Closed lpinca closed 5 months ago
I'm not sure why, but with a simple echo benchmark it seems to improve performance in all cases, especially on Linux
# macOS, allowSynchronousEvents = true
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 79436.500000
Msg/sec: 81366.750000
Msg/sec: 81826.250000
Msg/sec: 81597.500000
Msg/sec: 81824.500000
Msg/sec: 81801.250000
# macOS, allowSynchronousEvents = false
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 81455.250000
Msg/sec: 82939.000000
Msg/sec: 83173.500000
Msg/sec: 83274.750000
Msg/sec: 83354.250000
Msg/sec: 83395.500000
# virtualized Linux, allowSynchronousEvents = true
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 61000.000000
Msg/sec: 64534.500000
Msg/sec: 65011.250000
Msg/sec: 65059.500000
Msg/sec: 64989.000000
Msg/sec: 64912.000000
# virtualized Linux, allowSynchronousEvents = false
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 97008.500000
Msg/sec: 101144.000000
Msg/sec: 101579.000000
Msg/sec: 101271.000000
Msg/sec: 101091.750000
Msg/sec: 101559.000000
cc: @OrKon
This is more in line with the expectations
luigi@imac:bench (master)$ pwd
/Users/luigi/code/ws/bench
luigi@imac:bench (master)$ node parser.benchmark.js
ping frame (5 bytes payload) x 1,968,413 ops/sec ±0.50% (87 runs sampled)
ping frame (no payload) x 2,595,798 ops/sec ±0.19% (90 runs sampled)
text frame (20 bytes payload) x 1,500,100 ops/sec ±0.66% (85 runs sampled)
binary frame (125 bytes payload) x 1,499,442 ops/sec ±0.54% (82 runs sampled)
binary frame (65535 bytes payload) x 165,904 ops/sec ±0.26% (90 runs sampled)
binary frame (200 KiB payload) x 58,271 ops/sec ±0.29% (90 runs sampled)
binary frame (1 MiB payload) x 11,380 ops/sec ±0.27% (89 runs sampled)
luigi@imac:bench (master)$ git checkout fix/issue-2216
Switched to branch 'fix/issue-2216'
Your branch is up to date with 'origin/fix/issue-2216'.
luigi@imac:bench (fix/issue-2216)$ node parser.benchmark.js
ping frame (5 bytes payload) x 63,608 ops/sec ±0.41% (82 runs sampled)
ping frame (no payload) x 64,907 ops/sec ±0.30% (86 runs sampled)
text frame (20 bytes payload) x 63,806 ops/sec ±0.31% (89 runs sampled)
binary frame (125 bytes payload) x 63,565 ops/sec ±0.30% (87 runs sampled)
binary frame (65535 bytes payload) x 47,378 ops/sec ±0.41% (90 runs sampled)
binary frame (200 KiB payload) x 30,985 ops/sec ±0.56% (87 runs sampled)
binary frame (1 MiB payload) x 9,406 ops/sec ±0.38% (88 runs sampled)
The echo benchmark is a special case.
// server.js
const { createHash } = require('crypto');
const { createServer } = require('http');
const { Readable } = require('stream');
const { Sender } = require('ws');
const GUID = '258EAFA5-E914-47DA-95CA-C5AB0DC85B11';
const buf = Buffer.alloc(125);
const frame = Buffer.concat(Sender.frame(buf, { fin: true, opcode: 2 }));
const readable = new Readable({
read() {
this.push(frame);
}
});
const server = createServer();
server.on('upgrade', function (request, socket) {
const key = createHash('sha1')
.update(request.headers['sec-websocket-key'] + GUID)
.digest('base64');
socket.write(
[
'HTTP/1.1 101 Switching Protocols',
'Upgrade: websocket',
'Connection: Upgrade',
`Sec-WebSocket-Accept: ${key}`,
'\r\n'
].join('\r\n')
);
readable.pipe(socket);
});
server.listen(8080, function () {
console.log('Server listening on [::]:8080');
});
// client.js
const prettierBytes = require('prettier-bytes');
const speedometer = require('speedometer');
const WebSocket = require('ws');
const speed = speedometer();
const ws = new WebSocket('ws://localhost:8080');
ws.on('message', function (data) {
speed(data.length);
});
setInterval(function () {
console.log(prettierBytes(speed()) + '/s');
}, 2000);
# websockets/ws#master
node client.js
56 MB/s
62 MB/s
63 MB/s
63 MB/s
63 MB/s
63 MB/s
63 MB/s
63 MB/s
# websockets/ws#fix/issue-2216
node client.js
59 MB/s
66 MB/s
67 MB/s
66 MB/s
66 MB/s
67 MB/s
66 MB/s
65 MB/s
On my macbook pro m1 I see different results:
# websockets/ws#master
node client.js
29 MB/s
33 MB/s
33 MB/s
31 MB/s
31 MB/s
32 MB/s
# websockets/ws#fix/issue-2216
node client.js
8.1 MB/s
9.0 MB/s
9.1 MB/s
9.1 MB/s
9.2 MB/s
Yes, I also saw similar results on macOS yesterday. I can't make this the new default if the same happens also on Linux. The results in https://github.com/websockets/ws/pull/2218#issuecomment-2051375946 are coming from WSL 2. Below are the results on Windows.
# websockets/ws#master
node client.js
11 MB/s
12 MB/s
12 MB/s
12 MB/s
9.4 MB/s
6.9 MB/s
6.4 MB/s
6.6 MB/s
6.3 MB/s
6.2 MB/s
# websockets/ws#fix/issue-2216
node client.js
11 MB/s
12 MB/s
12 MB/s
12 MB/s
9.4 MB/s
7.0 MB/s
6.4 MB/s
6.2 MB/s
6.1 MB/s
6.1 MB/s
Below are the results on Linux on bare metal
# websockets/ws#master
node client.js
35 MB/s
39 MB/s
40 MB/s
40 MB/s
40 MB/s
40 MB/s
40 MB/s
39 MB/s
40 MB/s
40 MB/s
# websockets/ws#fix/issue-2216
node client.js
27 MB/s
30 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
I have also tried the following client benchmark:
const WebSocket = require('ws'); // omit in the browser
const ws = new WebSocket('ws://localhost:8080');
console.time("ws");
let i = 0;
ws.on('message', function (data) { // addEventListener in browser
i++;
if (i === 1000000) {
console.timeEnd("ws");
ws.close();
}
});
#websockets/ws#fix/issue-2216
node client.cjs
ws: 12.662s
#websockets/ws#master
node client.cjs
ws: 3.670s
#chrome-canary
ws: 101s
Edit: corrected result labels
I'm not sure how to proceed. This adds a very big performance penalty but the behavior on master is broken. We can make the whole "one event per tick" optional but that means reverting 93e3552e95ba5ad656c30b94f6be96afe22d4805 and 4ed7fe58b42a87d06452b6bc19028d167262c30b and adding a new option for it and that is semver-major.
We are not able to replicate the regression on a Linux bare metal machine. I think the benchmark is a bit flawed since it exercises the network stack and it is probably affected by the OS scheduler, network stack etc. For example, in the browser the network messages probably go throw the network service resulting in some IPC roundtrips. I guess to measure the impact correctly we would need to mock the network stack and dispatch client events in some other way.
Also, I think I mentioned this but this sort of logic for event dispatch is only-speced for the client and the server side does not need it as there is no need for interoperability.
We are not able to replicate the regression on a Linux bare metal machine. I think the benchmark is a bit flawed since it exercises the network stack and it is probably affected by the OS scheduler, network stack etc. For example, in the browser the network messages probably go throw the network service resulting in some IPC roundtrips. I guess to measure the impact correctly we would need to mock the network stack and dispatch client events in some other way.
It should exercises the network stack, otherwise a reliable benchmark is this one https://github.com/websockets/ws/pull/2218#issuecomment-2050331136 🥲
Also, I think I mentioned this but this sort of logic for event dispatch is only-speced for the client and the server side does not need it as there is no need for interoperability.
Yes, but there is only one WebSocket
class in ws
. It is shared by the client and the server. We had a similar discussion about this here https://github.com/websockets/ws/issues/2159#issuecomment-1692171593.
What do you think about flipping the default value of the allowSynchronousEvents
option? It is technically semver-major but we can do it in a minor release. I don't think it is widely used.
I don't like to restore the original non WHATWG compliant behavior but I see no other way. In hindsight following the spec here was a mistake.
Many small messages (30 bytes) in the same data chunk
// server.js
const { createHash } = require('crypto');
const { createServer } = require('http');
const { Readable } = require('stream');
const { Sender } = require('ws');
const GUID = '258EAFA5-E914-47DA-95CA-C5AB0DC85B11';
const buf = Buffer.alloc(30);
const frame = Buffer.concat(Sender.frame(buf, { fin: true, opcode: 2 }));
const data = Buffer.concat(new Array(512).fill(frame));
const readable = new Readable({
read() {
this.push(data);
}
});
const server = createServer();
server.on('upgrade', function (request, socket) {
const key = createHash('sha1')
.update(request.headers['sec-websocket-key'] + GUID)
.digest('base64');
socket.write(
[
'HTTP/1.1 101 Switching Protocols',
'Upgrade: websocket',
'Connection: Upgrade',
`Sec-WebSocket-Accept: ${key}`,
'\r\n'
].join('\r\n')
);
readable.pipe(socket);
});
server.listen(8080, function () {
console.log('Server listening on [::]:8080');
});
// client.js
const WebSocket = require('ws');
let messages = 0;
const ws = new WebSocket('ws://127.0.0.1:8080');
ws.on('open', function () {
console.time('ws');
});
ws.on('message', function () {
if (++messages === 10_000_000) {
console.timeEnd('ws');
ws.terminate();
}
});
# WSL 2, websockets/ws#master
node client.js
ws: 3.461s
# WSL 2, websockets/ws#fix/issue-2216
node client.js
ws: 11.426s
# Windows, websockets/ws#master
node client.js
ws: 3.743s
# Windows, websockets/ws#fix/issue-2216
node client.js
ws: 16.089s
# Linux on bare metal, websockets/ws#master
node client.js
ws: 3.976s
# Linux on bare metal, websockets/ws#fix/issue-2216
node client.js
ws: 12.114s
# macOS, websockets/ws#master
node client.js
ws: 4.963s
# macOS, websockets/ws#fix/issue-2216
node client.js
ws: 2:33.128 (m:ss.mmm)
@OrKoN any feedback/suggestions? Is it inconvenient for you to use the allowSynchronousEvents
option to get the desired behavior? FWIW, I noticed that both Bun and Deno leak memory when running the benchmark above (https://github.com/websockets/ws/pull/2218#issuecomment-2056889841) without a limit on the number of messages on the client.
@lpinca sorry, for the delay, requiring an option to opt-in into more spec-aligned behaviour would work for us, thanks!
Fixes #2216