websockets / ws

Simple to use, blazing fast and thoroughly tested WebSocket client and server for Node.js
MIT License
21.61k stars 2.42k forks source link

Emit at most one event per event loop iteration #2218

Closed lpinca closed 5 months ago

lpinca commented 5 months ago

Fixes #2216

lpinca commented 5 months ago

I'm not sure why, but with a simple echo benchmark it seems to improve performance in all cases, especially on Linux

# macOS, allowSynchronousEvents = true
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 79436.500000
Msg/sec: 81366.750000
Msg/sec: 81826.250000
Msg/sec: 81597.500000
Msg/sec: 81824.500000
Msg/sec: 81801.250000
# macOS, allowSynchronousEvents = false
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 81455.250000
Msg/sec: 82939.000000
Msg/sec: 83173.500000
Msg/sec: 83274.750000
Msg/sec: 83354.250000
Msg/sec: 83395.500000
# virtualized Linux, allowSynchronousEvents = true
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 61000.000000
Msg/sec: 64534.500000
Msg/sec: 65011.250000
Msg/sec: 65059.500000
Msg/sec: 64989.000000
Msg/sec: 64912.000000
# virtualized Linux, allowSynchronousEvents = false
./load_test 32 127.0.0.1 8080 0 0 125
Using message size of 125 bytes
Running benchmark now...
Msg/sec: 97008.500000
Msg/sec: 101144.000000
Msg/sec: 101579.000000
Msg/sec: 101271.000000
Msg/sec: 101091.750000
Msg/sec: 101559.000000

cc: @OrKon

lpinca commented 5 months ago

This is more in line with the expectations

luigi@imac:bench (master)$ pwd
/Users/luigi/code/ws/bench
luigi@imac:bench (master)$ node parser.benchmark.js 
ping frame (5 bytes payload) x 1,968,413 ops/sec ±0.50% (87 runs sampled)
ping frame (no payload) x 2,595,798 ops/sec ±0.19% (90 runs sampled)
text frame (20 bytes payload) x 1,500,100 ops/sec ±0.66% (85 runs sampled)
binary frame (125 bytes payload) x 1,499,442 ops/sec ±0.54% (82 runs sampled)
binary frame (65535 bytes payload) x 165,904 ops/sec ±0.26% (90 runs sampled)
binary frame (200 KiB payload) x 58,271 ops/sec ±0.29% (90 runs sampled)
binary frame (1 MiB payload) x 11,380 ops/sec ±0.27% (89 runs sampled)
luigi@imac:bench (master)$ git checkout fix/issue-2216 
Switched to branch 'fix/issue-2216'
Your branch is up to date with 'origin/fix/issue-2216'.
luigi@imac:bench (fix/issue-2216)$ node parser.benchmark.js 
ping frame (5 bytes payload) x 63,608 ops/sec ±0.41% (82 runs sampled)
ping frame (no payload) x 64,907 ops/sec ±0.30% (86 runs sampled)
text frame (20 bytes payload) x 63,806 ops/sec ±0.31% (89 runs sampled)
binary frame (125 bytes payload) x 63,565 ops/sec ±0.30% (87 runs sampled)
binary frame (65535 bytes payload) x 47,378 ops/sec ±0.41% (90 runs sampled)
binary frame (200 KiB payload) x 30,985 ops/sec ±0.56% (87 runs sampled)
binary frame (1 MiB payload) x 9,406 ops/sec ±0.38% (88 runs sampled)

The echo benchmark is a special case.

lpinca commented 5 months ago
// server.js

const { createHash } = require('crypto');
const { createServer } = require('http');
const { Readable } = require('stream');
const { Sender } = require('ws');

const GUID = '258EAFA5-E914-47DA-95CA-C5AB0DC85B11';

const buf = Buffer.alloc(125);
const frame = Buffer.concat(Sender.frame(buf, { fin: true, opcode: 2 }));

const readable = new Readable({
  read() {
    this.push(frame);
  }
});

const server = createServer();

server.on('upgrade', function (request, socket) {
  const key = createHash('sha1')
    .update(request.headers['sec-websocket-key'] + GUID)
    .digest('base64');

  socket.write(
    [
      'HTTP/1.1 101 Switching Protocols',
      'Upgrade: websocket',
      'Connection: Upgrade',
      `Sec-WebSocket-Accept: ${key}`,
      '\r\n'
    ].join('\r\n')
  );

  readable.pipe(socket);
});

server.listen(8080, function () {
  console.log('Server listening on [::]:8080');
});
// client.js

const prettierBytes = require('prettier-bytes');
const speedometer = require('speedometer');
const WebSocket = require('ws');

const speed = speedometer();

const ws = new WebSocket('ws://localhost:8080');

ws.on('message', function (data) {
  speed(data.length);
});

setInterval(function () {
  console.log(prettierBytes(speed()) + '/s');
}, 2000);
# websockets/ws#master
node client.js
56 MB/s
62 MB/s
63 MB/s
63 MB/s
63 MB/s
63 MB/s
63 MB/s
63 MB/s
# websockets/ws#fix/issue-2216
node client.js
59 MB/s
66 MB/s
67 MB/s
66 MB/s
66 MB/s
67 MB/s
66 MB/s
65 MB/s
OrKoN commented 5 months ago

On my macbook pro m1 I see different results:

# websockets/ws#master
node client.js
29 MB/s
33 MB/s
33 MB/s
31 MB/s
31 MB/s
32 MB/s
# websockets/ws#fix/issue-2216
node client.js
8.1 MB/s
9.0 MB/s
9.1 MB/s
9.1 MB/s
9.2 MB/s
lpinca commented 5 months ago

Yes, I also saw similar results on macOS yesterday. I can't make this the new default if the same happens also on Linux. The results in https://github.com/websockets/ws/pull/2218#issuecomment-2051375946 are coming from WSL 2. Below are the results on Windows.

# websockets/ws#master
node client.js
11 MB/s
12 MB/s
12 MB/s
12 MB/s
9.4 MB/s
6.9 MB/s
6.4 MB/s
6.6 MB/s
6.3 MB/s
6.2 MB/s
# websockets/ws#fix/issue-2216
node client.js
11 MB/s
12 MB/s
12 MB/s
12 MB/s
9.4 MB/s
7.0 MB/s
6.4 MB/s
6.2 MB/s
6.1 MB/s
6.1 MB/s
lpinca commented 5 months ago

Below are the results on Linux on bare metal

# websockets/ws#master
node client.js 
35 MB/s
39 MB/s
40 MB/s
40 MB/s
40 MB/s
40 MB/s
40 MB/s
39 MB/s
40 MB/s
40 MB/s
# websockets/ws#fix/issue-2216
node client.js 
27 MB/s
30 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
31 MB/s
OrKoN commented 5 months ago

I have also tried the following client benchmark:

const WebSocket = require('ws'); // omit in the browser

const ws = new WebSocket('ws://localhost:8080');

console.time("ws");

let i = 0;
ws.on('message', function (data) { // addEventListener in browser
  i++;
  if (i === 1000000) {
    console.timeEnd("ws");
    ws.close();
  }
});
#websockets/ws#fix/issue-2216
node client.cjs
ws: 12.662s
#websockets/ws#master
node client.cjs
ws: 3.670s
#chrome-canary
ws: 101s

Edit: corrected result labels

lpinca commented 5 months ago

I'm not sure how to proceed. This adds a very big performance penalty but the behavior on master is broken. We can make the whole "one event per tick" optional but that means reverting 93e3552e95ba5ad656c30b94f6be96afe22d4805 and 4ed7fe58b42a87d06452b6bc19028d167262c30b and adding a new option for it and that is semver-major.

OrKoN commented 5 months ago

We are not able to replicate the regression on a Linux bare metal machine. I think the benchmark is a bit flawed since it exercises the network stack and it is probably affected by the OS scheduler, network stack etc. For example, in the browser the network messages probably go throw the network service resulting in some IPC roundtrips. I guess to measure the impact correctly we would need to mock the network stack and dispatch client events in some other way.

Also, I think I mentioned this but this sort of logic for event dispatch is only-speced for the client and the server side does not need it as there is no need for interoperability.

lpinca commented 5 months ago

We are not able to replicate the regression on a Linux bare metal machine. I think the benchmark is a bit flawed since it exercises the network stack and it is probably affected by the OS scheduler, network stack etc. For example, in the browser the network messages probably go throw the network service resulting in some IPC roundtrips. I guess to measure the impact correctly we would need to mock the network stack and dispatch client events in some other way.

It should exercises the network stack, otherwise a reliable benchmark is this one https://github.com/websockets/ws/pull/2218#issuecomment-2050331136 🥲

Also, I think I mentioned this but this sort of logic for event dispatch is only-speced for the client and the server side does not need it as there is no need for interoperability.

Yes, but there is only one WebSocket class in ws. It is shared by the client and the server. We had a similar discussion about this here https://github.com/websockets/ws/issues/2159#issuecomment-1692171593.

lpinca commented 5 months ago

What do you think about flipping the default value of the allowSynchronousEvents option? It is technically semver-major but we can do it in a minor release. I don't think it is widely used.

I don't like to restore the original non WHATWG compliant behavior but I see no other way. In hindsight following the spec here was a mistake.

lpinca commented 5 months ago

Many small messages (30 bytes) in the same data chunk

// server.js

const { createHash } = require('crypto');
const { createServer } = require('http');
const { Readable } = require('stream');
const { Sender } = require('ws');

const GUID = '258EAFA5-E914-47DA-95CA-C5AB0DC85B11';

const buf = Buffer.alloc(30);
const frame = Buffer.concat(Sender.frame(buf, { fin: true, opcode: 2 }));
const data = Buffer.concat(new Array(512).fill(frame));

const readable = new Readable({
  read() {
    this.push(data);
  }
});

const server = createServer();

server.on('upgrade', function (request, socket) {
  const key = createHash('sha1')
    .update(request.headers['sec-websocket-key'] + GUID)
    .digest('base64');

  socket.write(
    [
      'HTTP/1.1 101 Switching Protocols',
      'Upgrade: websocket',
      'Connection: Upgrade',
      `Sec-WebSocket-Accept: ${key}`,
      '\r\n'
    ].join('\r\n')
  );

  readable.pipe(socket);
});

server.listen(8080, function () {
  console.log('Server listening on [::]:8080');
});
// client.js

const WebSocket = require('ws');

let messages = 0;

const ws = new WebSocket('ws://127.0.0.1:8080');

ws.on('open', function () {
  console.time('ws');
});

ws.on('message', function () {
  if (++messages === 10_000_000) {
    console.timeEnd('ws');

    ws.terminate();
  }
});
# WSL 2, websockets/ws#master
node client.js
ws: 3.461s
# WSL 2, websockets/ws#fix/issue-2216
node client.js
ws: 11.426s
# Windows, websockets/ws#master
node client.js
ws: 3.743s
# Windows, websockets/ws#fix/issue-2216
node client.js
ws: 16.089s
# Linux on bare metal, websockets/ws#master
node client.js 
ws: 3.976s
# Linux on bare metal, websockets/ws#fix/issue-2216
node client.js 
ws: 12.114s
# macOS, websockets/ws#master
node client.js 
ws: 4.963s
# macOS, websockets/ws#fix/issue-2216
node client.js 
ws: 2:33.128 (m:ss.mmm)
lpinca commented 5 months ago

@OrKoN any feedback/suggestions? Is it inconvenient for you to use the allowSynchronousEvents option to get the desired behavior? FWIW, I noticed that both Bun and Deno leak memory when running the benchmark above (https://github.com/websockets/ws/pull/2218#issuecomment-2056889841) without a limit on the number of messages on the client.

OrKoN commented 5 months ago

@lpinca sorry, for the delay, requiring an option to opt-in into more spec-aligned behaviour would work for us, thanks!