Devices not displaying after 3-4 hours ( version 3.6.5)

merih-sakarya commented 1 year ago

Description: After upgrading the device-farmer from version 3.6.1 to 3.6.5, I've been encountering an issue where devices stop being displayed after 3-4 hours of operation.

Steps to reproduce:

Upgrade device-farmer to 3.6.5
Operate the application for 3-4 hours

Expected result: Devices should continue to display and function normally. I expect to get the following log:

DEBUG ::: GroupService ::: invite ::: Device successfully joined the group : result : tx.d13e094d-3782-4a99-940f-fdb7ab249cc1

Actual result: Devices stop being displayed after 3-4 hours. The log above does not appear.

Debugging: In an effort to diagnose the problem, I added some debug logs and found that the socket connection couldn't be established during the "group.invite" request.

Below are the relevant logs:

0.f9b6ef9ef5d285a41a39.chunk.js:85409 DEBUG ::: device.device-service ::: usable IF device is physically present AND device is online AND preparations are ready AND the device has no owner or we are the owner!
0.f9b6ef9ef5d285a41a39.chunk.js:85418 DEBUG ::: device.device-service ::: usable ::: usable : true, owner : null, using : false
0.f9b6ef9ef5d285a41a39.chunk.js:126165 DEBUG ::: ControlPanesController ::: getDevice ::: GroupService ::: invite ::: device : {abi: 'arm64-v8a', airplaneMode: false, battery: {…}, browser: {…}, channel: 'EEidwqTUpEh8fpLvQw59Wel8LBE=', owner: null, …}
0.f9b6ef9ef5d285a41a39.chunk.js:90943 DEBUG ::: GroupService ::: invite ::: device : {abi: 'arm64-v8a', airplaneMode: false, battery: {…}, browser: {…}, channel: 'EEidwqTUpEh8fpLvQw59Wel8LBE=', owner: null, …}
0.f9b6ef9ef5d285a41a39.chunk.js:90944 DEBUG ::: GroupService ::: invite ::: device.usable : true
0.f9b6ef9ef5d285a41a39.chunk.js:90950 DEBUG ::: GroupService ::: invite ::: device.channel : EEidwqTUpEh8fpLvQw59Wel8LBE=
0.f9b6ef9ef5d285a41a39.chunk.js:90951 DEBUG ::: GroupService ::: invite ::: tx.channel : tx.be75b5f5-a9d6-47a7-974e-762ab8d08d04

And these are the logs of working example;

0.f9b6ef9ef5d285a41a39.chunk.js:85409 DEBUG ::: device.device-service ::: usable IF device is physically present AND device is online AND preparations are ready AND the device has no owner or we are the owner!
0.f9b6ef9ef5d285a41a39.chunk.js:85418 DEBUG ::: device.device-service ::: usable ::: usable : true, owner : null, using : false
0.f9b6ef9ef5d285a41a39.chunk.js:126165 DEBUG ::: ControlPanesController ::: getDevice ::: GroupService ::: invite ::: device : {abi: 'arm64-v8a', airplaneMode: false, battery: {…}, browser: {…}, channel: 'wHzsC2ey2yl9GO93j5nIgDzoJ7k=', owner: {email: 'testuser', name: 'test_user', group: '+ou+z8eSSH+dVaixt0Dh7w=='}, …}
0.f9b6ef9ef5d285a41a39.chunk.js:90943 DEBUG ::: GroupService ::: invite ::: device : {abi: 'arm64-v8a', airplaneMode: false, battery: {…}, browser: {…}, channel: 'wHzsC2ey2yl9GO93j5nIgDzoJ7k=', owner: {email: 'testuser', name: 'test_user', group: '+ou+z8eSSH+dVaixt0Dh7w=='}, …}
0.f9b6ef9ef5d285a41a39.chunk.js:90944 DEBUG ::: GroupService ::: invite ::: device.usable : true
0.f9b6ef9ef5d285a41a39.chunk.js:90950 DEBUG ::: GroupService ::: invite ::: device.channel : wHzsC2ey2yl9GO93j5nIgDzoJ7k=
0.f9b6ef9ef5d285a41a39.chunk.js:90951 DEBUG ::: GroupService ::: invite ::: tx.channel : tx.3e23936f-6423-45c3-8544-9c6a47e94417
0.f9b6ef9ef5d285a41a39.chunk.js:85409 DEBUG ::: device.device-service ::: usable IF device is physically present AND device is online AND preparations are ready AND the device has no owner or we are the owner!
0.f9b6ef9ef5d285a41a39.chunk.js:85418 DEBUG ::: device.device-service ::: usable ::: usable : true, owner : {"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="}, using : true
0.f9b6ef9ef5d285a41a39.chunk.js:90962 DEBUG ::: GroupService ::: invite ::: Device successfully joined the group : result : tx.3e23936f-6423-45c3-8544-9c6a47e94417

And a snippet of the code (group-service.js) where I believe the issue resides:

groupService.invite = function(device) {
  console.log('DEBUG ::: GroupService ::: invite ::: device :', device);
  console.log('DEBUG ::: GroupService ::: invite ::: device.usable :', device.usable);
  if (!device.usable) {
    return Promise.reject(new Error('Device is not usable'))
  }

  var tx = TransactionService.create(device)
  console.log('DEBUG ::: GroupService ::: invite ::: device.channel :', device.channel);
  console.log('DEBUG ::: GroupService ::: invite ::: tx.channel :', tx.channel);
  socket.emit('group.invite', device.channel, tx.channel, {
    requirements: {
      serial: {
        value: device.serial
      , match: 'exact'
      }
    }
  })
  return tx.promise
    .then(function(result) {
      console.log('DEBUG ::: GroupService ::: invite ::: Device successfully joined the group : result :', tx.channel);
      return result.device
    })
    .catch(TransactionError, function(e) {
      console.log('DEBUG ::: GroupService ::: invite ::: ERROR : Device refused to join the group :', e);
      throw new Error('Device refused to join the group')
    })
}

Additionally, I want to mention that the function getDevice in control-panes-controller.js cannot proceed to the second step, ControlService.create, because groupService.invite is not completed.

Here is the getDevice function:

function getDevice(serial) {
  DeviceService.get(serial, $scope)
    .then(function(device) {
      return GroupService.invite(device)
    })
    .then(function(device) {
      // This part is not triggering
      $scope.device = device
      $scope.control = ControlService.create(device, device.channel)
      // ... existing code
      return device
    })
    .catch(function() {
      $timeout(function() {
        $location.path('/')
      })
    })
}

Update:

In response to this issue, I have implemented additional logging in lib/units/websocket/index.js to assist with diagnosis.

Here's the updated section of the code:

io.on('connection', function(socket) {
  log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: Socket connection established. Socket ID: ${socket.id}`)

  var req = socket.request
  var user = req.user
  var channels = []

  user.ip = socket.handshake.query.uip || req.ip
  log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: req.ip  : ${req.ip}`)
  log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: socket.handshake.query.uip  : ${socket.handshake.query.uip}`)

  socket.emit('socket.ip', user.ip)

  log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: user : ${JSON.stringify(user)}`)
  log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: user.ip : ${user.ip}`)

  function joinChannel(channel) {
    try {
      log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : ${channel}`)
      channels.push(channel)
      log.info(`DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : ${channels.join(', ')}`)
      channelRouter.on(channel, messageListener)
      sub.subscribe(channel, function(err) { // Callback to subscribe method to catch any error
        if (err) {
          log.error(`DEBUG ::: lib.units.websocket ::: on connection ::: Error subscribing to channel ${channel} : ${err}`);
        }
      })
    } catch (e) {
      log.error(`DEBUG ::: lib.units.websocket ::: on connection ::: Error in joinChannel function : ${e}`)
    }
  }

  socket.on('error', function (err) { // Log any error related to the socket
     log.error(`DEBUG ::: lib.units.websocket ::: on connection ::: Socket error : ${err}`)
  })

  // ...

  var messageListener = wirerouter()
    .on(wire.UpdateAccessTokenMessage, function() {
      socket.emit('user.keys.accessToken.updated')
    })

    // ...

    .on(wire.JoinGroupMessage, function(channel, message) {
        log.info(`DEBUG ::: lib.units.websocket ::: on JoinGroupMessage ::: device.change ::: channel : ${channel.toString()}, message : ${JSON.stringify(message)}`);
        socket.emit('device.change', {
          important: true
          , data: datautil.applyOwner({
              serial: message.serial
            , owner: message.owner
            , likelyLeaveReason: 'owner_change'
            , usage: message.usage
            }
          , user
          )
        })
      })
    // ...
    .on(wire.LeaveGroupMessage, function(channel, message) {
        log.info(`DEBUG ::: lib.units.websocket ::: on LeaveGroupMessage ::: device.change ::: channel : ${channel.toString()}, message : ${JSON.stringify(message)}`);
        socket.emit('device.change', {
          important: true
          , data: datautil.applyOwner({
              serial: message.serial
            , owner: null
            , likelyLeaveReason: message.reason
            }
        , user
        )
      })
    })

    // ...

    .on(wire.TransactionProgressMessage, function(channel, message) {
      log.info(`DEBUG ::: lib.units.websocket ::: on TransactionProgressMessage  ::: tx.progress ::: channel : ${channel.toString()}, message : ${JSON.stringify(message)}`);
      socket.emit('tx.progress', channel.toString(), message)
    })
    .on(wire.TransactionDoneMessage, function(channel, message) {
      log.info(`DEBUG ::: lib.units.websocket ::: on TransactionDoneMessage  ::: tx.done ::: channel : ${channel.toString()}, message : ${JSON.stringify(message)}`);
      socket.emit('tx.done', channel.toString(), message)
    })

    // ... rest of the code ...
})

Here are the lib:units:websocket logs

2023-06-07T21:13:36.091Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Socket connection established. Socket ID: EtUY4gco3b2EWUaHAAAF
2023-06-07T21:13:36.091Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: req.ip  : 143.58.138.136
2023-06-07T21:13:36.092Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: socket.handshake.query.uip  : ::ffff:10.1.0.4
2023-06-07T21:13:36.092Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: user : {"createdAt":"2023-04-10T05:07:12.857Z","email":"mehmetaksahin","forwards":[],"group":"+ou+z8eSSH+dVaixt0Dh7w==","groups":{"lock":false,"quotas":{"allocated":{"duration":1296000000,"number":5},"consumed":{"duration":0,"number":0},"defaultGroupsDuration":0,"defaultGroupsNumber":0,"defaultGroupsRepetitions":0,"repetitions":10},"subscribed":["baf47001727a48b9b2f90cfbee580809"]},"ip":"::ffff:10.1.0.4","lastLoggedInAt":"2023-06-07T19:59:05.705Z","name":"mehmet_aksahin","privilege":"user","settings":{"lastUsedDevice":"QKKZF68SFYOZPV7P"}}
2023-06-07T21:13:36.092Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: user.ip : ::ffff:10.1.0.4
2023-06-07T21:13:36.092Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : +ou+z8eSSH+dVaixt0Dh7w==
2023-06-07T21:13:36.092Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : +ou+z8eSSH+dVaixt0Dh7w==
2023-06-07T21:13:36.202Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: Invitation Requested ::: channel: rnmMTFCygVOFA+YFnSpfCIZ6SvU=, responseChannel: tx.30961d3c-7d4c-4c03-8317-403836cb0c5d, user: {"createdAt":"2023-04-10T05:07:12.857Z","email":"mehmetaksahin","forwards":[],"group":"+ou+z8eSSH+dVaixt0Dh7w==","groups":{"lock":false,"quotas":{"allocated":{"duration":1296000000,"number":5},"consumed":{"duration":0,"number":0},"defaultGroupsDuration":0,"defaultGroupsNumber":0,"defaultGroupsRepetitions":0,"repetitions":10},"subscribed":["baf47001727a48b9b2f90cfbee580809"]},"ip":"::ffff:10.1.0.4","lastLoggedInAt":"2023-06-07T19:59:05.705Z","name":"mehmet_aksahin","privilege":"user","settings":{"lastUsedDevice":"QKKZF68SFYOZPV7P"}}, data: {"requirements":{"serial":{"value":"RZ8M83DJXXB","match":"exact"}}}
2023-06-07T21:13:36.202Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : tx.30961d3c-7d4c-4c03-8317-403836cb0c5d
2023-06-07T21:13:36.202Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : +ou+z8eSSH+dVaixt0Dh7w==, tx.30961d3c-7d4c-4c03-8317-403836cb0c5d
2023-06-07T21:13:36.203Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: push.send ::: message: {"owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"timeout":null,"requirements":[{"name":"serial","value":"RZ8M83DJXXB","type":3}],"usage":null}
2023-06-07T21:13:36.203Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: push.send ::: envelope: {"type":"Buffer","data":[8,10,18,84,10,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61,26,23,10,6,115,101,114,105,97,108,18,11,82,90,56,77,56,51,68,74,88,88,66,24,3,26,39,116,120,46,51,48,57,54,49,100,51,99,45,55,100,52,99,45,52,99,48,51,45,56,51,49,55,45,52,48,51,56,51,54,99,98,48,99,53,100]}
2023-06-07T21:13:36.204Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: push.send : ["rnmMTFCygVOFA+YFnSpfCIZ6SvU=",{"type":"Buffer","data":[8,10,18,84,10,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61,26,23,10,6,115,101,114,105,97,108,18,11,82,90,56,77,56,51,68,74,88,88,66,24,3,26,39,116,120,46,51,48,57,54,49,100,51,99,45,55,100,52,99,45,52,99,48,51,45,56,51,49,55,45,52,48,51,56,51,54,99,98,48,99,53,100]}]

Here are the lib:units:websocket logs from a working example

2023-06-07T20:52:12.854Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Socket connection established. Socket ID: Py-L3dfqv_VcrVHwAAAA
2023-06-07T20:52:12.854Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: req.ip  : 143.58.138.136
2023-06-07T20:52:12.854Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: socket.handshake.query.uip  : ::ffff:10.1.0.4
2023-06-07T20:52:12.855Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: user : {"createdAt":"2023-04-10T05:07:12.857Z","email":"testuser","forwards":[],"group":"+ou+z8eSSH+dVaixt0Dh7w==","groups":{"lock":false,"quotas":{"allocated":{"duration":1296000000,"number":5},"consumed":{"duration":0,"number":0},"defaultGroupsDuration":0,"defaultGroupsNumber":0,"defaultGroupsRepetitions":0,"repetitions":10},"subscribed":["baf47001727a48b9b2f90cfbee580809"]},"ip":"::ffff:10.1.0.4","lastLoggedInAt":"2023-06-07T19:59:05.705Z","name":"test_user","privilege":"user","settings":{"lastUsedDevice":"QKKZF68SFYOZPV7P"}}
2023-06-07T20:52:12.855Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: user.ip : ::ffff:10.1.0.4
2023-06-07T20:52:12.855Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : +ou+z8eSSH+dVaixt0Dh7w==
2023-06-07T20:52:12.855Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : +ou+z8eSSH+dVaixt0Dh7w==
2023-06-07T20:52:12.955Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: Invitation Requested ::: channel: wowOGdbPjS45w0V7E76vdy+Uebc=, responseChannel: tx.9879f99f-161e-4af5-b803-cfd47e932c91, user: {"createdAt":"2023-04-10T05:07:12.857Z","email":"testuser","forwards":[],"group":"+ou+z8eSSH+dVaixt0Dh7w==","groups":{"lock":false,"quotas":{"allocated":{"duration":1296000000,"number":5},"consumed":{"duration":0,"number":0},"defaultGroupsDuration":0,"defaultGroupsNumber":0,"defaultGroupsRepetitions":0,"repetitions":10},"subscribed":["baf47001727a48b9b2f90cfbee580809"]},"ip":"::ffff:10.1.0.4","lastLoggedInAt":"2023-06-07T19:59:05.705Z","name":"test_user","privilege":"user","settings":{"lastUsedDevice":"QKKZF68SFYOZPV7P"}}, data: {"requirements":{"serial":{"value":"QKKZF68SFYOZPV7P","match":"exact"}}}
2023-06-07T20:52:12.955Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : tx.9879f99f-161e-4af5-b803-cfd47e932c91
2023-06-07T20:52:12.955Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : +ou+z8eSSH+dVaixt0Dh7w==, tx.9879f99f-161e-4af5-b803-cfd47e932c91
2023-06-07T20:52:12.956Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: push.send ::: message: {"owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"timeout":null,"requirements":[{"name":"serial","value":"QKKZF68SFYOZPV7P","type":3}],"usage":null}
2023-06-07T20:52:12.960Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: push.send ::: envelope: {"type":"Buffer","data":[8,10,18,89,10,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61,26,28,10,6,115,101,114,105,97,108,18,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,24,3,26,39,116,120,46,57,56,55,57,102,57,57,102,45,49,54,49,101,45,52,97,102,53,45,98,56,48,51,45,99,102,100,52,55,101,57,51,50,99,57,49]}
2023-06-07T20:52:12.960Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on Group.Invite ::: push.send : ["wowOGdbPjS45w0V7E76vdy+Uebc=",{"type":"Buffer","data":[8,10,18,89,10,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61,26,28,10,6,115,101,114,105,97,108,18,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,24,3,26,39,116,120,46,57,56,55,57,102,57,57,102,45,49,54,49,101,45,52,97,102,53,45,98,56,48,51,45,99,102,100,52,55,101,57,51,50,99,57,49]}]
2023-06-07T20:52:13.052Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on JoinGroupMessage ::: channel : *ALL, message : {"serial":"QKKZF68SFYOZPV7P","owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"usage":null}, data : {"type":"Buffer","data":[8,11,18,77,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,18,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61]}
2023-06-07T20:52:13.052Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on JoinGroupMessage ::: dbapi.setDeviceOwner ::: message.serial : QKKZF68SFYOZPV7P, message.usage : .OwnerMessage
2023-06-07T20:52:13.053Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on JoinGroupMessage ::: dbapi.setDeviceOwner ::: Done!
2023-06-07T20:52:13.053Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on JoinGroupMessage ::: appDealer.send ::: channel : *ALL, data : {"type":"Buffer","data":[8,11,18,77,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,18,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61]}
2023-06-07T20:52:13.053Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on JoinGroupMessage ::: appDealer.send ::: Done!
2023-06-07T20:52:13.054Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on JoinGroupMessage ::: device.change ::: channel : *ALL, message : {"serial":"QKKZF68SFYOZPV7P","owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"usage":null}
2023-06-07T20:52:13.093Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on TransactionDoneMessage ::: channel : tx.9879f99f-161e-4af5-b803-cfd47e932c91, message : {"source":"QKKZF68SFYOZPV7P","seq":0,"success":true,"data":"success","body":null}, data : {"type":"Buffer","data":[8,24,18,31,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,16,0,24,1,34,7,115,117,99,99,101,115,115]}
2023-06-07T20:52:13.094Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on TransactionDoneMessage ::: tx.done ::: channel : tx.9879f99f-161e-4af5-b803-cfd47e932c91, message : {"source":"QKKZF68SFYOZPV7P","seq":0,"success":true,"data":"success","body":null}
2023-06-07T20:52:14.128Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : tx.599b932d-02fe-4627-b534-6967abc3b006
2023-06-07T20:52:14.128Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : +ou+z8eSSH+dVaixt0Dh7w==, tx.599b932d-02fe-4627-b534-6967abc3b006
2023-06-07T20:52:14.201Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on TransactionDoneMessage ::: channel : tx.599b932d-02fe-4627-b534-6967abc3b006, message : {"source":"QKKZF68SFYOZPV7P","seq":0,"success":true,"data":"devlive.testinium.com:7300","body":null}, data : {"type":"Buffer","data":[8,24,18,50,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,16,0,24,1,34,26,100,101,118,108,105,118,101,46,116,101,115,116,105,110,105,117,109,46,99,111,109,58,55,51,48,48]}
2023-06-07T20:52:14.202Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on TransactionDoneMessage ::: tx.done ::: channel : tx.599b932d-02fe-4627-b534-6967abc3b006, message : {"source":"QKKZF68SFYOZPV7P","seq":0,"success":true,"data":"devlive.testinium.com:7300","body":null}
2023-06-07T20:52:14.203Z INF/processor 62 [proc002] DEBUG ::: lib.units.processor ::: on ConnectStartedMessage ::: channel : tx.599b932d-02fe-4627-b534-6967abc3b006, message : {"serial":"QKKZF68SFYOZPV7P","url":"devlive.testinium.com:7300"}, data : {"type":"Buffer","data":[8,92,18,46,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,18,26,100,101,118,108,105,118,101,46,116,101,115,116,105,110,105,117,109,46,99,111,109,58,55,51,48,48]}

// Leaving Group

2023-06-07T20:56:20.614Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Join Channel : tx.ab5b2aba-a4ef-4aa7-920d-c3f922e6017c
2023-06-07T20:56:20.615Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on connection ::: Joined Channels : +ou+z8eSSH+dVaixt0Dh7w==, tx.ab5b2aba-a4ef-4aa7-920d-c3f922e6017c
2023-06-07T20:56:20.698Z INF/processor 29 [proc001] DEBUG ::: lib.units.processor ::: on LeaveGroupMessage ::: channel : *ALL, message : {"serial":"QKKZF68SFYOZPV7P","owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"reason":"ungroup_request"}, data : {"type":"Buffer","data":[8,15,18,94,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,18,57,10,13,109,101,104,109,101,116,97,107,115,97,104,105,110,18,14,109,101,104,109,101,116,95,97,107,115,97,104,105,110,26,24,43,111,117,43,122,56,101,83,83,72,43,100,86,97,105,120,116,48,68,104,55,119,61,61,26,15,117,110,103,114,111,117,112,95,114,101,113,117,101,115,116]}
2023-06-07T20:56:20.701Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on LeaveGroupMessage ::: device.change ::: channel : *ALL, message : {"serial":"QKKZF68SFYOZPV7P","owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"reason":"ungroup_request"}
2023-06-07T20:56:20.702Z INF/processor 62 [proc002] DEBUG ::: lib.units.processor ::: on TransactionDoneMessage ::: channel : tx.ab5b2aba-a4ef-4aa7-920d-c3f922e6017c, message : {"source":"QKKZF68SFYOZPV7P","seq":0,"success":true,"data":"success","body":null}, data : {"type":"Buffer","data":[8,24,18,31,10,16,81,75,75,90,70,54,56,83,70,89,79,90,80,86,55,80,16,0,24,1,34,7,115,117,99,99,101,115,115]}
2023-06-07T20:56:20.703Z INF/websocket 72 [*] DEBUG ::: lib.units.websocket ::: on TransactionDoneMessage ::: tx.done ::: channel : tx.ab5b2aba-a4ef-4aa7-920d-c3f922e6017c, message : {"source":"QKKZF68SFYOZPV7P","seq":0,"success":true,"data":"success","body":null}

Here are the device:plugins:group logs from a working example

2023-06-07T21:02:10.736Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: on wire.GroupMessage ::: channel : tx.60646240-3e08-4c2b-9bf9-704785e157bc, message : {"owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"timeout":null,"requirements":[{"name":"serial","value":"QKKZF68SFYOZPV7P","type":3}],"usage":null}
2023-06-07T21:02:10.739Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: on wire.GroupMessage ::: grouputil.match ::: message : {"owner":{"email":"testuser","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"timeout":null,"requirements":[{"name":"serial","value":"QKKZF68SFYOZPV7P","type":3}],"usage":null}
2023-06-07T21:02:10.741Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: timeout "null"
2023-06-07T21:02:10.743Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: usage "null"
2023-06-07T21:02:10.745Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: currentGroup : null
2023-06-07T21:02:10.747Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: NoGroupError : Not a member of any group
2023-06-07T21:02:10.751Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] Now owned by "testuser"
2023-06-07T21:02:10.753Z INF/device:plugins:group 156 [QKKZF68SFYOZPV7P] Subscribing to group channel "+ou+z8eSSH+dVaixt0Dh7w=="
2023-06-07T21:02:10.758Z IMP/device:plugins:group 156 [QKKZF68SFYOZPV7P] DEBUG ::: device:plugins:group ::: on wire.GroupMessage ::: reply.okay ::: channel : tx.60646240-3e08-4c2b-9bf9-704785e157bc
2023-06-07T21:02:11.128Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] Keeping 864x865 as current frame producer projection
2023-06-07T21:02:11.131Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] Requesting frame producer to start
2023-06-07T21:02:11.133Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] Launching screen service minicap-bin
2023-06-07T21:02:11.135Z INF/device:resources:minicap 156 [QKKZF68SFYOZPV7P] LD_LIBRARY_PATH=/data/local/tmp exec /data/local/tmp/minicap -S -Q 40 -P 1080x2400@864x865/0
2023-06-07T21:02:11.215Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "PID: 18663"
2023-06-07T21:02:11.218Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: Using projection 1080x2400@389x865/0"
2023-06-07T21:02:11.221Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:241) Creating SurfaceComposerClient"
2023-06-07T21:02:11.223Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:244) Performing SurfaceComposerClient init check"
2023-06-07T21:02:11.226Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:255) Creating virtual display"
2023-06-07T21:02:11.228Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:261) Creating buffer queue"
2023-06-07T21:02:11.231Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:264) Setting buffer options"
2023-06-07T21:02:11.233Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:268) Creating CPU consumer"
2023-06-07T21:02:11.235Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:272) Creating frame waiter"
2023-06-07T21:02:11.238Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (external/MY_minicap/src/minicap_29.cpp:276) Publishing virtual display"
2023-06-07T21:02:11.240Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] Connecting to minicap service
2023-06-07T21:02:11.243Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (jni/minicap/JpgEncoder.cpp:64) Allocating 7835652 bytes for JPG encoder"
2023-06-07T21:02:11.248Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] Reading minicap banner
2023-06-07T21:02:11.250Z INF/device:plugins:screen:stream 156 [QKKZF68SFYOZPV7P] minicap says: "INFO: (jni/minicap/minicap.cpp:464) New client connection"
2023-06-07T21:02:11.902Z INF/device:plugins:connect 156 [QKKZF68SFYOZPV7P] Listening on port 7300
2023-06-07T21:02:11.908Z IMP/device:plugins:connect 156 [QKKZF68SFYOZPV7P] Remote Connect Started for device "QKKZF68SFYOZPV7P" at "devlive.testinium.com:7300"

denis99999 commented 1 year ago

@merih-sakarya your issues sounds like #663 one, what show the logs at provider server side and at main server side ?

merih-sakarya commented 1 year ago

@denis99999 thanks for your feedback. I contacted the network team to get more information.

For now, here are the WebSocket logs for both cases:

Websocket Logs (Not Working)

- 0{“sid":"ib-zM-zNB9ZSpiH-AAAG","upgrades":[],"pingInterval":25000,"pingTimeout":20000}
- 40
+ 42["group.invite","yWNYYJk9EWwMQn593oCfzrv1v/4=","tx.668ce94d-3cb5-4fa8-986b-336b5d2fcf5f",{"requirements":{"serial":{"value":"1b8beee7","match":"exact"}}}]
- 42["socket.ip","::ffff:10.1.0.10"]
- 42["device.change",{"important":false,"data":{"serial":"ce10160aad263e1e03","battery":{"status":"full","health":"good","source":"usb","level":100,"scale":100,"temp":23.3,"voltage":3.943}}}]
- 42["device.change",{"important":false,"data":{"serial":"ce10160aad263e1e03","battery":{"status":"full","health":"good","source":"usb","level":100,"scale":100,"temp":23.4,"voltage":3.795}}}]

Websocket Logs (Working)

- 0{“sid":"slS5P4ZqhQj8EForAAAN","upgrades":[],"pingInterval":25000,"pingTimeout":20000}
- 40
+ 42["group.invite","KgHlbSjaXDERTL7IMdnazMTO6kM=","tx.b11eafbb-325b-4b32-a925-c4d2de40b790",{"requirements":{"serial":{"value":"a4b2185f","match":"exact"}}}]
- 42["socket.ip","::ffff:10.1.0.7"]
- 42["device.log",{"serial":"a4b2185f","timestamp":1685035260.988,"priority":4,"tag":"device:plugins:group","pid":87,"message":"Now owned by \"test_user\"","identifier":"a4b2185f"}]
- 42["device.log",{"serial":"a4b2185f","timestamp":1685035260.989,"priority":3,"tag":"device:plugins:group","pid":87,"message":"Subscribing to group channel \"+ou+z8eSSH+dVaixt0Dh7w==\"","identifier":"a4b2185f"}]
- 42["device.change",{"important":true,"data":{"serial":"a4b2185f","owner":{"email":"test_user","name":"test_user","group":"+ou+z8eSSH+dVaixt0Dh7w=="},"likelyLeaveReason":"owner_change","usage":null,"using":true}}]
- 42["tx.done","tx.b11eafbb-325b-4b32-a925-c4d2de40b790",{"source":"a4b2185f","seq":0,"success":true,"data":"success","body":null}]
+ 42["tx.cleanup","tx.b11eafbb-325b-4b32-a925-c4d2de40b790"]
+ 42["user.settings.update",{"lastUsedDevice":"a4b2185f"}]

merih-sakarya commented 1 year ago

@denis99999, I've added new logs to lib/units/websocket/index.js. In the case where it doesn't work, I noticed that the following part isn't triggered:

.on(wire.TransactionDoneMessage, function(channel, message) {
  log.info(`DEBUG ::: lib.units.websocket ::: on TransactionDoneMessage  ::: tx.done ::: channel : ${channel.toString()}, message : ${JSON.stringify(message)}`);
  socket.emit('tx.done', channel.toString(), message)
})

Do you know where exactly 'TransactionDoneMessage' is being triggered? I have a meeting with the network team on Monday, but before that, I would like to have a clear understanding of the network and data flow.

denis99999 commented 1 year ago

@merih-sakarya, as said before, your problem looks like a network problem, not related to STF itself which is working fine, that's why I advised you to take a look at STF logs everywhere on the network path , but also to take network traces (tcpdump or wireshark) to see what happens when the network connection is interrupted, I have no idea what your STF deployment is, but I recommend following the full deployment described in the DEPLOYMENT.md file.

merih-sakarya commented 1 year ago

@denis99999 Thank you for your previous advice. We have thoroughly investigated our network configuration, including using network trace tools such as tcpdump and Wireshark. So far, we haven't found any unusual behaviour or errors that could be causing the issue.

We have multiple mobile devices connected and initially, all of them work perfectly fine. However, after some time, a portion of these devices starts to encounter the issue while others continue to function normally. This inconsistent behavior makes it difficult to pinpoint a network issue as the root cause.

Upon comparing the logs from when the issue arises and when it doesn't, we noticed a discrepancy. Specifically, the log entry from the malfunctioning instance indicates the device owner as null:

0.f9b6ef9ef5d285a41a39.chunk.js:90943 DEBUG ::: GroupService ::: invite ::: device : {abi: 'arm64-v8a', airplaneMode: false, battery: {…}, browser: {…}, channel: 'EEidwqTUpEh8fpLvQw59Wel8LBE=', owner: null, …}

In contrast, the logs from the working instances show an actual owner for the device. Furthermore, the working instances are successfully updating the device owner in the database.

Could the issue potentially stem from the 'group.invite' function not properly assigning an owner to the device? Any further advice or recommendations based on this information would be greatly appreciated.

denis99999 commented 1 year ago

@merih-sakarya No, as I said before, to my knowledge this is not a functional problem with STF, but probably a problem in your STF deployment (of which I don't know the details), this problem is either network type , either system type or hardware type (which you connect your devices to), which probably explains your trace that shows "owner" is not up to date and not that it's the update function which is faulty (otherwise it would probably be faulty all the time), to convince you of this I advise you to deploy all the STF service units locally on a single machine and you will see that this problem never happens, I'm sorry but that's all i can do for you based on the information you give me.

spinningD20 commented 1 year ago

I have run into this same issue, with all of the services running on an ec2 instance except for the adb/provider side. Works fine all day, look the next day and the devices are listed in stf, but clicking on them to control them, gray screen and it doesn't actually connect.

If I restart just the provider portion, the devices are reaped and then re-established, and it works fine for a while again.

While it could be a networking or other "not code" issue, strictly speaking anyway, ideally stf would be explicit about failures and perhaps re-attempt to connect, or etc. As it is, because of the complexity of the infrastructure and the project itself, there is a large surface area to troubleshoot, so any information in non-happy path handling is critical.

It would be great if the software would explicitly convey a failure to establish a connection, instead of "silently fail"... and if there are situations where it could "self heal" that would be even better!

Even after reading the documentation and looking through what logging is available, for networking/hosting side I don't see any evident reason for my case either. The fact that the devices all work after I restart the provider each day, I would think perhaps a connection "implicitly/silently" times out somewhere, but am not sure where to look next.

denis99999 commented 1 year ago

@merih-sakarya, @spinningD20, assuming it's a network issue in the network path as I think, for example a proxy or firewall closing the TCP connection if a timer expires between your provider's machine and the main machine, you can follow these instructions on each machine:

add these kernel network parameters:

// setting tcp keepalive properties to maintain connections to devices 24h/24h 
sysctl -w net.ipv4.tcp_keepalive_time=60
sysctl -w net.ipv4.tcp_keepalive_intvl=10

add these ZMQ environment variables when launching service units:

// setting ZMQ tcp keepalive properties to maintain connections to devices 24h/24h 
ZMQ_TCP_KEEPALIVE=1
ZMQ_TCP_KEEPALIVE_IDLE=120

Let me know if it fixes your issue?

merih-sakarya commented 9 months ago

Hi @denis99999,

I wanted to let you know that your suggestion worked for me. I applied the changes to the ZeroMQ settings as you recommended. Here's the updated code snippet I used:

var zmq = require('zeromq')

var log = require('./logger').createLogger('util:zmqutil')

module.exports.socket = function() {
  var sock = zmq.socket.apply(zmq, arguments)
  try {
    // These parameters are important for maintaining open, idle TCP connections and ensuring that the other end of the connection is still reachable and active.
    // They are particularly useful in scenarios where connections might go idle but still need to be kept open for future communication.
    // For example, in a client-server application, if a client goes silent (no data exchange), these keepalive settings help the server determine if the client is still connected or has potentially lost network connectivity.
    // If the client doesn't respond to a certain number of keepalive probes, the server may assume the connection is dead and close it.
    //
    // @see https://github.com/DeviceFarmer/stf/issues/662
    // @see https://stackoverflow.com/questions/58143675/zmq-losing-subscribe-connection
    // @see https://github.com/openstf/stf/issues/100
    // @see https://libzmq.readthedocs.io/en/latest/zmq_setsockopt.html
    sock.setsockopt(zmq.ZMQ_TCP_KEEPALIVE, 1) // Enables TCP keepalive. The default is 0 (off).
    sock.setsockopt(zmq.ZMQ_TCP_KEEPALIVE_IDLE, 300) // Sets the idle time before the first keepalive probe is sent to 300 seconds (5 minutes).
    sock.setsockopt(zmq.ZMQ_TCP_KEEPALIVE_CNT, 10) // Specifies the number of keepalive probes to send before considering the connection dead if no response is received.
    sock.setsockopt(zmq.ZMQ_TCP_KEEPALIVE_INTVL, 300) // Sets the interval between individual keepalive probes to 300 seconds
  }
  catch (err) {
    log.warn('ZeroMQ library too old, no support for some options: ' + err.message)
  }

  return sock
}

This fix was crucial for maintaining stable connections in my setup. The detailed configuration for TCP keepalive options really made the difference.

Thanks for sharing this solution!

Best, Merih Sakarya

DeviceFarmer / stf

Devices not displaying after 3-4 hours ( version 3.6.5) #662