TechAndCheck / tech-and-check-alerts

Daily tip sheet for fact checkers
MIT License
13 stars 6 forks source link

Improve handling of Twitter accounts we can't scrape #216

Open reefdog opened 5 years ago

reefdog commented 5 years ago

Watching the Twitter scrape logs, we get a lot of 401 (not authorized) and 404 (not found) errors. This could be a Twitter service flicker, but more likely means some accounts on our list have been turned into protected/private accounts, or have been deactivated.

We should do something with these, probably just notify us so we can make a decision to remove them from the list.

reefdog commented 5 years ago

Updated: combined.log actually does give us enough info to go on (expanded from the single-line actual logdump):

{
  "name": "StatusCodeError",
  "statusCode": 401,
  "message": "401 - \"{\\\"request\\\":\\\"\\\\/1.1\\\\/statuses\\\\/user_timeline.json\\\",\\\"error\\\":\\\"Not authorized.\\\"}\"",
  "error": "{\"request\":\"\\/1.1\\/statuses\\/user_timeline.json\",\"error\":\"Not authorized.\"}",
  "options": {
    "url": "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=SusanFisher114&include_rts=false&tweet_mode=extended",
    "headers": {
      "Authorization": "OAuth oauth_consumer_key=\"X0DK65ToKH0FwLS8a4RuEK2S9\",oauth_nonce=\"hT46DuRFZB4yKy1wbM2XgNTlhe1XBtjI\",oauth_signature_method=\"HMAC-SHA1\",oauth_timestamp=\"1568386879\",oauth_token=\"4924987672-sDq1GzhX0VE24sX7DCmdrJQXre1b7ZoKm6ga1uc\",oauth_version=\"1.0\",oauth_signature=\"lmZuxJn5%2FhEH7xcPud7pA26tufE%3D\""
    },
    "simple": true,
    "resolveWithFullResponse": false,
    "transform2xxOnly": false
  },
  "response": {
    "statusCode": 401,
    "body": "{\"request\":\"\\/1.1\\/statuses\\/user_timeline.json\",\"error\":\"Not authorized.\"}",
    "headers": {
      "cache-control": "no-cache, no-store, must-revalidate, pre-check=0, post-check=0",
      "connection": "close",
      "content-disposition": "attachment; filename=json.json",
      "content-length": "75",
      "content-type": "application/json;charset=utf-8",
      "date": "Fri, 13 Sep 2019 15:01:19 GMT",
      "expires": "Tue, 31 Mar 1981 05:00:00 GMT",
      "last-modified": "Fri, 13 Sep 2019 15:01:19 GMT",
      "pragma": "no-cache",
      "server": "tsa_b",
      "set-cookie": [
        "personalization_id=\"v1_kBEPIjr9aOy8eecveoCMPw==\"; Max-Age=63072000; Expires=Sun, 12 Sep 2021 15:01:19 GMT; Path=/; Domain=.twitter.com",
        "lang=en; Path=/",
        "guest_id=v1%3A156838687914851032; Max-Age=63072000; Expires=Sun, 12 Sep 2021 15:01:19 GMT; Path=/; Domain=.twitter.com"
      ],
      "status": "401 Unauthorized",
      "strict-transport-security": "max-age=631138519",
      "www-authenticate": "OAuth realm=\"https://api.twitter.com\"",
      "x-access-level": "read-write",
      "x-connection-hash": "02e70a00ccc972747b70b248a3739ebf",
      "x-content-type-options": "nosniff",
      "x-frame-options": "SAMEORIGIN",
      "x-rate-limit-limit": "900",
      "x-rate-limit-remaining": "560",
      "x-rate-limit-reset": "1568387700",
      "x-response-time": "25",
      "x-transaction": "0033439d0038df0a",
      "x-twitter-response-tags": "BouncerCompliant",
      "x-xss-protection": "0"
    },
    "request": {
      "uri": {
        "protocol": "https:",
        "slashes": true,
        "auth": null,
        "host": "api.twitter.com",
        "port": 443,
        "hostname": "api.twitter.com",
        "hash": null,
        "search": "?screen_name=SusanFisher114&include_rts=false&tweet_mode=extended",
        "query": "screen_name=SusanFisher114&include_rts=false&tweet_mode=extended",
        "pathname": "/1.1/statuses/user_timeline.json",
        "path": "/1.1/statuses/user_timeline.json?screen_name=SusanFisher114&include_rts=false&tweet_mode=extended",
        "href": "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=SusanFisher114&include_rts=false&tweet_mode=extended"
      },
      "method": "GET",
      "headers": {
        "Authorization": "OAuth oauth_consumer_key=\"X0DK65ToKH0FwLS8a4RuEK2S9\",oauth_nonce=\"hT46DuRFZB4yKy1wbM2XgNTlhe1XBtjI\",oauth_signature_method=\"HMAC-SHA1\",oauth_timestamp=\"1568386879\",oauth_token=\"4924987672-sDq1GzhX0VE24sX7DCmdrJQXre1b7ZoKm6ga1uc\",oauth_version=\"1.0\",oauth_signature=\"lmZuxJn5%2FhEH7xcPud7pA26tufE%3D\""
      }
    }
  },
  "level": "warn",
  "service": "tech-and-check-alerts",
  "timestamp": "2019-09-13 15:01:19",
  "stack": "StatusCodeError: 401 - \"{\\\"request\\\":\\\"\\\\/1.1\\\\/statuses\\\\/user_timeline.json\\\",\\\"error\\\":\\\"Not authorized.\\\"}\"\n    at new StatusCodeError (/data/repos/tech-and-check-alerts/node_modules/request-promise-core/lib/errors.js:32:15)\n    at Request.plumbing.callback (/data/repos/tech-and-check-alerts/node_modules/request-promise-core/lib/plumbing.js:104:33)\n    at Request.RP$callback [as _callback] (/data/repos/tech-and-check-alerts/node_modules/request-promise-core/lib/plumbing.js:46:31)\n    at Request.self.callback (/data/repos/tech-and-check-alerts/node_modules/request/request.js:185:22)\n    at Request.emit (events.js:209:13)\n    at Request.<anonymous> (/data/repos/tech-and-check-alerts/node_modules/request/request.js:1161:10)\n    at Request.emit (events.js:209:13)\n    at IncomingMessage.<anonymous> (/data/repos/tech-and-check-alerts/node_modules/request/request.js:1083:12)\n    at Object.onceWrapper (events.js:297:20)\n    at IncomingMessage.emit (events.js:214:15)\n    at endReadableNT (_stream_readable.js:1178:12)\n    at processTicksAndRejections (internal/process/task_queues.js:77:11)"
}

We should monitor the logs for these access errors and pluck the account (screen_name={account}) and let ourselves know it wasn't accessible.