Closed jroakes closed 2 years ago
BTW I haven't added the linter to this repo yet - but on my to do list! Depends whether we retire this repo (as it's name suggests!) or think it's worth adding in the meantime. Probably worth adding to be honest so will have a look...
Status: Passed
{
"redirected": false,
"status": 200,
"size": 7191,
"size_kib": 7.0224609375,
"over_google_limit": false,
"comment_count": 2,
"record_counts": {
"by_type": {
"sitemap": 1,
"user_agent": 4,
"allow": 54,
"disallow": 228,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"by_useragent": {
"*": {
"allow": 51,
"disallow": 224,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"adsbot-google": {
"allow": 1,
"disallow": 4,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"twitterbot": {
"allow": 1,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"facebookexternalhit": {
"allow": 1,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 0
}
}
}
}
Status: Pass
{
"redirected": false,
"status": 200,
"size": 7482,
"size_kib": 7.306640625,
"over_google_limit": false,
"comment_count": 1,
"record_counts": {
"by_type": {
"sitemap": 5,
"user_agent": 6,
"allow": 7,
"disallow": 264,
"crawl_delay": 0,
"noindex": 0,
"other": 1
},
"by_useragent": {
"*": {
"allow": 1,
"disallow": 45,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"baiduspider": {
"allow": 0,
"disallow": 163,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"haosouspider": {
"allow": 0,
"disallow": 53,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"sogou web spider": {
"allow": 2,
"disallow": 1,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"sogou inst spider": {
"allow": 2,
"disallow": 1,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"sogou spider2": {
"allow": 2,
"disallow": 1,
"crawl_delay": 0,
"noindex": 0,
"other": 1
}
}
}
}
Status: Pass
{{
"redirected": false,
"status": 200,
"size": 2279,
"size_kib": 2.2255859375,
"over_google_limit": false,
"comment_count": 19,
"record_counts": {
"by_type": {
"sitemap": 0,
"user_agent": 1,
"allow": 32,
"disallow": 36,
"crawl_delay": 1,
"noindex": 0,
"other": 0
},
"by_useragent": {
"*": {
"allow": 32,
"disallow": 36,
"crawl_delay": 1,
"noindex": 0,
"other": 0
}
}
}
}
Status: Passed
{
"redirected": false,
"status": 200,
"size": 14,
"size_kib": 0.013671875,
"over_google_limit": false,
"comment_count": 0,
"record_counts": {
"by_type": {
"sitemap": 0,
"user_agent": 1,
"allow": 0,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"by_useragent": {
"*": {
"allow": 0,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 0
}
}
}
}
Status: Passed
{
"redirected": false,
"status": 404,
"size": 1245,
"size_kib": 1.2158203125,
"over_google_limit": false,
"comment_count": 2,
"record_counts": {
"by_type": {
"sitemap": 0,
"user_agent": 0,
"allow": 0,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 27
},
"by_useragent": {}
}
}
Status: Passed
{
"redirected": false,
"status": 200,
"size": 2426760,
"size_kib": 2369.8828125,
"over_google_limit": true,
"comment_count": 501,
"record_counts": {
"by_type": {
"sitemap": 1,
"user_agent": 1,
"allow": 22076,
"disallow": 22424,
"crawl_delay": 0,
"noindex": 5506,
"other": 0
},
"by_useragent": {
"*": {
"allow": 22076,
"disallow": 22424,
"crawl_delay": 0,
"noindex": 5506,
"other": 0
}
}
}
}
Status: Passed
{
"redirected": false,
"status": 404,
"size": 103,
"size_kib": 0.1005859375,
"over_google_limit": false,
"comment_count": 0,
"record_counts": {
"by_type": {
"sitemap": 0,
"user_agent": 0,
"allow": 0,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 1
},
"by_useragent": {}
}
}
Status: Passed
{
"redirected": false,
"status": 200,
"size": 80784,
"size_kib": 78.890625,
"over_google_limit": false,
"comment_count": 12,
"record_counts": {
"by_type": {
"sitemap": 0,
"user_agent": 36,
"allow": 251,
"disallow": 2863,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"by_useragent": {
"googlebot": {
"allow": 9,
"disallow": 87,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"applebot": {
"allow": 9,
"disallow": 83,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"bingbot": {
"allow": 9,
"disallow": 87,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"msnbot": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"slurp": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"googlebot-image": {
"allow": 7,
"disallow": 85,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"baiduspider": {
"allow": 14,
"disallow": 170,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"seznambot": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"teoma": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"yandex": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"yeti": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"msnbot-media": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"googlebot-news": {
"allow": 7,
"disallow": 85,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"googlebot-video": {
"allow": 7,
"disallow": 85,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"googlebot-mobile": {
"allow": 7,
"disallow": 85,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"mediapartners-google": {
"allow": 7,
"disallow": 86,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"adsbot-google": {
"allow": 7,
"disallow": 85,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"duckduckbot": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"daumoa": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"orangebot": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"orangebot-collector": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"stackrambler": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"twitterbot/1.0": {
"allow": 7,
"disallow": 82,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"mail.ru_bot": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"naverbot": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"vebidoobot": {
"allow": 6,
"disallow": 78,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"mj12bot": {
"allow": 6,
"disallow": 78,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"360spider": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"sogou": {
"allow": 7,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"rogerbot": {
"allow": 9,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"deepcrawl": {
"allow": 9,
"disallow": 84,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"bytespider": {
"allow": 9,
"disallow": 87,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"neevabot": {
"allow": 9,
"disallow": 87,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"cincobot": {
"allow": 1,
"disallow": 0,
"crawl_delay": 0,
"noindex": 0,
"other": 0
},
"*": {
"allow": 0,
"disallow": 1,
"crawl_delay": 0,
"noindex": 0,
"other": 0
}
}
}
}
Made adjustments to correct for testing errors. I decided to not handle returning no data for 40X/50X errors as 1) It would make the code more complex, and 2) Users can easily configure queries to exclude non-200 queries.
There is a bug in this script when tested on johnmu.com in that it reports URL fragments as comments. I need to update to correct for this.
I think I am ready for review. I have updated to 8 test cases that cover various use-cases. Please let me know if there is anything else needed.
Discussed in comment here: https://github.com/HTTPArchive/almanac.httparchive.org/pull/2351#issuecomment-932691221. The linter is going to complain and I will fix. I stink at JS.