mozilla / tippy-top-sites-deprecated

[deprecated][unmaintained]
6 stars 7 forks source link

Add Canadian sites #79

Open pdehaan opened 7 years ago

pdehaan commented 7 years ago

Not sure how far we want to push for having Canadian parity with top site URLs... Also, the list below can be slightly misleading since it's mostly just checking on if an existing Top Site URL has a similar .ca domain that doesn't redirect or error out. It doesn't in any way ensure that bing.ca is related to bing.com...

$ node parser.js

200 'http://www.about.ca/'
200 'http://abcnews.go.ca/'
429 'http://www.accuweather.ca/'
200 'https://www.airbnb.ca/'
405 'https://www.amazon.ca/'
200 'http://www.answers.ca/'
200 'http://www.aol.ca/'
403 'http://www.ask.ca/'
405 'http://www.bing.ca/'
200 'http://www.bild.ca/'
200 'http://www.blackboard.ca/'
404 'http://www.buzzfeed.ca/index'
200 'https://www.capitalone.ca/'
200 'http://www.cbc.ca/'
200 'http://www.cnn.ca/'
200 'http://www.dailymail.ca/'
200 'http://digg.ca/'
200 'http://diply.ca/'
200 'https://www.ebay.ca/'
200 'http://espn.go.ca/'
200 'https://www.eventbrite.ca/'
200 'http://www.foodnetwork.ca/'
200 'http://go.ca/'
403 'http://www.goodreads.ca/'
200 'https://www.google.ca/'
200 'https://images.google.ca/'
200 'https://www.groupon.ca/'
200 'http://www.huffingtonpost.ca/'
200 'http://www.ikea.ca/'
200 'http://www.latimes.ca/'
200 'http://lifehacker.ca/'
404 'http://mashable.ca/stories/'
200 'http://www.nbcnews.ca/'
405 'https://www.netflix.ca/'
404 'http://www.nih.ca/'
200 'http://www.npr.ca/'
403 'http://nypost.ca/'
200 'http://www.nytimes.ca/'
200 'http://www.pandora.ca/'
200 'http://www.people.ca/'
200 'http://www.sears.ca/'
403 'http://www.staples.ca/'
200 'http://www.strava.ca/'
200 'https://www.surveymonkey.ca/'
400 'http://www.target.ca/'
200 'https://www.tripadvisor.ca/'
403 'http://www.trulia.ca/'
200 'http://www.usatoday.ca/'
403 'http://www.verizon.ca/'
403 'http://www.wayfair.ca/'
Done...
const url = require('url');

const Queue = require('p-queue');
const fetch = require('node-fetch');
const parseDomain = require('parse-domain');

const topSites = require('./top_sites.json');

const queue = new Queue({concurrency: 2});

topSites.splice(0, 1000).forEach(site => {
  const urls = site.urls || [site.url];
  urls.forEach(url => {
      queue.add(async () => {
        const res = await checkSite(url);
        if (res.status < 500 && res.ca === res.uri)
          console.log(res.status, res.ca) // , res.uri);
      });
    });
});

queue.onIdle().then(() => {
  console.log('Done...');
});

function unique(item, idx, items) {
  return items.indexOf(item) === items.lastIndexOf(item);
}

async function checkSite(uri) {
  const dotCa = canadianizer(uri);
  const data = {uri, ca: dotCa};
  const opts = {method: 'HEAD', timeout: 3000};
  try {
    let res = await fetch(dotCa, opts);
    return Object.assign(data, {status: res.status, uri: res.url});
  } catch (err) {
    return Object.assign(data, {status: 999});
  }
}

function canadianizer(uri) {
  const {protocol, hostname, pathname} = url.parse(uri);
  const {subdomain, domain, tld} = parseDomain(hostname);
  const host = [subdomain, domain, 'ca'].filter(Boolean).join('.');
  return url.format({ protocol, host, pathname });
}