mozilla / page-metadata-parser

DEPRECATED - A Javascript library for parsing metadata on a web page.
https://www.npmjs.com/package/page-metadata-parser
Mozilla Public License 2.0
270 stars 42 forks source link

Add a provider field fixes #77 #78

Closed jaredlockhart closed 7 years ago

jaredlockhart commented 7 years ago

Allowed support for subdomains so 'sports.bing.com' will render as 'sports bing' which I think is a reasonable compromise.

coveralls commented 7 years ago

Coverage Status

Coverage remained the same at 100.0% when pulling 6e73102b18143628dba9a88306f5df63ab62c016 on 77 into ee886dc978d9cf7baf664bcc93bbaa1f15b96357 on master.

coveralls commented 7 years ago

Coverage Status

Coverage remained the same at 100.0% when pulling f2acfefcab319a0fd63e56b77be9eb46a71205ab on 77 into ee886dc978d9cf7baf664bcc93bbaa1f15b96357 on master.

pdehaan commented 7 years ago

Last one, here's the sorted and deduped results of the first 5 pages of /r/worldnews through your provider function (where the provider is more than 1 word):

[ 'abc net',    // http://www.abc.net.au/
  'abcnews go',    // http://abcnews.go.com/
  'bigstory ap',    // http://bigstory.ap.org
  'businessinsider com',    // http://www.businessinsider.com.au/
  'dailystar com',    // http://www.dailystar.com.lb/
  'economictimes indiatimes',    // http://economictimes.indiatimes.com/
  'edition cnn',    // http://edition.cnn.com/
  'globalnation inquirer',    // http://globalnation.inquirer.net/
  'm ndtv',    // http://m.ndtv.com/
  'mobile reuters',    // http://mobile.reuters.com/
  'motherboard vice',    // http://motherboard.vice.com/
  'nakedsecurity sophos',    // https://nakedsecurity.sophos.com/
  'news sky',    // http://news.sky.com/
  'news vice',    // https://news.vice.com/
  'timesofindia indiatimes'    // http://timesofindia.indiatimes.com/
]
const urlparse = require('url');
const { fetchSubreddit, domainReducer } = require('reddit-as-json');

fetchSubreddit('worldnews', 5)
  .then(domainReducer)
  .then(({data}) => data.map((domain) => getProvider(`https://${domain.name}`)))
  .then((data) => {
    return Object.keys(data.reduce((prev, curr) => {
      prev[curr] = true;
      return prev;
    }, {})).sort();
  })
  .then((data) => data.filter((provider) => provider.split(' ').length > 1))
  .then((data) => console.log(data))
  .catch((err) => console.error(err));

function getProvider(url) {
  return urlparse.parse(url)
    .hostname
    .replace(/www[a-zA-Z0-9]*\./, '')
    .replace('co.', '')
    .split('.')
    .slice(0, -1)
    .join(' ');
}