mozilla / page-metadata-parser

DEPRECATED - A Javascript library for parsing metadata on a web page.
https://www.npmjs.com/package/page-metadata-parser
Mozilla Public License 2.0
270 stars 42 forks source link

Suspicious www URL parsing in getProvider() #83

Closed pdehaan closed 2 years ago

pdehaan commented 7 years ago

Ref /parser.js:14-22,

function getProvider(url) {
  return urlparse.parse(url)
    .hostname
    .replace(/www[a-zA-Z0-9]*\./, '')
    .replace('.co.', '.')
    .split('.')
    .slice(0, -1)
    .join(' ');
}

The one suspicious bit in there is the .replace(/www[a-zA-Z0-9*\./, '') bit. It looks like [in theory] it would murder any domain that would start with "www", such as "wwwapple.com":

const urlparse = require('url');

function getProvider(url) {
  return urlparse.parse(url)
    .hostname
    .replace(/www[a-zA-Z0-9]*\./, '')
    .replace('.co.', '.')
    .split('.')
    .slice(0, -1)
    .join(' ');
}

console.log(getProvider('https://www.apple.com')); // "apple"
console.log(getProvider('https://bbc.co.uk')); // "bbc"
console.log(getProvider('https://redirect.ca')); // "redirect"
console.log(getProvider('https://aol.go.com')); // "aol go"
console.log(getProvider('https://wwwwwapple.com')); // ""