mozilla / heatmap

INACTIVE - http://mzl.la/ghe-archive - Context Graph Data Ingestion
Mozilla Public License 2.0
14 stars 6 forks source link

Update filter with top 500 sites from Alexa? #20

Closed pdehaan closed 8 years ago

pdehaan commented 8 years ago

Running a quick scan of sites in your NSFW filter versus the Alexa Top 500 (http://www.alexa.com/topsites/category/Top/Adult), and getting about 33% success rate. Not sure how recent our filter file is, or if it is ever updated, or if we want to add new links.

Steps to reproduce:

const urlParse = require('url').parse;

const fetch = require('node-fetch');
const alexa = require('alexa-top-sites');

const promises = [
  getNSFWFilter(),
  getAlexaByCategory('Adult')
];

Promise.all(promises)
  .then(([nsfwFilter, alexaSites]) => {
    const results = alexaSites.filter((site) => !nsfwFilter.includes(site));
    const pct = (results.length / alexaSites.length * 100).toFixed(0);
    const summary = `${results.length} of ${alexaSites.length} (${pct}%) results not found in master list.`;
    return {summary, results};
  })
  .then((results) => console.log(JSON.stringify(results, null, 2)))
  .catch((err) => console.error(err));

function getNSFWFilter() {
  return fetch('https://raw.githubusercontent.com/mozilla/heatmap/master/pornfilter/nofap.txt')
    .then((res) => res.text())
    .then((list) => list.split('\n').filter((item) => !(/^#/).test(item)));
}

function getAlexaByCategory(category) {
  return alexa.byCategory(category)
    .then((list) => list.sites.map((site) => urlParse(site).host));
}
**OUTPUT:** ``` json { "summary": "17 of 25 (68%) results not found in master list.", "results": [ "livejasmin.com", "g.e-hentai.org", "nudevista.com", "fetlife.com", "nhentai.net", "literotica.com", "furaffinity.net", "freeones.com", "adam4adam.com", "newgrounds.com", "clips4sale.com", "ebaumsworld.com", "manhunt.net", "luscious.net", "mrskin.com", "hentai-foundry.com", "digitalplayground.com" ] } ```

NOTE: My lame alexa-top-sites module only scrapes the front page of the http://www.alexa.com/topsites/category/Top/ pages, so only checks the first 25 results and not all 20 pages (500 results).

pdehaan commented 8 years ago

I updated my alexa-top-sites scraper to support pagination so I could grab all 500 results, and Alexa seems to have a large number of results not found in the heatmap list.

"summary": "464 of 500 (93%) results not found in master list.",

Let me know if you want me to try submit a PR to add these to nofap, or if you'd rather I just submit a separate file.


New version of Alexa scraper (with support for crude local caching) is:

const fs = require('fs');
const urlParse = require('url').parse;

const fetch = require('node-fetch');
const alexa = require('alexa-top-sites');

const promises = [
  getNSFWFilter(),
  getAlexaByCategory('Adult')
];

Promise.all(promises)
  .then(([nsfwFilter, alexaSites]) => {
    const results = alexaSites.filter((site) => !nsfwFilter.includes(site));
    const pct = (results.length / alexaSites.length * 100).toFixed(0);
    const summary = `${results.length} of ${alexaSites.length} (${pct}%) results not found in master list.`;
    return {summary, results};
  })
  .then((results) => console.log(JSON.stringify(results, null, 2)))
  .catch((err) => console.error(err));

function getNSFWFilter() {
  const cacheName = './nsfwfilter.cache.json';

  try {
    return require(cacheName);
  } catch (err) {
    console.error('cache not found: %s', cacheName);
    console.error(err);
  }

  return fetch('https://raw.githubusercontent.com/mozilla/heatmap/master/pornfilter/nofap.txt')
    .then((res) => res.text())
    .then((list) => list.split('\n').filter((item) => !(/^#/).test(item)))
    .then((list) => {
      fs.writeFileSync(cacheName, JSON.stringify(list, null, 2));
      return list;
    });
}

function getAlexaByCategory(category) {
  const cacheName = `./alexa-${category}.cache.json`;

  try {
    return require(cacheName);
  } catch (err) {
    console.error('cache not found: %s', cacheName);
    console.error(err);
  }

  return alexa.getPages(alexa.byCategory, category, 20)
    .then((list) => list.map((site) => urlParse(site).host))
    .then((list) => {
      fs.writeFileSync(cacheName, JSON.stringify(list, null, 2));
      return list;
    });
}
crankycoder commented 8 years ago

@pdehaan I've added an issue over at : https://github.com/crankycoder/nopornjs/issues/1 which is where the bloomfilter is generated.