Scrape by language? - Githubissues

ghost commented 3 years ago

Hey! How would I go about editing the script to download all fonts for a specific language? Say for example Chinese or Korean? Thanks for your help!

chiumax commented 3 years ago

Hmm. Interesting. Adobe Fonts didn't have other languages in the past. I can't really think of a good way to do this off the top of my head so you'll have to get your hands a little bit dirty to choose a specific language.

In the next comment I will paste a copy of fonts.js that I modified to download Chinese (Traditional).

There are a couple things to note (partly due to some steps being slightly incorrect in the README):

How To Scrape All Fonts

npm install
cd into src and replace fonts.js with the code in the next comment.

Copy everything after ?browse_mode=. So, in this case, ko&languages=ko should be in your clipboard.
In fonts.js, uncomment line 176
On line 19 replace ["zh-Hant&languages=zh-Hant"] with (in our example), ["ko&languages=ko"]
and run node fonts.js. This will create a JSON file which holds all the links.
After the JSON has been created, comment line 176
Uncomment the block of code below 176
run node fonts.js.
In the event that the script breaks, just run it again, it will check against already downloaded files.
The fonts should show up in src/zips

Okay, I apologize for the confusing instructions - I put it together hastily - so let me know if you run into any issues. Happy scraping !

chiumax commented 3 years ago

replace your fonts.js with this one

const puppeteer = require("puppeteer");
const path = require("path");
const getUrls = require("get-urls");
const request = require("request");
const opentype = require("opentype.js");
// const opentype = require("opentype/src/opentype.js");
// const woff2 = require(path.join(__dirname, "woff2", "src", "woff2.js"));
const woff2 = require("wawoff2");
const { zip } = require("zip-a-folder");
const fs = require("fs");
const rimraf = require("rimraf");

// global stuff...
let fontFamily;
let fontHrefCache = [];

let fontBrowseUrl = ``;
// const browserMode = ["default", "japanese"];
const browserMode=["zh-Hant&languages=zh-Hant"];

let fontMetaData = {};

// scrape outer page for all font links to  json
const scrapeForFontLinks = async (link) => {
  const browser = await puppeteer.launch({
    headless: true,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });
  const page = await browser.newPage();

  for (browseMode of browserMode) {
    let hrefs;
    let fontPages;
    let unique;
    let pageNum = 1;
    console.log(browseMode);
    fontMetaData[browseMode] = {};

    fontBrowseUrl = `https://fonts.adobe.com/fonts?browse_mode=${browseMode}&page=${pageNum}&sort=alpha`;
    await page.goto(fontBrowseUrl);
    const fontCount = await page.evaluate(() => {
      return document.querySelector("div[data-id='family-count-message']")
        .innerText;
    });
    fontMetaData[browseMode].count = fontCount;
    console.log(fontMetaData);

    do {
      fontBrowseUrl = `https://fonts.adobe.com/fonts?browse_mode=${browseMode}&page=${pageNum}&sort=alpha`;
      await page.goto(fontBrowseUrl);
      // all links on page
      hrefs = await page.$$eval("a", (as) => as.map((a) => a.href));
      // all links to fonts
      fontPages = hrefs.filter((href) => {
        splitHref = href.split("/");
        return splitHref[splitHref.length - 2] == "fonts";
      });
      // all unique links to fonts
      unique = [...new Set(fontPages)];
      fontHrefCache = fontHrefCache.concat(unique);
      // console.log(fontHrefCache);
      console.log("scraped ", unique.length, "fonts");
      pageNum++;
    } while (unique.length > 0);
  }
  console.log("closing browser...");

  await browser.close();
  console.log("browser closed");
  console.log(fontMetaData);
  // console.log(
  //   "There are a total of: " +
  //     fontMetaData.default.count +
  //     fontMetaData.japanese.count +
  //     "fonts"
  // );
  console.log("We recorded a total of: " + fontHrefCache.length + " fonts");

  // write font links to json
  let temp = { fontArray: fontHrefCache };
  const jsonString = JSON.stringify(temp);
  fs.writeFileSync(path.join(__dirname, "fontHrefs.json"), jsonString);
  console.log("done");
};

const scrapeFonts = async (link) => {
  let fontWritten = false;
  linkArr = link.split("/");
  fontFamily = linkArr[linkArr.length - 1];

  if (fs.existsSync(path.join(__dirname, "zips", fontFamily + ".zip"))) {
    console.log(fontFamily, "exists! \nSkipping...");
    return true;
  }

  const browser = await puppeteer.launch({
    headless: true,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });
  const page = await browser.newPage();
  // scrape fonts from font link here
  await page.goto(link);

  // Executing code in the DOM
  const fonts = await page.evaluate(() => {
    let temp = window.Typekit.fonts.fonts;
    let fonts = [];
    temp.forEach((font) => {
      fonts.push({ family: font.w.family, source: font.source });
    });

    return fonts;
  });
  await browser.close();

  const formattedFontArr = parseFonts(fonts);
  while (!fontWritten) {
    try {
      fontWritten = await writeFonts(formattedFontArr);
    } catch (err) {
      await writeFonts(formattedFontArr);
      fontWritten = true;
      console.log("uh-oh");
      console.log(err);
    }
  }
};

// Filters out adobe fonts and returns an array with links to WOFF2 fonts
const parseFonts = (fonts) => {
  const filteredFonts = fonts.filter((font) => {
    return !font.family.includes("adobe");
  });
  const fontURLs = filteredFonts.map((font) => {
    return { family: font.family, source: Array.from(getUrls(font.source))[0] };
  });
  return fontURLs;
};

// Write and convert WOFF2 to TTF fonts to file
const writeFonts = async (fonts) => {
  rimraf.sync(path.join(__dirname, "temp"));
  fs.mkdirSync(path.join(__dirname, "temp"));
  for (url of fonts) {
    await new Promise((resolve) =>
      request(url.source)
        .pipe(fs.createWriteStream(path.join(__dirname, "out.woff2")))
        .on("finish", () => {
          resolve();
        })
    );
    let buffer = fs.readFileSync(path.join(__dirname, "out.woff2"));
    let ttfBuffer = await woff2.decompress(buffer);
    // let ttfBuffer = woff2.decode(buffer);
    fs.writeFileSync(path.join(__dirname, "out.ttf"), ttfBuffer);
    // Opentype can ONLY parse TTFs, not WOFF2s
    let metadata = opentype.loadSync(path.join(__dirname, "out.ttf")).names;
    fs.writeFileSync(
      path.join(__dirname, "temp", metadata.postScriptName.en + ".ttf"),
      ttfBuffer
    );
  }
  await zip(
    path.join(__dirname, "temp"),
    path.join(__dirname, "zips", fontFamily + ".zip")
  );

  console.log(fontFamily, "scraped and zipped");
  return true;
};

// scrapeFonts("https://fonts.adobe.com/fonts/fira-sans");

// run one time only, scrapes all the font links from adobe. This will generate a fontHrefs.json

// scrapeForFontLinks("https://fonts.adobe.com/fonts?browse_mode=default");
// const scrapedFontLinks = fs.readFileSync("fontHrefs.json");
// let fontArray = JSON.parse(scrapedFontLinks).fontArray;
// (async () => {
//   try {
//     for (href of fontArray) {
//       await scrapeFonts(href);
//     }
//   } catch (e) {
//     console.log(e);
//   }
// })();

//////////////////////////

module.exports = scrapeFonts;

ghost commented 3 years ago

Woooow! Working like a charm! I sometimes get a Puppeteer timeout. Is this due to the rate limiting in adobe servers? Happens only sometimes. Thanks so much for the help :)

chiumax commented 3 years ago

Haha, glad it works for you !

About the Puppeteer timeout, is it breaking the script (causing you to need to rerun the code)? If so, is there an error message associated with it?

I didn't run into any issues scraping anything recently.

ghost commented 3 years ago

Hey, yep! Seems to be a timeout problem:

TimeoutError: Navigation timeout of 30000 ms exceeded
    at /home/yannicko/adobe-font-scraper/node_modules/puppeteer/lib/cjs/puppeteer/common/LifecycleWatcher.js:106:111

Only happens sometimes though. I need to stop the script and rerun it because it stops at that error. It's working nonetheless once I rerun it. Just a matter of running it a few times :)

Edit: Seems to be ProtonVPN which doesn't seem to be able to connect to the server. If I disable it it works perfectly, although it still sometimes happens. To a much lesser extent though.

chiumax commented 3 years ago

Hmm, I see. If I have time I'll try to update the repo with everything.

Closing for now, feel free to make a new issue if you come across anything else.

chiumax / adobe-font-scraper

Scrape by language? #5

How To Scrape All Fonts