Closed ghost closed 3 years ago
Hmm. Interesting. Adobe Fonts didn't have other languages in the past. I can't really think of a good way to do this off the top of my head so you'll have to get your hands a little bit dirty to choose a specific language.
In the next comment I will paste a copy of fonts.js
that I modified to download Chinese (Traditional).
There are a couple things to note (partly due to some steps being slightly incorrect in the README):
npm install
cd
into src
and replace fonts.js
with the code in the next comment.Copy everything after ?browse_mode=
. So, in this case, ko&languages=ko
should be in your clipboard.
In fonts.js
, uncomment line 176
On line 19 replace ["zh-Hant&languages=zh-Hant"]
with (in our example), ["ko&languages=ko"]
and run node fonts.js
. This will create a JSON file which holds all the links.
After the JSON has been created, comment line 176
Uncomment the block of code below 176
run node fonts.js
.
In the event that the script breaks, just run it again, it will check against already downloaded files.
The fonts should show up in src/zips
Okay, I apologize for the confusing instructions - I put it together hastily - so let me know if you run into any issues. Happy scraping !
replace your fonts.js
with this one
const puppeteer = require("puppeteer");
const path = require("path");
const getUrls = require("get-urls");
const request = require("request");
const opentype = require("opentype.js");
// const opentype = require("opentype/src/opentype.js");
// const woff2 = require(path.join(__dirname, "woff2", "src", "woff2.js"));
const woff2 = require("wawoff2");
const { zip } = require("zip-a-folder");
const fs = require("fs");
const rimraf = require("rimraf");
// global stuff...
let fontFamily;
let fontHrefCache = [];
let fontBrowseUrl = ``;
// const browserMode = ["default", "japanese"];
const browserMode=["zh-Hant&languages=zh-Hant"];
let fontMetaData = {};
// scrape outer page for all font links to json
const scrapeForFontLinks = async (link) => {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
for (browseMode of browserMode) {
let hrefs;
let fontPages;
let unique;
let pageNum = 1;
console.log(browseMode);
fontMetaData[browseMode] = {};
fontBrowseUrl = `https://fonts.adobe.com/fonts?browse_mode=${browseMode}&page=${pageNum}&sort=alpha`;
await page.goto(fontBrowseUrl);
const fontCount = await page.evaluate(() => {
return document.querySelector("div[data-id='family-count-message']")
.innerText;
});
fontMetaData[browseMode].count = fontCount;
console.log(fontMetaData);
do {
fontBrowseUrl = `https://fonts.adobe.com/fonts?browse_mode=${browseMode}&page=${pageNum}&sort=alpha`;
await page.goto(fontBrowseUrl);
// all links on page
hrefs = await page.$$eval("a", (as) => as.map((a) => a.href));
// all links to fonts
fontPages = hrefs.filter((href) => {
splitHref = href.split("/");
return splitHref[splitHref.length - 2] == "fonts";
});
// all unique links to fonts
unique = [...new Set(fontPages)];
fontHrefCache = fontHrefCache.concat(unique);
// console.log(fontHrefCache);
console.log("scraped ", unique.length, "fonts");
pageNum++;
} while (unique.length > 0);
}
console.log("closing browser...");
await browser.close();
console.log("browser closed");
console.log(fontMetaData);
// console.log(
// "There are a total of: " +
// fontMetaData.default.count +
// fontMetaData.japanese.count +
// "fonts"
// );
console.log("We recorded a total of: " + fontHrefCache.length + " fonts");
// write font links to json
let temp = { fontArray: fontHrefCache };
const jsonString = JSON.stringify(temp);
fs.writeFileSync(path.join(__dirname, "fontHrefs.json"), jsonString);
console.log("done");
};
const scrapeFonts = async (link) => {
let fontWritten = false;
linkArr = link.split("/");
fontFamily = linkArr[linkArr.length - 1];
if (fs.existsSync(path.join(__dirname, "zips", fontFamily + ".zip"))) {
console.log(fontFamily, "exists! \nSkipping...");
return true;
}
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
// scrape fonts from font link here
await page.goto(link);
// Executing code in the DOM
const fonts = await page.evaluate(() => {
let temp = window.Typekit.fonts.fonts;
let fonts = [];
temp.forEach((font) => {
fonts.push({ family: font.w.family, source: font.source });
});
return fonts;
});
await browser.close();
const formattedFontArr = parseFonts(fonts);
while (!fontWritten) {
try {
fontWritten = await writeFonts(formattedFontArr);
} catch (err) {
await writeFonts(formattedFontArr);
fontWritten = true;
console.log("uh-oh");
console.log(err);
}
}
};
// Filters out adobe fonts and returns an array with links to WOFF2 fonts
const parseFonts = (fonts) => {
const filteredFonts = fonts.filter((font) => {
return !font.family.includes("adobe");
});
const fontURLs = filteredFonts.map((font) => {
return { family: font.family, source: Array.from(getUrls(font.source))[0] };
});
return fontURLs;
};
// Write and convert WOFF2 to TTF fonts to file
const writeFonts = async (fonts) => {
rimraf.sync(path.join(__dirname, "temp"));
fs.mkdirSync(path.join(__dirname, "temp"));
for (url of fonts) {
await new Promise((resolve) =>
request(url.source)
.pipe(fs.createWriteStream(path.join(__dirname, "out.woff2")))
.on("finish", () => {
resolve();
})
);
let buffer = fs.readFileSync(path.join(__dirname, "out.woff2"));
let ttfBuffer = await woff2.decompress(buffer);
// let ttfBuffer = woff2.decode(buffer);
fs.writeFileSync(path.join(__dirname, "out.ttf"), ttfBuffer);
// Opentype can ONLY parse TTFs, not WOFF2s
let metadata = opentype.loadSync(path.join(__dirname, "out.ttf")).names;
fs.writeFileSync(
path.join(__dirname, "temp", metadata.postScriptName.en + ".ttf"),
ttfBuffer
);
}
await zip(
path.join(__dirname, "temp"),
path.join(__dirname, "zips", fontFamily + ".zip")
);
console.log(fontFamily, "scraped and zipped");
return true;
};
// scrapeFonts("https://fonts.adobe.com/fonts/fira-sans");
// run one time only, scrapes all the font links from adobe. This will generate a fontHrefs.json
// scrapeForFontLinks("https://fonts.adobe.com/fonts?browse_mode=default");
// const scrapedFontLinks = fs.readFileSync("fontHrefs.json");
// let fontArray = JSON.parse(scrapedFontLinks).fontArray;
// (async () => {
// try {
// for (href of fontArray) {
// await scrapeFonts(href);
// }
// } catch (e) {
// console.log(e);
// }
// })();
//////////////////////////
module.exports = scrapeFonts;
Woooow! Working like a charm! I sometimes get a Puppeteer timeout. Is this due to the rate limiting in adobe servers? Happens only sometimes. Thanks so much for the help :)
Haha, glad it works for you !
About the Puppeteer timeout, is it breaking the script (causing you to need to rerun the code)? If so, is there an error message associated with it?
I didn't run into any issues scraping anything recently.
Hey, yep! Seems to be a timeout problem:
TimeoutError: Navigation timeout of 30000 ms exceeded
at /home/yannicko/adobe-font-scraper/node_modules/puppeteer/lib/cjs/puppeteer/common/LifecycleWatcher.js:106:111
Only happens sometimes though. I need to stop the script and rerun it because it stops at that error. It's working nonetheless once I rerun it. Just a matter of running it a few times :)
Edit: Seems to be ProtonVPN which doesn't seem to be able to connect to the server. If I disable it it works perfectly, although it still sometimes happens. To a much lesser extent though.
Hmm, I see. If I have time I'll try to update the repo with everything.
Closing for now, feel free to make a new issue if you come across anything else.
Hey! How would I go about editing the script to download all fonts for a specific language? Say for example Chinese or Korean? Thanks for your help!