Closed dan085 closed 4 years ago
Hello, I see your point
The problem is the web is referencing some js files as a relative path and they being resolved against Microlink API as root URL instead of use the URL provided
This behavior can be considered a bug from Microlink API; let me investigate a bit more and I can found a solution 🙂
@Kikobeats already great !!! Thank you very much for the quick answer!
Hello again @dverdugo85
it should be working fine if you use prerender
since the website is a SPA:
https://api.microlink.io/?url=https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020&prerender
Hello @Kikobeats
for prerender the link I use puppeteer` https://github.com/puppeteer/puppeteer But I can not to read medatada. +++++++++Code +++++
const puppeteer = require('puppeteer'); const url = "https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020" const browser = await puppeteer.launch({headless: true, args: isCurrentUserRoot() ? ['--no-sandbox'] : undefined}); const page = await browser.newPage(); await page.goto(url, {waitUntil: 'networkidle0'});` const html = await page.content();
const metadata_info = await metascraper({html, url}) console.log(metadata_info)
function isCurrentUserRoot() { return process.getuid() == 0; // UID 0 is always root }
return this
{ author: null, date: null, description: null, image: null, logo: 'https://logo.clearbit.com/t13.cl', publisher: 'T13', title: null, url: 'https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020', audio: null, lang: null }
Regards!!
I use this function to read metadata
var metadata = await getMetaTags(metadata_info,page, url )
const isImageUrl = require('is-image-url');
async function getMetaTags(metadata,page,targetUrl){
var title_puppeteer = metadata['title']
var description_puppeteer = metadata['description']
var image_puppeteer = metadata['image']
var keywords_puppeteer = metadata['keywords']
var type_puppeteer = metadata['type']
try{
title_puppeteer = await page.$eval(
"head > meta[property='og:title']",
element => element.content
);
} catch (e) {
}
try{
description_puppeteer = await page.$eval(
"head > meta[property='og:description']",
element => element.content
);
} catch (e) {
}
try{
image_puppeteer = await page.$eval(
"head > meta[property='og:image']",
element => element.content
);
}catch (e) {
}
if(image_puppeteer === null){
var url_image_split_https = image_puppeteer.split('https://');
console.log("url_image_split_https")
console.log(url_image_split_https)
var status_image = false
if(url_image_split_https.length>1){
console.log("url_image_split_http")
image_1:
for (let ij = 0; ij < url_image_split_https.length; ij++) {
console.log('https://'+url_image_split_https[ij]);
var check_image = isImageUrl('https://'+url_image_split_https[ij]);
console.log(check_image)
if(check_image){
status_image = check_image
image_puppeteer = 'https://'+url_image_split_https[ij]
break image_1;
}
}
console.log(url_image_split_https)
}
var url_image_split_http = image_puppeteer.split('http://');
if(url_image_split_http.length>1){
console.log("url_image_split_http")
console.log(url_image_split_http)
if(!status_image) {
image_2:
for (let ij = 0; ij < url_image_split_http.length; ij++) {
console.log('https://' + url_image_split_http[ij]);
var check_image = isImageUrl('https://' + url_image_split_http[ij]);
console.log(check_image)
if(check_image){
status_image = check_image
image_puppeteer = 'http://'+url_image_split_http[ij]
break image_2;
}
}
}
}
}
try{
keywords_puppeteer = await page.$eval(
"head > meta[name='keywords']",
element => element.content
);
} catch (e) {
}
try{
type_puppeteer = await page.$eval(
"head > meta[property='og:type']",
element => element.content
);
} catch (e) {
}
const metadata_puppeteer = {
'title':title_puppeteer,
'description':description_puppeteer,
'keywords':keywords_puppeteer,
'type' :type_puppeteer,
'url_amp':targetUrl,
'amp':targetUrl,
'image':image_puppeteer
}
return metadata_puppeteer
}
I can't address the puppeteer issue, just use Microlink API!
I hope that help me I try to get html from this Link :https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020
I got the different html when use got and Edge. Don't paste the whole header, I would have to define a user-agent? I hope can you help me!!
with got I obtain:
{ "status":"success", "data":{ "title":"T13 | Tele 13", "description":"inés matte urrejola #0848, santiago, chile fono (562) 2 251 4000 © todos los derechos reservados. 13.cl 2014", "lang":"en", "author":null, "publisher":"t13.cl", "image":{ "url":"https://www-storage.13.cl/t13/site/img/main/t13_logo_splash.svg", "type":"image2", "size":2152, "size_pretty":"2.15 kB" }, "url":"https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020", "date":"2020-05-29T17:07:17.000Z", "logo":{ "url":"https://logo.clearbit.com/t13.cl", "type":"png", "size":7463, "height":128, "width":128, "size_pretty":"7.46 kB" } } }