not get corrrect html - Githubissues

dan085 commented 4 years ago

I hope that help me I try to get html from this Link :https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020

I got the different html when use got and Edge. Don't paste the whole header, I would have to define a user-agent? I hope can you help me!!

with got I obtain:

<head>
 <title>T13 | Tele 13</title>
</head>

6 series (Netflix, Amazon) por si el mundo no vuelve a ser igual | Tele 13


 with **browser edge** I obtain:

6 series (Netflix, Amazon) por si el mundo no vuelve a ser igual | Tele 13

I want to read metadata!
Regards!!

### Expected behaviour

I hope to get metadata
Tell us what should happen

### Actual behaviour

Tell us what happens instead

I get this result!

{
    "msg": "true",
    "url": "https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020",
    "title": "T13 | Tele 13",
    "description": "T13",
    "url_image": "https://logo.clearbit.com/t13.cl",
}

I test with https://api.microlink.io/?url=https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020

I get this

{ "status":"success", "data":{ "title":"T13 | Tele 13", "description":"inés matte urrejola #0848, santiago, chile fono (562) 2 251 4000 © todos los derechos reservados. 13.cl 2014", "lang":"en", "author":null, "publisher":"t13.cl", "image":{ "url":"https://www-storage.13.cl/t13/site/img/main/t13_logo_splash.svg", "type":"image2", "size":2152, "size_pretty":"2.15 kB" }, "url":"https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020", "date":"2020-05-29T17:07:17.000Z", "logo":{ "url":"https://logo.clearbit.com/t13.cl", "type":"png", "size":7463, "height":128, "width":128, "size_pretty":"7.46 kB" } } }



Regards!!

Kikobeats commented 4 years ago

Hello, I see your point

The problem is the web is referencing some js files as a relative path and they being resolved against Microlink API as root URL instead of use the URL provided

https://api.microlink.io/?url=https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020&data.html.selector=html&embed=html&prerender

This behavior can be considered a bug from Microlink API; let me investigate a bit more and I can found a solution 🙂

dan085 commented 4 years ago

@Kikobeats already great !!! Thank you very much for the quick answer!

Kikobeats commented 4 years ago

Hello again @dverdugo85

it should be working fine if you use prerender since the website is a SPA:

https://api.microlink.io/?url=https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020&prerender

dan085 commented 4 years ago

Hello @Kikobeats

for prerender the link I use puppeteer` https://github.com/puppeteer/puppeteer But I can not to read medatada. +++++++++Code +++++

const puppeteer = require('puppeteer'); const url = "https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020" const browser = await puppeteer.launch({headless: true, args: isCurrentUserRoot() ? ['--no-sandbox'] : undefined}); const page = await browser.newPage(); await page.goto(url, {waitUntil: 'networkidle0'});` const html = await page.content();

const metadata_info = await metascraper({html, url}) console.log(metadata_info)

function isCurrentUserRoot() { return process.getuid() == 0; // UID 0 is always root }

return this

{ author: null, date: null, description: null, image: null, logo: 'https://logo.clearbit.com/t13.cl', publisher: 'T13', title: null, url: 'https://www.t13.cl/noticia/tendencias/series-netflix-distopicas-maratonear-26-05-2020', audio: null, lang: null }

Regards!!

dan085 commented 4 years ago

I use this function to read metadata

var metadata = await getMetaTags(metadata_info,page, url )


const isImageUrl = require('is-image-url');
async function getMetaTags(metadata,page,targetUrl){

    var title_puppeteer       =   metadata['title']
    var description_puppeteer =   metadata['description']
    var image_puppeteer       =   metadata['image']
    var keywords_puppeteer    =   metadata['keywords']
    var type_puppeteer        =   metadata['type']

    try{
        title_puppeteer = await page.$eval(
            "head > meta[property='og:title']",
            element => element.content
        );
    } catch (e) {

    }

    try{
        description_puppeteer = await page.$eval(
            "head > meta[property='og:description']",
            element => element.content
        );
    } catch (e) {
    }

    try{
        image_puppeteer = await page.$eval(
            "head > meta[property='og:image']",
            element => element.content
        );

    }catch (e) {
    }

    if(image_puppeteer === null){

      var url_image_split_https =  image_puppeteer.split('https://');
        console.log("url_image_split_https")
        console.log(url_image_split_https)
        var status_image = false
        if(url_image_split_https.length>1){
            console.log("url_image_split_http")
            image_1:
                for (let ij = 0; ij < url_image_split_https.length; ij++) {
                    console.log('https://'+url_image_split_https[ij]);
                    var check_image =  isImageUrl('https://'+url_image_split_https[ij]);
                    console.log(check_image)
                    if(check_image){
                        status_image = check_image
                        image_puppeteer = 'https://'+url_image_split_https[ij]
                        break image_1;
                    }

                }
            console.log(url_image_split_https)
        }

        var url_image_split_http =  image_puppeteer.split('http://');

        if(url_image_split_http.length>1){
            console.log("url_image_split_http")
            console.log(url_image_split_http)

            if(!status_image) {
                image_2:
                    for (let ij = 0; ij < url_image_split_http.length; ij++) {
                        console.log('https://' + url_image_split_http[ij]);
                        var check_image = isImageUrl('https://' + url_image_split_http[ij]);
                        console.log(check_image)
                        if(check_image){
                            status_image = check_image
                            image_puppeteer = 'http://'+url_image_split_http[ij]
                            break image_2;
                        }

                    }
            }

        }

    }

    try{
        keywords_puppeteer = await page.$eval(
            "head > meta[name='keywords']",
            element => element.content
        );

    } catch (e) {
    }

    try{
        type_puppeteer = await page.$eval(
            "head > meta[property='og:type']",
            element => element.content
        );
    } catch (e) {
    }

    const metadata_puppeteer = {
        'title':title_puppeteer,
        'description':description_puppeteer,
        'keywords':keywords_puppeteer,
        'type' :type_puppeteer,
        'url_amp':targetUrl,
        'amp':targetUrl,
        'image':image_puppeteer
    }

    return  metadata_puppeteer
}

Kikobeats commented 4 years ago

I can't address the puppeteer issue, just use Microlink API!

microlinkhq / open

not get corrrect html #35