mvdan / xurls

Extract urls from text
BSD 3-Clause "New" or "Revised" License
1.19k stars 116 forks source link

parsing issue with json file #41

Closed hmsta closed 4 years ago

hmsta commented 4 years ago

Hi,

if a website body contains a json string, I get garbage urls...

specific case...

    string := `{"props":{"pageProps":{"theme":{"key":"leaf","mode":"light","colors":{"body":"palette.slate13","linkText":"#fff","linkBackground":"#39e09b","linkShadow":"#000"},"components":{"ProfileBackground":{"backgroundColor":"#fff","backgroundStyle":"flat"},"LinkContainer":{"borderType":"squared","styleType":"fill"},"SocialLink":{"fill":"linkBackground"},"Banner":{"default":{"backgroundColor":"linkBackground","color":"linkText"}}}},"username":"adrianphoto_bcn","pageTitle":"@adrianphoto_bcn","metaTitle":"@adrianphoto_bcn","metaDescription":"Linktree. Make your link do more.","profilePictureUrl":"https://d15mvavv27jnvy.cloudfront.net/zdKaK/660bb5ffef7d46960c5c1be349944840.jpg","description":null,"links":[{"id":"11987649","url":"https://onlyfans.com/adrianphotobcn","animation":null,"amazonAffiliate":null,"thumbnail":null,"title":"Onlyfans","type":"CLASSIC","context":{}},{"id":"7730208","url":"http://Photoproducer.manyvids.com","animation":null,"amazonAffiliate":null,"thumbnail":null,"title":"ManyVids","type":"CLASSIC","context":{}},{"id":"11994192","url":"https://www.suicidegirls.com/members/adrianphoto_bcn/","animation":null,"amazonAffiliate":null,"thumbnail":null,"title":"Suicidegirls","type":"CLASSIC","context":{}},{"id":"7730413","url":"https://mobile.twitter.com/adrianphoto_bcn","animation":null,"amazonAffiliate":null,"thumbnail":null,"title":"Twitter","type":"CLASSIC","context":{}},{"id":"7730346","url":"https://www.instagram.com/adrianphotobcn","animation":null,"amazonAffiliate":null,"thumbnail":null,"title":"Instagram","type":"CLASSIC","context":{}},{"id":"16064948","url":"https://www.instagram.com/afoto.bcn","animation":null,"amazonAffiliate":null,"thumbnail":null,"title":"Instagram sec","type":"CLASSIC","context":{}}],"socialLinks":[],"integrations":[],"leapLink":null,"isOwner":false,"isLogoVisible":true,"isProfileVerified":true,"hasConsentedToView":true,"account":{"id":1848934,"username":"adrianphoto_bcn","isActive":true,"profilePictureUrl":"https://d15mvavv27jnvy.cloudfront.net/zdKaK/660bb5ffef7d46960c5c1be349944840.jpg","pageTitle":"@adrianphoto_bcn","googleAnalyticsId":null,"facebookPixelId":null,"donationsActive":false,"contentWarning":null,"description":null,"isLogoVisible":true,"owner":{"id":2054277,"isEmailVerified":true},"pageMeta":null,"integrations":[],"links":[{"id":11987649,"type":"CLASSIC","title":"Onlyfans","url":"https://onlyfans.com/adrianphotobcn","formattedUrl":"https://onlyfans.com/adrianphotobcn","thumbnailUrl":null,"animation":null,"isLeapLink":false,"isLeapLinkActive":false,"amazonAffiliate":null,"context":null},{"id":7730208,"type":"CLASSIC","title":"ManyVids","url":"Photoproducer.manyvids.com","formattedUrl":"http://Photoproducer.manyvids.com","thumbnailUrl":null,"animation":null,"isLeapLink":false,"isLeapLinkActive":false,"amazonAffiliate":null,"context":null},{"id":11994192,"type":"CLASSIC","title":"Suicidegirls","url":"https://www.suicidegirls.com/members/adrianphoto_bcn/","formattedUrl":"https://www.suicidegirls.com/members/adrianphoto_bcn/","thumbnailUrl":null,"animation":null,"isLeapLink":false,"isLeapLinkActive":false,"amazonAffiliate":null,"context":null},{"id":7730413,"type":"CLASSIC","title":"Twitter","url":"https://mobile.twitter.com/adrianphoto_bcn","formattedUrl":"https://mobile.twitter.com/adrianphoto_bcn","thumbnailUrl":null,"animation":null,"isLeapLink":false,"isLeapLinkActive":false,"amazonAffiliate":null,"context":null},{"id":7730346,"type":"CLASSIC","title":"Instagram","url":"https://www.instagram.com/adrianphotobcn","formattedUrl":"https://www.instagram.com/adrianphotobcn","thumbnailUrl":null,"animation":null,"isLeapLink":false,"isLeapLinkActive":false,"amazonAffiliate":null,"context":null},{"id":16064948,"type":"CLASSIC","title":"Instagram sec","url":"https://www.instagram.com/afoto.bcn","formattedUrl":"https://www.instagram.com/afoto.bcn","thumbnailUrl":null,"animation":null,"isLeapLink":false,"isLeapLinkActive":false,"amazonAffiliate":null,"context":null}],"socialLinks":[],"theme":{"key":"leaf"}}},"__N_SSP":true},"page":"/[profile]","query":{"profile":"adrianphoto_bcn"}`
    rxStrict := xurls.Strict()
    urls := rxStrict.FindAllString(string, -1)
    for _, url := range urls {
        fmt.Printf("%s\n",url)
    }

thanks

mvdan commented 4 years ago

Thanks for raising this. I think you should be parsing the JSON first, to avoid weird edge cases like this one, and because you would want to unescape JSON strings before extracting URLs anyway.

Having said that, it probably doesn't make sense for us to support double quotes in the middle of a URL. I'll see what I can do about that.

hmsta commented 4 years ago

the json is somewhere inside the Githubissues.

  • Githubissues is a development platform for aggregating issues.