inhumantsar / slurp

Slurps webpages and saves them as clean, uncluttered Markdown. Think Pocket, but better.
https://inhumantsar.github.io/slurp/
MIT License
156 stars 5 forks source link

1722303056697 | ERROR | Parsed article missing critical content #53

Open TrumanXia opened 1 month ago

TrumanXia commented 1 month ago

thank you for the help first

site:https://www.toutiao.com/article/6745730819765043720/

log: `##### 1722303056697 | ERROR | Parsed article missing critical content

null
1722303071813 | DEBUG | onValidate called, no changes detected
{
  "hash": 1309301853
}
1722303119651 | ERROR | Parsed article missing critical content
null
1722303124069 | DEBUG | onValidate called, no changes detected
{
  "hash": 1309301853
}
1722303283706 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "link",
  "_idx": 0,
  "id": "link",
  "metaFields": [
    "url",
    "og:url",
    "parsely-link",
    "twitter:url"
  ],
  "defaultIdx": 0,
  "defaultKey": "link",
  "description": "Page URL provided or a permalink discovered in metadata."
}
1722303283706 | DEBUG | found prop elements
"url"
"meta[name=\"url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | found prop elements
"og:url"
"meta[name=\"og:url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | found prop elements
"parsely-link"
"meta[name=\"parsely-link\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | found prop elements
"twitter:url"
"meta[name=\"twitter:url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "byline",
  "_idx": 1,
  "id": "byline",
  "metaFields": [
    "author",
    "article:author",
    "parsely-author",
    "cXenseParse:author"
  ],
  "defaultIdx": 1,
  "defaultKey": "byline",
  "description": "Name of the primary author or the first author detected."
}
1722303283706 | DEBUG | found prop elements
"author"
"meta[name=\"author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | found prop elements
"article:author"
"meta[name=\"article:author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | found prop elements
"parsely-author"
"meta[name=\"parsely-author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | found prop elements
"cXenseParse:author"
"meta[name=\"cXenseParse:author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283706 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "site",
  "_idx": 2,
  "id": "siteName",
  "metaFields": [
    "og:site_name",
    "page.content.source",
    "application-name",
    "apple-mobile-web-app-title",
    "twitter:site"
  ],
  "defaultIdx": 2,
  "defaultKey": "site",
  "description": "Website or publication name."
}
1722303283706 | DEBUG | found prop elements
"og:site_name"
"meta[name=\"og:site_name\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"page.content.source"
"meta[name=\"page.content.source\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"application-name"
"meta[name=\"application-name\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"apple-mobile-web-app-title"
"meta[name=\"apple-mobile-web-app-title\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"twitter:site"
"meta[name=\"twitter:site\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "date",
  "_idx": 3,
  "_format": "d|YYYY-MM-DDTHH:mm",
  "id": "publishedTime",
  "metaFields": [
    "article:published_time",
    "parsely-pub-date",
    "datePublished",
    "article.published"
  ],
  "defaultIdx": 3,
  "defaultKey": "date",
  "description": "Date/time that the page was initially published.",
  "defaultFormat": "d|YYYY-MM-DDTHH:mm"
}
1722303283707 | DEBUG | found prop elements
"article:published_time"
"meta[name=\"article:published_time\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"parsely-pub-date"
"meta[name=\"parsely-pub-date\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"datePublished"
"meta[name=\"datePublished\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1722303283707 | DEBUG | found prop elements
"article.published"
"meta[name=\"article.published\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
inhumantsar commented 1 month ago

it looks like this site uses javascript-based forwarding. when going to the URL you shared, some javascript waits for the page to fully load before changing the window location to the "proper" URL, only then does the content load.

the only way to support sites like this would be to allow all sites to run arbitrary scripts within obsidian. unfortunately this isn't something i can (or would) do, so there's no real fix for this. i will keep this issue open for now though, as its a good reminder to add some kind of pre-slurp check for sites which do this and display a more appropriate error message.