inhumantsar / slurp

Slurps webpages and saves them as clean, uncluttered Markdown. Think Pocket, but better.
https://inhumantsar.github.io/slurp/
MIT License
127 stars 2 forks source link

Error by slurping web page #48

Open victor-kozlov opened 3 weeks ago

victor-kozlov commented 3 weeks ago

Link to the page I tried to slurp.

Logs:

1718357280673 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "link",
  "_idx": 0,
  "id": "link",
  "metaFields": [
    "url",
    "og:url",
    "parsely-link",
    "twitter:url"
  ],
  "defaultIdx": 0,
  "defaultKey": "link",
  "description": "Page URL provided or a permalink discovered in metadata."
}
1718357280673 | DEBUG | found prop elements
"url"
"meta[name=\"url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280673 | DEBUG | found prop elements
"og:url"
"meta[name=\"og:url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280673 | DEBUG | found prop elements
"parsely-link"
"meta[name=\"parsely-link\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280673 | DEBUG | found prop elements
"twitter:url"
"meta[name=\"twitter:url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280673 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "byline",
  "_idx": 1,
  "id": "byline",
  "metaFields": [
    "author",
    "article:author",
    "parsely-author",
    "cXenseParse:author"
  ],
  "defaultIdx": 1,
  "defaultKey": "byline",
  "description": "Name of the primary author or the first author detected."
}
1718357280673 | DEBUG | found prop elements
"author"
"meta[name=\"author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{
  "0": {}
}
1718357280674 | DEBUG | adding metadata
{
  "prop": {
    "enabled": true,
    "custom": false,
    "_key": "byline",
    "_idx": 1,
    "id": "byline",
    "metaFields": [
      "author",
      "article:author",
      "parsely-author",
      "cXenseParse:author"
    ],
    "defaultIdx": 1,
    "defaultKey": "byline",
    "description": "Name of the primary author or the first author detected."
  },
  "elements": {
    "0": {}
  },
  "metaFields": {},
  "querySelector": "meta[name=\"author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
}
1718357280674 | DEBUG | found prop elements
"article:author"
"meta[name=\"article:author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"parsely-author"
"meta[name=\"parsely-author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"cXenseParse:author"
"meta[name=\"cXenseParse:author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "site",
  "_idx": 2,
  "id": "siteName",
  "metaFields": [
    "og:site_name",
    "page.content.source",
    "application-name",
    "apple-mobile-web-app-title",
    "twitter:site"
  ],
  "defaultIdx": 2,
  "defaultKey": "site",
  "description": "Website or publication name."
}
1718357280674 | DEBUG | found prop elements
"og:site_name"
"meta[name=\"og:site_name\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"page.content.source"
"meta[name=\"page.content.source\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"application-name"
"meta[name=\"application-name\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"apple-mobile-web-app-title"
"meta[name=\"apple-mobile-web-app-title\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"twitter:site"
"meta[name=\"twitter:site\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{
  "0": {}
}
1718357280674 | DEBUG | adding metadata
{
  "prop": {
    "enabled": true,
    "custom": false,
    "_key": "site",
    "_idx": 2,
    "id": "siteName",
    "metaFields": [
      "og:site_name",
      "page.content.source",
      "application-name",
      "apple-mobile-web-app-title",
      "twitter:site"
    ],
    "defaultIdx": 2,
    "defaultKey": "site",
    "description": "Website or publication name."
  },
  "elements": {
    "0": {}
  },
  "metaFields": {},
  "querySelector": "meta[name=\"twitter:site\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
}
1718357280674 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "date",
  "_idx": 3,
  "_format": "d|YYYY-MM-DDTHH:mm",
  "id": "publishedTime",
  "metaFields": [
    "article:published_time",
    "parsely-pub-date",
    "datePublished",
    "article.published"
  ],
  "defaultIdx": 3,
  "defaultKey": "date",
  "description": "Date/time that the page was initially published.",
  "defaultFormat": "d|YYYY-MM-DDTHH:mm"
}
1718357280674 | DEBUG | found prop elements
"article:published_time"
"meta[name=\"article:published_time\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"parsely-pub-date"
"meta[name=\"parsely-pub-date\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"datePublished"
"meta[name=\"datePublished\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | found prop elements
"article.published"
"meta[name=\"article.published\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
1718357280674 | DEBUG | attempting to parse prop metadata
{
  "enabled": true,
  "custom": false,
  "_key": "updated",
  "_idx": 4,
  "_format": "d|YYYY-MM-DDTHH:mm",
  "id": "modifiedTime",
  "metaFields": [
    "article:modified_time",
    "dateModified",
    "dateLastPubbed"
  ],
  "defaultIdx": 4,
  "defaultKey": "updated",
  "description": "Date/time that the page was last modified, if available.",
  "defaultFormat": "d|YYYY-MM-DDTHH:mm"
}
1718357280674 | DEBUG | found prop elements
"article:modified_time"
"meta[name=\"article:modified_time\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
inhumantsar commented 1 week ago

sorry for the delay! life in meatspace has been busy lately.

it looks like this site has some kind of dynamic loading happening. so rather than loading the post directly, their site loads a page with a bunch of JavaScript which then looks at the url and loads the content for that.

Readability (the library Slurp uses to generate simplified versions of web pages) generally can't handle sites like this since it would mean running third-party code outside of a sandboxed browser tab, which opens up a whole host security concerns.

I'll do a bit more digging when I have more time to dive into the code and see if there's something that can be done to work around that dynamic loading issue safely as it's been a problem elsewhere too, but I wouldn't get my hopes up.

One thing I can absolutely do though is improve the error message that pops up.