Open TheBestPessimist opened 3 weeks ago
t.wrapper [as addSettingTab] (plugin:settings-search:59:20)
{
"hash": 1309301853
}
t.wrapper [as openTab] (plugin:settings-search:59:20)
{
"hash": 1309301853
}
SlurpPlugin.slurp (plugin:slurp:12508:30)
{
"enabled": true,
"custom": false,
"_key": "link",
"_idx": 0,
"id": "link",
"metaFields": [
"url",
"og:url",
"parsely-link",
"twitter:url"
],
"defaultIdx": 0,
"defaultKey": "link",
"description": "Page URL provided or a permalink discovered in metadata."
}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"url"
"meta[name=\"url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"og:url"
"meta[name=\"og:url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"parsely-link"
"meta[name=\"parsely-link\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"twitter:url"
"meta[name=\"twitter:url\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
{
"enabled": true,
"custom": false,
"_key": "byline",
"_idx": 1,
"id": "byline",
"metaFields": [
"author",
"article:author",
"parsely-author",
"cXenseParse:author"
],
"defaultIdx": 1,
"defaultKey": "byline",
"description": "Name of the primary author or the first author detected."
}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"author"
"meta[name=\"author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"article:author"
"meta[name=\"article:author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"parsely-author"
"meta[name=\"parsely-author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"cXenseParse:author"
"meta[name=\"cXenseParse:author\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
{
"enabled": true,
"custom": false,
"_key": "site",
"_idx": 2,
"id": "siteName",
"metaFields": [
"og:site_name",
"page.content.source",
"application-name",
"apple-mobile-web-app-title",
"twitter:site"
],
"defaultIdx": 2,
"defaultKey": "site",
"description": "Website or publication name."
}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"og:site_name"
"meta[name=\"og:site_name\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"page.content.source"
"meta[name=\"page.content.source\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"application-name"
"meta[name=\"application-name\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"apple-mobile-web-app-title"
"meta[name=\"apple-mobile-web-app-title\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"twitter:site"
"meta[name=\"twitter:site\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
{
"enabled": true,
"custom": false,
"_key": "date",
"_idx": 3,
"_format": "d|YYYY-MM-DDTHH:mm",
"id": "publishedTime",
"metaFields": [
"article:published_time",
"parsely-pub-date",
"datePublished",
"article.published"
],
"defaultIdx": 3,
"defaultKey": "date",
"description": "Date/time that the page was initially published.",
"defaultFormat": "d|YYYY-MM-DDTHH:mm"
}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"article:published_time"
"meta[name=\"article:published_time\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"parsely-pub-date"
"meta[name=\"parsely-pub-date\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"datePublished"
"meta[name=\"datePublished\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"article.published"
"meta[name=\"article.published\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
SlurpPlugin.slurp (plugin:slurp:12508:30)
{
"enabled": true,
"custom": false,
"_key": "updated",
"_idx": 4,
"_format": "d|YYYY-MM-DDTHH:mm",
"id": "modifiedTime",
"metaFields": [
"article:modified_time",
"dateModified",
"dateLastPubbed"
],
"defaultIdx": 4,
"defaultKey": "updated",
"description": "Date/time that the page was last modified, if available.",
"defaultFormat": "d|YYYY-MM-DDTHH:mm"
}
SlurpPlugin.slurp (plugin:slurp:12508:30)
"article:modified_time"
"meta[name=\"article:modified_time\"], meta[property=\"{s}\"], meta[itemprop=\"{s}\"], meta[http-equiv=\"{s}\"]"
{}
thanks for the report! often this is due to Readability thinking the block is a nav header or similar. I'll have a look and see whether that can be parsed out safely.
are there any more logs to go with this? seems to just sort of end mid-parse.
Here's another log: slurp-2024-06-08.md
I tried slurping the same url as above twice.
yeah so this is a readability thing. it works by scoring nodes individually. the scores are based on things like link density, classes which are commonly associated with content or non-content, content length, etc.
nodes with any score > 0 become a candidate and the node with the highest score becomes the "top candidate" for the node which contains the page's actual content. once it has its top candidate, it moves up the tree to check for an ancestor which contains at least three other candidates and has a score that's no less than 25% lower than the top candidate's score.
your site produces these candidates:
Reader: (Readability) Candidate: <div class="about__content"> with score 31.716417910447763
Reader: (Readability) Candidate: <div id="about" class="effect8"> with score 6.598726114649682
Reader: (Readability) Candidate: <div class="main-content"> with score 30.894308943089435
Reader: (Readability) Candidate: <div class="main-container"> with score 29.864498644986444
Reader: (Readability) Candidate: <body > with score 1.5447154471544717
Reader: (Readability) Candidate: <div class="work__content"> with score 42.23109243697479
Reader: (Readability) Candidate: <div id="work" class="effect8"> with score 11.945945945945946
note that the main container's score is 29.86 and the work content's score is 42.23, so the main container's score is about 30% lower and it gets disqualified. it might be possible to give extra weight to an ancestor with multiple high scoring children but it might degrade the experience when parsing more complex sites.
i'll leave this issue open for now as a reminder for the next time i dive into readability work and tinker with the scoring mechanism. if that tinkering seems promising, i'll open an issue upstream and link it here.
i have to point out though: readability and slurp are geared toward news sites, blogs, and long-form writeups as that is by far the most common use case. reliably extracting page content while excluding irrelevancies like ads and nav bars requires a lot of fuzzy logic which won't work as intended on every possible page structure.
Thank you for the explanation and for keeping this issue open for further work.
readability and slurp are geared toward news sites, blogs, and long-form writeups
That makes sense. I have tried multiple "url to .MD" tools and I found that most of them ignore the first part of my website, so that's why I opened this issue. After your explanation, I have more insight about why this happens. Thanks again!
What was slurped:![image](https://github.com/inhumantsar/slurp/assets/4482210/d83417b1-d92f-42d7-847e-9fab4bea4063)
the website: https://tbp.land