Closed lef-est closed 1 year ago
Please provide the following information for further investigation:
Capture as
> Advanced
)Please provide the following information for further investigation:
- The version of your OS, Browser, and WebScrapBook
- The web page URL you are trying to capture.
- The full capture options (can be obtained from
Capture as
>Advanced
)
Thanks for the quick reply!
Rewrite styles
except "no rewrite", as well as changing Remove hidden elements
to disabled.
{
"tasks": [
{
"comment": "",
"tabId": 1699685497,
"title": "which should use to archive webpages singlefile or webscrapbook? : DataHoarder",
"url": "https://www.reddit.com/r/DataHoarder/comments/rf3tca/which_should_use_to_archive_webpages_singlefile/"
}
],
"bookId": null,
"parentId": "root",
"index": null,
"mode": "",
"delay": null,
"options": {
"capture.applet": "link",
"capture.audio": "save-current",
"capture.backupForRecapture": true,
"capture.base": "blank",
"capture.canvas": "save",
"capture.contentSecurityPolicy": "remove",
"capture.deleteErasedOnCapture": true,
"capture.deleteErasedOnSave": false,
"capture.downLink.doc.delay": null,
"capture.downLink.doc.depth": 0,
"capture.downLink.doc.mode": "source",
"capture.downLink.doc.urlFilter": "",
"capture.downLink.file.extFilter": "###image\n#bmp, gif, ico, jpg, jpeg, jpe, jp2, png, tif, tiff, svg\n###audio\n#aac, ape, flac, mid, midi, mp3, ogg, oga, ra, ram, rm, rmx, wav, wma\n###video\n#avc, avi, flv, mkv, mov, mpg, mpeg, mp4, wmv\n###archive\n#zip, rar, jar, bz2, gz, tar, rpm, 7z, 7zip, xz, jar, xpi, lzh, lha, lzma\n#/z[0-9]{2}|r[0-9]{2}/\n###document\npdf, doc, docx, xls, xlsx, ppt, pptx, odt, ods, odp, odg, odf, rtf, txt, csv\n###executable\n#exe, msi, dmg, bin, xpi, iso\n###any non-web-page\n#/(?!$|html?|xht(ml)?|php|py|pl|aspx?|cgi|jsp)(.*)/i",
"capture.downLink.file.mode": "none",
"capture.downLink.urlExtra": "",
"capture.downLink.urlFilter": "###skip common logout URL\n/[/=]logout\\b/i",
"capture.downloadRetryCount": 3,
"capture.downloadRetryDelay": 1000,
"capture.embed": "link",
"capture.favicon": "save",
"capture.faviconAttrs": "",
"capture.font": "link",
"capture.formStatus": "save",
"capture.frame": "save",
"capture.frameRename": true,
"capture.helpers": "",
"capture.helpersEnabled": false,
"capture.image": "save-current",
"capture.imageBackground": "save-used",
"capture.insertInfoBar": true,
"capture.linkUnsavedUri": true,
"capture.mergeCssResources": true,
"capture.noscript": "save",
"capture.object": "link",
"capture.ping": "blank",
"capture.prefetch": "remove",
"capture.preload": "remove",
"capture.prettyPrint": false,
"capture.recordDocumentMeta": true,
"capture.recordRewrites": true,
"capture.referrerPolicy": "strict-origin-when-cross-origin",
"capture.referrerSpoofSource": false,
"capture.remoteTabDelay": null,
"capture.removeHidden": "undisplayed",
"capture.resourceSizeLimit": null,
"capture.rewriteCss": "match",
"capture.saveAs": "zip",
"capture.saveAsciiFilename": false,
"capture.saveDataUriAsFile": true,
"capture.saveDataUriAsSrcdoc": true,
"capture.saveFileAsHtml": true,
"capture.saveFilename": "%source-hostname% - %title% %source-hash% [%create-Y%%create-m%%create-d%T%create-H%%create-M%%create-S%]",
"capture.saveFilenameMaxLenUtf16": 127,
"capture.saveFilenameMaxLenUtf8": 255,
"capture.saveFolder": "WebScrapBook/data",
"capture.saveOverwrite": false,
"capture.saveResourcesSequentially": false,
"capture.saveTo": "folder",
"capture.script": "link",
"capture.serverUploadRetryCount": 3,
"capture.serverUploadRetryDelay": 2000,
"capture.serverUploadWorkers": 4,
"capture.shadowDom": "save",
"capture.style": "save",
"capture.styleInline": "save",
"capture.video": "save-current",
"capture.zipCompressLevel": null
}
}
This is the output I get (in order to upload I changed the extension from HTZ to ZIP): www.reddit.com - which should use to archive webpages singlefile or webscrapbook DataHoarder [20230528T195824].zip
It seems that some <style>
elements in the page cannot be correctly saved, as the browser returns escaped HTML for them (but not in other pages). This happens both on Firefox and Chrome and doesn't seem to be a bug of the browser.
We need further investigation about why it happens.
Thank you. If it helps, this is the SingleFileZ output that retains the style (after changing the extension from .zip.html
to .zip
)
www.reddit.com which should use to archive webpages singlefile or webscrapbook DataHoarder [20230528T142419].zip
```json { "profiles": { "__Default_Settings__": { "acceptHeaders": { "audio": "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5", "document": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "font": "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8", "image": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", "script": "*/*", "stylesheet": "text/css,*/*;q=0.1", "video": "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5" }, "allowedBookmarkFolders": [ "" ], "applySystemTheme": true, "autoClose": false, "autoOpenEditor": false, "autoSaveDelay": 1, "autoSaveDiscard": false, "autoSaveLoad": true, "autoSaveLoadOrUnload": false, "autoSaveRemove": false, "autoSaveRepeat": false, "autoSaveRepeatDelay": 10, "autoSaveUnload": false, "backgroundSave": true, "blockAudios": false, "blockFonts": false, "blockImages": false, "blockMixedContent": true, "blockScripts": true, "blockStylesheets": false, "blockVideos": false, "browserActionMenuEnabled": true, "compressCSS": false, "compressHTML": true, "confirmFilename": true, "confirmInfobarContent": true, "contextMenuEnabled": true, "createRootDirectory": false, "defaultEditorMode": "cut", "displayInfobar": true, "displayStats": false, "filenameConflictAction": "prompt", "filenameMaxLength": "192", "filenameMaxLengthUnit": "bytes", "filenameReplacedCharacters": [ "~", "+", "\\\\", "?", "%", "*", ":", "|", "\"", "<", ">", "\u0000-\u001f", "" ], "filenameReplacementCharacter": "_", "filenameTemplate": "{url-hostname} {page-title} [{year-locale}{month-locale}{visit-day-locale}T{visit-hours-locale}{visit-minutes-locale}{visit-seconds-locale}].zip.html", "forceWebAuthFlow": false, "githubBranch": "main", "githubRepository": "SingleFileZ-Archives", "githubToken": "", "githubUser": "", "ignoredBookmarkFolders": [ "" ], "includeBOM": false, "includeInfobar": true, "infobarTemplate": "{url-href} [{visit-year-locale}{visit-month-locale}{visit-day-locale}T{visit-hours-locale}{visit-minutes-locale}{visit-seconds-locale}]\\n{page-title} by {page-author}\\n{page-creator} {page-publisher}", "insertMetaCSP": true, "insertMetaNoIndex": false, "insertSingleFileComment": true, "insertTextBody": true, "loadDeferredImages": true, "loadDeferredImagesBlockCookies": false, "loadDeferredImagesBlockStorage": false, "loadDeferredImagesDispatchScrollEvent": false, "loadDeferredImagesKeepZoomLevel": false, "loadDeferredImagesMaxIdleTime": 1500, "logsEnabled": true, "maxResourceSize": 10, "maxResourceSizeEnabled": false, "moveStylesInHead": false, "networkTimeout": 0, "openEditor": true, "openSavedPage": false, "passReferrerOnError": false, "password": "", "progressBarEnabled": true, "removeAlternativeFonts": true, "removeAlternativeImages": true, "removeAlternativeMedias": true, "removeFrames": false, "removeHiddenElements": true, "removeSavedDate": false, "removeUnusedFonts": true, "removeUnusedStyles": true, "replaceBookmarkURL": true, "resolveFragmentIdentifierURLs": false, "saveCreatedBookmarks": false, "saveFavicon": true, "saveOriginalURLs": true, "saveRawPage": false, "saveToGDrive": false, "saveToGitHub": false, "saveWithWebDAV": false, "selfExtractingArchive": true, "shadowEnabled": true, "tabMenuEnabled": true, "userScriptEnabled": false, "warnUnsavedPage": true, "webDAVPassword": "", "webDAVURL": "", "webDAVUser": "" } }, "rules": [ { "autoSaveProfile": "__Default_Settings__", "profile": "__Default_Settings__", "url": "file:" } ], "maxParallelWorkers": 4 } ```
Fortunately we have found the root cause: some <style>
tags dynamically generated by the web page is in SVG namespace (rather than HTML), which is a bad practice and somehow a violation of the HTML standard.
SingleFile doesn't suffer from this issue as it dumps the HTML code using a self-implemented algorithm rather than the native API. It's less performant and has a potential risk of breakage, though.
Anyway, we have implemented a workaround to fix the bad tags. It should work in 1.14.4 now.
Wow the fix is fast. Thank you so much. Have a nice week ahead!
i tried chaging various settings under![image](https://github.com/danny0838/webscrapbook/assets/9643637/e746773d-c9cc-4aa2-b823-a7f046ddcd1b)
Capture - content
but none seem to save the guiding threads on reddit. it just shows up blank. i'm talking about this:Any guidance would be helpful, thanks!