Closed benoit74 closed 4 months ago
Hello. I have the same issue during downloading https://lurkmore.online/.
My docker-compose.yml
services:
zimit:
image: ghcr.io/openzim/zimit:zimit2
container_name: zimit
restart: unless-stopped
command: 'zimit --url https://lurkmore.online/ --name lurkmore --workers 6 --waitUntil domcontentloaded'
shm_size: 1gb
volumes:
- output:/output
volumes:
output:
driver: local
driver_opts:
type: none
o: bind
device: $VOLUMES_PATH_PREFIX/zimit/volumes/output
Logs:
{"timestamp":"2024-05-14T12:52:00.533Z","logLevel":"info","context":"worker","message":"Starting page","details":{"workerid":2,"page":"https://lurkmore.online/%D0%9A%D0%B0%D1%82%D1%8F_%D0%93%D0%BE%D1%80%D0%B4%D0%BE%D0%BD"}}
{"timestamp":"2024-05-14T12:52:00.534Z","logLevel":"info","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":948,"total":31621,"pending":6,"failed":0,"limit":{"max":0,"hit":false},"pendingPages":["{\"seedId\":1,\"started\":\"2024-05-14T12:52:00.533Z\",\"extraHops\":0,\"url\":\"https://lurkmore.online/%D0%9A%D0%B0%D1%82%D1%8F_%D0%93%D0%BE%D1%80%D0%B4%D0%BE%D0%BD\",\"added\":\"2024-05-14T12:32:01.781Z\",\"depth\":2}","{\"seedId\":1,\"started\":\"2024-05-14T12:51:59.695Z\",\"extraHops\":0,\"url\":\"https://lurkmore.online/%D0%95%D0%BA%D0%B0%D1%82%D0%B5%D1%80%D0%B8%D0%BD%D0%B0_II\",\"added\":\"2024-05-14T12:32:01.781Z\",\"depth\":2}","{\"seedId\":1,\"started\":\"2024-05-14T12:51:52.537Z\",\"extraHops\":0,\"url\":\"https://lurkmore.online/BadComedian\",\"added\":\"2024-05-14T12:32:01.780Z\",\"depth\":2}","{\"seedId\":1,\"started\":\"2024-05-14T12:51:10.376Z\",\"extraHops\":0,\"url\":\"https://lurkmore.online/%D0%9D%D0%B0%D0%B4%D0%BC%D0%BE%D0%B7%D0%B3\",\"added\":\"2024-05-14T12:32:01.774Z\",\"depth\":2}","{\"seedId\":1,\"started\":\"2024-05-14T12:51:06.149Z\",\"extraHops\":0,\"url\":\"https://lurkmore.online/%D0%94%D0%BE%D0%BA%D1%82%D0%BE%D1%80_%D0%9A%D1%82%D0%BE\",\"added\":\"2024-05-14T12:32:01.774Z\",\"depth\":2}","{\"seedId\":1,\"started\":\"2024-05-14T12:51:59.580Z\",\"extraHops\":0,\"url\":\"https://lurkmore.online/%D0%95%D0%BA%D0%B0%D1%82%D0%B5%D1%80%D0%B8%D0%BD%D0%B0_%D0%94%D0%BE%D0%BB%D0%B3%D0%BE%D1%80%D1%83%D0%BA%D0%BE%D0%B2%D0%B0\",\"added\":\"2024-05-14T12:32:01.781Z\",\"depth\":2}"]}}
{"timestamp":"2024-05-14T12:52:02.190Z","logLevel":"info","context":"general","message":"Awaiting page load","details":{"page":"https://lurkmore.online/%D0%9A%D0%B0%D1%82%D1%8F_%D0%93%D0%BE%D1%80%D0%B4%D0%BE%D0%BD","workerid":2}}
node:internal/deps/undici/undici:1834
throw new TypeError(
^
TypeError: Cannot convert argument to a ByteString because the character at index 25 has a value of 1050 which is greater than 255.
at webidl.converters.ByteString (node:internal/deps/undici/undici:1834:17)
at Object.record<ByteString, ByteString> (node:internal/deps/undici/undici:1743:32)
at webidl.converters.HeadersInit (node:internal/deps/undici/undici:2283:67)
at new Headers (node:internal/deps/undici/undici:2080:36)
at RequestResponseInfo.getMimeType (file:///app/dist/util/reqresp.js:196:25)
at Recorder.addPageRecord (file:///app/dist/util/recorder.js:448:34)
at Recorder.serializeToWARC (file:///app/dist/util/recorder.js:669:14)
at Recorder.handleRedirectResponse (file:///app/dist/util/recorder.js:197:14)
at Recorder.handleRequestWillBeSent (file:///app/dist/util/recorder.js:160:18)
at file:///app/dist/util/recorder.js:71:62
Node.js v20.11.1
Traceback (most recent call last):
File "/usr/bin/zimit", line 8, in <module>
sys.exit(zimit.zimit())
^^^^^^^^^^^^^
File "/app/zimit/lib/python3.12/site-packages/zimit/zimit.py", line 580, in zimit
run(sys.argv[1:])
File "/app/zimit/lib/python3.12/site-packages/zimit/zimit.py", line 485, in run
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
subprocess.CalledProcessError: Command '['crawl', '--failOnFailedSeed', '--workers', '6', '--waitUntil', 'domcontentloaded', '--depth', '-1', '--timeout', '90', '--behaviors', 'autoplay,autofetch,siteSpecific', '--behaviorTimeout', '90', '--diskUtilization', '90', '--url', 'https://lurkmore.online/', '--userAgentSuffix', '+Zimit', '--mobileDevice', 'Pixel 2', '--cwd', '/output/.tmp5b44wtc3']' returned non-zero exit status 1.
[zimit::2024-05-14 12:52:02,477] INFO:
[zimit::2024-05-14 12:52:02,477] INFO:----------
[zimit::2024-05-14 12:52:02,477] INFO:Cleanup, removing temp dir: /output/.tmp5b44wtc3
alex@MacBook-Air compose % docker logs zimit --tail 50 -f
[zimit::2024-05-14 13:16:51,667] INFO:----------
[zimit::2024-05-14 13:16:51,667] INFO:Testing warc2zim args
[zimit::2024-05-14 13:16:51,667] INFO:Running: warc2zim --name lurkmore --scraper-suffix + zimit 2.0.0-dev4 + Browsertrix crawler 1.1.1 --output /output --url https://lurkmore.online/
[warc2zim::2024-05-14 13:16:51,668] INFO:Arguments valid, no inputs to process. Exiting with return code 100
[zimit::2024-05-14 13:16:51,668] INFO:
[zimit::2024-05-14 13:16:51,668] INFO:----------
[zimit::2024-05-14 13:16:51,668] INFO:Output to tempdir: /output/.tmpwnbjnffm - will delete
[zimit::2024-05-14 13:16:51,668] INFO:Running browsertrix-crawler crawl: crawl --failOnFailedSeed --workers 6 --waitUntil domcontentloaded --depth -1 --timeout 90 --behaviors autoplay,autofetch,siteSpecific --behaviorTimeout 90 --diskUtilization 90 --url https://lurkmore.online/ --userAgentSuffix +Zimit --mobileDevice Pixel 2 --cwd /output/.tmpwnbjnffm
{"timestamp":"2024-05-14T13:16:51.990Z","logLevel":"info","context":"general","message":"Browsertrix-Crawler 1.1.1 (with warcio.js 2.2.1)","details":{}}
{"timestamp":"2024-05-14T13:16:51.991Z","logLevel":"info","context":"general","message":"Seeds","details":[{"url":"https://lurkmore.online/","scopeType":"prefix","include":["/^https?:\\/\\/lurkmore\\.online\\//"],"exclude":[],"allowHash":false,"depth":-1,"sitemap":null,"maxExtraHops":0,"maxDepth":1000000}]}
{"timestamp":"2024-05-14T13:16:52.086Z","logLevel":"error","context":"general","message":"Crawl failed","details":{"type":"exception","message":"Failed to launch the browser process!\n[48:64:0514/131652.075426:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/dbus/system_bus_socket: No such file or directory\n[48:48:0514/131652.080419:ERROR:ozone_platform_x11.cc(243)] Missing X server or $DISPLAY\n[48:48:0514/131652.080430:ERROR:env.cc(257)] The platform failed to initialize. Exiting.\n\n\nTROUBLESHOOTING: https://pptr.dev/troubleshooting\n","stack":"Error: Failed to launch the browser process!\n[48:64:0514/131652.075426:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/dbus/system_bus_socket: No such file or directory\n[48:48:0514/131652.080419:ERROR:ozone_platform_x11.cc(243)] Missing X server or $DISPLAY\n[48:48:0514/131652.080430:ERROR:env.cc(257)] The platform failed to initialize. Exiting.\n\n\nTROUBLESHOOTING: https://pptr.dev/troubleshooting\n\n at ChildProcess.onClose (file:///app/node_modules/@puppeteer/browsers/lib/esm/launch.js:301:24)\n at ChildProcess.emit (node:events:530:35)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)"}}
{"timestamp":"2024-05-14T13:16:52.086Z","logLevel":"info","context":"general","message":"Exiting, Crawl status: failing","details":{}}
Traceback (most recent call last):
File "/usr/bin/zimit", line 8, in <module>
sys.exit(zimit.zimit())
^^^^^^^^^^^^^
File "/app/zimit/lib/python3.12/site-packages/zimit/zimit.py", line 580, in zimit
run(sys.argv[1:])
File "/app/zimit/lib/python3.12/site-packages/zimit/zimit.py", line 485, in run
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
subprocess.CalledProcessError: Command '['crawl', '--failOnFailedSeed', '--workers', '6', '--waitUntil', 'domcontentloaded', '--depth', '-1', '--timeout', '90', '--behaviors', 'autoplay,autofetch,siteSpecific', '--behaviorTimeout', '90', '--diskUtilization', '90', '--url', 'https://lurkmore.online/', '--userAgentSuffix', '+Zimit', '--mobileDevice', 'Pixel 2', '--cwd', '/output/.tmpwnbjnffm']' returned non-zero exit status 9.
[zimit::2024-05-14 13:16:52,093] INFO:
[zimit::2024-05-14 13:16:52,093] INFO:----------
[zimit::2024-05-14 13:16:52,093] INFO:Cleanup, removing temp dir: /output/.tmpwnbjnffm
[zimit::2024-05-14 13:17:52,573] INFO:Checking browsertrix-crawler version
[zimit::2024-05-14 13:17:52,867] INFO:Browsertrix crawler: version 1.1.1
[zimit::2024-05-14 13:17:52,867] INFO:----------
[zimit::2024-05-14 13:17:52,867] INFO:Testing warc2zim args
[zimit::2024-05-14 13:17:52,867] INFO:Running: warc2zim --name lurkmore --scraper-suffix + zimit 2.0.0-dev4 + Browsertrix crawler 1.1.1 --output /output --url https://lurkmore.online/
[warc2zim::2024-05-14 13:17:52,869] INFO:Arguments valid, no inputs to process. Exiting with return code 100
[zimit::2024-05-14 13:17:52,869] INFO:
[zimit::2024-05-14 13:17:52,869] INFO:----------
[zimit::2024-05-14 13:17:52,869] INFO:Output to tempdir: /output/.tmppg34hkqa - will delete
[zimit::2024-05-14 13:17:52,869] INFO:Running browsertrix-crawler crawl: crawl --failOnFailedSeed --workers 6 --waitUntil domcontentloaded --depth -1 --timeout 90 --behaviors autoplay,autofetch,siteSpecific --behaviorTimeout 90 --diskUtilization 90 --url https://lurkmore.online/ --userAgentSuffix +Zimit --mobileDevice Pixel 2 --cwd /output/.tmppg34hkqa
{"timestamp":"2024-05-14T13:17:53.176Z","logLevel":"info","context":"general","message":"Browsertrix-Crawler 1.1.1 (with warcio.js 2.2.1)","details":{}}
{"timestamp":"2024-05-14T13:17:53.176Z","logLevel":"info","context":"general","message":"Seeds","details":[{"url":"https://lurkmore.online/","scopeType":"prefix","include":["/^https?:\\/\\/lurkmore\\.online\\//"],"exclude":[],"allowHash":false,"depth":-1,"sitemap":null,"maxExtraHops":0,"maxDepth":1000000}]}
{"timestamp":"2024-05-14T13:17:54.266Z","logLevel":"error","context":"general","message":"Crawl failed","details":{"type":"exception","message":"Failed to launch the browser process!\n[48:64:0514/131754.255443:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/dbus/system_bus_socket: No such file or directory\n[48:48:0514/131754.260410:ERROR:ozone_platform_x11.cc(243)] Missing X server or $DISPLAY\n[48:48:0514/131754.260418:ERROR:env.cc(257)] The platform failed to initialize. Exiting.\n\n\nTROUBLESHOOTING: https://pptr.dev/troubleshooting\n","stack":"Error: Failed to launch the browser process!\n[48:64:0514/131754.255443:ERROR:bus.cc(407)] Failed to connect to the bus: Failed to connect to socket /run/dbus/system_bus_socket: No such file or directory\n[48:48:0514/131754.260410:ERROR:ozone_platform_x11.cc(243)] Missing X server or $DISPLAY\n[48:48:0514/131754.260418:ERROR:env.cc(257)] The platform failed to initialize. Exiting.\n\n\nTROUBLESHOOTING: https://pptr.dev/troubleshooting\n\n at ChildProcess.onClose (file:///app/node_modules/@puppeteer/browsers/lib/esm/launch.js:301:24)\n at ChildProcess.emit (node:events:530:35)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)"}}
{"timestamp":"2024-05-14T13:17:54.267Z","logLevel":"info","context":"general","message":"Exiting, Crawl status: failing","details":{}}
Traceback (most recent call last):
File "/usr/bin/zimit", line 8, in <module>
sys.exit(zimit.zimit())
^^^^^^^^^^^^^
File "/app/zimit/lib/python3.12/site-packages/zimit/zimit.py", line 580, in zimit
run(sys.argv[1:])
File "/app/zimit/lib/python3.12/site-packages/zimit/zimit.py", line 485, in run
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
subprocess.CalledProcessError: Command '['crawl', '--failOnFailedSeed', '--workers', '6', '--waitUntil', 'domcontentloaded', '--depth', '-1', '--timeout', '90', '--behaviors', 'autoplay,autofetch,siteSpecific', '--behaviorTimeout', '90', '--diskUtilization', '90', '--url', 'https://lurkmore.online/', '--userAgentSuffix', '+Zimit', '--mobileDevice', 'Pixel 2', '--cwd', '/output/.tmppg34hkqa']' returned non-zero exit status 9.
[zimit::2024-05-14 13:17:54,275] INFO:
[zimit::2024-05-14 13:17:54,275] INFO:----------
[zimit::2024-05-14 13:17:54,275] INFO:Cleanup, removing temp dir: /output/.tmppg34hkqa
Thank you @seniyakk
Upstream issue solved.
See https://github.com/webrecorder/browsertrix-crawler/issues/569
Zimfarm task: https://farm.openzim.org/pipeline/c05c7a6d-9b5f-44e8-8786-6e4adf593f9b/debug