Closed ivan closed 10 years ago
FWIW I'm on Ubuntu 14.04 and my ulimit -n
is the default 1024
.
This is still happening with 69d34de29a8c6e082c99f7bce05a766363b9fe8d
# ~/.local/bin/wpull3 --database=mydb --delete-after --span-hosts-allow=page-requisites,linked-pages --page-requisites --no-check-certificate --no-robots --inet4-only --warc-file=/tmp/dummywarc --concurrent=20 -r -np https://ludios.org/tmp/alexa-top-10000.html
[...]
Requesting http://www1.firstdirect.com/... Requesting http://teasernet.ru/... ERROR Fetching ‘http://aiqianming.cn/’ encountered an error: DNS resolution failed: [Errno -5] No address associated with hostname
INFO Fetching ‘http://mydealz.de/’.
301 Moved Permanently
Length: 0 [None]
[O ] 0.0 B 0:00:02 -- B/s
Bytes received: 0
ERROR Fetching ‘https://www.germanwings.com/’ encountered an error: Redirect location missing.
301 Moved Permanently
Length: 229 [text/html; charset=iso-8859-1]
301 Moved Permanently
Length: 229 [text/html; charset=iso-8859-1]
302 Moved Temporarily
Length: 0 [None]
100.0% [=========================] 229.0 B 0:00:01 168.6 B/sINFO Rolling back file /tmp/dummywarc.warc.gz to length 36535228.
INFO Rolling back file /tmp/dummywarc.warc.gz to length 36535228.
INFO Rolling back file /tmp/dummywarc.warc.gz to length 36535228.
ERROR Fatal exception.
Traceback (most recent call last):
File "/home/at/.local/lib/python3.4/site-packages/wpull/recorder.py", line 341, in write_record
with open_func(self._warc_filename, mode='ab') as out_file:
File "/usr/lib/python3.4/gzip.py", line 181, in __init__
fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
OSError: [Errno 24] Too many open files: '/tmp/dummywarc.warc.gz'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/at/.local/lib/python3.4/site-packages/wpull/app.py", line 101, in run
yield From(self._builder.factory['Engine']())
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/engine.py", line 250, in __call__
yield From(self._run_workers())
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 247, in _step
result = coro.send(value)
File "/home/at/.local/lib/python3.4/site-packages/wpull/engine.py", line 64, in _run_workers
task.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/engine.py", line 131, in _run_worker
yield From(self._process_item(item))
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/engine.py", line 300, in _process_item
yield From(self._process_url_item(url_item))
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/engine.py", line 322, in _process_url_item
yield From(self._processor.process(url_item))
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/processor/web.py", line 147, in process
raise Return((yield From(session.process())))
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/processor/web.py", line 234, in process
is_done = yield From(self._process_one())
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/processor/web.py", line 262, in _process_one
callback=self._response_callback)
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 245, in _step
result = coro.throw(exc)
File "/home/at/.local/lib/python3.4/site-packages/wpull/http/web.py", line 160, in fetch
yield From(session.read_content(file))
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 314, in _wakeup
value = future.result()
File "/home/at/.local/lib/python3.4/site-packages/trollius/futures.py", line 277, in result
raise self._exception
File "/home/at/.local/lib/python3.4/site-packages/trollius/tasks.py", line 249, in _step
result = next(coro)
File "/home/at/.local/lib/python3.4/site-packages/wpull/http/client.py", line 173, in read_content
self._recorder_session.response(self._response)
File "/home/at/.local/lib/python3.4/site-packages/wpull/recorder.py", line 123, in response
session.response(response)
File "/home/at/.local/lib/python3.4/site-packages/wpull/recorder.py", line 559, in response
self._recorder.write_record(self._response_record)
File "/home/at/.local/lib/python3.4/site-packages/wpull/recorder.py", line 349, in write_record
with open(self._warc_filename, mode='wb') as out_file:
OSError: [Errno 24] Too many open files: '/tmp/dummywarc.warc.gz'
INFO FINISHED.
INFO Duration: 0:15:43. Speed: 115.1 KiB/s.
INFO Downloaded: 2019 files, 171.0 MiB.
INFO Exiting with status 3.
I can't seem to reproduce this. I ran the same thing in Ubuntu 14.04 too but under another user and it ran for more than an hour. lsof for the user showed no deleted files and the number temp files remained constant. The number of files and TCP connections stayed around 500 also.
I'm still seeing this with 2721ab4deb939a4343dda48dab4967bfab79ff28
I ran lsof
on the wpull process every 20 seconds using
while true; do lsof -n -P -p 1678 > lsof-dump/$(date +'%s'); sleep 20; done
The dumps show that the number of temporary files stays nearly constant:
# grep -c \.tmp$ *
1409001056:1
1409001076:38
1409001096:40
1409001116:34
1409001136:42
1409001156:39
1409001176:37
1409001196:39
1409001216:36
1409001236:39
1409001256:41
1409001276:35
1409001296:36
1409001316:33
1409001336:33
1409001357:41
1409001377:39
1409001397:37
1409001417:41
1409001437:36
1409001457:43
1409001477:43
1409001497:45
1409001517:39
but the number of TCP connections keeps growing:
# grep -c TCP\ *
1409001056:1
1409001076:78
1409001096:122
1409001116:172
1409001136:187
1409001156:202
1409001176:215
1409001196:227
1409001216:254
1409001236:287
1409001256:315
1409001276:329
1409001296:347
1409001316:370
1409001336:402
1409001357:420
1409001377:445
1409001397:442
1409001417:465
1409001437:488
1409001457:520
1409001477:552
1409001497:553
(This is after just 8 minutes, I'm sure it'll crash soon.)
Some speculation about why you're not seeing a crash: maybe your network or ISP closes idle TCP connections?
After 34 minutes:
ERROR Fatal exception.
Traceback (most recent call last):
File "/home/at/.local/lib/python3.4/site-packages/wpull/recorder.py", line 341, in write_record
with open_func(self._warc_filename, mode='ab') as out_file:
File "/usr/lib/python3.4/gzip.py", line 181, in __init__
fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
OSError: [Errno 24] Too many open files: '/tmp/dummywarc.warc.gz'
[...]
INFO FINISHED.
INFO Duration: 0:33:57. Speed: 87.2 KiB/s.
INFO Downloaded: 3242 files, 261.8 MiB.
INFO Exiting with status 3.
# grep -c TCP\ *
1409001056:1
1409001076:78
1409001096:122
1409001116:172
1409001136:187
1409001156:202
1409001176:215
1409001196:227
1409001216:254
1409001236:287
1409001256:315
1409001276:329
1409001296:347
1409001316:370
1409001336:402
1409001357:420
1409001377:445
1409001397:442
1409001417:465
1409001437:488
1409001457:520
1409001477:552
1409001497:553
1409001517:554
1409001537:563
1409001557:597
1409001577:599
1409001597:609
1409001617:609
1409001637:628
1409001657:647
1409001677:632
1409001697:650
1409001717:662
1409001737:684
1409001757:694
1409001778:705
1409001798:696
1409001818:706
1409001838:699
1409001858:705
1409001878:714
1409001898:734
1409001918:740
1409001938:756
1409001958:752
1409001978:745
1409001998:744
1409002018:747
1409002038:741
1409002058:732
1409002078:731
1409002098:726
1409002119:739
1409002139:750
1409002159:767
1409002179:768
1409002199:786
1409002219:789
1409002239:811
1409002259:825
1409002279:825
1409002299:829
1409002319:830
1409002339:835
1409002359:846
1409002379:870
1409002399:866
1409002419:869
1409002439:865
1409002460:852
1409002480:864
1409002500:869
1409002520:877
1409002540:887
1409002560:892
1409002580:900
1409002600:893
1409002620:897
1409002640:888
1409002660:896
1409002680:894
1409002700:890
1409002720:893
1409002740:893
1409002761:899
1409002781:896
1409002801:896
1409002821:896
1409002841:879
1409002861:880
1409002881:876
1409002901:872
1409002921:878
1409002941:876
1409002961:883
1409002981:895
1409003001:897
1409003021:907
1409003041:919
1409003061:933
1409003082:955
My router is running a version of Tomato which might explain why the number of TCP connections remained constant for me. a8c5fc8 should be closing idle connections now.
Thanks! The connection count stays steady at ~24-65 now.
Manually closing issue as fixed in 152b186cb20cdf05c3d2c4a766dc6dccf79618de (v0.1000)
54 minutes into
I see