Open rom1504 opened 1 year ago
maybe another path is to have a flexible thread pool: instead of doing like 32 threads, increase or decrease depending on the current speed: if many requests are being done, don't start more, if not do
measuring the byte/s could help estimate this
New idea:
Another idea: use timeout arg of join https://docs.python.org/3/library/threading.html#threading.Thread.join It ignores the thread if it doesn't join in time
So simply put the thread in bad pool and continue starting more threads
reproducing example:
import urllib.request
import time
import io
example_urls = [
(12, 'http://www.herteldenbirname.com/wp-content/uploads/2014/05/Italia-Independent-Flocked-Aviator-Sunglasses-150x150.jpg'),
(124, 'http://image.rakuten.co.jp/sneak/cabinet/shoes-03/cr-ucrocs5-a.jpg?_ex=128x128'),
(146, 'http://www.slicingupeyeballs.com/wp-content/uploads/2009/05/stoneroses452.jpg'),
(122, 'https://media.mwcradio.com/mimesis/2013-03/01/2013-03-01T153415Z_1_CBRE920179600_RTROPTP_3_TECH-US-GERMANY-EREADER_JPG_475x310_q85.jpg'),
(282, 'https://8d1aee3bcc.site.internapcdn.net/00/images/media/5/5cfb2eba8f1f6244c6f7e261b9320a90-1.jpg'),
(298, 'https://my-furniture.com.au/media/catalog/product/cache/1/small_image/295x295/9df78eab33525d08d6e5fb8d27136e95/a/u/au0019-stool-01.jpg'),
(300, 'http://images.tastespotting.com/thumbnails/889506.jpg'),
(330, 'https://www.infoworld.pk/wp-content/uploads/2016/02/Cool-HD-Valentines-Day-Wallpapers-480x300.jpeg'),
(361, 'http://pendantscarf.com/image/cache/data/necklace/JW0013-(2)-150x150.jpg'),
(408, 'https://www.solidrop.net/photo-6/animorphia-coloring-books-for-adults-children-drawing-book-secret-garden-style-relieve-stress-graffiti-painting-book.jpg'),
]
def download_image(row, timeout):
"""Download an image with urllib"""
key, url = row
img_stream = None
user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
start_time = time.time()
try:
request = urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string})
with urllib.request.urlopen(request, timeout=timeout) as r:
img_stream = io.BytesIO(r.read())
return key, img_stream, None, time.time() - start_time
except Exception as err: # pylint: disable=broad-except
if img_stream is not None:
img_stream.close()
return key, None, str(err), time.time() - start_time
def main():
for example_url in example_urls:
a = download_image(example_url, 2)
print(a)
if __name__ == "__main__":
main()
one try:
import queue
from threading import Thread
import time
class GoodBadPool:
def __init__(self, generator, runner, timeout, pool_size, out_queue_max) -> None:
self.generator = generator
self.runner = runner
self.timeout = timeout
self.pool_size = pool_size
self.out_queue_max = out_queue_max
self.results = []
self.good_threads = []
self.bad_threads = []
self.outqueue = queue.SimpleQueue()
self.good_done = {}
self.item_left = True
def call(self, start_time, item):
result = self.runner(item)
key = item[0]
if time.time() - start_time < self.timeout:
self.outqueue.put(result)
self.good_done[key] = True
def cleanup_bad_threads(self):
still_bad_threads = []
for thread in self.bad_threads:
thread.join(0)
if thread.is_alive():
still_bad_threads.append(thread)
self.bad_threads = still_bad_threads
def cleanup_good_threads(self):
# move slow threads to bad threads
still_good_threads = []
for start_time, key, thread in self.good_threads:
if key in self.good_done:
thread.join(0)
del self.good_done[key]
continue
if time.time() - start_time > self.timeout:
self.outqueue.put((key, None, "timeout"))
self.bad_threads.append(thread)
else:
still_good_threads.append((start_time, key, thread))
self.good_threads = still_good_threads
def provider(self):
"""Loops infinitely, if we need new values, try to get them
1. clean up bad threads (join them)
2. clean up good threads by moving the slow ones to bad threads
3. start new threads if possible
"""
while True:
if self.outqueue.qsize() > self.out_queue_max:
time.sleep(0.1)
continue
self.cleanup_bad_threads()
self.cleanup_good_threads()
#print(f"good: {len(self.good_threads)}, bad: {len(self.bad_threads)}, outqueue: {self.outqueue.qsize()}")
if self.item_left and len(self.good_threads) < self.pool_size:
try:
item = next(self.generator)
key = item[0]
except StopIteration:
self.item_left = False
continue
start_time = time.time()
thread = Thread(target=self.call, args=(start_time, item,))
thread.start()
self.good_threads.append((start_time, key, thread))
else:
if len(self.good_threads) == 0 and not self.item_left:
self.outqueue.put(None)
return
time.sleep(0.1)
continue
def run(self):
t = Thread(target=self.provider)
t.start()
while True:
item = self.outqueue.get()
if item is None:
break
yield item
t.join(0)
but actually doesn't seem to help that much. Something else must be slowing things down
I think the only reasonable paths forward here are
2 new ideas:
from pycurl import Curl
import pycurl
from io import BytesIO
import time
def download_image(row, timeout):
"""Download an image with urllib"""
key, url = row
user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
try:
mycurl=Curl()
mycurl.setopt(pycurl.SSL_VERIFYPEER, 0)
mycurl.setopt(pycurl.SSL_VERIFYHOST, 0)
mycurl.setopt(pycurl.TIMEOUT, timeout)
mycurl.setopt(pycurl.URL, url)
body = BytesIO()
mycurl.setopt(pycurl.WRITEFUNCTION, body.write)
mycurl.setopt(pycurl.USERAGENT, user_agent_string)
mycurl.perform()
val = body.getvalue()
body.close()
return key, val, None
except Exception as e:
return key, None, str(e)
example_urls = [
(12, 'http://www.herteldenbirname.com/wp-content/uploads/2014/05/Italia-Independent-Flocked-Aviator-Sunglasses-150x150.jpg'),
(124, 'http://image.rakuten.co.jp/sneak/cabinet/shoes-03/cr-ucrocs5-a.jpg?_ex=128x128'),
(146, 'http://www.slicingupeyeballs.com/wp-content/uploads/2009/05/stoneroses452.jpg'),
(122, 'https://media.mwcradio.com/mimesis/2013-03/01/2013-03-01T153415Z_1_CBRE920179600_RTROPTP_3_TECH-US-GERMANY-EREADER_JPG_475x310_q85.jpg'),
(282, 'https://8d1aee3bcc.site.internapcdn.net/00/images/media/5/5cfb2eba8f1f6244c6f7e261b9320a90-1.jpg'),
(298, 'https://my-furniture.com.au/media/catalog/product/cache/1/small_image/295x295/9df78eab33525d08d6e5fb8d27136e95/a/u/au0019-stool-01.jpg'),
(300, 'http://images.tastespotting.com/thumbnails/889506.jpg'),
(330, 'https://www.infoworld.pk/wp-content/uploads/2016/02/Cool-HD-Valentines-Day-Wallpapers-480x300.jpeg'),
(361, 'http://pendantscarf.com/image/cache/data/necklace/JW0013-(2)-150x150.jpg'),
(408, 'https://www.solidrop.net/photo-6/animorphia-coloring-books-for-adults-children-drawing-book-secret-garden-style-relieve-stress-graffiti-painting-book.jpg'),
]
for item in example_urls:
s = time.time()
a = download_image(item, 2)
print(time.time() - s)
important:
curl is not faster
tinyproxy solution is working !!
instructions:
proxies = {'http': 'http://127.0.0.1:8888', 'https': 'https://127.0.0.1:8888'}
proxy_support = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
in download
2x faster
but success rate is low...
new idea: maintain a pool of fast domain and a pool of slow domain. Give more threads to the slow domain and/or discard them compute statistics live
New idea: keep track of domains that fail to resolve and don't even try to download those next time. Also maybe limit number of redirect
Writing down the download time for each url in metadata can also help here for further analysis
to compute domain stats:
import glob
from urllib.parse import urlparse
import pandas as pd
pq = list(glob.glob("*.parquet"))
dfs = [pd.read_parquet(p) for p in pq]
df = pd.concat(dfs)
df["domain"] = df["url"].apply(lambda url: urlparse(url).netloc)
df2 = df[["domain", "duration"]]
df2.groupby("domain").agg({'domain':'size', 'duration':'mean'}).sort_values("duration")[-10:]
but seems like longer urls (up to 40s...) are from unique domains among 10k items. Checking among more.
among 160k seems no clear correlation between domain and duration
ok only reasonable way forward here is to decouple completely the downloading from the rest, and then set up a clean benchmark on "here is 100 shards of 1000 items, how can you get them fast using whatever technology (other languages, libs, ...)"
I implemented some new metrics and found that many urls timeout after 20s, which clearly slow down everything
here is some examples: Downloaded (12, 'http://www.herteldenbirname.com/wp-content/uploads/2014/05/Italia-Independent-Flocked-Aviator-Sunglasses-150x150.jpg') in 10.019284009933472 Downloaded (124, 'http://image.rakuten.co.jp/sneak/cabinet/shoes-03/cr-ucrocs5-a.jpg?_ex=128x128') in 10.01184344291687 Downloaded (146, 'http://www.slicingupeyeballs.com/wp-content/uploads/2009/05/stoneroses452.jpg') in 10.006474256515503 Downloaded (122, 'https://media.mwcradio.com/mimesis/2013-03/01/2013-03-01T153415Z_1_CBRE920179600_RTROPTP_3_TECH-US-GERMANY-EREADER_JPG_475x310_q85.jpg') in 10.241626739501953 Downloaded (282, 'https://8d1aee3bcc.site.internapcdn.net/00/images/media/5/5cfb2eba8f1f6244c6f7e261b9320a90-1.jpg') in 10.431355476379395 Downloaded (298, 'https://my-furniture.com.au/media/catalog/product/cache/1/small_image/295x295/9df78eab33525d08d6e5fb8d27136e95/a/u/au0019-stool-01.jpg') in 10.005694150924683 Downloaded (300, 'http://images.tastespotting.com/thumbnails/889506.jpg') in 10.007027387619019 Downloaded (330, 'https://www.infoworld.pk/wp-content/uploads/2016/02/Cool-HD-Valentines-Day-Wallpapers-480x300.jpeg') in 10.004335880279541 Downloaded (361, 'http://pendantscarf.com/image/cache/data/necklace/JW0013-(2)-150x150.jpg') in 10.00539231300354 Downloaded (408, 'https://www.solidrop.net/photo-6/animorphia-coloring-books-for-adults-children-drawing-book-secret-garden-style-relieve-stress-graffiti-painting-book.jpg') in 10.004313945770264
Let's try to implement request timeout
I tried #153 , eventlet and #260 and none of them can timeout properly
A good value for timeout is 2s