[Enhancement] Threading

WyohKnott commented 9 years ago

We could improve the backup speed by disconnecting the post fetching from the images saving. Tell me what you think about it?

TODO: add an option to control the number of threads? TODO: fix the output messages

--- tumblr_backup.py.orig   2014-10-01 09:57:32.142436800 +0200
+++ tumblr_backup_threading.py  2014-10-01 16:51:47.713677400 +0200
@@ -5,6 +5,8 @@
 from __future__ import with_statement
 import os
 from os.path import join, split, splitext
+import threading
+from threading import Thread, Event
 import sys
 import urllib
 import urllib2
@@ -68,6 +70,7 @@
 have_custom_css = False

 MAX_POSTS = 50
+imagePool = {}

 # ensure the right date/time format
 try:
@@ -199,6 +202,57 @@
         sys.stderr.write('Writing metadata failed for tags: %s in: %s\n' % (tags, image_name))

+def get_image_url(self, image_url, offset):
+   """Saves an image if not saved yet. Returns the new URL or
+   the original URL in case of download errors."""
+
+   def _url(fn):
+       return u'%s%s/%s' % (save_dir, image_dir, fn)
+
+   def _addexif(fn):
+       if options.exif and fn.endswith('.jpg'):
+           add_exif(fn, set(self.tags))
+
+   # determine the image file name
+   offset = '_' + offset if offset else ''
+   if options.image_names == 'i':
+       image_filename = self.ident + offset
+   elif options.image_names == 'bi':
+       image_filename = account + '_' + self.ident + offset
+   else:
+       image_filename = image_url.split('/')[-1]
+   glob_filter = '' if '.' in image_filename else '.*'
+   # check if a file with this name already exists
+   image_glob = glob(join(image_folder, image_filename + glob_filter))
+   if image_glob:
+       _addexif(image_glob[0])
+   # download the image data
+   try:
+       image_response = urllib2.urlopen(image_url)
+   except urllib2.HTTPError:
+       # return the original URL
+       return image_url
+   try:
+       image_data = image_response.read()
+   except urllib2.HTTPError:
+       # return the original URL
+       return image_url
+   image_response.close()
+   # determine the file type if it's unknown
+   if '.' not in image_filename:
+       image_type = imghdr.what(None, image_data[:32])
+       if image_type:
+           image_filename += '.' + image_type.replace('jpeg', 'jpg')
+   # save the image
+   with open_image(image_dir, image_filename) as image_file:
+       image_file.write(image_data)
+   _addexif(join(image_folder, image_filename))
+
+
+def add_to_pool(self, image_url, offset):
+    imagePool[image_url] = [self, offset];
+
+
 def save_style():
     with open_text(backup_css) as css:
         css.write('''\
@@ -396,7 +450,16 @@
                 self.post_count += 1
             return True

-        # Get the XML entries from the API, which we can only do for max 50 posts at once.
+        poolThreads = []
+        quitEvent = Event()
+        for t in range(0,7):
+            ts = SavePool(quitEvent)
+            ts.daemon = True
+            poolThreads.append(ts)
+        for j in poolThreads:
+            j.start()
+
+       # Get the XML entries from the API, which we can only do for max 50 posts at once.
         # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
         i = options.skip
         while i < last_post:
@@ -426,6 +489,9 @@

         log(account, "%d posts backed up\n" % self.post_count)
         self.total_count += self.post_count
+        quitEvent.set()
+        while (threading.activeCount() > 1):
+            time.sleep(1)

 class TumblrPost:
@@ -481,7 +547,8 @@
             url = escape(get_try('photo-link-url'))
             for p in post.photoset['photo':] if hasattr(post, 'photoset') else [post]:
                 src = unicode(p['photo-url'])
-                append(escape(self.get_image_url(src, p().get('offset'))), u'<img alt="" src="%s">')
+                add_to_pool(self, src, p().get('offset'))
+                append(escape(self.get_image_filename(src, p().get('offset'))), u'<img alt="" src="%s">')
                 if url:
                     content[-1] = u'<a href="%s">%s</a>' % (url, content[-1])
                 content[-1] = '<p>' + content[-1] + '</p>'
@@ -541,18 +608,7 @@
         for p in ('<p>(<(%s)>)', '(</(%s)>)</p>'):
             self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content)

-    def get_image_url(self, image_url, offset):
-        """Saves an image if not saved yet. Returns the new URL or
-        the original URL in case of download errors."""
-
-        def _url(fn):
-            return u'%s%s/%s' % (save_dir, image_dir, fn)
-
-        def _addexif(fn):
-            if options.exif and fn.endswith('.jpg'):
-                add_exif(fn, set(self.tags))
-
-        # determine the image file name
+    def get_image_filename(self, image_url, offset):
         offset = '_' + offset if offset else ''
         if options.image_names == 'i':
             image_filename = self.ident + offset
@@ -560,30 +616,8 @@
             image_filename = account + '_' + self.ident + offset
         else:
             image_filename = image_url.split('/')[-1]
-        glob_filter = '' if '.' in image_filename else '.*'
-        # check if a file with this name already exists
-        image_glob = glob(join(image_folder, image_filename + glob_filter))
-        if image_glob:
-            _addexif(image_glob[0])
-            return _url(split(image_glob[0])[1])
-        # download the image data
-        try:
-            image_response = urllib2.urlopen(image_url)
-        except urllib2.HTTPError:
-            # return the original URL
-            return image_url
-        image_data = image_response.read()
-        image_response.close()
-        # determine the file type if it's unknown
-        if '.' not in image_filename:
-            image_type = imghdr.what(None, image_data[:32])
-            if image_type:
-                image_filename += '.' + image_type.replace('jpeg', 'jpg')
-        # save the image
-        with open_image(image_dir, image_filename) as image_file:
-            image_file.write(image_data)
-        _addexif(join(image_folder, image_filename))
-        return _url(image_filename)
+        return u'%s%s/%s' % (save_dir, image_dir, image_filename + image_url[-4:])
+

     def get_post(self):
         """returns this post in HTML"""
@@ -621,6 +655,21 @@
                 f.write(self.xml_content)

+class SavePool(threading.Thread):
+    def __init__(self, quit):
+        threading.Thread.__init__(self)
+        self.quit = quit
+    def run(self):
+        imagecounter = 0
+        while not self.quit.isSet() or imagePool:
+            if imagePool:
+                key, value = imagePool.popitem()
+                get_image_url(value[0],key,value[1])
+                log(account, "%d images remaining to save\r" % (len(imagePool)))
+                imagecounter += 1 
+        log(account, "%d images backed up\n" % imagecounter)
+
+
 class BlosxomPost(TumblrPost):

     def get_image_url(self, image_url, offset):

bbolli commented 9 years ago

Hi! Thanks for the idea and initial implementation. There are a few points I have to think about, though.

I'd like to put just the download itself into a thread, as this is the only time-critical part
There's no way for the threads to indicate download errors
image_dir and image_folder are globals and inherently not thread-safe

But, as I said, I like the idea and will look into it.

WyohKnott commented 9 years ago

I've got another idea that might require to rewrite what I've just posted though, it's just a general draft.

I've never really used threads before, so I'm not sure what are the do and don't do yet.

Thanks for the comment.

bbolli / tumblr-utils

[Enhancement] Threading #24