clear-datacenter / plan

MIT License
45 stars 17 forks source link

利用google bing图片搜索构建测试数据集 #11

Open wanghaisheng opened 8 years ago

wanghaisheng commented 8 years ago

https://github.com/syurchi/google-image-scraper

http://stackoverflow.com/questions/35242151/batch-download-google-images-with-tags http://stackoverflow.com/questions/4082966/what-are-the-alternatives-now-that-the-google-web-search-api-has-been-deprecated http://stackoverflow.com/questions/25133865/download-images-from-google-image-search-python http://stackoverflow.com/questions/32035973/in-python-is-there-a-way-i-can-download-all-some-the-image-files-e-g-jpg-png?rq=1

wanghaisheng commented 8 years ago
# DESCRIPTION
#    This is a Google Image batch download tool, takes search query as input,
#    resulting a folder containing Google Image search results( usually couple of images).
# SYNOPSIS
#    ./GoogleImageSearch.sh QUERY
# EXAMPLE
#    ./GoogleImageSearch.sh 'Linkin Park'

echo 'searching Google Image for' $1 '...';
#replace space with '+', ex."Linkin Park" -> "Linkin+Park"
query=$(echo $1 | sed 's/ /+/g');
#echo $query
url="http://www.google.com.hk/search?q=$query&tbm=isch&sout=1&tbs=isz:ex,iszw:600,iszh:600";
echo $url;

#Step1: use w3m to download wegpage source file
w3m -dump_source $url >GoogleImageSearch.html;

#Step2: fetch imgurl from webpage source file
  #insert newline in front of where string "imgurl" appears
  awk '{gsub(/imgurl/,"\nimgurl");print}' < GoogleImageSearch.html > newline_imgurl;
  #insert newline at the end of where string "jpg" or "png" appears
  awk '{gsub(/jpg/,"jpg\n");print}' < newline_imgurl > newline_jpg;
  awk '{gsub(/png/,"png\n");print}' < newline_jpg > newline_png;
  #grep imgurls
  grep -E "(imgurl=http:[-/.[:alnum:]]*jpg|imgurl=http:[-/.[:alnum:]]*png)" newline_png > remove_imgurl;
  #remove string "imgurl=", left pure url list
  awk '{gsub(/imgurl=/,"");print}' < remove_imgurl > urlList;
  #clear up
  rm newline_imgurl newline_jpg newline_png remove_imgurl;
  # to examine url list: remove '#' below
  #vi urlList;

#Step3: download image files(Input:urlList;Retry:2 times;Output:Fold $query)
wget -i urlList -t 2 -P $query;
wanghaisheng commented 8 years ago
#!/usr/bin/env python

'''
Query on GoogleImageSearch and install resulted images by scraping.

To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful

Example Run:
installQueriedGoogleImages('spotty')

Eren Golge erengolge@gmail.com - www.erengolge.com - 17 April 2013
'''

import json
import pdb
import urllib
import mechanize 
import cookielib
import re
import sys
import os
from BeautifulSoup import BeautifulSoup

def installQueriedGoogleImages(query):
    br = mechanize.Browser()
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)

    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    main_url = 'http://www.bing.com/images/search?q='+query
    r = br.open(main_url)
    counter = 1
    for i in range(6):
        html = r.read()
        soup = BeautifulSoup(html)
        divs = soup.findAll('div',{'class':"dg_u"})
        for div in divs:
            links = div.findAll('a')
            link = links[0]
            text = link['m']
            img_link = re.search('imgurl:"([^"]+)', text).group(1)
            print 'Downloading image %d-%s -...\n'%(counter, img_link)
            try:
#                 pdb.set_trace()
                ext = img_link[-4:]
                # urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
                download_photo(img_link, query+'/image'+str(counter)+ext)
            except IOError:
                print 'image %d cannot be downloaded because of server error!...'%counter
            except UnicodeError:
                print 'image %d cannot be downloaded because of naming of website!...'%counter
            counter += 1
            print('df')

#         r = br.open(link)

def download_photo(img_url, filename):
    try:
        image_on_web = urllib.urlopen(img_url)
        if image_on_web.headers.maintype == 'image':
            buf = image_on_web.read()
            downloaded_image = file(filename, "wb")
            downloaded_image.write(buf)
            downloaded_image.close()
            image_on_web.close()
        else:
            return False    
    except:
        return False
    return True

if __name__ == '__main__':
    iteration_num = len(sys.argv)-1
    for i in range(iteration_num):
        color= sys.argv[i+1]
        if not os.path.exists(color):
            os.makedirs(color)
        installQueriedGoogleImages(color)
wanghaisheng commented 8 years ago
#conding:utf-8

import sys
import os
import urllib
import urllib2
import json
import requests

KEY = '<Your Bing Developer Key>'
OUTPUT = '/images/bing/'
MAX = 100
count = 1

def bing_search(query, directory, skip):
        global count

        bing_url = 'https://api.datamarket.azure.com/Bing/Search/v1/Image'
        print 'search count: ' + str(count) + ', url: ' + bing_url + ', skip: ' + str(skip)
        pm = urllib2.HTTPPasswordMgrWithDefaultRealm()
        pm.add_password(None, bing_url, KEY, KEY)

        handler = urllib2.HTTPBasicAuthHandler(pm)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
        if skip > 0:
            params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'",  '$skip': skip ,'$format': 'json'})
        else:       
            params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'", '$format': 'json'})
        response = urllib2.urlopen(bing_url+'?'+params)
        data = json.loads(response.read())

        results = data['d']['results']

        for item in results:
                if count > MAX:
                        print 'finish. count: ' + str(MAX)
                        return

                image_url = item['MediaUrl']
                root,ext = os.path.splitext(image_url)
                if ext.lower() == '.jpg':
                        print image_url,
                        try:
                                r = requests.get(image_url)
                                fname = OUTPUT + directory + "/bing%04d.jpg" % count
                                f = open(fname, 'wb')
                                f.write(r.content)
                                f.close()
                                print "...save", fname
                        except:
                                print "error", fname
                        count += 1

        bing_search(query, directory, count)

if __name__ == '__main__':
        argvs = sys.argv
        argc = len(argvs)
        if(argc != 3):
                print 'Usage: python %s query directory' % argvs[0]
                quit()
        query = argvs[1]
        directory = argvs[2]
        print 'get bing image: %s ' % query
        bing_search(query, directory, 0)
wanghaisheng commented 8 years ago

Bulk Bing Image Downloader

Bulk Bing Image Downloader (BBID) is downloader which:

Usage

chmod +x bbid.py
./bbid.py [-h] [-s SEARCH_STRING] [-f SEARCH_FILE] [-o OUTPUT] [--filter] [--no-filter]

Example

./bbid.py -s earth

#!/usr/bin/env python3
import os, sys, urllib.request, re, threading, posixpath, urllib.parse, argparse, atexit, random, socket, time, hashlib, pickle, signal, subprocess

#config
output_dir = './bing' #default output dir
adult_filter = True #Do not disable adult filter by default
pool_sema = threading.BoundedSemaphore(value = 20) #max number of download threads
bingcount = 35 #default bing paging
socket.setdefaulttimeout(2)

in_progress = []
tried_urls = []
finished_keywords=[]
failed_urls = []
image_md5s = {}
urlopenheader={ 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'}
def download(url,output_dir,retry=False):
    global tried_urls, failed_urls
    url_hash=hashlib.sha224(url.encode('utf-8')).digest()
    if url_hash in tried_urls:
        return
    pool_sema.acquire() 
    path = urllib.parse.urlsplit(url).path
    filename = posixpath.basename(path)
    if len(filename)>40:
        filename=filename[:36]+filename[-4:]
    while os.path.exists(output_dir + '/' + filename):
        filename = str(random.randint(0,100)) + filename
    in_progress.append(filename)
    try:
        request=urllib.request.Request(url,None,urlopenheader)
        image=urllib.request.urlopen(request).read()
        if len(image)==0:
            print('no image')

        md5 = hashlib.md5()
        md5.update(image)
        md5_key = md5.hexdigest()
        if md5_key in image_md5s:
            print('FAIL Image is a duplicate of ' + image_md5s[md5_key] + ', not saving ' + filename)
            return

        image_md5s[md5_key] = filename

        imagefile=open(output_dir + '/' + filename,'wb')
        imagefile.write(image)
        imagefile.close()
        in_progress.remove(filename)
        if retry:
            print('Retry OK '+ filename)
        else:
            print("OK " + filename)
        tried_urls.append(url_hash)
    except Exception as e:
        if retry:
            print('Retry Fail ' + filename)
        else:
            print("FAIL " + filename)
            failed_urls.append((url, output_dir))
    pool_sema.release()

def removeNotFinished():
    for filename in in_progress:
        try:
            os.remove(output_dir + '/' + filename)
        except FileNotFoundError:
            pass

def fetch_images_from_keyword(keyword,output_dir):
    current = 1
    last = ''
    while True:
        request_url='https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(keyword) + '&async=content&first=' + str(current) + '&adlt=' + adlt
        request=urllib.request.Request(request_url,None,headers=urlopenheader)
        response=urllib.request.urlopen(request)
        html = response.read().decode('utf8')
        links = re.findall('imgurl:&quot;(.*?)&quot;',html)
        try:
            if links[-1] == last:
                break
            last = links[-1]
            current += bingcount
            for link in links:
                t = threading.Thread(target = download,args = (link,output_dir))
                t.start()
        except IndexError:
            print('No search results for "{0}"'.format(keyword))
            return False
        time.sleep(0.1)
    return True

def backup_history(*args):
    download_history=open(output_dir + '/download_history.pickle','wb')
    pickle.dump(tried_urls,download_history)
    pickle.dump(finished_keywords, download_history)
    pickle.dump(image_md5s, download_history)
    download_history.close()
    print('history_dumped')
    if args:
        exit(0)

if __name__ == "__main__":
    atexit.register(removeNotFinished)
    parser = argparse.ArgumentParser(description = 'Bing image bulk downloader')
    parser.add_argument('-s', '--search-string', help = 'Keyword to search', required = False)
    parser.add_argument('-f', '--search-file', help = 'Path to a file containing search strings line by line', required = False)
    parser.add_argument('-o', '--output', help = 'Output directory', required = False)
    parser.add_argument('--filter', help = 'Enable adult filter', action = 'store_true', required = False)
    parser.add_argument('--no-filter', help=  'Disable adult filter', action = 'store_true', required = False)
    args = parser.parse_args()
    if (not args.search_string) and (not args.search_file):
        parser.error('Provide Either search string or path to file containing search strings')
    if args.output:
        output_dir = args.output
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_dir_origin = output_dir
    signal.signal(signal.SIGINT, backup_history)
    try:
        download_history=open(output_dir + '/download_history.pickle','rb')
        tried_urls=pickle.load(download_history)
        finished_keywords=pickle.load(download_history)
        image_md5s=pickle.load(download_history)
        download_history.close()
    except (OSError, IOError):
        tried_urls=[]
    if adult_filter:
        adlt = ''
    else:
        adlt = 'off'
    if args.no_filter:
        adlt = 'off'
    elif args.filter:
        adlt = ''
    if args.search_string:
        keyword = args.search_string
        fetch_images_from_keyword(args.search_string,output_dir)
    elif args.search_file:
        try:
            inputFile=open(args.search_file)
        except (OSError, IOError):
            print("Couldn't open file {}".format(args.search_file))
            exit(1)
        for keyword in inputFile.readlines():
            keyword_hash=hashlib.sha224(keyword.strip().encode('utf-8')).digest()
            if keyword_hash in finished_keywords:
                print('"{0}" Already downloaded'.format(keyword.strip()))
                continue
            output_dir = output_dir_origin + '/' + keyword.strip().replace(' ','_')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if fetch_images_from_keyword(keyword,output_dir):
                finished_keywords.append(keyword_hash)
                for failed_url in failed_urls:
                    t = threading.Thread(target = download,args = (failed_url[0],failed_url[1],True))
                    t.start()
                failed_urls=[]
            backup_history()
        inputFile.close()