Open wanghaisheng opened 8 years ago
# DESCRIPTION
# This is a Google Image batch download tool, takes search query as input,
# resulting a folder containing Google Image search results( usually couple of images).
# SYNOPSIS
# ./GoogleImageSearch.sh QUERY
# EXAMPLE
# ./GoogleImageSearch.sh 'Linkin Park'
echo 'searching Google Image for' $1 '...';
#replace space with '+', ex."Linkin Park" -> "Linkin+Park"
query=$(echo $1 | sed 's/ /+/g');
#echo $query
url="http://www.google.com.hk/search?q=$query&tbm=isch&sout=1&tbs=isz:ex,iszw:600,iszh:600";
echo $url;
#Step1: use w3m to download wegpage source file
w3m -dump_source $url >GoogleImageSearch.html;
#Step2: fetch imgurl from webpage source file
#insert newline in front of where string "imgurl" appears
awk '{gsub(/imgurl/,"\nimgurl");print}' < GoogleImageSearch.html > newline_imgurl;
#insert newline at the end of where string "jpg" or "png" appears
awk '{gsub(/jpg/,"jpg\n");print}' < newline_imgurl > newline_jpg;
awk '{gsub(/png/,"png\n");print}' < newline_jpg > newline_png;
#grep imgurls
grep -E "(imgurl=http:[-/.[:alnum:]]*jpg|imgurl=http:[-/.[:alnum:]]*png)" newline_png > remove_imgurl;
#remove string "imgurl=", left pure url list
awk '{gsub(/imgurl=/,"");print}' < remove_imgurl > urlList;
#clear up
rm newline_imgurl newline_jpg newline_png remove_imgurl;
# to examine url list: remove '#' below
#vi urlList;
#Step3: download image files(Input:urlList;Retry:2 times;Output:Fold $query)
wget -i urlList -t 2 -P $query;
#!/usr/bin/env python
'''
Query on GoogleImageSearch and install resulted images by scraping.
To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful
Example Run:
installQueriedGoogleImages('spotty')
Eren Golge erengolge@gmail.com - www.erengolge.com - 17 April 2013
'''
import json
import pdb
import urllib
import mechanize
import cookielib
import re
import sys
import os
from BeautifulSoup import BeautifulSoup
def installQueriedGoogleImages(query):
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
main_url = 'http://www.bing.com/images/search?q='+query
r = br.open(main_url)
counter = 1
for i in range(6):
html = r.read()
soup = BeautifulSoup(html)
divs = soup.findAll('div',{'class':"dg_u"})
for div in divs:
links = div.findAll('a')
link = links[0]
text = link['m']
img_link = re.search('imgurl:"([^"]+)', text).group(1)
print 'Downloading image %d-%s -...\n'%(counter, img_link)
try:
# pdb.set_trace()
ext = img_link[-4:]
# urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
download_photo(img_link, query+'/image'+str(counter)+ext)
except IOError:
print 'image %d cannot be downloaded because of server error!...'%counter
except UnicodeError:
print 'image %d cannot be downloaded because of naming of website!...'%counter
counter += 1
print('df')
# r = br.open(link)
def download_photo(img_url, filename):
try:
image_on_web = urllib.urlopen(img_url)
if image_on_web.headers.maintype == 'image':
buf = image_on_web.read()
downloaded_image = file(filename, "wb")
downloaded_image.write(buf)
downloaded_image.close()
image_on_web.close()
else:
return False
except:
return False
return True
if __name__ == '__main__':
iteration_num = len(sys.argv)-1
for i in range(iteration_num):
color= sys.argv[i+1]
if not os.path.exists(color):
os.makedirs(color)
installQueriedGoogleImages(color)
#conding:utf-8
import sys
import os
import urllib
import urllib2
import json
import requests
KEY = '<Your Bing Developer Key>'
OUTPUT = '/images/bing/'
MAX = 100
count = 1
def bing_search(query, directory, skip):
global count
bing_url = 'https://api.datamarket.azure.com/Bing/Search/v1/Image'
print 'search count: ' + str(count) + ', url: ' + bing_url + ', skip: ' + str(skip)
pm = urllib2.HTTPPasswordMgrWithDefaultRealm()
pm.add_password(None, bing_url, KEY, KEY)
handler = urllib2.HTTPBasicAuthHandler(pm)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
if skip > 0:
params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'", '$skip': skip ,'$format': 'json'})
else:
params = urllib.urlencode({'Query': "'" + query + "'", 'Adult': "'Off'", '$format': 'json'})
response = urllib2.urlopen(bing_url+'?'+params)
data = json.loads(response.read())
results = data['d']['results']
for item in results:
if count > MAX:
print 'finish. count: ' + str(MAX)
return
image_url = item['MediaUrl']
root,ext = os.path.splitext(image_url)
if ext.lower() == '.jpg':
print image_url,
try:
r = requests.get(image_url)
fname = OUTPUT + directory + "/bing%04d.jpg" % count
f = open(fname, 'wb')
f.write(r.content)
f.close()
print "...save", fname
except:
print "error", fname
count += 1
bing_search(query, directory, count)
if __name__ == '__main__':
argvs = sys.argv
argc = len(argvs)
if(argc != 3):
print 'Usage: python %s query directory' % argvs[0]
quit()
query = argvs[1]
directory = argvs[2]
print 'get bing image: %s ' % query
bing_search(query, directory, 0)
Bulk Bing Image Downloader (BBID) is downloader which:
chmod +x bbid.py
./bbid.py [-h] [-s SEARCH_STRING] [-f SEARCH_FILE] [-o OUTPUT] [--filter] [--no-filter]
./bbid.py -s earth
#!/usr/bin/env python3
import os, sys, urllib.request, re, threading, posixpath, urllib.parse, argparse, atexit, random, socket, time, hashlib, pickle, signal, subprocess
#config
output_dir = './bing' #default output dir
adult_filter = True #Do not disable adult filter by default
pool_sema = threading.BoundedSemaphore(value = 20) #max number of download threads
bingcount = 35 #default bing paging
socket.setdefaulttimeout(2)
in_progress = []
tried_urls = []
finished_keywords=[]
failed_urls = []
image_md5s = {}
urlopenheader={ 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'}
def download(url,output_dir,retry=False):
global tried_urls, failed_urls
url_hash=hashlib.sha224(url.encode('utf-8')).digest()
if url_hash in tried_urls:
return
pool_sema.acquire()
path = urllib.parse.urlsplit(url).path
filename = posixpath.basename(path)
if len(filename)>40:
filename=filename[:36]+filename[-4:]
while os.path.exists(output_dir + '/' + filename):
filename = str(random.randint(0,100)) + filename
in_progress.append(filename)
try:
request=urllib.request.Request(url,None,urlopenheader)
image=urllib.request.urlopen(request).read()
if len(image)==0:
print('no image')
md5 = hashlib.md5()
md5.update(image)
md5_key = md5.hexdigest()
if md5_key in image_md5s:
print('FAIL Image is a duplicate of ' + image_md5s[md5_key] + ', not saving ' + filename)
return
image_md5s[md5_key] = filename
imagefile=open(output_dir + '/' + filename,'wb')
imagefile.write(image)
imagefile.close()
in_progress.remove(filename)
if retry:
print('Retry OK '+ filename)
else:
print("OK " + filename)
tried_urls.append(url_hash)
except Exception as e:
if retry:
print('Retry Fail ' + filename)
else:
print("FAIL " + filename)
failed_urls.append((url, output_dir))
pool_sema.release()
def removeNotFinished():
for filename in in_progress:
try:
os.remove(output_dir + '/' + filename)
except FileNotFoundError:
pass
def fetch_images_from_keyword(keyword,output_dir):
current = 1
last = ''
while True:
request_url='https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(keyword) + '&async=content&first=' + str(current) + '&adlt=' + adlt
request=urllib.request.Request(request_url,None,headers=urlopenheader)
response=urllib.request.urlopen(request)
html = response.read().decode('utf8')
links = re.findall('imgurl:"(.*?)"',html)
try:
if links[-1] == last:
break
last = links[-1]
current += bingcount
for link in links:
t = threading.Thread(target = download,args = (link,output_dir))
t.start()
except IndexError:
print('No search results for "{0}"'.format(keyword))
return False
time.sleep(0.1)
return True
def backup_history(*args):
download_history=open(output_dir + '/download_history.pickle','wb')
pickle.dump(tried_urls,download_history)
pickle.dump(finished_keywords, download_history)
pickle.dump(image_md5s, download_history)
download_history.close()
print('history_dumped')
if args:
exit(0)
if __name__ == "__main__":
atexit.register(removeNotFinished)
parser = argparse.ArgumentParser(description = 'Bing image bulk downloader')
parser.add_argument('-s', '--search-string', help = 'Keyword to search', required = False)
parser.add_argument('-f', '--search-file', help = 'Path to a file containing search strings line by line', required = False)
parser.add_argument('-o', '--output', help = 'Output directory', required = False)
parser.add_argument('--filter', help = 'Enable adult filter', action = 'store_true', required = False)
parser.add_argument('--no-filter', help= 'Disable adult filter', action = 'store_true', required = False)
args = parser.parse_args()
if (not args.search_string) and (not args.search_file):
parser.error('Provide Either search string or path to file containing search strings')
if args.output:
output_dir = args.output
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_dir_origin = output_dir
signal.signal(signal.SIGINT, backup_history)
try:
download_history=open(output_dir + '/download_history.pickle','rb')
tried_urls=pickle.load(download_history)
finished_keywords=pickle.load(download_history)
image_md5s=pickle.load(download_history)
download_history.close()
except (OSError, IOError):
tried_urls=[]
if adult_filter:
adlt = ''
else:
adlt = 'off'
if args.no_filter:
adlt = 'off'
elif args.filter:
adlt = ''
if args.search_string:
keyword = args.search_string
fetch_images_from_keyword(args.search_string,output_dir)
elif args.search_file:
try:
inputFile=open(args.search_file)
except (OSError, IOError):
print("Couldn't open file {}".format(args.search_file))
exit(1)
for keyword in inputFile.readlines():
keyword_hash=hashlib.sha224(keyword.strip().encode('utf-8')).digest()
if keyword_hash in finished_keywords:
print('"{0}" Already downloaded'.format(keyword.strip()))
continue
output_dir = output_dir_origin + '/' + keyword.strip().replace(' ','_')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if fetch_images_from_keyword(keyword,output_dir):
finished_keywords.append(keyword_hash)
for failed_url in failed_urls:
t = threading.Thread(target = download,args = (failed_url[0],failed_url[1],True))
t.start()
failed_urls=[]
backup_history()
inputFile.close()
https://github.com/syurchi/google-image-scraper
http://stackoverflow.com/questions/35242151/batch-download-google-images-with-tags http://stackoverflow.com/questions/4082966/what-are-the-alternatives-now-that-the-google-web-search-api-has-been-deprecated http://stackoverflow.com/questions/25133865/download-images-from-google-image-search-python http://stackoverflow.com/questions/32035973/in-python-is-there-a-way-i-can-download-all-some-the-image-files-e-g-jpg-png?rq=1