I worked with the GoogleScraper as directed by you and everything worked fine except for the fact that when I am performing the image-scraping with GoogleScraper code then I able to download a maximum of 30 images and no more than that. No matter what I try, only 30 images are being downloaded.
Also, the search engine yahoo included in the code is not working and it is unable to fetch the images.
Here is a copy of my code-
Any suggestions Mr.Nikolai?
from GoogleScraper import scrape_with_config, GoogleSearchError
try:
search = scrape_with_config(config)
except GoogleSearchError as e:
print(e)
image_urls = []
for serp in search.serps:
image_urls.extend(
[link.link for link in serp.links]
)
print('[i] Going to scrape {num} images and saving them in "{dir}"'.format(
num=len(image_urls),
dir=target_directory
))
import threading,requests, os, urllib
class FetchResource(threading.Thread):
"""Grabs a web resource and stores it in the target directory.
Args:
target: A directory where to save the resource.
urls: A bunch of urls to grab
"""
def init(self, target, urls):
super().init()
self.target = target
self.urls = urls
def run(self):
for url in self.urls:
url = urllib.parse.unquote(url)
with open(os.path.join(self.target, url.split('/')[-1]), 'wb') as f:
try:
content = requests.get(url).content
f.write(content)
except Exception as e:
pass
print('[+] Fetched {}'.format(url))
I worked with the GoogleScraper as directed by you and everything worked fine except for the fact that when I am performing the image-scraping with GoogleScraper code then I able to download a maximum of 30 images and no more than that. No matter what I try, only 30 images are being downloaded.
Also, the search engine yahoo included in the code is not working and it is unable to fetch the images.
Here is a copy of my code- Any suggestions Mr.Nikolai?
from GoogleScraper import scrape_with_config, GoogleSearchError
target_directory = 'Pictures/tdg/'
config = { 'keyword': 'Sachin', 'search_engines': ['yandex', 'google', 'bing', 'baidu'], 'search_type': 'image', 'scrape_method': 'selenium', 'do_caching': True, }
try: search = scrape_with_config(config) except GoogleSearchError as e: print(e)
image_urls = []
for serp in search.serps:
print('[i] Going to scrape {num} images and saving them in "{dir}"'.format( num=len(image_urls), dir=target_directory ))
import threading,requests, os, urllib
class FetchResource(threading.Thread): """Grabs a web resource and stores it in the target directory. Args: target: A directory where to save the resource. urls: A bunch of urls to grab """ def init(self, target, urls): super().init() self.target = target self.urls = urls
make a directory for the results
try: os.mkdir(target_directory) except FileExistsError: pass
fire up 100 threads to get the images
num_threads = 100
threads = [FetchResource('Pictures/tdg/', []) for i in range(num_threads)]
while image_urls: for t in threads: try: t.urls.append(image_urls.pop()) except IndexError as e: break
threads = [t for t in threads if t.urls]
for t in threads: t.start()