[CRAW] 크롤링20 - Githubissues

Eomdangyeong commented 5 years ago

#얘덜아 이거 20개만 크롤링 되는 코드야
#왜 20개만 되는지 모르겠어
#포문 돌려서 여러번 하면 똑같은 20개가 여러번 저장된닥...

import requests
import urllib.request

from scrapy.selector import Selector

#이거 count바꾸면 알아서 20개 count된다

count = 2040

#inputSearch에 검색어 넣으면 돼 그니까 장미면 장미 넣고 장미꽃 넣고
#rose , roseficture 이렇게 여러번 넣으면 여러경우 나오겠지,,?
#1000/20이면 50이니까 검색어 50개만 해봐..
#근데 중간에 쓸모없는것도 저장되니까 그거ㅓ 걸러줘야해
#어디에 저장되냐면 코드있는 경로에 저장된다

inputSearch = "백합"
base_url = "https://www.google.co.kr/search?biw=1597&bih=925&" \
             "tbm=isch&sa=1&btnG=%EA%B2%80%EC%83%89&q=" + inputSearch

def img_url_from_page(url):
    html = requests.get(url).text  

    sel = Selector(text=html)

    img_names = sel.css('td a img::attr(src)').extract()

    img_names = [img_name for img_name in img_names]

    return img_names

def img_from_url(image_names):
    global count
    count += 1
    name = count

    full_name = str(name) + ".jpg"

    urllib.request.urlretrieve(image_names, full_name)

for i in img_url_from_page(base_url):
    img_from_url(i)

참고 사이트 https://okky.kr/article/405238

Eomdangyeong commented 5 years ago

#이거는 더 많이 모을 수 있는건데 오류뜬다 ㅠㅠㅠ
#근데 이거 돌리기전에 꼭 chromedriver다운 받아야해 
#이거 쓰면 돌아가고 폴더도 생기는데
#cmd창 같은건데 chromedriver.exe 뜨고 크롬창 뜨고 스크롤 막 내려가다가
#ERROR:browser_process_sub_thread.cc이거 라고 에러 뜨는데 무슨 에러인지 구글링 해도 잘 모르겠어
#그리고 실행창에 이미지 url은 막 뜨는데 can't get image라고 뜨고 0 pictures succesfully downloaded이렇게 된다
#헬프... ^_________^

import os
import urllib.request
from selenium import webdriver
import argparse
from selenium.webdriver.support.ui import WebDriverWait
import json

#검색어넣어주면돼, 폴더이름도 이걸로 생김 
searchterm = 'Lily'

url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"

#chrome driver다운받아서 그 경로를 아래에 입력해줘야함
browser = webdriver.Chrome(r"C:\Users\eksru\AppData\Local\Programs\Python\Python36\chromedriver.exe")
browser.get(url)
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0

if not os.path.exists(searchterm):
    os.mkdir(searchterm)
#스크롤 내려
#이 range안에 수만큼 돌아가긴해,,
for _ in range(200):
    browser.execute_script("window.scrollBy(0,10000)")

for x in browser.find_elements_by_xpath('//div[contains(@class,"rg_meta")]'):
    counter = counter + 1
    print ("Total Count:", counter)
    print ("Succsessful Count:", succounter)
    print ("URL:",json.loads(x.get_attribute('innerHTML'))["ou"])

    img = json.loads(x.get_attribute('innerHTML'))["ou"]
    imgtype = json.loads(x.get_attribute('innerHTML'))["ity"]
    try:
        req = urllib.request(img, headers={'User-Agent': header})
        raw_img = urllib.request.urlopen(req).read()
        File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb")
        File.write(raw_img)
        File.close()
        succounter = succounter + 1
    except:
            print ("can't get img")

print (succounter, "pictures succesfully downloaded")
browser.close()

참고 사이트https://jeongmin-lee.tistory.com/4

Eomdangyeong commented 5 years ago

이 사이트도 내가 참고해서 해봤는데 경로 설정 문제같은데 참고해봐바바!!!!! 나도 진짜 천개 한번에 하고싶었어.............이거 천개 한번에 할 수 있다는데.........이사람은........... 나는..........왜....................................... https://simply-python.com/2015/05/18/saving-images-from-google-search-using-selenium-and-python/

Eomdangyeong commented 5 years ago

craw.zip

ghost commented 5 years ago

import re, os, sys, datetime, time
import pandas
from selenium import webdriver
from contextlib import closing
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM

class GoogleImageExtractor(object):

    def __init__(self, search_key = '' ):
        """ Google image search class
            Args:
                search_key to be entered.

        """
        if type(search_key) == str:
            ## convert to list even for one search keyword to standalize the pulling.
            self.g_search_key_list = [search_key]
        elif type(search_key) == list:
            self.g_search_key_list = search_key
        else:
            print( 'google_search_keyword not of type str or list')
            raise

        self.g_search_key = ''

        ## user options
        self.image_dl_per_search = 200

        ## url construct string text
        self.prefix_of_search_url = "https://www.google.com.sg/search?q="
        self.postfix_of_search_url = '&source=lnms&tbm=isch&sa=X&ei=0eZEVbj3IJG5uATalICQAQ&ved=0CAcQ_AUoAQ&biw=939&bih=591'# non changable text
        self.target_url_str = ''

        ## storage
        self.pic_url_list = []
        self.pic_info_list = []

        ## file and folder path
        self.folder_main_dir_prefix = r'C:\data'

    def reformat_search_for_spaces(self):
        """
            Method call immediately at the initialization stages
            get rid of the spaces and replace by the "+"
            Use in search term. Eg: "Cookie fast" to "Cookie+fast"

            steps:
            strip any lagging spaces if present
            replace the self.g_search_key
        """
        self.g_search_key = self.g_search_key.rstrip().replace(' ', '+')

    def set_num_image_to_dl(self, num_image):
        """ Set the number of image to download. Set to self.image_dl_per_search.
            Args:
                num_image (int): num of image to download.
        """
        self.image_dl_per_search = num_image

    def get_searchlist_fr_file(self, filename):
        """Get search list from filename. Ability to add in a lot of phrases.
            Will replace the self.g_search_key_list
            Args:
                filename (str): full file path
        """
        with open(filename,'r') as f:
            self.g_search_key_list = f.readlines()

    def formed_search_url(self):
        ''' Form the url either one selected key phrases or multiple search items.
            Get the url from the self.g_search_key_list
            Set to self.sp_search_url_list
        '''
        self.reformat_search_for_spaces()
        self.target_url_str = self.prefix_of_search_url + self.g_search_key +\
                                self.postfix_of_search_url

    def retrieve_source_fr_html(self):
        """ Make use of selenium. Retrieve from html table using pandas table.

        """
        driver = webdriver.Firefox()
        driver.get(self.target_url_str)

        ## wait for log in then get the page source.
        try:
            driver.execute_script("window.scrollTo(0, 30000)")
            time.sleep(2)
            self.temp_page_source = driver.page_source
            #driver.find_element_by_css_selector('ksb _kvc').click()#cant find the class
            driver.find_element_by_id('smb').click() #ok
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, 60000)")
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, 60000)")

        except:
            print('not able to find')
            driver.quit()

        self.page_source = driver.page_source

        driver.close()

    def extract_pic_url(self):
        """ extract all the raw pic url in list

        """
        dom = DOM(self.page_source)
        tag_list = dom('a.rg_l')

        for tag in tag_list[:self.image_dl_per_search]:
            tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href'])
            try:
                self.pic_url_list.append(tar_str.group(1))
            except:
                print( 'error parsing', tag)

    def multi_search_download(self):
        """ Mutli search download"""
        for indiv_search in self.g_search_key_list:
            self.pic_url_list = []
            self.pic_info_list = []

            self.g_search_key = indiv_search

            self.formed_search_url()
            self.retrieve_source_fr_html()
            self.extract_pic_url()
            self.downloading_all_photos() #some download might not be jpg?? use selnium to download??
            self.save_infolist_to_file()

    def downloading_all_photos(self):
        """ download all photos to particular folder

        """
        self.create_folder()
        pic_counter = 1
        for url_link in self.pic_url_list:
            print( pic_counter)
            pic_prefix_str = self.g_search_key  + str(pic_counter)
            self.download_single_image(url_link.encode(), pic_prefix_str)
            pic_counter = pic_counter +1

    def download_single_image(self, url_link, pic_prefix_str):
        """ Download data according to the url link given.
            Args:
                url_link (str): url str.
                pic_prefix_str (str): pic_prefix_str for unique label the pic
        """
        self.download_fault = 0
        file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
        temp_filename = pic_prefix_str + file_ext
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename )

        valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive

        url = URL(url_link)
        if url.redirect:
            return # if there is re-direct, return

        if file_ext not in valid_image_ext_list:
            return #return if not valid image extension

        f = open(temp_filename_full_path, 'wb') # save as test.gif
        print( url_link)
        self.pic_info_list.append(pic_prefix_str + ': ' + url_link )
        try:
            f.write(url.download())#if have problem skip
        except:
            #if self.__print_download_fault:
            print( 'Problem with processing this data: ', url_link)
            self.download_fault =1
        f.close()

    def create_folder(self):
        """
            Create a folder to put the log data segregate by date

        """
        self.gs_raw_dirpath = os.path.join(self.folder_main_dir_prefix, time.strftime("_%d_%b%y", time.localtime()))
        if not os.path.exists(self.gs_raw_dirpath):
            os.makedirs(self.gs_raw_dirpath)

    def save_infolist_to_file(self):
        """ Save the info list to file.

        """
        temp_filename_full_path = os.path.join(self.gs_raw_dirpath, self.g_search_key + '_info.txt' )

        with  open(temp_filename_full_path, 'w') as f:
            for n in self.pic_info_list:
                f.write(n)
                f.write('\n')

if __name__ == '__main__':

    choice =4
    print('doing')
    if choice ==4:
        """test the downloading of files"""
        w = GoogleImageExtractor('')#leave blanks if get the search list from file
        searchlist_filename = r'C:\data\imgsearch_list.txt'
        w.set_num_image_to_dl(200)
        w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
        w.multi_search_download()
#코드 오류안나게 고쳤는데 오류는 안나는데 아무일도 일어나지 않는다 정말ㅎㅎㅎㅎㅎ...

sejongresearch / FlowerClassification

[CRAW] 크롤링20 #5