jellimin / 2023-kopis

0 stars 0 forks source link

데이터 수집, 적재 | 방송데이터 #8

Open jellimin opened 1 year ago

sum-k commented 1 year ago

드라마 줄거리 추출 완료했고, 디스코드에 알림도 보냈습니다!

## 드라마 줄거리 추출

## 시청률 관련 크롤링 - 드라마
# 필요한 모듈 불러오기
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from tqdm import tqdm_notebook
from selenium.common.exceptions import NoSuchElementException
import datetime
from datetime import datetime
from datetime import date
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
import warnings
warnings.filterwarnings('ignore')

# 크롤링 오류 발생 제거
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

def send_message(message):
    requests.post("https://discord.com/api/webhooks/1140278554609844264/aUubW_3WgjV_hwzVcQPjeWrhQzm1lZBS481VVIHqO7Cq_4A7E0xcJ2FPKsxRtaWy6R1r"
, data=message)

query = "https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=%EC%8B%9C%EC%B2%AD%EB%A5%A0+%EC%88%9C%EC%9C%84&oquery=08%EC%9B%9407%EC%9D%BC%EC%A3%BC+%EC%A7%80%EC%83%81%ED%8C%8C+%EC%8B%9C%EC%B2%AD%EB%A5%A0&tqi=iLF5ZlprvN8ssix3jbsssssssn4-214140"
driver = webdriver.Chrome(executable_path="./chromedriver.exe", options=options)
driver.get(query)

rank = []
title = []
url = []

# '주간' 클릭
driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/div[1]/section[1]/div/div[2]/div[2]/div[1]/ul/li[2]/a").click()

for j in range(1, 4):
    # 지상파, 종합편성, 케이블 페이지 존재
    page = "/html/body/div[3]/div[2]/div/div[1]/section[1]/div/div[2]/div[1]/ul/li[{}]/a".format(j)
    driver.find_element(By.XPATH, page).click()

    # 드라마       
    select = "/html/body/div[3]/div[2]/div/div[1]/section[1]/div/div[2]/div[2]/div[2]/select/option[2]"
    driver.find_element(By.XPATH, select).click()

    for i in range(1, 4):   
        a = str(i) # rank
        b = "/html/body/div[3]/div[2]/div/div[1]/section[1]/div/div[2]/div[3]/div/table/tbody/tr[{}]/td[2]/p/a".format(i) # 제목, url

        titlee = driver.find_element(By.XPATH, b).text
        urll = driver.find_element(By.XPATH, b).get_attribute('href')

        rank.append(a)
        title.append(titlee)
        url.append(urll)

dict = {'순위':rank, '제목':title, 'url':url}
df = pd.DataFrame(dict)
df.to_csv("drama.csv", index=False, sep=',')

########## 줄거리 뽑아내기
data = pd.read_csv("drama.csv")
url = data['url']

content = [] # 줄거리

# 드라마
for i in range(len(url)):
    query = url[i]
    driver = webdriver.Chrome(options=options)
    driver.get(query)
    # driver.implicitly_wait(10)

    cont = driver.find_element(By.CSS_SELECTOR, 'span.desc._text').text
    content.append(cont)

data['줄거리'] = content
data.drop('url', axis=1, inplace=True)
data.to_csv("drama.csv", index=False, sep=",")

# 함수 정의 후 크롤링 끝난 뒤 send_message 하기 ! 
message = {'content':'드라마 줄거리 크롤링을 완료했습니다.'}
send_message(message)
sum-k commented 1 year ago

예능 출연진도 추출완료하고, 디스코드 알림과 연결해두었습니다!

## 시청률 관련 크롤링 - 예능
# 필요한 모듈 불러오기
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from tqdm import tqdm_notebook
from selenium.common.exceptions import NoSuchElementException
import datetime
from datetime import datetime
from datetime import date
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
import warnings
warnings.filterwarnings('ignore')

# 크롤링 오류 발생 제거
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

def send_message(message):
    requests.post("https://discord.com/api/webhooks/1140278554609844264/aUubW_3WgjV_hwzVcQPjeWrhQzm1lZBS481VVIHqO7Cq_4A7E0xcJ2FPKsxRtaWy6R1r"
, data=message)
# 예능 출연진 크롤링

ent_title = ['나 혼자 산다', '유 퀴즈 온 더 블럭', '놀라운 토요일'] # 예능프로 제목

title = []
actor = [] # 출연진 내용
series = [] # 회차
date = [] # 날짜

# 예능
for i in range(len(ent_title)):
    query = "https://search.naver.com/search.naver?query={}".format(ent_title[i])
    driver = webdriver.Chrome(options=options)
    driver.get(query)
    # driver.implicitly_wait(10)

    driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div/div[1]/div[2]/div[1]/div[4]/div/div/ul/li[5]/a").click() #회차정보 탭으로 이동

    for j in range(1, 3): # 2개 회차
        title.append(ent_title[i])
        a = "/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div/div/div/div[{}]/ul/li[1]/div/div[1]/strong/a/span".format(j)
        aa = driver.find_element(By.XPATH, a).text
        series.append(aa)
        b = "/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div/div/div/div[{}]/ul/li[1]/div/div[1]/span".format(j)
        bb = driver.find_element(By.XPATH, b).text
        date.append(bb)

        temp=[]
        while(True):
            try:
                for k in range(1, 6):
                    c = "/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div/div/div/div[{0}]/ul/li[1]/div/dl/dd/a[{1}]".format(j, k)
                    act = driver.find_element(By.XPATH, c).text
                    temp.append(act)   
                temp = ",".join(temp)
            except:
                break
        actor.append(temp)

dict = {'제목':title, '회차정보':series, '방영날짜':date, '출연진':actor}
df = pd.DataFrame(dict)
df.to_csv("entertainment.csv", index=False, sep=",")

# 함수 정의 후 크롤링 끝난 뒤 send_message 하기 ! 
message = {'content':'예능 출연진 크롤링을 완료했습니다.'}
send_message(message)
sum-k commented 1 year ago

drama.csv entertainment.csv