Closed stonecropa closed 1 year ago
This issue was already fixed (see #1306). Please upgrade to the 2.2.6 version of PSReadLine from PowerShell Gallery. See the upgrading section for instructions. Please let us know if you run into the same issue with the latest version.
Prerequisites
Exception report
Screenshot
Environment data
Steps to reproduce
from utils import write_source, write_tweet from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver import ChromeOptions import time from datetime import datetime import re import os import json
tweet_num_limit = 1 # 一次提取的帖子url的数量 comment_num_limit = 600 # 截取的一级评论限制数量 second_comment_num_limit = 600
driver_path = 'C:\Program Files\Google\Chrome\Application\chromedriver.exe'
wb_login_url = 'https://weibo.com/login.php'
content_xpath = '//div[@class="detail_wbtext_4CRf9"]' # 帖子正文 tweet_time_xpath = '//a[@class="head-info_time_6sFQg"]' # 帖子发布时间 comment_num_xpath = '//div[@class="woo-box-item-flex toolbar_item_1ky_D toolbar_cursor_34j5V"]/div/span[@class="toolbar_num_JXZul"]' # 评论数量 comment_list_bottom_xpath = '//div[@class="Bottom_text_1kFLe"]' # 评论列表页面最底部 popup_comment_list_bottom_xpath = '//div[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter woo-modal-wrap ReplyModal_wrap_2j1bg"]//div[@class="Bottom_text_1kFLe"]' # 弹窗评论列表页面最底部 close_popup_button_xpath = '//div[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter woo-modal-wrap ReplyModal_wrap_2j1bg"]//div[@class="wbpro-layer-tit-opt woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]/i'
tweet_user_xpath = '//a[@class="ALink_default_2ibt1 head_cut_2Zcft head_name_24eEB"]' # 帖子发布用户id
def login(): option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) driver = webdriver.Chrome(driver_path, options=option) driver.maximize_window() driver.get(wb_login_url) print('正在打开微博登录页面......')
获取帖子文本和时间,返回dict
def get_tweet_content(driver): tweet_text = driver.find_element_by_xpath(content_xpath).get_attribute('innerText') tweet_text = tweet_text.replace('
', '\n').replace('\u200b', '').replace('\u2006', ' ').replace( '\u0026\u0071\u0075\u006f\u0074\u003b', '\"').replace('\u0026\u006e\u0062\u0073\u0070\u003b', ' ').replace( '\u0026\u0061\u006d\u0070\u003b', '&') tweet_time = driver.find_element_by_xpath(tweet_time_xpath).get_attribute('innerText')
从帖子url中截取用户和帖子的id,返回两个str
def get_user_tweet_id(tweet_url): split_list = tweet_url.split('/')
def get_tweet_comment(driver): comment_num = driver.find_element_by_xpath(comment_num_xpath).get_attribute('innerText').strip() tip_xpath = '//span[@class="woo-tip-text"]' tip = driver.find_elements_by_xpath(tip_xpath) first = find_comment_view(driver, 0) if comment_num == '评论' or comment_num == 0 or len(first) == 0: return [] if len(tip) != 0 and tip[-1].get_attribute('innerText').strip() == '暂无评论,发表你的评论或看看推荐吧': return []
下拉滚动条,加载多条评论
def scroll_to_show_enough_comment(driver):
useful_comment_num = 0
check_num = 0.1
check_time = 0.1
while 1:
scrolling(driver, 100000)
bottom_comment = find_comment_view(driver, comment_num_limit - 1)
comment_list_bottom = driver.find_elements_by_xpath(comment_list_bottom_xpath)
if len(bottom_comment) != 0:
useful_comment_num = comment_num_limit
break
if len(comment_list_bottom) != 0:
for i in range(comment_num_limit - 1, -1, -1):
comment_view = find_comment_view(driver, i)
if len(comment_view) != 0:
useful_comment_num = i + 1
break
break
time.sleep(1)
check_num += 1
if check_num == 10:
check_time += 1
if check_miss(driver):
useful_comment_num = comment_num_limit
break
check_num = 0
if check_time > 4:
useful_comment_num = 0
break
把滚动条再拉回顶部,否则评论列表到达最底端的提示显示不出来(一直卡在刷新状态)
scrolling(driver, 0)
time.sleep(2)
return useful_comment_num
def scroll_to_show_enough_comment(driver, max_retry=5): useful_comment_num = 0 retry_times = 0
def check_miss(driver): comment_view_xpath = f'//div[@class="RepostCommentList_mar1_3VHkS"]//div[@class="vue-recycle-scroller__item-view"]/div[@data-active="true"]' print('check num', int(driver.find_elements_by_xpath(comment_view_xpath)[0].get_attribute("data-index"))) return int(driver.find_elements_by_xpath(comment_view_xpath)[0].get_attribute("data-index")) > comment_num_limit
下拉滚动条,加载弹窗内所有评论
def scroll_popup_to_show_all_comment(driver): useful_comment_num = 0
下拉滚动条,加载多个帖子
def scroll_to_show_enough_tweet(driver): while 1: scrolling(driver, 100000)
def scrolling(driver, location): js = f"var q=document.documentElement.scrollTop={location}" driver.execute_script(js)
def scrolling_popup(driver, location): scroll = driver.find_element_by_xpath('//div[@class="ReplyModal_scroll3_2kADQ"]') driver.execute_script(f'arguments[0].scrollTop={location}', scroll)
def find_comment_view(driver, data_index): comment_view_xpath = f'//div[@class="RepostCommentList_mar1_3VHkS"]//div[@class="vue-recycle-scroller__item-view"]/div[@data-index="{data_index}"]' return driver.find_elements_by_xpath(comment_view_xpath)
def find_active_comment_view(driver, data_index): comment_view_xpath = f'//div[@class="RepostCommentList_mar1_3VHkS"]//div[@class="vue-recycle-scroller__item-view"]/div[@data-active="true" and @data-index="{data_index}"]' return driver.find_elements_by_xpath(comment_view_xpath)
def find_second_comment_view(driver, data_index): comment_view_xpath = f'//div[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter woo-modal-wrap ReplyModal_wrap_2j1bg"]//div[@data-index="{data_index}"]' return driver.find_elements_by_xpath(comment_view_xpath)
def find_active_second_comment_view(driver, data_index): comment_view_xpath = f'//div[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter woo-modal-wrap ReplyModal_wrap_2j1bg"]//div[@data-index="{data_index}" and @data-active="true"]' return driver.find_elements_by_xpath(comment_view_xpath)
def find_tweet_view(driver, data_index): tweet_view_xpath = f'//div[@class="vue-recycle-scroller__item-view"]/div[@data-index="{data_index}"]' # 跟评论的xpath一样 return driver.find_elements_by_xpath(tweet_view_xpath)
def find_active_tweet_view(driver, data_index): tweet_view_xpath = f'//div[@class="vue-recycle-scroller__item-view"]/div[@data-index="{data_index}" and @data-active="true"]' # 跟评论的xpath一样 return driver.find_elements_by_xpath(tweet_view_xpath)
def check_second_display(comment_view): return comment_view.find_element_by_xpath('./..').get_attribute( 'style') != 'transform: translateY(-9999px); z-index: -1;'
tweet_url是一个帖子的内容网页,比如https://www.weibo.com/1883881851/LkJwDqkmO
def crawl_tweet(driver, tweet_url):
driver.get(tweet_url)
WebDriverWait(driver, 20).until(lambda driver: driver.find_elements_by_xpath(content_xpath))
time.sleep(0.3)
tweet_dict = get_tweet_content(driver)
user_id, tweet_id = get_user_tweet_id(tweet_url)
tweet_dict['user id'] = user_id
tweet_dict['tweet id'] = tweet_id
comment_list = get_tweet_comment(driver)
if comment_list == []:
return None
tweet = {
'source': tweet_dict,
'comment': comment_list
}
将 tweet 保存到文件
dirpath = os.path.join('.', 'Data', theme)
if not os.path.exists(dirpath):
os.makedirs(dirpath)
filepath = os.path.join(dirpath, f'{tweet["source"]["tweet id"]}.json')
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(tweet, f, ensure_ascii=False)
return tweet
def crawl_tweet(driver, tweet_url): driver.get(tweet_url) WebDriverWait(driver, 20).until(lambda driver: driver.find_elements_by_xpath(content_xpath)) time.sleep(0.3)
def crawl_theme(driver, crawl_num, theme, theme_url): dirpath = os.path.join('.', 'Data', theme) if not os.path.exists(dirpath): os.makedirs(dirpath)
if name == 'main': driver = login() crawl_num = 10000 theme = 'No Theme Long' theme_url = 'https://s.weibo.com/weibo?q=%23%E6%B5%B7%E5%A4%A9%E5%91%B3%E4%B8%9A%E5%9B%9E%E5%BA%94%E9%85%B1%E6%B2%B9%E6%B7%BB%E5%8A%A0%E5%89%82%E4%BA%89%E8%AE%AE%23' crawl_theme(driver, crawl_num, theme, theme_url)
crawl_tweet(driver, test_url)
Expected behavior
make the comment about weibo and write
Actual behavior
error