Closed aiyisuizhongqi closed 1 month ago
from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup import xlwt
book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('菜徐坤', cell_overwrite_ok=True) sheet.write(0, 0, '标题') sheet.write(0, 1, '视频地址') sheet.write(0, 2, '观看人数') sheet.write(0, 3, '弹幕数') sheet.write(0, 4, '发布时间')
def test_open_brower(): with sync_playwright() as p: browser = p.chromium.launch() context = browser.new_context() page = context.new_page() page.goto("https://www.bilibili.com//") page.locator('//*[@id="nav-searchform"]/div[1]/input').fill("菜徐坤 篮球") with page.expect_popup() as page1_info: page.locator("#nav-searchform").get_by_role("img").nth(1).click() page1 = page1_info.value
# 获取总页数 max_page = page1.locator( "//button[@class='vui_button vui_pagenation--btn vui_pagenation--btn-side' and text()='下一页']/preceding-sibling::button[contains(@class, 'vui_pagenation--btn-num')][1]") total = int(max_page.inner_text()) # inner_text() 返回渲染后文本 # 点击第1,2,3页,返回 for i in range(1, 3): # 测试用的,发现循环第二次的时候list是空的,page1.content()拿到的是框架源代码,没有内容 if i > 1: page1.get_by_role("button", name=f"{i}", exact=True).click() # 等待新页面加载完成 page1.wait_for_load_state('domcontentloaded') # 获取加载后的页面源码 html = page1.content() # 解析页面 ikun_movie_list = test_parse_html(html) # 写入文件 test_save_ikun(ikun_movie_list, 1 + 25 * (i - 1)) # 保存 book.save('菜徐坤的篮球.xls')
def test_parsehtml(html): soup = BeautifulSoup(html, 'lxml') list = soup.find(class='video-list row').findall(class='col_3 col_xs_1_5 col_md_2 col_xl_1_7 mb_x40') ikun_movies_lists = [] for item in list: itemtitle = item.find(class='bili-video-cardinfo--tit').text item_link = item.find('a').get('href') item_view = item.findall('span', class='bili-video-cardstats--item')[0].text item_biubiu = item.findall('span', class='bili-video-cardstats--item')[1].text itemdate = item.find(class='bili-video-cardinfo--date').string
ikun_movie_list = [item_title, item_link, item_view, item_biubiu, item_date] ikun_movies_lists.append(ikun_movie_list) return ikun_movies_lists
def test_save_ikun(ikun_movies_lists, row): for item in ikun_movies_lists: sheet.write(row, 0, item[0]) sheet.write(row, 1, item[1]) sheet.write(row, 2, item[2]) sheet.write(row, 3, item[3]) sheet.write(row, 4, item[4]) row += 1
这是来自QQ邮箱的假期自动回复邮件。 您好,我最近正在休假中,无法亲自回复您的邮件。我将在假期结束后,尽快给您回复。
已找到问题,b站上第二页和第一页的video list 中的class属性不一样
from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup import xlwt
book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('菜徐坤', cell_overwrite_ok=True) sheet.write(0, 0, '标题') sheet.write(0, 1, '视频地址') sheet.write(0, 2, '观看人数') sheet.write(0, 3, '弹幕数') sheet.write(0, 4, '发布时间')
获取页面加载后的源码
def test_open_brower(): with sync_playwright() as p: browser = p.chromium.launch() context = browser.new_context() page = context.new_page() page.goto("https://www.bilibili.com//") page.locator('//*[@id="nav-searchform"]/div[1]/input').fill("菜徐坤 篮球") with page.expect_popup() as page1_info: page.locator("#nav-searchform").get_by_role("img").nth(1).click() page1 = page1_info.value
def test_parsehtml(html): soup = BeautifulSoup(html, 'lxml') list = soup.find(class='video-list row').findall(class='col_3 col_xs_1_5 col_md_2 col_xl_1_7 mb_x40') ikun_movies_lists = [] for item in list: itemtitle = item.find(class='bili-video-cardinfo--tit').text item_link = item.find('a').get('href') item_view = item.findall('span', class='bili-video-cardstats--item')[0].text item_biubiu = item.findall('span', class='bili-video-cardstats--item')[1].text itemdate = item.find(class='bili-video-cardinfo--date').string
def test_save_ikun(ikun_movies_lists, row): for item in ikun_movies_lists: sheet.write(row, 0, item[0]) sheet.write(row, 1, item[1]) sheet.write(row, 2, item[2]) sheet.write(row, 3, item[3]) sheet.write(row, 4, item[4]) row += 1