NanmiCoder / MediaCrawler

小红书笔记 | 评论爬虫、抖音视频 | 评论爬虫、快手视频 | 评论爬虫、B 站视频 | 评论爬虫、微博帖子 | 评论爬虫、百度贴吧帖子 | 百度贴吧评论回复爬虫
Other
16.06k stars 5.14k forks source link

爬取能正常运行,但是在爬取评论时一条评论信息也爬取不到 #31

Closed InMyDreammer closed 1 year ago

InMyDreammer commented 1 year ago

报错信息如下 MediaCrawler ERROR aweme_id: 7266050530072481076 get comments failed, error: Expecting value: line 1 column 1 (char 0),
按理说即使又抖音的反爬取机制,但是也应该有一两条数据,但是一条也没有,以下是我修改过后的保存到本地csv的代码: import json from typing import Dict, List

from tortoise import fields from tortoise.models import Model import os import config from tools import utils import pandas as pd

class DouyinBaseModel(Model): id = fields.IntField(pk=True, autoincrement=True, description="自增ID") user_id = fields.CharField(null=True, max_length=64, description="用户ID") sec_uid = fields.CharField(null=True, max_length=128, description="用户sec_uid") short_user_id = fields.CharField(null=True, max_length=64, description="用户短ID") user_unique_id = fields.CharField(null=True, max_length=64, description="用户唯一ID") nickname = fields.CharField(null=True, max_length=64, description="用户昵称") avatar = fields.CharField(null=True, max_length=255, description="用户头像地址") user_signature = fields.CharField(null=True, max_length=500, description="用户签名") ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址") add_ts = fields.BigIntField(description="记录添加时间戳") last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")

class Meta:
    abstract = True

class DouyinAweme(DouyinBaseModel): aweme_id = fields.CharField(max_length=64, index=True, description="视频ID") aweme_type = fields.CharField(max_length=16, description="视频类型") title = fields.CharField(null=True, max_length=500, description="视频标题") desc = fields.TextField(null=True, description="视频描述") create_time = fields.BigIntField(description="视频发布时间戳", index=True) liked_count = fields.CharField(null=True, max_length=16, description="视频点赞数") comment_count = fields.CharField(null=True, max_length=16, description="视频评论数") share_count = fields.CharField(null=True, max_length=16, description="视频分享数") collected_count = fields.CharField(null=True, max_length=16, description="视频收藏数")

class Meta:
    table = "douyin_aweme"
    table_description = "抖音视频"

def __str__(self):
    return f"{self.aweme_id} - {self.title}"

def save_data_to_excel(data: Dict, sheet_name: str): file_path = 'D:\douyin.xlsx' if not os.path.exists(file_path): df = pd.DataFrame(columns=list(data.keys())) df.to_excel(file_path, sheet_name=sheet_name,index=False, engine='openpyxl') else: with pd.ExcelFile(file_path) as xls:

        df_old = pd.read_excel(xls, sheet_name=sheet_name, engine='openpyxl')

        # 使用 pd.concat 替代 append 方法
        df_new = pd.DataFrame([data])
        df_combined = pd.concat([df_old, df_new], ignore_index=True)

        df_combined.to_excel(file_path, sheet_name=sheet_name, index=False, engine='openpyxl')

async def save_aweme_to_excel(aweme_data: Dict): save_data_to_excel(aweme_data, "aweme")

async def save_comment_to_excel(comment_data: Dict): save_data_to_excel(comment_data, "comments")

async def save_aweme_to_excel(aweme_data: Dict):

#

file_path = 'D:\douyin.xlsx'

if not os.path.exists(file_path):

raise Exception(f"File not found: {file_path}")

if not os.path.exists(file_path):

df = pd.DataFrame(columns=list(aweme_data.keys()))

df.to_excel(file_path, sheet_name='aweme', index=False, engine='openpyxl')

else:

df = pd.read_excel(file_path, sheet_name='aweme', engine='openpyxl')

df = df.append(aweme_data, ignore_index=True)

df.to_excel(file_path, sheet_name='aweme', index=False, engine='openpyxl')

#

async def save_comment_to_excel(comment_data: Dict):

file_path = 'D:\douyin.xlsx'

if not os.path.exists(file_path):

raise Exception(f"File not found: {file_path}")

if not os.path.exists(file_path):

df = pd.DataFrame(columns=list(comment_data.keys()))

df.to_excel(file_path, sheet_name='comments', index=False, engine='openpyxl')

else:

df = pd.read_excel(file_path, sheet_name='comments', engine='openpyxl')

df = df.append(comment_data, ignore_index=True)

df.to_excel(file_path, sheet_name='comments', index=False, engine='openpyxl')

class DouyinAwemeComment(DouyinBaseModel): comment_id = fields.CharField(max_length=64, index=True, description="评论ID") aweme_id = fields.CharField(max_length=64, index=True, description="视频ID") content = fields.TextField(null=True, description="评论内容") create_time = fields.BigIntField(description="评论时间戳") sub_comment_count = fields.CharField(max_length=16, description="评论回复数")

class Meta:
    table = "douyin_aweme_comment"
    table_description = "抖音视频评论"

def __str__(self):
    return f"{self.comment_id} - {self.content}"

async def update_douyin_aweme(aweme_item: Dict): aweme_id = aweme_item.get("aweme_id") user_info = aweme_item.get("author", {}) interact_info = aweme_item.get("statistics", {}) local_db_item = { "aweme_id": aweme_id, "aweme_type": aweme_item.get("aweme_type"), "title": aweme_item.get("desc", ""), "desc": aweme_item.get("desc", ""), "create_time": aweme_item.get("create_time"), "user_id": user_info.get("uid"), "sec_uid": user_info.get("sec_uid"), "short_user_id": user_info.get("short_id"), "user_unique_id": user_info.get("unique_id"), "user_signature": user_info.get("signature"), "nickname": user_info.get("nickname"), "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0], "liked_count": interact_info.get("digg_count"), "collected_count": interact_info.get("collect_count"), "comment_count": interact_info.get("comment_count"), "share_count": interact_info.get("share_count"), "ip_location": aweme_item.get("ip_label", ""), "last_modify_ts": utils.get_current_timestamp(), } print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") if config.IS_SAVED_DATABASED: if not await DouyinAweme.filter(aweme_id=aweme_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() await DouyinAweme.create(local_db_item) else: await DouyinAweme.filter(aweme_id=aweme_id).update(local_db_item) else: await save_aweme_to_excel(local_db_item)

async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): if not comments: return for comment_item in comments: await update_dy_aweme_comment(aweme_id, comment_item)

async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): comment_aweme_id = comment_item.get("aweme_id") if aweme_id != comment_aweme_id: print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") return user_info = comment_item.get("user", {}) comment_id = comment_item.get("cid") avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get( "avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {} local_db_item = { "comment_id": comment_id, "create_time": comment_item.get("create_time"), "ip_location": comment_item.get("ip_label", ""), "aweme_id": aweme_id, "content": comment_item.get("text"), "content_extra": json.dumps(comment_item.get("text_extra", [])), "user_id": user_info.get("uid"), "sec_uid": user_info.get("sec_uid"), "short_user_id": user_info.get("short_id"), "user_unique_id": user_info.get("unique_id"), "user_signature": user_info.get("signature"), "nickname": user_info.get("nickname"), "avatar": avatar_info.get("url_list", [""])[0], "sub_comment_count": comment_item.get("reply_comment_total", 0), "last_modify_ts": utils.get_current_timestamp(), } print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") if config.IS_SAVED_DATABASED: if not await DouyinAwemeComment.filter(comment_id=comment_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() await DouyinAwemeComment.create(local_db_item) else: await DouyinAwemeComment.filter(comment_id=comment_id).update(local_db_item) else: await save_comment_to_excel(local_db_item)

NanmiCoder commented 1 year ago

抖音评论爬去失效了,等待后续更新

NanmiCoder commented 1 year ago

fix it