通过av号或bv号获取在线弹幕及使用python内置xml库处理xml数据

SennriSyunnga / Video-editing-auxiliary-tool-based-on-barrage-analysis-

原理是这样子的：假设观众的弹幕是大众趣味的正确反馈，在精彩时刻就会有大量特定弹幕，那么可以反向通过统计一段时间内特定弹幕的数量，来识别出一个长视频中受欢迎的部分。统计是通过输入关键词来进行的，例如对于特别有趣的片段，观众倾向于发送“草”这一弹幕。假设在一段时间内，草的弹幕超出了预设的限制（limit），工具应当记下该时间轴，并提供给剪辑者一个参考时间轴。在一段时间内，由于网络延迟的问题，弹幕的出现不是同步的，观众见证一个亮点时刻时，作为反应的弹幕也会分散在数秒之中。因此将统计的范围放到当前时刻开始的5秒之内。只要总计的弹幕达到了阈值（limit）就记下当前时刻，并且从20秒间隔（interval）后重新开始新一轮判定。若当前时刻并不达标，将当前时间往后推进1秒。之所以设置了20秒的冷却间隔（interval），是为了减少重复记轴—— 精彩镜头从来不是瞬间，而是从某个时间点开始的时间段如果不设置冷却时间，那么相邻的数秒都会被纳入统计。这样输出的数个结果的参考意义是很有限的。统计开始容易，统计结束位置很难，但是结束位置可以人为判断，未必要越厨代庖。因此这个工具旨在提供一个“精彩片段的开始大致时间”，给剪辑者、补长视频的观众作为跳转参考。这样一来工具的运行时间也能一定程度上减少。

MIT License

7 stars 3 forks source link

import requests #第三方库，需通过pip安装 def getDanmu(vid): if vid.startswith("BV"): #BV号 resp = requests.get(f"https://api.bilibili.com/x/player/pagelist?bvid={vid}") elif vid.startswith("av"): #AV号 resp = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={vid[2:]}") elif vid.isdigit(): #AV号（纯数字） resp = requests.get(f"https://api.bilibili.com/x/player/pagelist?aid={vid}") else: raise Exception("无效的av号或bv号") if resp.ok: data = resp.json() return requests.get(f"https://api.bilibili.com/x/v1/dm/list.so?oid={data['data'][0]['cid']}") else: raise Exception(resp.reason)

昨天想了一下，如果只是粗略的看弹幕浓度的话，可以借鉴Bilibili视频底部弹幕曲线的思路，用matplotlib做一个折线图或者柱形图统计关键弹幕。简单写了一个例子。

import math
import re
import requests  # 第三方库，通过pip安装
import xml.etree.ElementTree as ET

from datetime import timedelta

import matplotlib.pyplot as plt  # 第三方库，通过pip安装
from matplotlib.ticker import FuncFormatter, StrMethodFormatter  # 第三方库，通过pip安装

plt.rcParams["font.sans-serif"] = ["SimHei"]  # 使用支持显示中文及日文字符的字体

BILIBILI_API = "https://api.bilibili.com/x"
BILIBILI_API_PAGELIST_AVID = f"{BILIBILI_API}/player/pagelist?aid="
BILIBILI_API_PAGELIST_BVID = f"{BILIBILI_API}/player/pagelist?bvid="
BILIBILI_API_DANMU_LIST = f"{BILIBILI_API}/v1/dm/list.so?oid="

def timeformatter(y, pos):  # 格式化柱形图时间
    return str(timedelta(seconds=math.floor(y))) if y >= 0 else "无效时间"

formatter = FuncFormatter(timeformatter)

def getDanmu(vid):  # 获取弹幕xml
    if vid.startswith("BV"):
        resp = requests.get(f"{BILIBILI_API_PAGELIST_BVID}{vid}")
    elif vid.startswith("av"):
        resp = requests.get(f"{BILIBILI_API_PAGELIST_AVID}{vid[2:]}")
    elif vid.isdigit():
        resp = requests.get(f"{BILIBILI_API_PAGELIST_AVID}{vid}")
    else:
        return None
    if resp.ok:
        cid = resp.json()["data"][0]["cid"]
        return requests.get(f"{BILIBILI_API_DANMU_LIST}{cid}")
    else:
        return None

def strQ2B(text):  # 全角字符转换成半角字符
    tmp = []
    for word in text:
        code = ord(word)
        if code == 12288:  # 转换空格
            code = 32
        elif 65281 <= code and code <= 65374:  # 转换字符、数字及字母
            code -= 65248
        tmp.append(chr(code))
    return "".join(tmp)

keywords = "kksk|草|めあ|mea|谁|誰|??".split("|")  # 关键字
similar_words = {"めあ": "めあ/mea", "mea": "めあ/mea", "谁": "谁/誰", "誰": "谁/誰"}  # 同义词
matches = {
    similar_words[keyword] if keyword in similar_words else keyword: []
    for keyword in keywords
}
last = 0

danmuxml = getDanmu("BV1RJ411C7jy")
if danmuxml and danmuxml.ok:
    root = ET.fromstring(danmuxml.content)
    for danmu in root.findall("d"):
        t = float(danmu.attrib["p"].split(",")[0])  # 获取弹幕时间
        if t > last:
            last = math.ceil(t)  # 记录最后一条弹幕时间
        for keyword in keywords:  # 关键字匹配
            if re.search(
                keyword.replace("?", r"\?"),  # ?为regex关键字，需要加"\"进行转义
                strQ2B(danmu.text),  # 把全角符号、数字及字母转换为半角
                re.I,  # 忽略大小写
            ):
                matches[
                    similar_words[keyword] if keyword in similar_words else keyword
                ].append(t)
    # 绘制柱形图
    fig, ax = plt.subplots()
    ax.set_xlabel("时间")
    ax.set_ylabel("个数")
    ax.set_xlim(left=0, right=last)  # 可选，设置时间轴范围从00:00:00到最后一条弹幕时间
    ax.set_ylim(bottom=8, top=32)  # 可选，设置次数的上下限
    ax.xaxis.set_major_formatter(formatter)
    ax.yaxis.set_major_formatter(StrMethodFormatter("{x:.0f}"))
    plt.hist(  # 绘制柱形图
        matches.values(),  # 以时间频率为纵坐标
        label=matches.keys(),  # 以关键字为横坐标
        bins=256,  # bins指定出现的柱形个数
    )
    plt.legend()
    ax.set_title("关键弹幕次数统计")
    plt.show()
else:
    print("无法获取弹幕xml")

SennriSyunnga / Video-editing-auxiliary-tool-based-on-barrage-analysis-

通过av号或bv号获取在线弹幕及使用python内置xml库处理xml数据 #1