处理日语长视频时报错

DeepFal commented 1 week ago

2024-11-07 21:11:45.306 Uncaught app exception Traceback (most recent call last): File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\streamlit\runtime\scriptrunner\exec_code.py", line 88, in exec_func_with_error_handling result = func() File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\streamlit\runtime\scriptrunner\script_runner.py", line 590, in code_to_exec exec(code, module.dict) File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 117, in main() File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 113, in main text_processing_section() File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 30, in text_processing_section process_text() File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 47, in process_text step3_1_spacy_split.split_by_spacy() File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\core\step3_1_spacy_split.py", line 17, in split_by_spacy split_by_mark(nlp) File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\core\spacy_utils\split_by_mark.py", line 21, in split_by_mark doc = nlp(input_text) File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\language.py", line 1037, in call doc = self._ensure_doc(text) File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\language.py", line 1128, in _ensure_doc return self.make_doc(doc_like) File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\language.py", line 1120, in make_doc return self.tokenizer(text) File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\lang\ja__init.py", line 56, in call__ sudachipy_tokens = self.tokenizer.tokenize(text) Exception: Tokenization error: Input is too long, it can't be more than 49149 bytes, was 116123

DeepFal commented 1 week ago

暂时改成这样凑合用了，期待完善这种问题的处理方法

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os, sys
import pandas as pd
from core.spacy_utils.load_nlp_model import init_nlp
from core.config_utils import load_key, get_joiner
from rich import print

# 设置最大字节数限制
MAX_LENGTH = 49000

def split_text_into_chunks(text, max_length):
    """将文本按标点分割成多个不超过 max_length 字节的段落。"""
    chunks = []
    current_chunk = []
    current_length = 0

    # 按句号、逗号等标点符号进行初步分割
    sentences = text.split('。')  # 可根据需要更改分隔符，处理中文标点符号

    for sentence in sentences:
        sentence_length = len(sentence.encode('utf-8'))

        # 如果当前段落加上该句子长度超出限制，保存当前段落并重置
        if current_length + sentence_length > max_length:
            chunks.append('。'.join(current_chunk) + '。')
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    # 处理最后剩余的内容
    if current_chunk:
        chunks.append('。'.join(current_chunk) + '。')

    return chunks

def split_by_mark(nlp):
    whisper_language = load_key("whisper.language")
    language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language
    joiner = get_joiner(language)
    print(f"[blue]🔍 Using {language} language joiner: '{joiner}'[/blue]")

    # 读取并清理数据
    chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
    chunks.text = chunks.text.apply(lambda x: x.strip('"'))

    # 组合所有文本内容并分段
    input_text = joiner.join(chunks.text.to_list())
    text_chunks = split_text_into_chunks(input_text, MAX_LENGTH)

    sentences_by_mark = []

    # 对每个小段落分别应用 NLP 模型
    for chunk in text_chunks:
        doc = nlp(chunk)
        assert doc.has_annotation("SENT_START")
        sentences_by_mark.extend([sent.text for sent in doc.sents])

    # 将结果写入文件
    with open("output/log/sentence_by_mark.txt", "w", encoding="utf-8") as output_file:
        for i, sentence in enumerate(sentences_by_mark):
            if i > 0 and sentence.strip() in [',', '.', '，', '。', '？', '！']:
                output_file.seek(output_file.tell() - 1, os.SEEK_SET)
                output_file.write(sentence)
            else:
                output_file.write(sentence + "\n")

    print("[green]💾 Sentences split by punctuation marks saved to →  `sentences_by_mark.txt`[/green]")

if __name__ == "__main__":
    nlp = init_nlp()
    split_by_mark(nlp)

Huanshere commented 1 week ago

日语的支持不是很好，一是 whisper 打标点不好，二是 llm 对于日语的支持不算很完美，之后会尝试更换一下 wav2vac 对齐模型

DeepFal commented 1 week ago

磕磕绊绊做完了一个3小时的音乐剧，总体来说还可以，可以私你看看成品，自己找的sonnet接口跑的

Huanshere / VideoLingo

处理日语长视频时报错 #238