Open DeepFal opened 1 week ago
暂时改成这样凑合用了,期待完善这种问题的处理方法
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os, sys
import pandas as pd
from core.spacy_utils.load_nlp_model import init_nlp
from core.config_utils import load_key, get_joiner
from rich import print
# 设置最大字节数限制
MAX_LENGTH = 49000
def split_text_into_chunks(text, max_length):
"""将文本按标点分割成多个不超过 max_length 字节的段落。"""
chunks = []
current_chunk = []
current_length = 0
# 按句号、逗号等标点符号进行初步分割
sentences = text.split('。') # 可根据需要更改分隔符,处理中文标点符号
for sentence in sentences:
sentence_length = len(sentence.encode('utf-8'))
# 如果当前段落加上该句子长度超出限制,保存当前段落并重置
if current_length + sentence_length > max_length:
chunks.append('。'.join(current_chunk) + '。')
current_chunk = [sentence]
current_length = sentence_length
else:
current_chunk.append(sentence)
current_length += sentence_length
# 处理最后剩余的内容
if current_chunk:
chunks.append('。'.join(current_chunk) + '。')
return chunks
def split_by_mark(nlp):
whisper_language = load_key("whisper.language")
language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language
joiner = get_joiner(language)
print(f"[blue]🔍 Using {language} language joiner: '{joiner}'[/blue]")
# 读取并清理数据
chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
chunks.text = chunks.text.apply(lambda x: x.strip('"'))
# 组合所有文本内容并分段
input_text = joiner.join(chunks.text.to_list())
text_chunks = split_text_into_chunks(input_text, MAX_LENGTH)
sentences_by_mark = []
# 对每个小段落分别应用 NLP 模型
for chunk in text_chunks:
doc = nlp(chunk)
assert doc.has_annotation("SENT_START")
sentences_by_mark.extend([sent.text for sent in doc.sents])
# 将结果写入文件
with open("output/log/sentence_by_mark.txt", "w", encoding="utf-8") as output_file:
for i, sentence in enumerate(sentences_by_mark):
if i > 0 and sentence.strip() in [',', '.', ',', '。', '?', '!']:
output_file.seek(output_file.tell() - 1, os.SEEK_SET)
output_file.write(sentence)
else:
output_file.write(sentence + "\n")
print("[green]💾 Sentences split by punctuation marks saved to → `sentences_by_mark.txt`[/green]")
if __name__ == "__main__":
nlp = init_nlp()
split_by_mark(nlp)
日语的支持不是很好,一是 whisper 打标点不好,二是 llm 对于日语的支持不算很完美,之后会尝试更换一下 wav2vac 对齐模型
磕磕绊绊做完了一个3小时的音乐剧,总体来说还可以,可以私你看看成品,自己找的sonnet接口跑的
2024-11-07 21:11:45.306 Uncaught app exception Traceback (most recent call last): File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\streamlit\runtime\scriptrunner\exec_code.py", line 88, in exec_func_with_error_handling result = func() File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\streamlit\runtime\scriptrunner\script_runner.py", line 590, in code_to_exec exec(code, module.dict) File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 117, in
main()
File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 113, in main
text_processing_section()
File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 30, in text_processing_section
process_text()
File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\st.py", line 47, in process_text
step3_1_spacy_split.split_by_spacy()
File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\core\step3_1_spacy_split.py", line 17, in split_by_spacy
split_by_mark(nlp)
File "C:\Users\deepf\Desktop\VideoLingo\VideoLingo\core\spacy_utils\split_by_mark.py", line 21, in split_by_mark
doc = nlp(input_text)
File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\language.py", line 1037, in call
doc = self._ensure_doc(text)
File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\language.py", line 1128, in _ensure_doc
return self.make_doc(doc_like)
File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\language.py", line 1120, in make_doc
return self.tokenizer(text)
File "C:\Users\deepf\anaconda3\envs\videolingo\lib\site-packages\spacy\lang\ja__init.py", line 56, in call__
sudachipy_tokens = self.tokenizer.tokenize(text)
Exception: Tokenization error: Input is too long, it can't be more than 49149 bytes, was 116123