Open rikuto125 opened 2 weeks ago
import os
from bs4 import BeautifulSoup, Tag
def split_html(input_file):
with open(input_file, 'r', encoding='utf-8') as file:
content = file.read()
soup = BeautifulSoup(content, 'html.parser')
headers = soup.find_all(['h1', 'h2', 'h3'])
output_dir = 'split_html_output'
os.makedirs(output_dir, exist_ok=True)
for i, header in enumerate(headers):
# 現在のヘッダーから次のヘッダーまでの内容を取得
content = [header]
next_sibling = header.next_sibling
while next_sibling and not (isinstance(next_sibling, Tag) and next_sibling.name in ['h1', 'h2', 'h3']):
if isinstance(next_sibling, Tag) or (isinstance(next_sibling, str) and next_sibling.strip()):
content.append(next_sibling)
next_sibling = next_sibling.next_sibling
# HTMLファイルを作成
filename = f"{output_dir}/section_{i+1}_{header.name}_{header.text.strip()[:30]}.html"
with open(filename, 'w', encoding='utf-8') as file:
file.write(f"<!DOCTYPE html>\n<html>\n<head>\n<title>{header.text.strip()}</title>\n</head>\n<body>\n")
file.write(''.join(str(elem) for elem in content))
file.write("\n</body>\n</html>")
print(f"HTMLファイルが {output_dir} ディレクトリに分割されました。")
# スクリプトの使用例
if __name__ == "__main__":
input_file = "path/to/your/input.html" # 入力ファイルのパスを指定してください
split_html(input_file)