delete css tag - Githubissues

from bs4 import BeautifulSoup
import re

def clean_html(input_file, output_file):
    # HTMLファイルを読み込む
    with open(input_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # BeautifulSoupオブジェクトを作成
    soup = BeautifulSoup(html_content, 'html.parser')

    # style タグを削除
    for style in soup(['style']):
        style.decompose()

    # link タグのうち、stylesheetを参照しているものを削除
    for link in soup.find_all('link', rel='stylesheet'):
        link.decompose()

    # すべてのタグから class, height, width 属性を削除
    for tag in soup.find_all(True):
        if 'class' in tag.attrs:
            del tag.attrs['class']
        if 'height' in tag.attrs:
            del tag.attrs['height']
        if 'width' in tag.attrs:
            del tag.attrs['width']

    # style 属性を削除
    for tag in soup.find_all(style=True):
        del tag['style']

    # インラインスタイルを削除（style="..."）
    pattern = re.compile(r'\s*style=[""][^""]*[""]')
    html_string = str(soup)
    cleaned_html = pattern.sub('', html_string)

    # クリーンされたHTMLを新しいファイルに保存
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_html)

    print(f"クリーンされたHTMLが '{output_file}' として保存されました。")

# 使用例
input_file = 'hoge.html'  # 処理したいHTMLファイルの名前
output_file = 'hoge_cleaned_output.html'  # 出力ファイルの名前
clean_html(input_file, output_file)

import os from bs4 import BeautifulSoup, Tag def split_html(input_file): with open(input_file, 'r', encoding='utf-8') as file: content = file.read() soup = BeautifulSoup(content, 'html.parser') headers = soup.find_all(['h1', 'h2', 'h3']) output_dir = 'split_html_output' os.makedirs(output_dir, exist_ok=True) for i, header in enumerate(headers): # 現在のヘッダーから次のヘッダーまでの内容を取得 content = [header] next_sibling = header.next_sibling while next_sibling and not (isinstance(next_sibling, Tag) and next_sibling.name in ['h1', 'h2', 'h3']): if isinstance(next_sibling, Tag) or (isinstance(next_sibling, str) and next_sibling.strip()): content.append(next_sibling) next_sibling = next_sibling.next_sibling # HTMLファイルを作成 filename = f"{output_dir}/section_{i+1}_{header.name}_{header.text.strip()[:30]}.html" with open(filename, 'w', encoding='utf-8') as file: file.write(f"<!DOCTYPE html>\n<html>\n<head>\n<title>{header.text.strip()}</title>\n</head>\n<body>\n") file.write(''.join(str(elem) for elem in content)) file.write("\n</body>\n</html>") print(f"HTMLファイルが {output_dir} ディレクトリに分割されました。") # スクリプトの使用例 if __name__ == "__main__": input_file = "path/to/your/input.html" # 入力ファイルのパスを指定してください split_html(input_file)

rikuto125 / -

delete css tag #6