使用正则表达式匹配各个部分

pattern = r'^(.+?)\s+\[(.*?)\]\s+\[([^\]]*)\]\s+\[([^\]]*)\]\s+\[([^\]]*)\]\s+\[([^\]]*)\]$'
match = re.match(pattern, line)

if not match:
    raise ValueError("Invalid line format")

# 提取匹配的各部分
url = match.group(1)
status_code = int(match.group(2).strip('[]'))
title = match.group(3).strip('[]')
server = match.group(4).strip('[]')
jump_url = match.group(5).strip('[]')
content_length = match.group(6).strip('[]')

# 如果 status_code 不是 302 或 301，则将 jump_url 设置为空字符串
if status_code not in [301, 302]:
    jump_url = ''

return {
    'url': url,
    'status_code': status_code,
    'title': title,
    'server': server,
    'jump_url': jump_url,
    'content_length': content_length
}

def process_file_and_write_to_excel(input_file_path, output_excel_path): data = []

# 读取原始文本文件并解析每一行
with open(input_file_path, 'r', encoding='utf-8') as infile:
    for line_number, line in enumerate(infile, start=1):
        try:
            parsed_data = parse_line(line.strip())
            data.append(parsed_data)
        except ValueError as e:
            print(f"Error parsing line {line_number}: {e} - Line: '{line.strip()}'")

# 使用 Pandas DataFrame 将数据写入 Excel 文件
df = pd.DataFrame(data)
df.to_excel(output_excel_path, index=False)

def main(): input_file_path = 'input.txt' output_excel_path = 'output.xlsx'

# 直接处理文件并将结果写入 Excel 文件
process_file_and_write_to_excel(input_file_path, output_excel_path)

if name == 'main': main()

muddlelife commented 1 week ago

感谢支持，下个版本会加上

muddlelife commented 4 days ago

v1.5.0版本已经支持了将批量扫描结果导出为csv格式

muddlelife / windfire

增加xlsx格式导出结果 #3

使用正则表达式匹配各个部分