ghosert / cmd-editor

报告 【Cmd 在线 Markdown 编辑阅读器】的问题,臭虫,改进建议
703 stars 62 forks source link

🤡写了个自动备份的脚本,取需,有问题可以留言~ #2647

Closed coder-pig closed 7 months ago

coder-pig commented 12 months ago

支持将作者发的备份数据(xlsx)文件,批量生成md文件md文件本地化(图片下载到本地),需要Python环境,先安装依赖库:

pip install -r aiofiles==23.2.1
pip install -r aiohttp_requests==0.1.3
pip install -r openpyxl==3.1.2

复制下述代码保存为 zybl_xlsx_to_local_md.py 文件,把备份数据xlsx放到同一目录下:

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
   File     : zybl_xlsx_to_local_md.py
   Author   : CoderPig
   date     : 2023-09-22 17:08
   Desc     : cmd markdown(作业部落) 数据备份xlsx文件 批量生成本地MD文件的脚本
-------------------------------------------------
"""
import asyncio
import os
import re
import time
from functools import partial

import aiofiles
from aiohttp_requests import requests
from openpyxl import load_workbook

origin_md_dir = os.path.join(os.getcwd(), "origin_md")  # 生成的原始md文件
local_md_dir = os.path.join(os.getcwd(), "local_md")  # 生成的本地md文件
pic_url_path_record_list = []  # 存储图片URL和本地图片路径对应关系的列表,用作批量下载图片
pic_match_pattern = re.compile(r'(\]: |\()+(http.*?\.(png|PNG|jpg|JPG|gif|GIF|svg|SVG|webp|awebp))\??(\)?)',
                               re.M)  # 匹配图片的正则
order_set = {i for i in range(1, 500000)}  # 避免图片名重复后缀
default_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/85.0.4183.121 Safari/537.36 '
}

# 判断目录是否存在,不存在新建
def is_dir_existed(file_path, mkdir=True):
    if mkdir:
        if not os.path.exists(file_path):
            os.makedirs(file_path)
    else:
        return os.path.exists(file_path)

# 把文本写入到文件中
def write_text_to_file(content, file_path, mode="w+"):
    try:
        with open(file_path, mode, encoding='utf-8') as f:
            f.write(content + "\n", )
    except OSError as reason:
        print(str(reason))

# 以文本形式读取文件
def read_file_text_content(file_path):
    if not os.path.exists(file_path):
        return None
    else:
        with open(file_path, 'r+', encoding='utf-8') as f:
            return f.read()

# 扫描特定目录下特定文件后缀,返回文件路径列表
def scan_file_list_by_suffix(file_dir=os.getcwd(), suffix=""):
    return [os.path.join(file_dir, x) for x in os.listdir(file_dir) if x.endswith(suffix)]

# 异步下载图片
async def download_pic(pic_path, url, headers=None):
    try:
        if headers is None:
            headers = default_headers
        if url.startswith("http") | url.startswith("https"):
            if os.path.exists(pic_path):
                print("图片已存在,跳过下载:%s" % pic_path)
            else:
                resp = await requests.get(url, headers=headers)
                print("下载图片:%s" % resp.url)
                if resp is not None:
                    if resp.status != 404:
                        async with aiofiles.open(pic_path, "wb+") as f:
                            await f.write(await resp.read())
                            print("图片下载完毕:%s" % pic_path)
                    else:
                        print("图片不存在:{}".format(url))
        else:
            print("图片链接格式不正确:%s - %s" % (pic_path, url))
    except Exception as e:
        print("下载异常:{}\n{}".format(url, e))

# 解析xlsx文件,生成原始md文件
def analysis_xlsx(xlsx_file):
    workbook = load_workbook(xlsx_file)
    sheet = workbook['SheetJS']
    # 遍历每一行,min_row → 从第几行开始迭代,values_only=True只获取单元格的值
    for row in sheet.iter_rows(min_row=1, values_only=True):
        if row[0] != "name" and row[1] != "title":
            if row[1] is None:
                print("未检测到标题" + row[2])
            else:
                content = row[2]
                if content is None:
                    print("未检测到文章内容【{}】".format(row[1]))
                    content = ""
                # 需要过滤掉不支持的文件名字符
                md_file = os.path.join(origin_md_dir,
                                       row[1].replace('"', "'")
                                       .replace("?", "?")
                                       .replace("|", "_")
                                       .replace("/", "_")
                                       + ".md")
                print("【{}】原始MD文件生成完毕".format(md_file))
                write_text_to_file(content, md_file)
    workbook.close()

# md文件本地化
def md_to_local():
    origin_md_list = scan_file_list_by_suffix(origin_md_dir, "md")
    if len(origin_md_list) == 0:
        exit("未检测到md文件")
    else:
        for origin_md in origin_md_list:
            md_file_name = os.path.basename(origin_md)
            new_md_dir = os.path.join(local_md_dir, md_file_name[:-3])
            new_picture_dir = os.path.join(new_md_dir, "images")
            is_dir_existed(new_md_dir)
            is_dir_existed(new_picture_dir)
            # 生成md文件路径
            new_md_file_path = os.path.join(new_md_dir, md_file_name)
            # 读取md文件内容
            old_content = read_file_text_content(origin_md)
            # 替换原内容
            new_content = pic_match_pattern.sub(partial(pic_to_local, pic_save_dir=new_picture_dir), old_content)
            # 生成新的md文件
            write_text_to_file(new_content, new_md_file_path)
            print("新md文件已生成 → {}".format(new_md_file_path))
        print("所有本地md文件生成完毕!开始批量下载图片文件")
        for pic_url_path_record in pic_url_path_record_list:
            split_list = pic_url_path_record.split("\t")
            loop.run_until_complete(download_pic(split_list[1], split_list[0]))

# 远程图片转换为本地图片
def pic_to_local(match_result, pic_save_dir):
    global pic_url_path_record_list
    print("替换前的图片路径:{}".format(match_result[2]))
    # 生成新的图片名
    img_file_name = "{}_{}.{}".format(int(round(time.time())), order_set.pop(), match_result[3])
    # 拼接图片相对路径(Markdown用到的)
    relative_path = 'images/{}'.format(img_file_name)
    # 拼接图片绝对路径,下载到本地
    absolute_path = os.path.join(pic_save_dir, img_file_name)
    print("替换后的图片路径:{}".format(relative_path))
    pic_url_path_record_list.append("{}\t{}".format(match_result[2], absolute_path))
    # 拼接前后括号()
    return "{}{}{}".format(match_result[1], relative_path, match_result[4])

if __name__ == '__main__':
    is_dir_existed(origin_md_dir)
    is_dir_existed(local_md_dir)
    choose = int(input("请输入要批处理的操作序号:\n1、xlsx生成原始md文件;\n2、原始md文件生成本地md文件:\n"))
    if choose == 1:
        xlsx_files = scan_file_list_by_suffix(suffix="xlsx")
        if len(xlsx_files) == 0:
            exit("未检测到数据备份的xlsx文件!!!")
        else:
            analysis_xlsx(xlsx_files[0])
    elif choose == 2:
        loop = asyncio.get_event_loop()
        md_to_local()
    else:
        exit("错误输入")

接着运行脚本:

python  zybl_xlsx_to_local_md.py

输入1回车可以批量生成md文件,输入2回车可以批量转换为本地md文件

image

sakz commented 9 months ago

作者发的备份数据(xlsx)怎么获取

coder-pig commented 8 months ago

作者发的备份数据(xlsx)怎么获取

你问群主要啊,或者发邮件给他

tianh commented 7 months ago

为什么发给我的是txt?

ghosert commented 7 months ago

谢谢大家的等待,网站基础服务已经恢复。