summer506hai / Study-Notes

学习记录合辑
0 stars 0 forks source link

python存档 #15

Open summer506hai opened 11 months ago

summer506hai commented 11 months ago

Python 打包自己的库到 PYPI

your_project/ ├── your_package/ │ ├── init .py │ └── your_module.py ├── setup.py └── README.md

summer506hai commented 10 months ago

函数执行超时

import time
from threading import Thread

import func_timeout
from func_timeout import func_set_timeout

@func_set_timeout(10)
def f():
    while True:
        print("running.....")
        time.sleep(1)

def try_f():
    try:
        f()
    except func_timeout.exceptions.FunctionTimedOut:
        print("执行已超时10s")

if __name__ == '__main__':
    t1 = Thread(target=try_f, args=())
    t1.start()
    print("start.....")
summer506hai commented 10 months ago

获取url的域名

from urllib.parse import urlparse

parsed_url = urlparse(url)
print(parsed_url.netloc)
summer506hai commented 10 months ago

excel

excel中写数据


    writer = pd.ExcelWriter(your_file, engine="openpyxl")
    import re
    def data_clean(text):
        # 清洗excel中 非法字符,都是不常见的不可显示字符,例如退格,响铃等
        ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
        text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
        return text
    df = pd.DataFrame(your_data)
    df = df.fillna('').astype(str)
    for col in df.columns:
        df[col] = df[col].apply(lambda x: data_clean(x))
    df.to_excel(writer, sheet_name="your_sheet_name", index=False)
    # writer.save()
    writer.close()

读excel中数据

import pandas as pd

# 读取 Excel 文件 excel 列名为  query top1_url top1_title .....
df = pd.read_excel('your_excel.xlsx',sheet_name="your_sheet_name")

# 输出每行所有 url 列的内容
for index, row in df.iterrows():
    print(f'query is:{row["query"]}')
    #excel 中的列名 存在 同名(序号不同),使用 for 循环读取 
    for i in range(1,6):         
        url = row[f'top{i}_url']
        title = row[f'top{i}_title']
        print(f'url is: {url}, title is: {title}')

读excel中数据,并以json格式写到文件中

json格式如下:


{
    "query": "宝宝洗澡怎么洗",
    "data": [
        {
            "url": "https://www.    .htm",
            "title": "xxxxxxxxx"
        },
        {
            "url": "https://www.  .htm",
            "title": "xxxxxxx"
        },
   ...
    ]
}
import json
import pandas as pd

def read_excel_data(excel_name,sheet_name):
    # 读取 Excel 文件
    df = pd.read_excel(excel_name,sheet_name=sheet_name)
    all_data = []
    # 输出每行所有 url 列的内容
    for index, row in df.iterrows():
        each_data = {}
        each_data["query"] = row["query"]
        # print(type(row))
        each_data["data"] = []
        # print(f'query is:{row["query"]}')
        for i in range(1,6):
            each_result = {}
            url = row[f'top{i}_url']
            title = row[f'top{i}_title']
            if not pd.isna(url) and not pd.isna(title) : #不为 nan (float类型)
                each_result["url"] = url
                each_result["title"] = title
                each_data["data"].append(each_result)

        if len(each_data["data"]) != 0:
            all_data.append(each_data)
    return all_data

def write_to_json(txt_name,all_data): #写成json格式
    with open(txt_name, "w", encoding='utf-8') as f:
        for p in range(len(all_data)):
            # print(json.dumps(json_data_all[p], ensure_ascii=False))
            f.write(json.dumps(all_data[p], ensure_ascii=False) + "\n")

if __name__ == '__main__':
    all_data = read_excel_data('your_excel_file.xlsx',"your_sheet_name")
    write_to_json("your_json_file.txt",all_data)
summer506hai commented 9 months ago

处理json数据

json格式如下:

{
    "query": "宝宝洗澡怎么洗",
    "data": [
        {
            "url": "https://www.    .htm",
            "title": "xxxxxxxxx"
        },
        {
            "url": "https://www.  .htm",
            "title": "xxxxxxx"
        }
    ]
}
def delete_same_url_data(data_list):
    '''
    删除掉 url一样的数据
    '''
    new_line = []
    for each_data in data_list:
        data = each_data["data"]
        url_list = []
        new_data = []
        for item in data:
            url = item["url"]
            if url not in url_list:
                url_list.append(url)
                new_data.append(item)
        each_data["data"] = new_data
        # print(each_data)
        new_line.append(each_data)
    return new_line