Open summer506hai opened 11 months ago
import time
from threading import Thread
import func_timeout
from func_timeout import func_set_timeout
@func_set_timeout(10)
def f():
while True:
print("running.....")
time.sleep(1)
def try_f():
try:
f()
except func_timeout.exceptions.FunctionTimedOut:
print("执行已超时10s")
if __name__ == '__main__':
t1 = Thread(target=try_f, args=())
t1.start()
print("start.....")
from urllib.parse import urlparse
parsed_url = urlparse(url)
print(parsed_url.netloc)
writer = pd.ExcelWriter(your_file, engine="openpyxl")
import re
def data_clean(text):
# 清洗excel中 非法字符,都是不常见的不可显示字符,例如退格,响铃等
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
return text
df = pd.DataFrame(your_data)
df = df.fillna('').astype(str)
for col in df.columns:
df[col] = df[col].apply(lambda x: data_clean(x))
df.to_excel(writer, sheet_name="your_sheet_name", index=False)
# writer.save()
writer.close()
import pandas as pd
# 读取 Excel 文件 excel 列名为 query top1_url top1_title .....
df = pd.read_excel('your_excel.xlsx',sheet_name="your_sheet_name")
# 输出每行所有 url 列的内容
for index, row in df.iterrows():
print(f'query is:{row["query"]}')
#excel 中的列名 存在 同名(序号不同),使用 for 循环读取
for i in range(1,6):
url = row[f'top{i}_url']
title = row[f'top{i}_title']
print(f'url is: {url}, title is: {title}')
json格式如下:
{
"query": "宝宝洗澡怎么洗",
"data": [
{
"url": "https://www. .htm",
"title": "xxxxxxxxx"
},
{
"url": "https://www. .htm",
"title": "xxxxxxx"
},
...
]
}
import json
import pandas as pd
def read_excel_data(excel_name,sheet_name):
# 读取 Excel 文件
df = pd.read_excel(excel_name,sheet_name=sheet_name)
all_data = []
# 输出每行所有 url 列的内容
for index, row in df.iterrows():
each_data = {}
each_data["query"] = row["query"]
# print(type(row))
each_data["data"] = []
# print(f'query is:{row["query"]}')
for i in range(1,6):
each_result = {}
url = row[f'top{i}_url']
title = row[f'top{i}_title']
if not pd.isna(url) and not pd.isna(title) : #不为 nan (float类型)
each_result["url"] = url
each_result["title"] = title
each_data["data"].append(each_result)
if len(each_data["data"]) != 0:
all_data.append(each_data)
return all_data
def write_to_json(txt_name,all_data): #写成json格式
with open(txt_name, "w", encoding='utf-8') as f:
for p in range(len(all_data)):
# print(json.dumps(json_data_all[p], ensure_ascii=False))
f.write(json.dumps(all_data[p], ensure_ascii=False) + "\n")
if __name__ == '__main__':
all_data = read_excel_data('your_excel_file.xlsx',"your_sheet_name")
write_to_json("your_json_file.txt",all_data)
json格式如下:
{
"query": "宝宝洗澡怎么洗",
"data": [
{
"url": "https://www. .htm",
"title": "xxxxxxxxx"
},
{
"url": "https://www. .htm",
"title": "xxxxxxx"
}
]
}
读json
def read_json(json_file):
'''
读json文件,返回list
'''
f = open(json_file, encoding='utf-8')
lines = f.readlines() # 读取全部内容 ,并以列表方式返回
all_lines = []
for line in lines:
_line = json.loads(line)
all_lines.append(_line)
return all_lines
写json
def write_to_json(txt_name,all_data): #写成json格式
with open(txt_name, "w", encoding='utf-8') as f:
for p in range(len(all_data)):
# print(json.dumps(json_data_all[p], ensure_ascii=False))
f.write(json.dumps(all_data[p], ensure_ascii=False) + "\n")
相同的query 进行 合并
def merge_same_query_data(data_list):
'''
json中,合并 query相同
'''
merge_data = {}
for each_data in data_list:
query = each_data["query"]
data = each_data["data"]
if query in merge_data:
# If query already exists, append the results
merge_data[query]["data"].extend(data)
else:
# If query doesn't exist, add it to the merged_results
merge_data[query] = {"query":query,"data":data}
# print(list(merge_data.values()))
return list(merge_data.values())
url相同(每一行中有相同的url)的剔除
def delete_same_url_data(data_list):
'''
删除掉 url一样的数据
'''
new_line = []
for each_data in data_list:
data = each_data["data"]
url_list = []
new_data = []
for item in data:
url = item["url"]
if url not in url_list:
url_list.append(url)
new_data.append(item)
each_data["data"] = new_data
# print(each_data)
new_line.append(each_data)
return new_line
Python 打包自己的库到 PYPI
https://zhuanlan.zhihu.com/p/79164800
输入 your-command 可执行自定义的方法(安装完pip后)
entry_points={ 'console_scripts': [ 'your-command = your_package.your_module:main', ], },
your_project/ ├── your_package/ │ ├── init .py │ └── your_module.py ├── setup.py └── README.md