## 找出以bai开头的花名
l = ['qiaosong', 'huasheng', 'baijing', 'bailian', 'jingchu', 'yinshuang', 'zilu']
def search_start_with_bai(lt):
for tmpl in lt:
if tmpl[0:3] == 'bai':
print(tmpl)
def senior_search_start_with_bai(lt):
tmp = [tmpl for tmpl in lt if tmpl[:3] == 'bai']
print(tmp)
## 提取连续的数字
def extract_str():
tmp = "qs23hs24bj25bl45jc123ys234zl0"
for i in range(len(tmp)):
if tmp[i] >= 'a' and tmp[i] <= 'z':
tmp = tmp.replace(tmp[i], ' ')
print([ s for s in tmp.split(' ') if len(s)>0 and int(s)>=0 ])
def search(lt):
search_start_with_bai(lt)
senior_search_start_with_bai(lt)
if __name__ == '__main__':
search(l)
extract_str()
import re
def get_urls():
urls = set()
line_number = 0
with open('log.result', 'r') as f:
for line in f.readlines():
urls.add(line.split(' ')[2])
line_number += 1
for url in urls:
print(url)
print("url去重后的总数=",len(urls))
print('文件总行数=', line_number)
def get_urls_with_exclude_productids():
urls = set()
with open('log.result', 'r') as f:
for line in f.readlines():
temp = line.split(' ')[2]
value = re.sub(r'/github/reposity/issues/[\w]{32,}', '/github/reposity/issues/20190304000001', temp, re.IGNORECASE)
if value is not None:
urls.add(value)
else:
urls.add(temp)
for url in urls:
print(url)
print("去重后的url总数: ", len(urls))
get_urls()
get_urls_with_exclude_productids()
参考网址
https://mbd.baidu.com/newspage/data/landingsuper?context=%7B%22nid%22%3A%22news_9442500400653324518%22%7D&n_type=1&p_from=3
源代码