chzhiyi / -KnowledgeShare

6 stars 1 forks source link

20190305 - 正则表达式的应用(下) - qiaosong #25

Open chzhiyi opened 5 years ago

chzhiyi commented 5 years ago

参考网址

https://mbd.baidu.com/newspage/data/landingsuper?context=%7B%22nid%22%3A%22news_9442500400653324518%22%7D&n_type=1&p_from=3

源代码

## 找出以bai开头的花名
l = ['qiaosong', 'huasheng', 'baijing', 'bailian', 'jingchu', 'yinshuang', 'zilu']
def search_start_with_bai(lt):
    for tmpl in lt:
        if tmpl[0:3] == 'bai':
            print(tmpl)
def senior_search_start_with_bai(lt):
    tmp = [tmpl for tmpl in lt if tmpl[:3] == 'bai']
    print(tmp)
## 提取连续的数字
def extract_str():
    tmp = "qs23hs24bj25bl45jc123ys234zl0"
    for i in range(len(tmp)):
        if tmp[i] >= 'a' and tmp[i] <= 'z':
            tmp = tmp.replace(tmp[i], ' ')
    print([ s for s in tmp.split(' ') if len(s)>0 and int(s)>=0 ])
def search(lt):
    search_start_with_bai(lt)
    senior_search_start_with_bai(lt)
if __name__ == '__main__':
    search(l)
    extract_str()
import re

def re_extract_numbers():
    tmp = 'qs23hs24bj25bl45jc123ys234zl0'
    pat = r'[0-9]{1,}'
    nums = re.findall(pat, tmp)
    print(nums)

def re_match_hello_world(str_value):
    pattern = 'hello world'
    matchs = re.match(pattern, str_value)
    print(matchs)
    if matchs is not None :
        print(matchs.group(0))
        print(matchs.span())

def print_matchs(matchs):
    print(matchs)
    print(matchs.group(0))
    print(matchs.span())

def re_match_simple_regular_expression(str_value):
    pattern = r'T[moi]+'
    matchs = re.match(pattern, str_value)
    print_matchs(matchs)

def re_match_sample(pattern, str_value):
    matchs = re.match(pattern, str_value)
    print_matchs(matchs)

if __name__ == '__main__':
    # re_extract_numbers()

    # ## 'hello world'也是正则表达式,只能匹配字符串'hello world'或以'hello world'开头的字符串
    # re_match_hello_world('hello world')
    # re_match_hello_world('hello world, hello china')
    # ## 下面匹配失败。NoneType 没有属性或方法group(0)、span()
    # re_match_hello_world('hello')
    # re_match_hello_world('today, hello , hello world, Hello World')

    re_match_simple_regular_expression('Tim')
    re_match_simple_regular_expression('Tom')
    re_match_simple_regular_expression('Toooooom')
    re_match_simple_regular_expression('Tuple')

    # re_match_sample(r'T[io]+m', 'Tiiiiiiiiiim')
    # re_match_sample(r'T[io]+m', 'Tiooioooooim')

    # re_match_sample(r'^[0-9]+$', '31421341234')
    # # re_match_sample(r'^[0-9]+$', '214saf12312')
    # re_match_sample(r'[0-9]+', '214saf12312')
import re

def get_urls():
    urls = set()
    line_number = 0
    with open('log.result', 'r') as f:
        for line in f.readlines():
            urls.add(line.split(' ')[2])
            line_number += 1
    for url in urls:
        print(url)
    print("url去重后的总数=",len(urls))
    print('文件总行数=', line_number)

def get_urls_with_exclude_productids():
    urls = set()
    with open('log.result', 'r') as f:
        for line in f.readlines():
            temp = line.split(' ')[2]
            value = re.sub(r'/github/reposity/issues/[\w]{32,}', '/github/reposity/issues/20190304000001', temp, re.IGNORECASE)
            if value is not None:
                urls.add(value)
            else:
                urls.add(temp)
    for url in urls:
        print(url)
    print("去重后的url总数: ", len(urls))

get_urls()
get_urls_with_exclude_productids()
[02/Mar/2019:00:02:49 github.com /github/reposity/issues/shareSwitch 200 0.003 1410
[02/Mar/2019:00:03:44 github.com /github/reposity/issues/statusCount 200 0.004 174
[02/Mar/2019:00:03:44 github.com /github/reposity/issues 200 0.0010 847
[02/Mar/2019:00:04:310 github.com /github/reposity/issues 200 0.013 1104
[02/Mar/2019:00:04:310 github.com /github/reposity/issues/a874478d2b8c4c4099a1900102800c8ec 200 0.009 448
[02/Mar/2019:00:010:22 github.com /github/reposity/issues/statusCount 200 0.0010 174
[02/Mar/2019:00:010:22 github.com /github/reposity/issues 200 0.007 844
[02/Mar/2019:00:09:02 github.com /github/reposity/issues/shareSwitch 200 0.004 147
[02/Mar/2019:00:09:24 github.com /github/reposity/issues/statusCount 200 0.0010 170
[02/Mar/2019:00:09:24 github.com /github/reposity/issues 200 0.007 8910
[02/Mar/2019:00:10:07 github.com /github/reposity/issues 200 0.013 1104
[02/Mar/2019:00:10:07 github.com /github/reposity/issues/d8b42bad9b7b44eea133109qwsqws910441143 200 0.008 440
[02/Mar/2019:00:10:34 github.com /github/reposity/issues 200 0.007 1027
[02/Mar/2019:00:10:34 github.com /github/reposity/issues/statusCount 200 0.0010 174
[02/Mar/2019:00:13:44 github.com /github/reposity/issues/shareSwitch 200 0.004 1410
[02/Mar/2019:00:110:01 github.com /github/reposity/issues/shareSwitch 200 0.004 1410
[02/Mar/2019:00:110:09 github.com /github/reposity/issues 200 0.0010 149
[02/Mar/2019:00:110:09 github.com /github/reposity/issues/statusCount 200 0.007 147
[02/Mar/2019:00:17:10 github.com /github/reposity/issues/statusCount 200 0.0010 1104
[02/Mar/2019:00:17:10 github.com /github/reposity/issues 200 0.007 894
[02/Mar/2019:00:17:13 github.com /github/reposity/issues 200 0.009 724
[02/Mar/2019:00:21:24 github.com /github/reposity/issues/shareSwitch 200 0.004 147
[02/Mar/2019:00:21:210 github.com /github/reposity/issues/statusCount 200 0.004 1102
[02/Mar/2019:00:21:210 github.com /github/reposity/issues 200 0.007 433
[02/Mar/2019:00:22:28 github.com /github/reposity/issues 200 0.012 1104
[02/Mar/2019:00:22:28 github.com /github/reposity/issues/c7cbbeb3qwsd10e49e28e04dbc0c24cb87a 200 0.008 437
[02/Mar/2019:00:22:29 github.com /github/reposity/issues/statusCount 200 0.004 1108
[02/Mar/2019:00:22:29 github.com /github/reposity/issues 200 0.0010 1002
[02/Mar/2019:00:22:30 github.com /github/reposity/issues 200 0.013 744