WGLab / doc-ANNOVAR

Documentation for the ANNOVAR software
http://annovar.openbioinformatics.org
234 stars 359 forks source link

index the dbfile by python #247

Closed pickpingk closed 5 months ago

pickpingk commented 5 months ago
from tqdm import tqdm
def index_file_generate(file_path,  index_file, bin_size=1000): 
    """
   index the dbfile
   perl -> python
   defult bin size 1000
    """
    regions = {}
    file_size = os.path.getsize(file_path)
    pbar = tqdm(total=file_size)
    offset = 0
    with open(file_path, 'r') as rf, open(index_file, 'w') as wf:
        wf.write(f"#BIN\t{bin_size}\t{file_size}\n")
        for lines in rf:
            if lines.startswith("#"):
                offset += len(lines)
                continue
            line = lines.strip()

            chrom,start = line.split('\t')[:2]
            start = int(start)
            curbin = start - (start % bin_size)
            regions_keys = f'{chrom}\t{curbin}'

            if regions_keys not in regions:
                regions[regions_keys] = {"min":offset, "max":offset+len(lines)}
            else:
                regions[regions_keys]["max"] = offset+len(lines)

            offset += len(lines)
            pbar.update(len(lines))
        for k in sorted(regions.keys()): 
            wf.write(f"{k}\t{regions[k]['min']}\t{regions[k]['max']}\n")
    pbar.close()
    print('generate index successfully')

Maybe you need change something with the code?