Closed carolinecunningham closed 4 years ago
mutagene currently can't process VCFs due to updates in MAF reader
steps to reproduce error: run: mutagene motif search -g hg19 -i sample2.vcf
caroline's draft code to fix:
in motif_menu.py:
if ".vcf" not in args.infile.name: mutations, mutations_with_context, processing_stats = read_MAF_with_context_window(args.infile, args.genome, args.window_size) if len(mutations_with_context) == 0: logger.warning("No mutations loaded") matching_motifs = identify_motifs(mutations_with_context, custom_motif, args.strand) if mutations_with_context is not None else [] else: mutations, mutations_with_context, processing_stats = read_VCF_with_context_window(args.infile, args.genome, args.window_size) if len(mutations_with_context) == 0: logger.warning("No mutations loaded") matching_motifs = identify_motifs(mutations_with_context, custom_motif, args.strand) if mutations_with_context is not None else []
in motifs/init.py:
try: for sample, mutations in samples_mutations.items(): if mutations is not None and len(mutations) > 0: first_mut_seq_with_coords = mutations[0][-1] window_size = (len(first_mut_seq_with_coords) - 1) // 2 for m in search_motifs: for s in strand: # print("IDENTIFYING MOTIF: ", m['name']) result = get_enrichment(mutations, m['motif'], m['position'], m['ref'], m['alt'], window_size, s) debug_data = {'sample': sample, 'motif': m['logo'], 'strand': s} debug_data.update(result) debug_string = pprint.pformat(debug_data, indent=4) logger.debug(debug_string) if result['mutation_load'] == 0: continue motif_dict = { 'sample': sample, 'name': m['name'], 'motif': m['logo'], 'strand': s, 'enrichment': result['enrichment'], 'pvalue': result['pvalue_fisher'], 'mutations_low_est': result['mutation_load'], 'mutations_high_est': result['bases_mutated_in_motif'], # 'mut_motif': result['bases_mutated_in_motif'], # 'mut_not_in_motif': result['bases_mutated_not_in_motif'], # 'stat_count': result['bases_not_mutated_in_motif'], # 'ref_count': result['bases_not_mutated_not_in_motif'] } motif_matches.append(motif_dict) except AttributeError: # print(sample, len(mutations)) if samples_mutations is not None and len(samples_mutations) > 0: first_mut_seq_with_coords = samples_mutations[0][-1] window_size = (len(first_mut_seq_with_coords) - 1) // 2 for m in search_motifs: for s in strand: # print("IDENTIFYING MOTIF: ", m['name']) result = get_enrichment(samples_mutations, m['motif'], m['position'], m['ref'], m['alt'], window_size, s) debug_data = {'sample': "VCF", 'motif': m['logo'], 'strand': s} debug_data.update(result) debug_string = pprint.pformat(debug_data, indent=4) logger.debug(debug_string) if result['mutation_load'] == 0: continue motif_dict = { 'sample': "VCF", 'name': m['name'], 'motif': m['logo'], 'strand': s, 'enrichment': result['enrichment'], 'pvalue': result['pvalue_fisher'], 'mutations_low_est': result['mutation_load'], 'mutations_high_est': result['bases_mutated_in_motif'], # 'mut_motif': result['bases_mutated_in_motif'], # 'mut_not_in_motif': result['bases_mutated_not_in_motif'], # 'stat_count': result['bases_not_mutated_in_motif'], # 'ref_count': result['bases_not_mutated_not_in_motif'] } motif_matches.append(motif_dict) return motif_matches
in context_window.py:
def read_VCF_with_context_window(muts, asm, window_size): cn = complementary_nucleotide mutations = defaultdict(float) N_skipped = 0
raw_mutations = [] for line in muts.read().split("\n"): if line.startswith("#"): continue if len(line) < 10: continue col_list = line.split() if len(col_list) < 4: continue # ID = col_list[2] # chromosome is expected to be one or two number or one letter chrom = col_list[0] # VCF CHROM if chrom.lower().startswith("chr"): chrom = chrom[3:] # if len(chrom) == 2 and chrom[1] not in "0123456789": # chrom = chrom[0] pos = int(col_list[1]) # VCF POS x = col_list[3] # VCF REF y = col_list[4] # VCF ALT # if multiple REF or ALT alleles are given, ignore mutation entry (could mean seq error, could mean deletion) if len(x) != 1: N_skipped += 1 continue if len(y) != 1: N_skipped += 1 continue transcript_strand = '+' # bad assumption about transcript strand raw_mutations.append((chrom, pos, transcript_strand, x, y)) # print("RAW", raw_mutations) # print("INDELS", N_skipped) mutations_with_context = [] if len(raw_mutations) > 0: contexts = get_context_twobit_window(raw_mutations, asm, window_size) if contexts is None or len(contexts) == 0: return None, None for (chrom, pos, transcript_strand, x, y) in raw_mutations: (p5, p3), seq_with_coords = contexts.get((chrom, pos), (("N", "N"), [])) # print("RESULT: {} {}".format(p5, p3)) if len(set([p5, x, y, p3]) - set(nucleotides)) > 0: # print(chrom, pos, p5, p3, x) # print("Skipping invalid nucleotides") N_skipped += 1 continue if x in "CT": mutations[p5 + p3 + x + y] += 1.0 else: # complementary mutation mutations[cn[p3] + cn[p5] + cn[x] + cn[y]] += 1.0 mutations_with_context.append((chrom, pos, transcript_strand, x, y, seq_with_coords)) N_loaded = int(sum(mutations.values())) processing_stats = {'loaded': N_loaded, 'skipped': N_skipped, 'format': 'VCF'} return mutations, mutations_with_context, processing_stats
mutagene currently can't process VCFs due to updates in MAF reader
steps to reproduce error: run: mutagene motif search -g hg19 -i sample2.vcf
caroline's draft code to fix:
in motif_menu.py:
in motifs/init.py:
in context_window.py:
def read_VCF_with_context_window(muts, asm, window_size): cn = complementary_nucleotide mutations = defaultdict(float) N_skipped = 0
N_skipped_indels = 0