neksa / mutagene

Python library and package for mutational analysis
https://www.ncbi.nlm.nih.gov/projects/mutagene/
Other
3 stars 3 forks source link

VCF processing #17

Closed carolinecunningham closed 4 years ago

carolinecunningham commented 5 years ago

mutagene currently can't process VCFs due to updates in MAF reader

steps to reproduce error: run: mutagene motif search -g hg19 -i sample2.vcf

caroline's draft code to fix:

in motif_menu.py:

    if ".vcf" not in args.infile.name:
        mutations, mutations_with_context, processing_stats = read_MAF_with_context_window(args.infile, args.genome, args.window_size)

        if len(mutations_with_context) == 0:
            logger.warning("No mutations loaded")

        matching_motifs = identify_motifs(mutations_with_context, custom_motif,
                                          args.strand) if mutations_with_context is not None else []

    else:
        mutations, mutations_with_context, processing_stats = read_VCF_with_context_window(args.infile, args.genome, args.window_size)

        if len(mutations_with_context) == 0:
            logger.warning("No mutations loaded")

        matching_motifs = identify_motifs(mutations_with_context, custom_motif,
                                          args.strand) if mutations_with_context is not None else []

in motifs/init.py:

try:
    for sample, mutations in samples_mutations.items():
        if mutations is not None and len(mutations) > 0:
            first_mut_seq_with_coords = mutations[0][-1]
            window_size = (len(first_mut_seq_with_coords) - 1) // 2

            for m in search_motifs:
                for s in strand:
                    # print("IDENTIFYING MOTIF: ", m['name'])
                    result = get_enrichment(mutations, m['motif'], m['position'], m['ref'], m['alt'], window_size, s)

                    debug_data = {'sample': sample, 'motif': m['logo'], 'strand': s}
                    debug_data.update(result)
                    debug_string = pprint.pformat(debug_data, indent=4)
                    logger.debug(debug_string)

                    if result['mutation_load'] == 0:
                        continue

                    motif_dict = {
                        'sample': sample,
                        'name': m['name'],
                        'motif': m['logo'],
                        'strand': s,
                        'enrichment': result['enrichment'],
                        'pvalue': result['pvalue_fisher'],
                        'mutations_low_est': result['mutation_load'],
                        'mutations_high_est': result['bases_mutated_in_motif'],
                        # 'mut_motif': result['bases_mutated_in_motif'],
                        # 'mut_not_in_motif': result['bases_mutated_not_in_motif'],
                        # 'stat_count': result['bases_not_mutated_in_motif'],
                        # 'ref_count': result['bases_not_mutated_not_in_motif']
                    }

                    motif_matches.append(motif_dict)

except AttributeError:
    # print(sample, len(mutations))
    if samples_mutations is not None and len(samples_mutations) > 0:
        first_mut_seq_with_coords = samples_mutations[0][-1]
        window_size = (len(first_mut_seq_with_coords) - 1) // 2

        for m in search_motifs:
            for s in strand:
                # print("IDENTIFYING MOTIF: ", m['name'])
                result = get_enrichment(samples_mutations, m['motif'], m['position'], m['ref'], m['alt'], window_size, s)

                debug_data = {'sample': "VCF", 'motif': m['logo'], 'strand': s}
                debug_data.update(result)
                debug_string = pprint.pformat(debug_data, indent=4)
                logger.debug(debug_string)

                if result['mutation_load'] == 0:
                    continue

                motif_dict = {
                    'sample': "VCF",
                    'name': m['name'],
                    'motif': m['logo'],
                    'strand': s,
                    'enrichment': result['enrichment'],
                    'pvalue': result['pvalue_fisher'],
                    'mutations_low_est': result['mutation_load'],
                    'mutations_high_est': result['bases_mutated_in_motif'],
                    # 'mut_motif': result['bases_mutated_in_motif'],
                    # 'mut_not_in_motif': result['bases_mutated_not_in_motif'],
                    # 'stat_count': result['bases_not_mutated_in_motif'],
                    # 'ref_count': result['bases_not_mutated_not_in_motif']
                }

                motif_matches.append(motif_dict)

return motif_matches

in context_window.py:

def read_VCF_with_context_window(muts, asm, window_size): cn = complementary_nucleotide mutations = defaultdict(float) N_skipped = 0

N_skipped_indels = 0

raw_mutations = []

for line in muts.read().split("\n"):
    if line.startswith("#"):
        continue
    if len(line) < 10:
        continue

    col_list = line.split()
    if len(col_list) < 4:
        continue

    # ID = col_list[2]

    # chromosome is expected to be one or two number or one letter
    chrom = col_list[0]  # VCF CHROM
    if chrom.lower().startswith("chr"):
        chrom = chrom[3:]
    # if len(chrom) == 2 and chrom[1] not in "0123456789":
    #     chrom = chrom[0]

    pos = int(col_list[1])  # VCF POS
    x = col_list[3]         # VCF REF
    y = col_list[4]         # VCF ALT

    # if multiple REF or ALT alleles are given, ignore mutation entry (could mean seq error, could mean deletion)
    if len(x) != 1:
        N_skipped += 1
        continue
    if len(y) != 1:
        N_skipped += 1
        continue

    transcript_strand = '+'  # bad assumption about transcript strand

    raw_mutations.append((chrom, pos, transcript_strand, x, y))

# print("RAW", raw_mutations)
# print("INDELS", N_skipped)

mutations_with_context = []
if len(raw_mutations) > 0:
    contexts = get_context_twobit_window(raw_mutations, asm, window_size)
    if contexts is None or len(contexts) == 0:
        return None, None

    for (chrom, pos, transcript_strand, x, y) in raw_mutations:
        (p5, p3), seq_with_coords = contexts.get((chrom, pos), (("N", "N"), []))
        # print("RESULT: {} {}".format(p5, p3))

        if len(set([p5, x, y, p3]) - set(nucleotides)) > 0:
            # print(chrom, pos, p5, p3, x)
            # print("Skipping invalid nucleotides")
            N_skipped += 1
            continue
        if x in "CT":
            mutations[p5 + p3 + x + y] += 1.0
        else:
            # complementary mutation
            mutations[cn[p3] + cn[p5] + cn[x] + cn[y]] += 1.0

        mutations_with_context.append((chrom, pos, transcript_strand, x, y, seq_with_coords))
N_loaded = int(sum(mutations.values()))
processing_stats = {'loaded': N_loaded, 'skipped': N_skipped, 'format': 'VCF'}
return mutations, mutations_with_context, processing_stats