Closed edsu7 closed 3 months ago
Develop a nextflow pipeline using nf-core sample for the following steps:
Python code to convert the xml to vcf: https://drive.google.com/drive/folders/1Pb7lnI1txPF8K2M1ntrBNIhJzV_hdrxy
import xml.etree.ElementTree as ET
import pandas as pd
def extract_data_from_short_variant(short_variant):
return {
# Extract useful information from the xml file
'AF': short_variant.get('allele-fraction'),
'ALT': short_variant.get('alternate-sequence'),
'#CHROM': short_variant.get('chromosome'),
'DP': short_variant.get('depth'),
'POS': short_variant.get('genomic-start'),
'REF': short_variant.get('reference-sequence'),
}
# Process dataframe by adding "ID", "QUAL", "FILTER" with default value "." and "INFO" with "DP=#;AF=#"
# where DP means depth and AF means allele frequency
def process_dataframe(df):
df['ID'] = '.'
df['QUAL'] = '.'
df['FILTER'] = '.'
df['INFO'] = 'DP=' + df['DP'].astype(str) + ';AF=' + df['AF'].astype(str)
df.drop(['DP', 'AF'], axis=1, inplace=True)
desired_order = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
df = df[desired_order]
return df
tree = ET.parse('../example.xml')
root = tree.getroot()
# Extract information from xml
data = []
for short_variant in root.findall('.//short-variant'):
short_variant_data = extract_data_from_short_variant(short_variant)
data.append(short_variant_data)
df = pd.DataFrame(data)
# Process the DataFrame
df_processed = process_dataframe(df)
# Output into vcf file
df_processed.to_csv('shortvariant.vcf', sep='\t', index=False)
VCF file for short variant:
$ python3 short_variant_to_vcf_update.py
Extract short variant info from the xml file Output order sorted based on "CHROM" and "POS" short_variant_to_vcf_update.py:
import xml.etree.ElementTree as ET
import pandas as pd
def extract_data_from_short_variant(short_variant):
return {
# Extract useful information from the xml file
'AF': short_variant.get('allele-fraction'),
'ALT': short_variant.get('alternate-sequence'),
'#CHROM': short_variant.get('chromosome'),
'DP': short_variant.get('depth'),
'POS': short_variant.get('genomic-start'),
'REF': short_variant.get('reference-sequence'),
}
def chr_pos_sort(df):
# Define a custom sort order for chromosome numbers
chrom_order = {'chr' + str(i): i for i in range(1, 23)}
chrom_order['chrX'] = 23
chrom_order['chrY'] = 24
# Apply the custom sort order
df['#CHROM'] = df['#CHROM'].map(chrom_order)
df['POS'] = df['POS'].astype(int)
# Sort the DataFrame first by '#CHROM' and then by 'POS'
sorted_df = df.sort_values(by=['#CHROM', 'POS'])
# Convert the chromosome numbers back to their original format
sorted_df['#CHROM'] = 'chr' + sorted_df['#CHROM'].astype(str)
return sorted_df
# Process dataframe by adding "ID", "QUAL", "FILTER" with default value "." and "INFO" with "DP=#;AF=#"
# where DP means depth and AF means allele frequency
def process_dataframe(df):
df['ID'] = '.'
df['QUAL'] = '.'
df['FILTER'] = '.'
df['INFO'] = 'DP=' + df['DP'].astype(str) + ';AF=' + df['AF'].astype(str)
df.drop(['DP', 'AF'], axis=1, inplace=True)
desired_order = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
df = df[desired_order]
sorted_df = chr_pos_sort(df.copy())
return sorted_df
tree = ET.parse('../example.xml')
root = tree.getroot()
# Extract information from xml
data = []
for short_variant in root.findall('.//short-variant'):
short_variant_data = extract_data_from_short_variant(short_variant)
data.append(short_variant_data)
df = pd.DataFrame(data)
# Process the DataFrame
df_processed = process_dataframe(df)
# Output into vcf file
df_processed.to_csv('shortvariant.vcf', sep='\t', index=False)
shortvariant.vcf:
$ bash bash_script.sh
bash_script.sh:
# extract header information from xml file
grep -E "\<variant-report|\<sample" example.xml | sed 's/ /\n/g' | grep "=" | sed 's/"[^"]*$/"/g' | sed 's/^/##/g' > header_2.vcf
# Complete header information & merge with shortvariant info
cat header_1.vcf header_2.vcf header_3.vcf shortvariant.vcf > shortvariant_complete.vcf
header_1.vcf:
##fileformat=VCFv4.3
header_2.vcf:
##xmlns:vr="http://foundationmedicine.com/compbio/variant-report"
##curation-version-id="3600702"
##disease="OVARY"
##flowcell-analysis="2000013681"
##gender="female"
##human-genome-assembly="hg19"
##specimen-id="2001205343"
##study="FACT/bTMB-AMC-PRO-19-1209-CAM"
##test-request="ORD-0961212-01"
##test-type="BFAST"
##name="ORD-0961212-01*US0939362.01"
##bait-set="CF3"
header_3.vcf:
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
shortvariant_complete.vcf:
rearrangement_to_vcf_update.py:
import xml.etree.ElementTree as ET
import pandas as pd
from pyfaidx import Fasta
def custom_sort_key(item):
parts = item.split(':')
try:
chr_num = int(parts[0].replace('chr', ''))
coord = int(parts[1])
return (chr_num, coord)
except (ValueError, IndexError):
return item
# extract ref nucletide
def extract_nucleotide(fasta_file, pos):
fasta = Fasta(fasta_file)
sequence = fasta[pos.split(':')[0]][int(pos.split(':')[1])-1].seq
fasta.close()
return sequence
# create alt
def alternative(pos1, pos2):
dir1 = dir[pos1]
dir2 = dir[pos2]
ref_nt = ref[pos1]
if dir1 == '+':
if dir2 == '+':
return ref_nt + ']' + pos2 + ']'
elif dir2 == '-':
return ref_nt + '[' + pos2 + '['
elif dir1 == '-':
if dir2 == '+':
return ']' + pos2 + ']' + ref_nt
elif dir2 == '-':
return '[' + pos2 + '[' + ref_nt
return None
# Sort the final output with chr and pos
def chr_pos_sort(df):
# Define a custom sort order for chromosome numbers
chrom_order = {'chr' + str(i): i for i in range(1, 23)}
chrom_order['chrX'] = 23
chrom_order['chrY'] = 24
# Apply the custom sort order
df['#CHROM'] = df['#CHROM'].map(chrom_order)
df['POS'] = df['POS'].astype(int)
# Sort the DataFrame first by '#CHROM' and then by 'POS'
sorted_df = df.sort_values(by=['#CHROM', 'POS'])
# Convert the chromosome numbers back to their original format
sorted_df['#CHROM'] = 'chr' + sorted_df['#CHROM'].astype(str)
return sorted_df
# Parse the XML file
tree = ET.parse('example.xml')
root = tree.getroot()
fasta_file = 'hg19.fa'
data = [] # Create a list to store the extracted data
ref = {} # contian reference nt
dir = {} # contian direction info
junc = [] # contain junction
# Iterate over each "short-variant"
for rearrangement in root.findall('.//rearrangement'):
# Extract data from "short-variant"
rearrangement_data = {
'AF': rearrangement.get('allele-fraction'),
'Pos1': rearrangement.get('pos1'),
'Pos2': rearrangement.get('pos2'),
'DP': rearrangement.get('supporting-read-pairs'),
'type': rearrangement.get('type'),
'genomic-type': rearrangement.find('.//chimeric-junctions').get('genomic-type'),
'description': rearrangement.find('.//chimeric-junction').get('description'),
}
junc.append(rearrangement.get('pos1'))
junc.append(rearrangement.get('pos2'))
ref[rearrangement.get('pos1')] = extract_nucleotide(fasta_file, rearrangement.get('pos1'))
ref[rearrangement.get('pos2')] = extract_nucleotide(fasta_file, rearrangement.get('pos2'))
print(rearrangement.get('pos1'), ref[rearrangement.get('pos1')])
# Check if there are any "short-variant-intragenic-annotation" elements
partner_breakpoints = rearrangement.findall('.//partner-breakpoint')
for partner_breakpoint in partner_breakpoints:
dir[partner_breakpoint.get('chromosome') + ":" + partner_breakpoint.get('annotation-position')] = partner_breakpoint.get('genomic-disruption-direction')
# print(partner_breakpoint.get('chromosome') + ":" + partner_breakpoint.get('annotation-position'), dir[partner_breakpoint.get('chromosome') + ":" + partner_breakpoint.get('annotation-position')])
data.append(rearrangement_data)
sorted_junc = sorted(junc, key=custom_sort_key) # sort junction based on first chr, than pos
# create a dictonary, naming bnd_# for all junctions based on sorted order
junction = {}
num = 0
for item in sorted_junc:
num +=1
junction[item] = f'bnd_{num}'
# Create a DataFrame from the extracted data
df = pd.DataFrame(data)
# Create a DataFrame for vcf file
df_a = pd.DataFrame()
df_a[['#CHROM', 'POS']] = df['Pos1'].str.split(':', expand=True)
df_a['POS'] = df_a['POS'].astype(int)
df_a['ID'] = df['Pos1'].map(junction)
df_a['REF'] = df['Pos1'].map(ref)
df_a['ALT'] = df.apply(lambda row: alternative(row['Pos1'], row['Pos2']), axis=1)
df_a['QUAL'] = '.'
df_a['FILTER'] = '.'
df_a['INFO'] = 'SVTYPE=BND;MATEID=' + df['Pos2'].map(junction) + ';DP=' + df['DP'].astype(str) + ';AF=' + df['AF'].astype(str)
df_b = pd.DataFrame()
df_b[['#CHROM', 'POS']] = df['Pos2'].str.split(':', expand=True)
df_b['POS'] = df_b['POS'].astype(int)
df_b['ID'] = df['Pos2'].map(junction)
df_b['REF'] = df['Pos2'].map(ref)
df_b['ALT'] = df.apply(lambda row: alternative(row['Pos2'], row['Pos1']), axis=1)
df_b['QUAL'] = '.'
df_b['FILTER'] = '.'
df_b['INFO'] = 'SVTYPE=BND;MATEID=' + df['Pos1'].map(junction) + ';DP=' + df['DP'].astype(str) + ';AF=' + df['AF'].astype(str)
result = pd.concat([df_a, df_b], ignore_index=True)
sorted_df = chr_pos_sort(result.copy())
# Output the DataFrame to a TSV file
sorted_df.to_csv('rearrangement.vcf', sep='\t', index=False)
rearrangement_complete.vcf:
https://gatk.broadinstitute.org/hc/en-us/articles/5358875253147-LiftoverVcf-Picard
java -jar picard.jar CreateSequenceDictionary \
R=hg38.fa \
O=hg38.dict
java -jar picard.jar LiftoverVcf I=shortvariant_complete.vcf \
O=shortvariant_lifted_over.vcf \
CHAIN=hg19ToHg38.over.chain \
REJECT=rejected_variants.vcf \
R=hg38.fa
java -jar picard.jar LiftoverVcf I=rearrangement_complete.vcf \
O=rearrangement_lifted_over.vcf \
CHAIN=hg19ToHg38.over.chain \
REJECT=ra_rejected_variants.vcf \
R=hg38.fa
shortvariant_lifted_over.vcf:
...
rearrangement_lifted_over.vcf:
...
Some issues:
Update
Would argue to use base hg38 genome assembly reference
reason1. the chain file (hg19ToHg38.over.chain.gz) would only have link to base hg38 reference
reason2. GRCh38_hla_decoy_ebv reference contains much more contigs and these contigs will carry no information in the vcf files, but they will use lots of space in the header of vcf files
wc -l GRCh38_hla_decoy_ebv.fa.fai
3366 GRCh38_hla_decoy_ebv.fa.fai
wc -l hg38.fa.fai
455 hg38.fa.fai
Completed. Workflow available https://github.com/icgc-argo-workflows/xml_variant_ingestion
Based on Gaungqiao's findings in ticket MONSTAR's XML to VCF looks convertible.
A pipeline will be needed to trackable and perform uniform conversion of MONSTAR's XMLs