Dear all,
I am sharing a tailored script for converting RFMix2 output into TRACTS input.
# rfmix2tracts.py
import pandas as pd
import sys
import os
def find_header_row(file_path):
"""
Find the header row in the file. It is the first line that starts with '#'.
"""
with open(file_path, 'r') as file:
for i, line in enumerate(file):
if line.startswith('#'):
return i
return None
def process_rfmix_to_tracts(input_file, output_folder):
"""
Process an RFMix output file and generate TRACTS input files.
"""
# Find the header row
header_row = find_header_row(input_file)
if header_row is None:
print("Header row could not be found.")
return
# Read the input file
data = pd.read_csv(input_file, sep='\t', header=header_row, skiprows=1)
# Get the list of samples (excluding the first 6 columns)
samples = data.columns[6:]
# Process each sample (in case of a file with multiple samples)
for sample in samples:
# Determine the output file name based on the haplotype (.0 or .1)
file_suffix = '_B.viterbi.bed.cm' if sample.endswith('.1') else '_A.viterbi.bed.cm'
output_file = os.path.join(output_folder, f"{sample.split('.')[0]}{file_suffix}")
# Extract the relevant columns for the sample and reorder them
sample_data = data[['#chm', 'spos', 'epos', sample, 'sgpos', 'egpos']]
sample_data.columns = ['chrom', 'begin', 'end', 'assignment', 'cmBegin', 'cmEnd']
# Save to file
sample_data.to_csv(output_file, sep='\t', index=False)
print("Process completed. The files have been generated.")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <input_file> <output_folder>")
else:
input_file_path = sys.argv[1]
output_folder_path = sys.argv[2]
process_rfmix_to_tracts(input_file_path, output_folder_path)
# Contact: franco.lmarsico@gmail.com
# GitHub: marsicoFL
# Example of how to run the script:
# python rfmix2tracts.py /path/to/rfmix2output.msp.tsv /path/to/output/folder/
# Or if you want to run it in the same folder:
# python rfmix2tracts.py toy.msp.tsv .
Dear all, I am sharing a tailored script for converting RFMix2 output into TRACTS input.
Best, Franco