gravelLab / tracts

A set of tools for modelling ancestry patterns along the genome.
GNU General Public License v2.0
22 stars 14 forks source link

From rfmix2 to Tracts #33

Open MarsicoFL opened 9 months ago

MarsicoFL commented 9 months ago

Dear all, I am sharing a tailored script for converting RFMix2 output into TRACTS input.


# rfmix2tracts.py

import pandas as pd
import sys
import os

def find_header_row(file_path):
    """
    Find the header row in the file. It is the first line that starts with '#'.
    """
    with open(file_path, 'r') as file:
        for i, line in enumerate(file):
            if line.startswith('#'):
                return i
    return None

def process_rfmix_to_tracts(input_file, output_folder):
    """
    Process an RFMix output file and generate TRACTS input files.
    """
    # Find the header row
    header_row = find_header_row(input_file)
    if header_row is None:
        print("Header row could not be found.")
        return

    # Read the input file
    data = pd.read_csv(input_file, sep='\t', header=header_row, skiprows=1)

    # Get the list of samples (excluding the first 6 columns)
    samples = data.columns[6:]

    # Process each sample (in case of a file with multiple samples)
    for sample in samples:
        # Determine the output file name based on the haplotype (.0 or .1)
        file_suffix = '_B.viterbi.bed.cm' if sample.endswith('.1') else '_A.viterbi.bed.cm'
        output_file = os.path.join(output_folder, f"{sample.split('.')[0]}{file_suffix}")

        # Extract the relevant columns for the sample and reorder them
        sample_data = data[['#chm', 'spos', 'epos', sample, 'sgpos', 'egpos']]
        sample_data.columns = ['chrom', 'begin', 'end', 'assignment', 'cmBegin', 'cmEnd']

        # Save to file
        sample_data.to_csv(output_file, sep='\t', index=False)

    print("Process completed. The files have been generated.")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <input_file> <output_folder>")
    else:
        input_file_path = sys.argv[1]
        output_folder_path = sys.argv[2]
        process_rfmix_to_tracts(input_file_path, output_folder_path)

# Contact: franco.lmarsico@gmail.com
# GitHub: marsicoFL

# Example of how to run the script:
# python rfmix2tracts.py /path/to/rfmix2output.msp.tsv /path/to/output/folder/
# Or if you want to run it in the same folder:
# python rfmix2tracts.py toy.msp.tsv .

Best, Franco