openvax / gtfparse

Parsing tools for GTF (gene transfer format) files
Apache License 2.0
109 stars 25 forks source link

Read_gtf but no write_gtf? #21

Open rsalz opened 3 years ago

rsalz commented 3 years ago

Would be a nice feature to also be able to write the dataframe to a gtf again. With the attribute column parsed as it is it's impossible to go the opposite direction

Benoitdw commented 11 months ago

It worth what it worth but this is a small function to write gtf. I wanted to be quick so it's not very polish...

import polars
from pathlib import Path
import typing as t

COMMONS_COL = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame']

def write_gtf(df: polars.DataFrame, export_path: str | Path, headers:  t.List[str] = None):
    headers = headers or []
    with open(export_path, 'w') as f:
        for header in headers:
            f.write(f"{header}\n")
        for row in df.iter_rows(named=True):
            f.write(f"{commons_cols(row)}\t{custom_fields(row)}\n")

def commons_cols(row) -> str :
    return "\t".join([str(row[field] or '.') for field in COMMONS_COL])

def custom_fields(row) -> str:
    return "; ".join([f'{field} "{row[field]}"' for field in row.keys() if (field not in COMMONS_COL) and (row[field])])

And the test :

from gtfparse import read_gtf, write_gtf
from data import data_path
from polars import DataFrame

REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf")

def test_write_gtf(tmp_path):
    gtf_dict = read_gtf(REFSEQ_GTF_PATH)
    write_gtf(gtf_dict, tmp_path/"dummy_gtf.gtf")
    assert  isinstance(read_gtf(str(tmp_path/"dummy_gtf.gtf")), DataFrame)
svenbioinf commented 6 months ago

It worth what it worth but this is a small function to write gtf. I wanted to be quick so it's not very polish...

import polars
from pathlib import Path
import typing as t

COMMONS_COL = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame']

def write_gtf(df: polars.DataFrame, export_path: str | Path, headers:  t.List[str] = None):
    headers = headers or []
    with open(export_path, 'w') as f:
        for header in headers:
            f.write(f"{header}\n")
        for row in df.iter_rows(named=True):
            f.write(f"{commons_cols(row)}\t{custom_fields(row)}\n")

def commons_cols(row) -> str :
    return "\t".join([str(row[field] or '.') for field in COMMONS_COL])

def custom_fields(row) -> str:
    return "; ".join([f'{field} "{row[field]}"' for field in row.keys() if (field not in COMMONS_COL) and (row[field])])

And the test :

from gtfparse import read_gtf, write_gtf
from data import data_path
from polars import DataFrame

REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf")

def test_write_gtf(tmp_path):
    gtf_dict = read_gtf(REFSEQ_GTF_PATH)
    write_gtf(gtf_dict, tmp_path/"dummy_gtf.gtf")
    assert  isinstance(read_gtf(str(tmp_path/"dummy_gtf.gtf")), DataFrame)

Nice one. Thank you!

gamazeps commented 5 months ago

Is there a reason why this is not merged ?

Happy to do the PR if needed :)

Benoitdw commented 5 months ago

You give me the motivation to make a PR :)

gamazeps commented 5 months ago

Go for it, otherwise happy to do it at the end of the week :)