Open rsalz opened 3 years ago
It worth what it worth but this is a small function to write gtf. I wanted to be quick so it's not very polish...
import polars
from pathlib import Path
import typing as t
COMMONS_COL = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame']
def write_gtf(df: polars.DataFrame, export_path: str | Path, headers: t.List[str] = None):
headers = headers or []
with open(export_path, 'w') as f:
for header in headers:
f.write(f"{header}\n")
for row in df.iter_rows(named=True):
f.write(f"{commons_cols(row)}\t{custom_fields(row)}\n")
def commons_cols(row) -> str :
return "\t".join([str(row[field] or '.') for field in COMMONS_COL])
def custom_fields(row) -> str:
return "; ".join([f'{field} "{row[field]}"' for field in row.keys() if (field not in COMMONS_COL) and (row[field])])
And the test :
from gtfparse import read_gtf, write_gtf
from data import data_path
from polars import DataFrame
REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf")
def test_write_gtf(tmp_path):
gtf_dict = read_gtf(REFSEQ_GTF_PATH)
write_gtf(gtf_dict, tmp_path/"dummy_gtf.gtf")
assert isinstance(read_gtf(str(tmp_path/"dummy_gtf.gtf")), DataFrame)
It worth what it worth but this is a small function to write gtf. I wanted to be quick so it's not very polish...
import polars from pathlib import Path import typing as t COMMONS_COL = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'] def write_gtf(df: polars.DataFrame, export_path: str | Path, headers: t.List[str] = None): headers = headers or [] with open(export_path, 'w') as f: for header in headers: f.write(f"{header}\n") for row in df.iter_rows(named=True): f.write(f"{commons_cols(row)}\t{custom_fields(row)}\n") def commons_cols(row) -> str : return "\t".join([str(row[field] or '.') for field in COMMONS_COL]) def custom_fields(row) -> str: return "; ".join([f'{field} "{row[field]}"' for field in row.keys() if (field not in COMMONS_COL) and (row[field])])
And the test :
from gtfparse import read_gtf, write_gtf from data import data_path from polars import DataFrame REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf") def test_write_gtf(tmp_path): gtf_dict = read_gtf(REFSEQ_GTF_PATH) write_gtf(gtf_dict, tmp_path/"dummy_gtf.gtf") assert isinstance(read_gtf(str(tmp_path/"dummy_gtf.gtf")), DataFrame)
Nice one. Thank you!
Is there a reason why this is not merged ?
Happy to do the PR if needed :)
You give me the motivation to make a PR :)
Go for it, otherwise happy to do it at the end of the week :)
Would be a nice feature to also be able to write the dataframe to a gtf again. With the attribute column parsed as it is it's impossible to go the opposite direction