lgatto / rpx

R Interface to the ProteomeXchange Repository
http://lgatto.github.io/rpx/
4 stars 2 forks source link

File types #22

Closed lgatto closed 1 year ago

lgatto commented 1 year ago

Since versions 2.6.1 and 2.7.1, rpx doesn't rely on the (defunct) project README.txt file to get a listing of files. This README.txt file also provided other information, including the file types. Currently, these are set to NA.

It would be nice to infer these base on the file extensions, to identify raw files, binary raw files, identification, files, parameter files, documents, ...

A example of these types and extensions, composed based on 20K+ files, is shown below:

exts <- list(raw =  c("raw", "RAW", "Raw", "wiff", "wiff2"), ## binary raw
             mz = c("mzML", "MZML", "mzXML", "mzxml", "MZXML", "mzml", "TraML", "traML", "traml",
                    "imzML", "mzData", "mzdata", "CDF"), ## open raw
             pks = c("mgf", "MGF", "pkl", "PKL"), ## peak lists
             id = c("mzid", "mzId", "mzID", "idXML", "mzIdentML", "mzidentML", "mzidentml",
                    "IdXML", "pepXML", "pepxml", "pepnovo", "idxml"), ## id
             fas = c("fasta", "faa", "fas", "FASTA", "fa", "fasts"), ## sequences
             tbl = c("csv", "CSV", "xls", "XLS", "xlsx", "XLSX", "xlsb", "xlsm", "txt", "TXT",
                     "tsv", "mztab", "mzTab", "tabular", "tab", "psmtsv"), ## tabular
             arx = c("zip", "ZIP", "tar", "tgz", "rar", "RAR", "7z", "bz2", "webarchive", "gz", "xy"), ## archives
             par = c("PARAMS", "params", "param", "par", "method", "Method", "ini", "mtd", "index",
                     "apar", "json", "toml", "yaml", "config", "FAmethod", "properties"), ## parameters, metadata, configuration, ...
             com = c("dat", "pdResult", "pdStudy", "pdResultView", "pdAnalysis", "pdProcessingWF", "pdConsensusWF",
                     "ProgenesisQIPExperiment", "ProgenesisQIPArchive", "ProgenesisLcmsExperiment",
                     "ProgenesisQIPMultiFractionExperiment"), ## commercial software
             sft = c("view", "skyd", "sky", "skyl"), ## open/free software such as skyline
             code = c("R", "r", "pl", "py", "js", "jar", "Rmd", "sh", "ipynb"),
             data = c("RData", "RDS", "sqlite", "mz5", "h5"),
             doc = c("pdf", "PDF", "pptx", "ppt", "doc", "docx", "html", "htm", "shtml",
                     "rtf", "readme"), ## documents, descriptions
             img = c("tiff", "jpg", "gif", "tif", "TIF", "png", "PNG", "JPG", "svg"), ## images
             tmp = c("bak", "download", "exe", "crdownload", "sgdownload", "css",
                     "bin", "dll", "chksum", "cksum", "md5", "temp"), ## backup, download, executables, ...
             xml = c("xml", "XML"),
             gen = c("gtf", "gff", "fastq", "vcf", "plink"),
             none = "")