ababaian / palmdb

Database of virus RdRp barcode sequences
Creative Commons Zero v1.0 Universal
0 stars 0 forks source link

[palmdb4] `palmdb` Table: Align all palmDB sequences to sOTU sequences #5

Open almosnow opened 2 months ago

ababaian commented 2 months ago

To make the palmdb table linking all unique palmprints with all sOTU centroid palmprints

# Database of sOTU
makedb --in "$WORK"/palmdb-2023-04/final/sotus.palmprint.faa --db ### sotu

# Create unique_OTU to sOTU membership file
# NOTE THIS NEEDS TO BE UPDATED TO A GLOBAL ALIGNMENT
diamond blastp \
-q $WORK/palmdb-2023-04/final/unique.palmprint.faa \
-d sotu \
--masking 0 --unal 1 \
-k 1 \
-f 6 qseqid  qstart qend qlen qstrand \
     sseqid  sstart send slen \
     pident evalue \
     full_qseq \
> unique_to_sotu.pro

Assign sOTU nicknames (See Also #8)

Assigned sOTU nicknames from previous palmDB

Saved as: nick/assigned.sotu.csv

# Extract given sOTU nicknames
select sotu, nickname  from palmdb
where centroid = 'true'

Shard out assigned sOTU and nickname files

cut -f1 assigned.sotu.csv -d',' > assigned.sotu.otu
cut -f2 assigned.sotu.csv -d',' > assigned.sotu.nick

List of all sOTU (quoted)

grep ">" palmdb-2023-04/final/sotus.palmprint.faa \
  | sed 's/>/"/g' - \
  | sed 's/$/"/g' - \
  > nick/input.sotu.list

List of all novel sOTU

grep -f assigned.sotu.otu -v input.sotu.list > new.otu.otu

Generate new nickname space

shuf Noun > Noun.1
shuf adj > adj.1
paste adj.1 Noun.1 | tr -d '\t' \
  | sed 's/^/"/g' - | sed 's/$/"/g' - > nick.1

# deplete if there is a name collision in new nicknames
# from the previously assigned nicknames
grep -f assigned.sotu.nick -v nick.1  | head -n 365168 - > new.otu.nick

# paste new sOTU and new nickname into a CSV
cat assigned.sotu.csv > output.sotu.nick.csv
paste new.otu.otu new.otu.nick -d"," >> output.sotu.nick.csv

Cleanup nick/ workspace

rm adj.1 assigned.sotu.nick assigned.sotu.otu new.otu.nick new.otu.otu nick.1 Noun.1

R parse script: make_palmDB.Rmd

pro <- read.table(file = 'unique_to_sotu.pro', header = F)
colnames(pro) <- c( 'qseqid',  'qstart', 'qend', 'qlen', 'qstrand', 'sseqid',
                    'sstart', 'send', 'slen', 'pident', 'evalue', 'full_qseq')

nick <- read.csv(file ='nick/output.sotu.nick.csv')

# Initialize palmDB
palmdb <- data.frame( palm_id             = pro$qseqid,
                      sotu                = pro$sseqid,
                      percent_identity    = pro$pident,
                      centroid            = FALSE,
                      palmprint = pro$full_qseq )

palmdb$centroid <- (palmdb$palm_id == palmdb$sotu)
palmdb <- merge(palmdb, nick, by = 'sotu', all.x = T)
palmdb$sotu[ palmdb$sotu == '*' ] <- NA

palmdb <- palmdb[ , c( "palm_id"         , "sotu"    , "nickname",
                       "percent_identity", "centroid", "palmprint")]

palmdb <- palmdb[ order(as.numeric(gsub("u", '', palmdb$sotu)),
                        -palmdb$percent_identity ), ]

write.csv(palmdb, file = 'palmdb.sql.csv')

Create palmdb2 table

CREATE TABLE palmdb2 (
  palm_id text PRIMARY KEY NOT NULL,
  sotu text,
  nickname text,
  percent_identity numeric (4),
  centroid boolean,
  palmprint text
)
WITH (
    OIDS = FALSE
)
TABLESPACE pg_default;

ALTER TABLE public.palmdb2
    OWNER to serratus;

GRANT ALL ON TABLE public.palmdb2 TO serratus;

GRANT SELECT ON TABLE public.palmdb2 TO viewer;

Index palmdb2

CREATE INDEX palmdb2_palm_id
    ON public.palmdb2 USING btree
    (palm_id COLLATE pg_catalog."C" ASC NULLS LAST);

CREATE INDEX palmdb2_sotu
    ON public.palmdb2 USING btree
    (sotu COLLATE pg_catalog."C" ASC NULLS LAST);

CREATE INDEX palmdb2_centroid
    ON public.palmdb2 USING btree
    (centroid DESC NULLS LAST);