jump-cellpainting / datasets

Images and other data from the JUMP Cell Painting Consortium
BSD 3-Clause "New" or "Revised" License
155 stars 16 forks source link

Stats on number and size of images files #43

Closed shntnu closed 7 months ago

shntnu commented 1 year ago
prod_sources="1 2 3 4 5 6 7 8 9 10 11 13"

parallel aws s3 ls --recursive "s3://cellpainting-gallery/cpg0016-jump/source_{1}/ |gzip > stats/cpg0016-jump_source_{}.txt.gz" ::: ${prod_sources}

parallel 'zcat stats/cpg0016-jump_source_{}.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6|grep images$|cut -d, -f1,2 |sed "1s/^/bytes,Metadata_Source\n/"|gzip> stats/cpg0016-jump_source_{}_images.csv.gz' ::: ${prod_sources}

parallel 'zcat stats/cpg0016-jump_source_{}_images.csv.gz|cut -d, -f1|paste -sd+ - | bc > stats/cpg0016-jump_source_{}_images_size.csv' ::: ${prod_sources}

parallel 'sed -i "1s/^/cpg0016-jump,source_{},/" stats/cpg0016-jump_source_{}_images_size.csv' ::: ${prod_sources}

cat stats/cpg0016-jump_source_*_images_size.csv | sort > stats/cpg0016-jump_source_images_size.csv

sed -i "1s/^/dataset,source,bytes\n/" stats/cpg0016-jump_source_images_size.csv

cat stats/cpg0016-jump_source_images_size.csv

# dataset,source,bytes
# cpg0016-jump,source_1,5371241900768
# cpg0016-jump,source_10,8218380973898
# cpg0016-jump,source_11,8408864230727
# cpg0016-jump,source_13,7389117766045
# cpg0016-jump,source_2,8400817571909
# cpg0016-jump,source_3,18197060600282
# cpg0016-jump,source_4,19401189466298
# cpg0016-jump,source_5,14371076831150
# cpg0016-jump,source_6,12843800911238
# cpg0016-jump,source_7,6126096793600
# cpg0016-jump,source_8,7931651003318
# cpg0016-jump,source_9,10096282694514

cat stats/cpg0016-jump_source_images_size.csv | sed 1d | cut -d, -f2 | paste -sd+ - | bc 
# 126755580743747

parallel 'zcat stats/cpg0016-jump_source_{}.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6-|egrep "ti[f]{1,2}"$|wc -l> stats/cpg0016-jump_source_{}_images_tiff_count.csv' ::: ${prod_sources}

parallel 'sed -i "1s/^/cpg0016-jump,source_{},/" stats/cpg0016-jump_source_{}_images_tiff_count.csv' ::: ${prod_sources}

cat stats/cpg0016-jump_source_*_images_tiff_count.csv | sort > stats/cpg0016-jump_source_images_tiff_count.csv

sed -i "1s/^/dataset,source,tiffs\n/" stats/cpg0016-jump_source_images_tiff_count.csv

cat stats/cpg0016-jump_source_images_tiff_count.csv 

# dataset,source,tiffs
# cpg0016-jump,source_1,1943040
# cpg0016-jump,source_10,4093834
# cpg0016-jump,source_11,3504002
# cpg0016-jump,source_13,2662635
# cpg0016-jump,source_2,4217490
# cpg0016-jump,source_3,7188480
# cpg0016-jump,source_4,7658494
# cpg0016-jump,source_5,7191860
# cpg0016-jump,source_6,6804166
# cpg0016-jump,source_7,2211830
# cpg0016-jump,source_8,7473900
# cpg0016-jump,source_9,3317760

cat stats/cpg0016-jump_source_images_tiff_count.csv | sed 1d | cut -d, -f2 | paste -sd+ - | bc 
# 58267491
parallel 'zcat stats/{}_source_4.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6|grep images$|cut -d, -f1,2 |sed "1s/^/bytes,Metadata_Source\n/"|gzip> stats/{}_source_4_images.csv.gz' ::: ${pilot_datasets}

parallel 'zcat stats/{}_source_4_images.csv.gz|cut -d, -f1|paste -sd+ - | bc > stats/{}_source_4_images_size.csv' ::: ${pilot_datasets}

parallel 'sed -i "1s/^/{},source_4,/" stats/{}_source_4_images_size.csv' ::: ${pilot_datasets}

parallel 'sed -i "1s/^/dataset,source,bytes\n/" stats/{}_source_4_images_size.csv' ::: ${pilot_datasets}

csvstack $(parallel "echo stats/{}_source_4_images_size.csv" ::: ${pilot_datasets}) > stats/cpg0000-00001-0002_source_4_images_size.csv

cat stats/cpg0000-00001-0002_source_4_images_size.csv

# dataset,source,bytes
# cpg0000-jump-pilot,source_4,7481514441972
# cpg0001-cellpainting-protocol,source_4,20613341335336
# cpg0002-jump-scope,source_4,13724192889780

cat stats/cpg0000-00001-0002_source_4_images_size.csv|sed 1d|cut -d, -f3| paste -sd+ - | bc 
# 41819048667088

parallel 'zcat stats/{}_source_4.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6-|egrep "ti[f]{1,2}"$|wc -l> stats/{}_source_4_images_tiff_count.csv' ::: ${pilot_datasets}

parallel 'sed -i "1s/^/{},source_{},/" stats/{}_source_4_images_tiff_count.csv' ::: ${pilot_datasets}

parallel 'sed -i "1s/^/dataset,source,tiffs\n/" stats/{}_source_4_images_tiff_count.csv' ::: ${pilot_datasets}

csvstack $(parallel "echo stats/{}_source_4_images_tiff_count.csv" ::: ${pilot_datasets}) > stats/cpg0000-00001-0002_source_4_images_tiff_count.csv

cat stats/cpg0000-00001-0002_source_4_images_tiff_count.csv

# dataset,source,tiffs
# cpg0000-jump-pilot,source_cpg0000-jump-pilot,3029017
# cpg0001-cellpainting-protocol,source_cpg0001-cellpainting-protocol,4693691
# cpg0002-jump-scope,source_cpg0002-jump-scope,1238195

cat stats/cpg0000-00001-0002_source_4_images_tiff_count.csv|sed 1d|cut -d, -f3| paste -sd+ - | bc 

# 8960903