Closed shntnu closed 7 months ago
prod_sources="1 2 3 4 5 6 7 8 9 10 11 13" parallel aws s3 ls --recursive "s3://cellpainting-gallery/cpg0016-jump/source_{1}/ |gzip > stats/cpg0016-jump_source_{}.txt.gz" ::: ${prod_sources} parallel 'zcat stats/cpg0016-jump_source_{}.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6|grep images$|cut -d, -f1,2 |sed "1s/^/bytes,Metadata_Source\n/"|gzip> stats/cpg0016-jump_source_{}_images.csv.gz' ::: ${prod_sources} parallel 'zcat stats/cpg0016-jump_source_{}_images.csv.gz|cut -d, -f1|paste -sd+ - | bc > stats/cpg0016-jump_source_{}_images_size.csv' ::: ${prod_sources} parallel 'sed -i "1s/^/cpg0016-jump,source_{},/" stats/cpg0016-jump_source_{}_images_size.csv' ::: ${prod_sources} cat stats/cpg0016-jump_source_*_images_size.csv | sort > stats/cpg0016-jump_source_images_size.csv sed -i "1s/^/dataset,source,bytes\n/" stats/cpg0016-jump_source_images_size.csv cat stats/cpg0016-jump_source_images_size.csv # dataset,source,bytes # cpg0016-jump,source_1,5371241900768 # cpg0016-jump,source_10,8218380973898 # cpg0016-jump,source_11,8408864230727 # cpg0016-jump,source_13,7389117766045 # cpg0016-jump,source_2,8400817571909 # cpg0016-jump,source_3,18197060600282 # cpg0016-jump,source_4,19401189466298 # cpg0016-jump,source_5,14371076831150 # cpg0016-jump,source_6,12843800911238 # cpg0016-jump,source_7,6126096793600 # cpg0016-jump,source_8,7931651003318 # cpg0016-jump,source_9,10096282694514 cat stats/cpg0016-jump_source_images_size.csv | sed 1d | cut -d, -f2 | paste -sd+ - | bc # 126755580743747 parallel 'zcat stats/cpg0016-jump_source_{}.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6-|egrep "ti[f]{1,2}"$|wc -l> stats/cpg0016-jump_source_{}_images_tiff_count.csv' ::: ${prod_sources} parallel 'sed -i "1s/^/cpg0016-jump,source_{},/" stats/cpg0016-jump_source_{}_images_tiff_count.csv' ::: ${prod_sources} cat stats/cpg0016-jump_source_*_images_tiff_count.csv | sort > stats/cpg0016-jump_source_images_tiff_count.csv sed -i "1s/^/dataset,source,tiffs\n/" stats/cpg0016-jump_source_images_tiff_count.csv cat stats/cpg0016-jump_source_images_tiff_count.csv # dataset,source,tiffs # cpg0016-jump,source_1,1943040 # cpg0016-jump,source_10,4093834 # cpg0016-jump,source_11,3504002 # cpg0016-jump,source_13,2662635 # cpg0016-jump,source_2,4217490 # cpg0016-jump,source_3,7188480 # cpg0016-jump,source_4,7658494 # cpg0016-jump,source_5,7191860 # cpg0016-jump,source_6,6804166 # cpg0016-jump,source_7,2211830 # cpg0016-jump,source_8,7473900 # cpg0016-jump,source_9,3317760 cat stats/cpg0016-jump_source_images_tiff_count.csv | sed 1d | cut -d, -f2 | paste -sd+ - | bc # 58267491
parallel 'zcat stats/{}_source_4.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6|grep images$|cut -d, -f1,2 |sed "1s/^/bytes,Metadata_Source\n/"|gzip> stats/{}_source_4_images.csv.gz' ::: ${pilot_datasets} parallel 'zcat stats/{}_source_4_images.csv.gz|cut -d, -f1|paste -sd+ - | bc > stats/{}_source_4_images_size.csv' ::: ${pilot_datasets} parallel 'sed -i "1s/^/{},source_4,/" stats/{}_source_4_images_size.csv' ::: ${pilot_datasets} parallel 'sed -i "1s/^/dataset,source,bytes\n/" stats/{}_source_4_images_size.csv' ::: ${pilot_datasets} csvstack $(parallel "echo stats/{}_source_4_images_size.csv" ::: ${pilot_datasets}) > stats/cpg0000-00001-0002_source_4_images_size.csv cat stats/cpg0000-00001-0002_source_4_images_size.csv # dataset,source,bytes # cpg0000-jump-pilot,source_4,7481514441972 # cpg0001-cellpainting-protocol,source_4,20613341335336 # cpg0002-jump-scope,source_4,13724192889780 cat stats/cpg0000-00001-0002_source_4_images_size.csv|sed 1d|cut -d, -f3| paste -sd+ - | bc # 41819048667088 parallel 'zcat stats/{}_source_4.txt.gz |tr -s " "|cut -d" " -f1,2,3,4|tr " " ","|tr / ,|cut -d"," -f3,5,6-|egrep "ti[f]{1,2}"$|wc -l> stats/{}_source_4_images_tiff_count.csv' ::: ${pilot_datasets} parallel 'sed -i "1s/^/{},source_{},/" stats/{}_source_4_images_tiff_count.csv' ::: ${pilot_datasets} parallel 'sed -i "1s/^/dataset,source,tiffs\n/" stats/{}_source_4_images_tiff_count.csv' ::: ${pilot_datasets} csvstack $(parallel "echo stats/{}_source_4_images_tiff_count.csv" ::: ${pilot_datasets}) > stats/cpg0000-00001-0002_source_4_images_tiff_count.csv cat stats/cpg0000-00001-0002_source_4_images_tiff_count.csv # dataset,source,tiffs # cpg0000-jump-pilot,source_cpg0000-jump-pilot,3029017 # cpg0001-cellpainting-protocol,source_cpg0001-cellpainting-protocol,4693691 # cpg0002-jump-scope,source_cpg0002-jump-scope,1238195 cat stats/cpg0000-00001-0002_source_4_images_tiff_count.csv|sed 1d|cut -d, -f3| paste -sd+ - | bc # 8960903