icgc-argo / seq-tools

Command line tools for ARGO sequencing data validation
https://github.com/icgc-argo/seq-tools
GNU Affero General Public License v3.0
1 stars 0 forks source link

🐛 Validation does not check empty string for required fields #73

Closed hknahal closed 3 years ago

hknahal commented 3 years ago

Describe the bug

The platform_unit field is a required in the sequencing_experiment schema but if it is submitted as an empty string in the metadata JSON, the validation does not report it as an error.

Steps To Reproduce

BAM header file: PU does not exist

@HD VN:1.5  GO:none SO:coordinate
@SQ SN:chrM LN:16569
@SQ SN:chr1 LN:249250621
@SQ SN:chr2 LN:243199373
@SQ SN:chr3 LN:198022430
@SQ SN:chr4 LN:191154276
@SQ SN:chr5 LN:180915260
@SQ SN:chr6 LN:171115067
@SQ SN:chr7 LN:159138663
@SQ SN:chr8 LN:146364022
@SQ SN:chr9 LN:141213431
@SQ SN:chr10    LN:135534747
@SQ SN:chr11    LN:135006516
@SQ SN:chr12    LN:133851895
@SQ SN:chr13    LN:115169878
@SQ SN:chr14    LN:107349540
@SQ SN:chr15    LN:102531392
@SQ SN:chr16    LN:90354753
@SQ SN:chr17    LN:81195210
@SQ SN:chr18    LN:78077248
@SQ SN:chr19    LN:59128983
@SQ SN:chr20    LN:63025520
@SQ SN:chr21    LN:48129895
@SQ SN:chr22    LN:51304566
@SQ SN:chrX LN:155270560
@SQ SN:chrY LN:59373566
@SQ SN:chr1_gl000191_random LN:106433
@SQ SN:chr1_gl000192_random LN:547496
@SQ SN:chr4_gl000193_random LN:189789
@SQ SN:chr4_gl000194_random LN:191469
@SQ SN:chr7_gl000195_random LN:182896
@SQ SN:chr8_gl000196_random LN:38914
@SQ SN:chr8_gl000197_random LN:37175
@SQ SN:chr9_gl000198_random LN:90085
@SQ SN:chr9_gl000199_random LN:169874
@SQ SN:chr9_gl000200_random LN:187035
@SQ SN:chr9_gl000201_random LN:36148
@SQ SN:chr11_gl000202_random    LN:40103
@SQ SN:chr17_gl000203_random    LN:37498
@SQ SN:chr17_gl000204_random    LN:81310
@SQ SN:chr17_gl000205_random    LN:174588
@SQ SN:chr17_gl000206_random    LN:41001
@SQ SN:chr18_gl000207_random    LN:4262
@SQ SN:chr19_gl000208_random    LN:92689
@SQ SN:chr19_gl000209_random    LN:159169
@SQ SN:chr21_gl000210_random    LN:27682
@SQ SN:chrUn_gl000211   LN:166566
@SQ SN:chrUn_gl000212   LN:186858
@SQ SN:chrUn_gl000213   LN:164239
@SQ SN:chrUn_gl000214   LN:137718
@SQ SN:chrUn_gl000215   LN:172545
@SQ SN:chrUn_gl000216   LN:172294
@SQ SN:chrUn_gl000217   LN:172149
@SQ SN:chrUn_gl000218   LN:161147
@SQ SN:chrUn_gl000219   LN:179198
@SQ SN:chrUn_gl000220   LN:161802
@SQ SN:chrUn_gl000221   LN:155397
@SQ SN:chrUn_gl000222   LN:186861
@SQ SN:chrUn_gl000223   LN:180455
@SQ SN:chrUn_gl000224   LN:179693
@SQ SN:chrUn_gl000225   LN:211173
@SQ SN:chrUn_gl000226   LN:15008
@SQ SN:chrUn_gl000227   LN:128374
@SQ SN:chrUn_gl000228   LN:129120
@SQ SN:chrUn_gl000229   LN:19913
@SQ SN:chrUn_gl000230   LN:43691
@SQ SN:chrUn_gl000231   LN:27386
@SQ SN:chrUn_gl000232   LN:40652
@SQ SN:chrUn_gl000233   LN:45941
@SQ SN:chrUn_gl000234   LN:40531
@SQ SN:chrUn_gl000235   LN:34474
@SQ SN:chrUn_gl000236   LN:41934
@SQ SN:chrUn_gl000237   LN:45867
@SQ SN:chrUn_gl000238   LN:39939
@SQ SN:chrUn_gl000239   LN:33824
@SQ SN:chrUn_gl000240   LN:41933
@SQ SN:chrUn_gl000241   LN:42152
@SQ SN:chrUn_gl000242   LN:43523
@SQ SN:chrUn_gl000243   LN:43341
@SQ SN:chrUn_gl000244   LN:39929
@SQ SN:chrUn_gl000245   LN:36651
@SQ SN:chrUn_gl000246   LN:38154
@SQ SN:chrUn_gl000247   LN:36422
@SQ SN:chrUn_gl000248   LN:39786
@SQ SN:chrUn_gl000249   LN:38502
@SQ SN:hs37d5   LN:35477943
@SQ SN:NC_007605    LN:171823
@RG ID:LP6008269-DNA_H08    PL:ILLUMINA LB:LP6008269-DNA_H08    SM:LP6008269-DNA_H08
@PG ID:bwa  PN:bwa  VN:0.7.10-r789  CL:/home/mib-cri/software/bwa//bwa-0.7.10/bwa mem -R @RG\tID:LP6008269-DNA_H08\tSM:LP6008269-DNA_H08\tLB:LP6008269-DNA_H08\tPL:ILLUMINA -p /lustre/projects/stlab-icgc/software/ref_genomes/GRCh37_g1k//bwa-0.7.5a/Hsa.GRCh37.bwa -
@PG ID:MarkDuplicates   PN:MarkDuplicates   VN:1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)   CL:picard.sam.MarkDuplicates INPUT=[/lustre/projects/stlab-icgc/prod/batch18f_realign/alignment/LP6008269-DNA_H08/1.5/HFH7WALXX.5.lane.bam, /lustre/projects/stlab-icgc/prod/batch18f_realign/alignment/LP6008269-DNA_H08/1.5/HFH7WALXX.6.lane.bam] OUTPUT=/lustre/projects/stlab-icgc/prod/batch18f_realign/alignment/LP6008269-DNA_H08/1.5/LP6008269-DNA_H08.bam METRICS_FILE=/lustre/projects/stlab-icgc/prod/batch18f_realign/alignment/LP6008269-DNA_H08/1.5/LP6008269-DNA_H08.lane.dups.txt.pipetemp READ_NAME_REGEX=[a-zA-Z0-9_\-]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).* TMP_DIR=[/lustre/projects/stlab-icgc/prod/batch18f_realign/alignment/LP6008269-DNA_H08/1.5/temp] VALIDATION_STRINGENCY=SILENT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=true    PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates REMOVE_DUPLICATES=false ASSUME_SORTED=false MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 VERBOSITY=INFO QUIET=false CREATE_MD5_FILE=false

Metadata JSON file: platform_unit is an empty string:

{
    "studyId": "OCCAMS-GB",
    "analysisType": {
        "name": "sequencing_experiment"
    },
    "samples": [
        {
            "submitterSampleId": "LP6008269-DNA_H08",
            "matchedNormalSubmitterSampleId": "LP6008268-DNA_H08",
            "sampleType": "Total DNA",
            "specimen": {
                "submitterSpecimenId": "83fcf464cf46317238331e5c7da26690e645be11aee45d49c6ed91da490e313f",
                "specimenType": "Primary tumour",
                "tumourNormalDesignation": "Tumour",
                "specimenTissueSource": "Solid tissue"
            },
            "donor": {
                "submitterDonorId": "b8bfd29057a30c787f3ce32ad6fd64aeb349043a52c25d917cb28644bc0a48a0",
                "gender": "Male"
            }
        }
    ],
    "files": [
        {
            "dataType": "Submitted Reads",
            "fileName": "563f0b6e132965027e6ec742752a4757.LP6008269-DNA_H08.bam",
            "fileSize": 152209724860,
            "fileType": "BAM",
            "fileMd5sum": "563f0b6e132965027e6ec742752a4757",
            "fileAccess": "controlled",
            "info": {
                "legacyAnalysisId": "EGAR00001566506",
                "data_category": "Sequencing Reads"
            }
        }
    ],
    "read_groups": [
        {
            "file_r1": "563f0b6e132965027e6ec742752a4757.LP6008269-DNA_H08.bam",
            "file_r2": "563f0b6e132965027e6ec742752a4757.LP6008269-DNA_H08.bam",
            "insert_size": null,
            "platform_unit": "",
            "is_paired_end": true,
            "library_name": "LP6008269-DNA_H08",
            "read_length_r1": null,
            "read_length_r2": null,
            "sample_barcode": null,
            "read_group_id_in_bam": null,
            "submitter_read_group_id": "LP6008269-DNA_H08"
        }
    ],
    "experiment": {
        "submitter_sequencing_experiment_id": "EXP-927",
        "experimental_strategy": "WGS",
        "sequencing_center": "",
        "platform": "ILLUMINA",
        "platform_model": null,
        "sequencing_date": null
    },
    "read_group_count": 1
}

Snippet from report.json (other checks were correctly validated):

{
        "checker": "c140_platform_unit_uniqueness",
        "status": "VALID",
        "message": "Platform unit uniqueness check status: VALID"
      },

Expected behaviour

Validation should recognize an empty string as a missing value for a required field and return INVALID

junjun-zhang commented 3 years ago

addressed by #75