ebi-ait / morphic-website

A generic data catalogue web app to serve the AIT team projects.
0 stars 0 forks source link

generate test data set #9

Closed amnonkhen closed 9 months ago

amnonkhen commented 9 months ago

Generate a test data set with synthetic data that can be used to check how the system behaves when the data is at a larger scale.

amnonkhen commented 9 months ago

I used https://json-generator.com/ with the following template:

[
  '{{repeat(1000,1000)}}',
    {
      "id": '5{{index()}}',
      "dpc": '{{random("MSKCC", "JAX", "UCSF", "NW")}}',
      "short_study_label": function(tags) {return this.dpc + "_"+"KO"+"_"+tags.integer(1000,9999)},
      "upload_status": "Data in, metadata in progress",
      "study_title": '{{lorem(1, "sentences")}}',
      "readout_assay": '{{random("Bulk RNA-seq", "scRNA-seq", "Perturb-seq","multiplexed scRNA-seq", "ATAC-Seq", "WGS")}}',
      "perturbation_type": '{{random("CRISPR-Cas9 KO", "CRISPR-dCas9 KD","CRISPRi (dCas9-KRAB)","Auxin-inducible degron (AID)")}}',

      "model_system":
        '{{lorem(1, "sentences")}}',
      "pooled_perturbation": '{{random("Yes", "No")}}',
      "longitudinal_study": '{{random("Yes", "No")}}',
      "duo_code_for_data_sharing_restriction": "already public",
      "number_of_datasets": null,
      "expected_release": "Apr 2022",
      "available_datasets": 'GSE{{integer(100000, 999999)}}',
      "publication":
        '{{lorem(3, "paragraphs")}}',
      "data_upload_contact_name": '{{firstName()}} {{surname()}}',
      "data_upload_contact_email_address": '{{email()}}',
      "contact": '{{firstName()}} {{surname()}}',
      "donor_ancestry": "",
      "gender": '{{gender()}}',
      "protocols_io_link_cell_culture": "",
      "protocols_io_link_for_differentiation_and_maintenance": "",
      "general_comments": '{{lorem(1, "paragraphs")}}',
      "sharing_mechanism_with_DRACC": "",
      "comments": '{{lorem(1, "paragraphs")}}',
      cell_line: function (tags) 
      {
        var values = ['H1', 'HUES8', 'KOLF2','KOLF2.2J', 'hiPSC'];
        var randomStringsArray = Array.from(
          { length: tags.integer(1, 10) }, 
          () =>  values[tags.integer(0,values.length-1)]);
        return [...new Set(randomStringsArray)].join(',');
      },
       target_genes: function (tags) 
      {
        var values = ['HHEX', 'genome-scale Brunello library', 'MSK MorPhiC genes', 'genome-scale Brunello and GeCKO (v2) libraries', 'GHRL1', 'POU2F3', 'EPAS1', 'FOSB', 'GCM1', 'PPARG', 'ISL1'];
        var randomStringsArray = Array.from(
          { length: tags.integer(1, 10) }, 
          () =>  values[tags.integer(0,values.length-1)]);
        return [...new Set(randomStringsArray)].join(',');
      }

  }
]
I created a file called [src/assets/test-data.json](https://codesandbox.io/p/sandbox/ag-grid-packages-forked-vvf84k?file=%2Fassets%2Ftest-data.json) with 5k records with random synthetic data.