Generate new test data for salmon tools

Context

Currently we fetch test data for salmontools from an external repo.

https://github.com/AlexsLemonade/refinebio/blob/c4b4a6ad9392b911eec754c81b50a0b05b5f4d41/workers/run_tests.sh#L82

We essentially just run salmontools on this test data and then compare the output file against an existing output file asserting their checksums match.

Below are the tests in their current implementation.

double read

https://github.com/AlexsLemonade/refinebio/blob/dev/workers/data_refinery_workers/processors/test_salmon.py#L572

    def test_double_reads(self):
        """Test outputs when the sample has both left and right reads."""
        job_context = {
            "job_id": 123,
            "job": ProcessorJob(),
            "pipeline": Pipeline(name="Salmon"),
            "input_file_path": self.test_dir + "double_input/reads_1.fastq",
            "input_file_path_2": self.test_dir + "double_input/reads_2.fastq",
            "salmontools_directory": self.test_dir + "double_salmontools/",
            "salmontools_archive": self.test_dir + "salmontools-result.tar.gz",
            "output_directory": self.test_dir + "double_output/",
            "computed_files": [],
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606)

        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        salmon._run_salmontools(job_context)

        # Confirm job status
        self.assertTrue(job_context["success"])

        # Unpack result for checking
        os.system("gunzip " + job_context["salmontools_directory"] + "*.gz")

        # Check two output files
        output_file1 = job_context["salmontools_directory"] + "unmapped_by_salmon_1.fa"
        expected_output_file1 = self.test_dir + "expected_double_output/unmapped_by_salmon_1.fa"
        self.assertTrue(identical_checksum(output_file1, expected_output_file1))

        output_file2 = job_context["salmontools_directory"] + "unmapped_by_salmon_2.fa"
        expected_output_file2 = self.test_dir + "expected_double_output/unmapped_by_salmon_2.fa"
        self.assertTrue(identical_checksum(output_file2, expected_output_file2))

single read

https://github.com/AlexsLemonade/refinebio/blob/dev/workers/data_refinery_workers/processors/test_salmon.py#L612

    def test_single_read(self):
        """Test outputs when the sample has one read only."""
        job_context = {
            "job_id": 456,
            "job": ProcessorJob(),
            "pipeline": Pipeline(name="Salmon"),
            "input_file_path": self.test_dir + "single_input/single_read.fastq",
            "output_directory": self.test_dir + "single_output/",
            "salmontools_directory": self.test_dir + "single_salmontools/",
            "salmontools_archive": self.test_dir + "salmontools-result.tar.gz",
            "computed_files": [],
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606)

        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        salmon._run_salmontools(job_context)

        # Confirm job status
        self.assertTrue(job_context["success"])

        # Unpack result for checking
        os.system("gunzip " + job_context["salmontools_directory"] + "*.gz")

        # Check output file
        output_file = job_context["salmontools_directory"] + "unmapped_by_salmon.fa"
        expected_output_file = self.test_dir + "expected_single_output/unmapped_by_salmon.fa"
        self.assertTrue(identical_checksum(output_file, expected_output_file))

Problem or idea

Since the data is no longer available we need to recreate this data. I believe we are considering using Polyester rna-seq to generate this.

Solution or next step

Tagging @jaclyn-taroni @jashapiro for next steps.

AlexsLemonade / refinebio

Generate new test data for salmon tools #3223

Context

double read

single read

Problem or idea

Solution or next step