From 1a7fbf1e57881efa3ffc6c38e9ec9bdf49088327 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 4 Feb 2026 16:25:40 +0000 Subject: [PATCH] Fix multiple analyses with the same referenceGenome --- eva_sub_cli/executables/check_fasta_insdc.py | 4 ++-- tests/test_check_fasta_insdc.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/eva_sub_cli/executables/check_fasta_insdc.py b/eva_sub_cli/executables/check_fasta_insdc.py index d4f98727..82dd5711 100644 --- a/eva_sub_cli/executables/check_fasta_insdc.py +++ b/eva_sub_cli/executables/check_fasta_insdc.py @@ -132,11 +132,11 @@ def get_analyses_and_reference_genome_from_metadata(vcf_files_for_fasta, json_fi else: all_analyses.add(analysis_aliases[0]) # Get (single) assembly associated with these analyses - assemblies = [metadata.get_reference_assembly_for_analysis(analysis) for analysis in all_analyses] + assemblies = {metadata.get_reference_assembly_for_analysis(analysis) for analysis in all_analyses} if len(assemblies) != 1: logger.error(f'Could not determine assembly accession to check against fasta file, out of: {assemblies}') return all_analyses, None - return all_analyses, assemblies[0] + return all_analyses, assemblies.pop() def check_assembly_in_metadata(assembly_in_metadata): diff --git a/tests/test_check_fasta_insdc.py b/tests/test_check_fasta_insdc.py index 4fbe9b1a..1f72cffa 100644 --- a/tests/test_check_fasta_insdc.py +++ b/tests/test_check_fasta_insdc.py @@ -23,6 +23,22 @@ def test_get_analysis_and_reference_genome_from_metadata(self): assert analyses == {'VD1'} assert reference == 'GCA_000001405.27' + def test_get_analysis_and_reference_genome_from_metadata_multiple_analyses(self): + """Test that multiple analyses with the same referenceGenome return that single assembly.""" + working_dir = os.path.join(self.resource_dir, 'sample_checker') + metadata_json = os.path.join(working_dir, 'metadata.json') + vcf_file1 = os.path.join(working_dir, 'example1.vcf.gz') # VD1 + vcf_file2 = os.path.join(working_dir, 'example2.vcf') # VD2 + vcf_file3 = os.path.join(working_dir, 'example3.vcf') # VD3 + os.chdir(working_dir) + + # Multiple VCF files from different analyses, all with same referenceGenome + analyses, reference = get_analyses_and_reference_genome_from_metadata( + [vcf_file1, vcf_file2, vcf_file3], metadata_json + ) + assert analyses == {'VD1', 'VD2', 'VD3'} + assert reference == 'GCA_000001405.27' + def test_get_analysis_and_reference_genome_from_metadata_absolute_paths(self): working_dir = os.path.join(self.resource_dir, 'sample_checker') metadata_json = os.path.join(working_dir, 'metadata.json')