EBIvariation · apriltuesday · Aug 4, 2025 · Aug 5, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,13 +1,16 @@
 FROM python:3.10
 
 ENV vcf_validator_version=0.10.2
-ENV NXF_VER=22.10.6
+ENV NXF_VER=23.10.0
 
 WORKDIR /opt
 
 # Install JAVA and Node
 RUN apt update && apt install -y default-jdk nodejs npm
 
+# Install bcftools
+RUN apt install -y bcftools
+
 # Install VCF validator
 RUN curl -LJo /usr/local/bin/vcf_validator  https://github.com/EBIvariation/vcf-validator/releases/download/v${vcf_validator_version}/vcf_validator_linux \
     && curl -LJo /usr/local/bin/vcf_assembly_checker  https://github.com/EBIvariation/vcf-validator/releases/download/v${vcf_validator_version}/vcf_assembly_checker_linux \

diff --git a/docs/installation.md b/docs/installation.md
@@ -40,10 +40,15 @@ To upgrade to the newest version, run `pip install --upgrade eva-sub-cli`.
 
 ## 3. From source natively
 
+Advanced users who want to manage their dependencies in a more granular way can install from source natively.
 This installation method requires the following:
 * Python 3.8+
 * [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html) 21.10+
 * [biovalidator](https://github.com/elixir-europe/biovalidator) 2.1.0+
 * [vcf-validator](https://github.com/EBIvariation/vcf-validator) 0.9.7+
+* [bcftools](https://www.htslib.org/download/) 1.14+
 
-Install each of these and ensure they are included in your PATH. Then install the latest eva-sub-cli release as previously described.
+Install each of these and ensure they are included in your PATH. Then install the latest eva-sub-cli release from [PyPI](https://pypi.org/project/eva-sub-cli/):
+```bash
+pip install eva-sub-cli
+```
diff --git a/eva_sub_cli/jinja_templates/html/file_validation.html b/eva_sub_cli/jinja_templates/html/file_validation.html
@@ -6,6 +6,8 @@
             {{ assembly_check(result) }}
         {% elif check_type == "vcf_check" %}
             {{ vcf_check(result) }}
+        {% elif check_type == "norm_check" %}
+            {{ norm_check(result) }}
         {% endif %}
     {% endfor %}
 {%- endmacro %}
@@ -90,4 +92,33 @@
     {% endif %}
 {%- endmacro %}
 
+{% macro norm_check(norm_check_result) %}
+    {% set error_count = norm_check_result.get("nb_error", 0) %}
+    {% set expand_icon = "" %}
+    {% if error_count > 0 %}
+        {% set expand_icon = "&#9654;" %}
+        {% set icon = "&#10060;" %}
+        {% set row_class = "report-section fail collapsible" %}
+    {% else %}
+        {% set icon = "&#10004;" %}
+        {% set row_class = "report-section pass" %}
+    {% endif %}
+    <div class='{{ row_class }}'><span class="expand_icon">{{ expand_icon }}</span> {{ icon }} Normalisation check: {{ error_count }} errors ({{ norm_check_result.get("nb_total", 0) }} total lines: {{ norm_check_result.get("nb_split", 0) }} split, {{ norm_check_result.get("nb_realigned", 0) }} realigned, {{ norm_check_result.get("nb_skipped", 0) }} skipped)</div>
+    {% set error_list = norm_check_result.get("error_list") %}
 
+    {% if error_list%}
+        <div class="error-list">
+            <div class="error-description">First 10 errors are below. <strong>Full report:</strong> {{ norm_check_result.get('report_path', '') }}</div>
+            <table>
+                <tr>
+                    <th>Category</th><th>Error</th>
+                </tr>
+                {% for error in error_list[:10] %}
+                    <tr>
+                        <td><strong>Error</strong></td><td> {{ error }}</td>
+                    </tr>
+                {% endfor %}
+            </table>
+        </div>
+    {% endif %}
+{%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/html/report.html b/eva_sub_cli/jinja_templates/html/report.html
@@ -70,7 +70,8 @@ <h2>Metadata validation results</h2>
     <h2>VCF validation results</h2>
     <div class="description">
         Checks whether each file is compliant with the <a href="http://samtools.github.io/hts-specs/VCFv4.4.pdf" target=”_blank”>VCF specification</a>.
-        Also checks whether the variants' reference alleles match against the reference assembly.
+        Also checks whether the variants' reference alleles match against the reference assembly,
+        and whether the file can be normalised (if necessary) using <a href="https://samtools.github.io/bcftools/bcftools.html#norm" target="blank">bcftools norm</a>.
     </div>
     {% for file_name in vcf_files %}
         {% if file_name != "pass"%}

diff --git a/eva_sub_cli/jinja_templates/text/file_validation.txt b/eva_sub_cli/jinja_templates/text/file_validation.txt
@@ -6,6 +6,8 @@
             {{ assembly_check(result) }}
         {% elif check_type == "vcf_check" %}
             {{ vcf_check(result) }}
+        {% elif check_type == "norm_check" %}
+            {{ norm_check(result) }}
         {% endif %}
     {% endfor %}
 {%- endmacro %}
@@ -56,3 +58,21 @@
 		{% endfor %}
 	{% endif %}
 {%- endmacro %}
+
+{% macro norm_check(norm_check_result) %}
+    {% set error_count = norm_check_result.get("nb_error", 0) %}
+    {% if error_count > 0 %}
+        {% set icon = "\u274C" %}
+    {% else %}
+        {% set icon = "\u2714" %}
+    {% endif %}
+		{{ icon }} Normalisation check: {{ error_count }} errors ({{ norm_check_result.get("nb_total", 0) }} total lines: {{ norm_check_result.get("nb_split", 0) }} split, {{ norm_check_result.get("nb_realigned", 0) }} realigned, {{ norm_check_result.get("nb_skipped", 0) }} skipped)
+
+	{% set error_list = norm_check_result.get("error_list") %}
+	{% if error_list%}
+		First 10 errors are below. Full report: {{ norm_check_result.get('report_path', '') }}
+		{% for error in error_list[:10] %}
+			Error: {{ error }}
+		{% endfor %}
+	{% endif %}
+{%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/text/report.txt b/eva_sub_cli/jinja_templates/text/report.txt
@@ -29,7 +29,8 @@ For requirements, please refer to the EVA website (https://www.ebi.ac.uk/eva/?Su
 
 VCF VALIDATION RESULTS
 Checks whether each file is compliant with the VCF specification (http://samtools.github.io/hts-specs/VCFv4.4.pdf).
-Also checks whether the variants' reference alleles match against the reference assembly.
+Also checks whether the variants' reference alleles match against the reference assembly, and whether the file can be
+normalised (if necessary) using bcftools norm (https://samtools.github.io/bcftools/bcftools.html#norm).
 
 {% for file_name in vcf_files %}
 {% if file_name != "pass"%}

diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -23,7 +23,8 @@ params.metadata_xlsx = null
 params.executable = [
     "vcf_validator": "vcf_validator",
     "vcf_assembly_checker": "vcf_assembly_checker",
-    "biovalidator": "biovalidator"
+    "biovalidator": "biovalidator",
+    "bcftools": "bcftools"
 ]
 // python scripts - installed as part of eva-sub-cli
 params.python_scripts = [
@@ -83,11 +84,11 @@ workflow {
     // VCF checks
     check_vcf_valid(vcf_and_ref_ch)
     check_vcf_reference(vcf_and_ref_ch)
+    check_vcf_normalised(vcf_and_ref_ch)
 
     generate_file_size_and_md5_digests(vcf_files)
     collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
 
-
     // Metadata conversion
     if (params.metadata_xlsx && !params.metadata_json){
         convert_xlsx_2_json(joinBasePath(params.metadata_xlsx))
@@ -177,6 +178,30 @@ process check_vcf_reference {
     """
 }
 
+/*
+ * Check that the VCF file can be normalised using bcftools
+ */
+process check_vcf_normalised {
+	publishDir output_dir,
+            overwrite: true,
+            mode: "copy"
+
+    input:
+    tuple path(vcf), path(fasta), path(report)
+
+    output:
+    // TODO should we output the normalised file?
+    path "norm_check/*.log", emit: normalisation_log
+
+	script:
+	"""
+	mkdir norm_check
+	# Trap exit code so failures can be reported
+    $params.executable.bcftools norm --no-version -cw -f $fasta -O u $vcf 1> /dev/null 2> norm_check/${vcf}_bcftools_norm.log \
+        || echo "exit code \$?"
+    """
+}
+
 process generate_file_size_and_md5_digests {
     input:
     path(vcf_file)

diff --git a/eva_sub_cli/report.py b/eva_sub_cli/report.py
@@ -19,7 +19,7 @@ def generate_report(validation_results, validation_date, submission_dir, vcf_fas
                     consent_statement_required, subdir, template_file):
     results_for_report = {k: v for k, v in validation_results.items() if k != 'ready_for_submission_to_eva'}
     vcf_files = sorted(set([file_name
-                            for check in results_for_report if check in ["vcf_check", "assembly_check"]
+                            for check in results_for_report if check in ["vcf_check", "assembly_check", "norm_check"]
                             for file_name in results_for_report[check]
                             ]))
     fasta_files = sorted([file_name for file_name in results_for_report['fasta_check']])

diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
 logger = logging_config.get_logger(__name__)
 
 default_container_image = 'ebivariation/eva-sub-cli'
-default_container_tag = 'v0.0.6'
+default_container_tag = 'v0.0.7.dev0'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 

diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py
@@ -13,14 +13,15 @@ class NativeValidator(Validator):
     def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None, metadata_xlsx=None,
                  shallow_validation=False, vcf_validator_path='vcf_validator',
                  assembly_checker_path='vcf_assembly_checker', biovalidator_path='biovalidator',
-                 submission_config=None, nextflow_config=None):
+                 bcftools_path='bcftools', submission_config=None, nextflow_config=None):
         super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json,
                          metadata_xlsx=metadata_xlsx, shallow_validation=shallow_validation,
                          submission_config=submission_config)
         self.nextflow_config = nextflow_config
         self.vcf_validator_path = vcf_validator_path
         self.assembly_checker_path = assembly_checker_path
         self.biovalidator_path = biovalidator_path
+        self.bcftools_path = bcftools_path
 
     @staticmethod
     def _validation_file_path_for(file_path):
@@ -57,13 +58,15 @@ def get_validation_cmd(self):
             f" --executable.vcf_validator {self.vcf_validator_path}",
             f" --executable.vcf_assembly_checker {self.assembly_checker_path}",
             f" --executable.biovalidator {self.biovalidator_path}",
+            f" --executable.bcftools {self.bcftools_path}",
             f" -c {self.nextflow_config} " if self.nextflow_config else ""
         ])
 
     def verify_executables_installed(self):
         for name, path in [('vcf-validator', self.vcf_validator_path),
                            ('vcf-assembly-checker', self.assembly_checker_path),
-                           ('biovalidator', self.biovalidator_path)]:
+                           ('biovalidator', self.biovalidator_path),
+                           ('bcftools', self.bcftools_path)]:
             try:
                 self._run_quiet_command(
                     f"Check {name} is installed and available on the path",

diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
@@ -113,6 +113,19 @@ def vcf_check_errors_is_critical(error):
     return True
 
 
+def parse_bcftools_norm_report(norm_report):
+    total = split = realigned = skipped = 0
+    error_list = []
+    with open(norm_report) as open_file:
+        for line in open_file:
+            if line.startswith('Lines   total/split/realigned/skipped:'):
+                # Lines   total/split/realigned/skipped:  2/0/1/0
+                total, split, realigned, skipped = line.strip().split()[-1].split('/')
+            else:
+                error_list.append(line.strip())
+    return error_list, int(total), int(split), int(realigned), int(skipped)
+
+
 def parse_biovalidator_validation_results(metadata_check_file):
     """
     Read the biovalidator's report and extract the list of validation errors

diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -16,7 +16,7 @@
 from eva_sub_cli.report import generate_html_report, generate_text_report
 from eva_sub_cli.validators.validation_results_parsers import parse_assembly_check_log, parse_assembly_check_report, \
     parse_biovalidator_validation_results, convert_metadata_sheet, convert_metadata_row, convert_metadata_attribute, \
-    parse_vcf_check_report, parse_metadata_property
+    parse_vcf_check_report, parse_metadata_property, parse_bcftools_norm_report
 
 VALIDATION_OUTPUT_DIR = "validation_output"
 VALIDATION_RESULTS_KEY = 'validation_results'
@@ -153,7 +153,7 @@ def verify_ready_for_submission_to_eva(self):
         """ Checks if all the validation are passed """
         return all((
             all((value.get('pass', False) is True for key, value in self.results.items() if
-                 key in ['vcf_check', 'assembly_check', 'fasta_check', 'sample_check', 'metadata_check', 'evidence_type_check'])),
+                 key in ['vcf_check', 'assembly_check', 'fasta_check', 'sample_check', 'norm_check', 'metadata_check', 'evidence_type_check'])),
             any((
                 self.results['shallow_validation']['requested'] is False,
                 self.results['shallow_validation'].get('required', True) is False
@@ -167,6 +167,7 @@ def _collect_validation_workflow_results(self):
             self._collect_trim_down_metrics()
         self._collect_vcf_check_results()
         self._collect_assembly_check_results()
+        self._collect_norm_check_results()
         self._load_sample_check_results()
         self._load_evidence_check_results()
         self._load_fasta_check_results()
@@ -189,6 +190,11 @@ def _assess_validation_results(self):
                                    for vcf_name, asm_check in self.results.get('assembly_check', {}).items()))
         self.results['assembly_check']['pass'] = asm_nb_mismatch_result and asm_nb_error_result
 
+        # norm_check result
+        norm_check_result = all((norm_check.get('nb_error', 1) == 0
+                                 for vcf_name, norm_check in self.results.get('norm_check', {}).items()))
+        self.results['norm_check']['pass'] = norm_check_result
+
         # fasta_check result
         fasta_check_result = all((fa_file_check.get('all_insdc', False) is True
                                   for fa_file, fa_file_check in self.results.get('fasta_check', {}).items()))
@@ -249,6 +255,10 @@ def _assembly_check_text_report(self, vcf_name):
             os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*')
         )
 
+    @lru_cache
+    def _normalisation_log(self, vcf_name):
+        return resolve_single_file_path(os.path.join(self.output_dir, 'norm_check', vcf_name + '_bcftools_norm.log'))
+
     @cached_property
     def _sample_check_yaml(self):
         return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml'))
@@ -310,6 +320,25 @@ def _collect_assembly_check_results(self):
                 'total': total
             }
 
+    def _collect_norm_check_results(self):
+        self.results['norm_check'] = {}
+        for vcf_file in self.vcf_files:
+            vcf_name = os.path.basename(vcf_file)
+            normalisation_log = self._normalisation_log(vcf_name)
+            if normalisation_log:
+                error_list, nb_total, nb_split, nb_realigned, nb_skipped = parse_bcftools_norm_report(normalisation_log)
+            else:
+                error_list, nb_total, nb_split, nb_realigned, nb_skipped = (['Process failed'], 0, 0, 0, 0)
+            self.results['norm_check'][vcf_name] = {
+                'report_path': normalisation_log,
+                'error_list': error_list,
+                'nb_error': len(error_list),
+                'nb_total': nb_total,
+                'nb_split': nb_split,
+                'nb_realigned': nb_realigned,
+                'nb_skipped': nb_skipped
+            }
+
     def _load_fasta_check_results(self):
         for fasta_file in self.fasta_files:
             fasta_file_name = os.path.basename(fasta_file)
@@ -338,8 +367,6 @@ def _load_evidence_check_results(self):
 
         self._update_metadata_with_evidence_type()
 
-
-
     def _collect_metadata_results(self):
         self.results['metadata_check'] = {}
         self._load_spreadsheet_conversion_errors()

diff --git a/tests/build_and_test_docker_locally.py b/tests/build_and_test_docker_locally.py
@@ -233,13 +233,12 @@ def get_docker_validation_cmd(self):
 
     def assert_validation_results(self, validator, expected_sample_checker, expected_metadata_files_json,
                                   expected_metadata_val, expected_semantic_val, expected_evidence_type_val):
+        # Assert VCF format check
         vcf_format_dir = os.path.join(validator.output_dir, 'vcf_format')
         self.assertTrue(os.path.exists(vcf_format_dir))
-
-        vcf_format_log_file = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
-        self.assertTrue(os.path.exists(vcf_format_log_file))
-
-        with open(vcf_format_log_file) as vcf_format_log_file:
+        vcf_format_log_path = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
+        self.assertTrue(os.path.exists(vcf_format_log_path))
+        with open(vcf_format_log_path) as vcf_format_log_file:
             vcf_format_logs = vcf_format_log_file.readlines()
             self.assertEqual('[info] According to the VCF specification, the input file is valid\n',
                              vcf_format_logs[2])
@@ -249,18 +248,24 @@ def assert_validation_results(self, validator, expected_sample_checker, expected
                 self.assertEqual('According to the VCF specification, the input file is valid\n',
                                  text_report_content[0])
 
-        # assert assembly report
+        # Assert assembly report check
         assembly_check_dir = os.path.join(validator.output_dir, 'assembly_check')
         self.assertTrue(os.path.exists(assembly_check_dir))
-
-        assembly_check_log_file = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
-        self.assertTrue(os.path.exists(assembly_check_log_file))
-
-        with open(assembly_check_log_file) as assembly_check_log_file:
+        assembly_check_log_path = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
+        self.assertTrue(os.path.exists(assembly_check_log_path))
+        with open(assembly_check_log_path) as assembly_check_log_file:
             assembly_check_logs = assembly_check_log_file.readlines()
             self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[4])
             self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[5])
 
+        # Assert normalisation check
+        norm_check_dir = os.path.join(validator.output_dir, 'norm_check')
+        self.assertTrue(os.path.exists(norm_check_dir))
+        norm_check_log_path = os.path.join(norm_check_dir, 'input_passed.vcf_bcftools_norm.log')
+        with open(norm_check_log_path) as norm_check_log_file:
+            norm_check_logs = norm_check_log_file.readlines()
+            self.assertEqual('[E::faidx_adjust_position] The sequence "1" was not found\n', norm_check_logs[0])
+
         # Assert Samples concordance
         self.assert_yaml_file(validator._sample_check_yaml, expected_sample_checker)
 

diff --git a/tests/resources/norm_check/invalid.vcf_bcftools_norm.log b/tests/resources/norm_check/invalid.vcf_bcftools_norm.log
@@ -0,0 +1,5 @@
+NON_ACGTN_ALT	chr1	49338976	]chr1:49277505]T
+NON_ACGTN_ALT	chr1	49997014	TAT[chr1:50014208[
+NON_ACGTN_ALT	chr1	50014208	]chr1:49997014]ATT
+NON_ACGTN_ALT	chr1	191611692	[chr8:41723769[A
+Lines   total/split/realigned/skipped:	152/0/0/0