diff --git a/docker/Dockerfile b/docker/Dockerfile
index d078868..5037912 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,13 +1,16 @@
FROM python:3.10
ENV vcf_validator_version=0.10.2
-ENV NXF_VER=22.10.6
+ENV NXF_VER=23.10.0
WORKDIR /opt
# Install JAVA and Node
RUN apt update && apt install -y default-jdk nodejs npm
+# Install bcftools
+RUN apt install -y bcftools
+
# Install VCF validator
RUN curl -LJo /usr/local/bin/vcf_validator https://github.com/EBIvariation/vcf-validator/releases/download/v${vcf_validator_version}/vcf_validator_linux \
&& curl -LJo /usr/local/bin/vcf_assembly_checker https://github.com/EBIvariation/vcf-validator/releases/download/v${vcf_validator_version}/vcf_assembly_checker_linux \
diff --git a/docs/installation.md b/docs/installation.md
index 5525d0f..a38cdd8 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -40,10 +40,15 @@ To upgrade to the newest version, run `pip install --upgrade eva-sub-cli`.
## 3. From source natively
+Advanced users who want to manage their dependencies in a more granular way can install from source natively.
This installation method requires the following:
* Python 3.8+
* [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html) 21.10+
* [biovalidator](https://github.com/elixir-europe/biovalidator) 2.1.0+
* [vcf-validator](https://github.com/EBIvariation/vcf-validator) 0.9.7+
+* [bcftools](https://www.htslib.org/download/) 1.14+
-Install each of these and ensure they are included in your PATH. Then install the latest eva-sub-cli release as previously described.
+Install each of these and ensure they are included in your PATH. Then install the latest eva-sub-cli release from [PyPI](https://pypi.org/project/eva-sub-cli/):
+```bash
+pip install eva-sub-cli
+```
\ No newline at end of file
diff --git a/eva_sub_cli/jinja_templates/html/file_validation.html b/eva_sub_cli/jinja_templates/html/file_validation.html
index dfd6c6d..6591108 100644
--- a/eva_sub_cli/jinja_templates/html/file_validation.html
+++ b/eva_sub_cli/jinja_templates/html/file_validation.html
@@ -6,6 +6,8 @@
{{ assembly_check(result) }}
{% elif check_type == "vcf_check" %}
{{ vcf_check(result) }}
+ {% elif check_type == "norm_check" %}
+ {{ norm_check(result) }}
{% endif %}
{% endfor %}
{%- endmacro %}
@@ -90,4 +92,33 @@
{% endif %}
{%- endmacro %}
+{% macro norm_check(norm_check_result) %}
+ {% set error_count = norm_check_result.get("nb_error", 0) %}
+ {% set expand_icon = "" %}
+ {% if error_count > 0 %}
+ {% set expand_icon = "▶" %}
+ {% set icon = "❌" %}
+ {% set row_class = "report-section fail collapsible" %}
+ {% else %}
+ {% set icon = "✔" %}
+ {% set row_class = "report-section pass" %}
+ {% endif %}
+
{{ expand_icon }} {{ icon }} Normalisation check: {{ error_count }} errors ({{ norm_check_result.get("nb_total", 0) }} total lines: {{ norm_check_result.get("nb_split", 0) }} split, {{ norm_check_result.get("nb_realigned", 0) }} realigned, {{ norm_check_result.get("nb_skipped", 0) }} skipped)
+ {% set error_list = norm_check_result.get("error_list") %}
+ {% if error_list%}
+
+
First 10 errors are below. Full report: {{ norm_check_result.get('report_path', '') }}
+
+
+ | Category | Error |
+
+ {% for error in error_list[:10] %}
+
+ | Error | {{ error }} |
+
+ {% endfor %}
+
+
+ {% endif %}
+{%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/html/report.html b/eva_sub_cli/jinja_templates/html/report.html
index 8630012..8173668 100644
--- a/eva_sub_cli/jinja_templates/html/report.html
+++ b/eva_sub_cli/jinja_templates/html/report.html
@@ -70,7 +70,8 @@ Metadata validation results
VCF validation results
Checks whether each file is compliant with the
VCF specification.
- Also checks whether the variants' reference alleles match against the reference assembly.
+ Also checks whether the variants' reference alleles match against the reference assembly,
+ and whether the file can be normalised (if necessary) using
bcftools norm.
{% for file_name in vcf_files %}
{% if file_name != "pass"%}
diff --git a/eva_sub_cli/jinja_templates/text/file_validation.txt b/eva_sub_cli/jinja_templates/text/file_validation.txt
index e27fa75..3fc94b8 100644
--- a/eva_sub_cli/jinja_templates/text/file_validation.txt
+++ b/eva_sub_cli/jinja_templates/text/file_validation.txt
@@ -6,6 +6,8 @@
{{ assembly_check(result) }}
{% elif check_type == "vcf_check" %}
{{ vcf_check(result) }}
+ {% elif check_type == "norm_check" %}
+ {{ norm_check(result) }}
{% endif %}
{% endfor %}
{%- endmacro %}
@@ -56,3 +58,21 @@
{% endfor %}
{% endif %}
{%- endmacro %}
+
+{% macro norm_check(norm_check_result) %}
+ {% set error_count = norm_check_result.get("nb_error", 0) %}
+ {% if error_count > 0 %}
+ {% set icon = "\u274C" %}
+ {% else %}
+ {% set icon = "\u2714" %}
+ {% endif %}
+ {{ icon }} Normalisation check: {{ error_count }} errors ({{ norm_check_result.get("nb_total", 0) }} total lines: {{ norm_check_result.get("nb_split", 0) }} split, {{ norm_check_result.get("nb_realigned", 0) }} realigned, {{ norm_check_result.get("nb_skipped", 0) }} skipped)
+
+ {% set error_list = norm_check_result.get("error_list") %}
+ {% if error_list%}
+ First 10 errors are below. Full report: {{ norm_check_result.get('report_path', '') }}
+ {% for error in error_list[:10] %}
+ Error: {{ error }}
+ {% endfor %}
+ {% endif %}
+{%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/text/report.txt b/eva_sub_cli/jinja_templates/text/report.txt
index a462d92..68bfde9 100644
--- a/eva_sub_cli/jinja_templates/text/report.txt
+++ b/eva_sub_cli/jinja_templates/text/report.txt
@@ -29,7 +29,8 @@ For requirements, please refer to the EVA website (https://www.ebi.ac.uk/eva/?Su
VCF VALIDATION RESULTS
Checks whether each file is compliant with the VCF specification (http://samtools.github.io/hts-specs/VCFv4.4.pdf).
-Also checks whether the variants' reference alleles match against the reference assembly.
+Also checks whether the variants' reference alleles match against the reference assembly, and whether the file can be
+normalised (if necessary) using bcftools norm (https://samtools.github.io/bcftools/bcftools.html#norm).
{% for file_name in vcf_files %}
{% if file_name != "pass"%}
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
index 0fe26a8..7a409b7 100644
--- a/eva_sub_cli/nextflow/validation.nf
+++ b/eva_sub_cli/nextflow/validation.nf
@@ -23,7 +23,8 @@ params.metadata_xlsx = null
params.executable = [
"vcf_validator": "vcf_validator",
"vcf_assembly_checker": "vcf_assembly_checker",
- "biovalidator": "biovalidator"
+ "biovalidator": "biovalidator",
+ "bcftools": "bcftools"
]
// python scripts - installed as part of eva-sub-cli
params.python_scripts = [
@@ -83,11 +84,11 @@ workflow {
// VCF checks
check_vcf_valid(vcf_and_ref_ch)
check_vcf_reference(vcf_and_ref_ch)
+ check_vcf_normalised(vcf_and_ref_ch)
generate_file_size_and_md5_digests(vcf_files)
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
-
// Metadata conversion
if (params.metadata_xlsx && !params.metadata_json){
convert_xlsx_2_json(joinBasePath(params.metadata_xlsx))
@@ -177,6 +178,30 @@ process check_vcf_reference {
"""
}
+/*
+ * Check that the VCF file can be normalised using bcftools
+ */
+process check_vcf_normalised {
+ publishDir output_dir,
+ overwrite: true,
+ mode: "copy"
+
+ input:
+ tuple path(vcf), path(fasta), path(report)
+
+ output:
+ // TODO should we output the normalised file?
+ path "norm_check/*.log", emit: normalisation_log
+
+ script:
+ """
+ mkdir norm_check
+ # Trap exit code so failures can be reported
+ $params.executable.bcftools norm --no-version -cw -f $fasta -O u $vcf 1> /dev/null 2> norm_check/${vcf}_bcftools_norm.log \
+ || echo "exit code \$?"
+ """
+}
+
process generate_file_size_and_md5_digests {
input:
path(vcf_file)
diff --git a/eva_sub_cli/report.py b/eva_sub_cli/report.py
index 88bfd6f..1c1b55c 100644
--- a/eva_sub_cli/report.py
+++ b/eva_sub_cli/report.py
@@ -19,7 +19,7 @@ def generate_report(validation_results, validation_date, submission_dir, vcf_fas
consent_statement_required, subdir, template_file):
results_for_report = {k: v for k, v in validation_results.items() if k != 'ready_for_submission_to_eva'}
vcf_files = sorted(set([file_name
- for check in results_for_report if check in ["vcf_check", "assembly_check"]
+ for check in results_for_report if check in ["vcf_check", "assembly_check", "norm_check"]
for file_name in results_for_report[check]
]))
fasta_files = sorted([file_name for file_name in results_for_report['fasta_check']])
diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
index 6073776..8bd9be0 100644
--- a/eva_sub_cli/validators/docker_validator.py
+++ b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)
default_container_image = 'ebivariation/eva-sub-cli'
-default_container_tag = 'v0.0.6'
+default_container_tag = 'v0.0.7.dev0'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'
diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py
index d8ebe9b..206f538 100644
--- a/eva_sub_cli/validators/native_validator.py
+++ b/eva_sub_cli/validators/native_validator.py
@@ -13,7 +13,7 @@ class NativeValidator(Validator):
def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None, metadata_xlsx=None,
shallow_validation=False, vcf_validator_path='vcf_validator',
assembly_checker_path='vcf_assembly_checker', biovalidator_path='biovalidator',
- submission_config=None, nextflow_config=None):
+ bcftools_path='bcftools', submission_config=None, nextflow_config=None):
super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json,
metadata_xlsx=metadata_xlsx, shallow_validation=shallow_validation,
submission_config=submission_config)
@@ -21,6 +21,7 @@ def __init__(self, mapping_file, submission_dir, project_title, metadata_json=No
self.vcf_validator_path = vcf_validator_path
self.assembly_checker_path = assembly_checker_path
self.biovalidator_path = biovalidator_path
+ self.bcftools_path = bcftools_path
@staticmethod
def _validation_file_path_for(file_path):
@@ -57,13 +58,15 @@ def get_validation_cmd(self):
f" --executable.vcf_validator {self.vcf_validator_path}",
f" --executable.vcf_assembly_checker {self.assembly_checker_path}",
f" --executable.biovalidator {self.biovalidator_path}",
+ f" --executable.bcftools {self.bcftools_path}",
f" -c {self.nextflow_config} " if self.nextflow_config else ""
])
def verify_executables_installed(self):
for name, path in [('vcf-validator', self.vcf_validator_path),
('vcf-assembly-checker', self.assembly_checker_path),
- ('biovalidator', self.biovalidator_path)]:
+ ('biovalidator', self.biovalidator_path),
+ ('bcftools', self.bcftools_path)]:
try:
self._run_quiet_command(
f"Check {name} is installed and available on the path",
diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
index f38344e..4c5bb4a 100644
--- a/eva_sub_cli/validators/validation_results_parsers.py
+++ b/eva_sub_cli/validators/validation_results_parsers.py
@@ -113,6 +113,19 @@ def vcf_check_errors_is_critical(error):
return True
+def parse_bcftools_norm_report(norm_report):
+ total = split = realigned = skipped = 0
+ error_list = []
+ with open(norm_report) as open_file:
+ for line in open_file:
+ if line.startswith('Lines total/split/realigned/skipped:'):
+ # Lines total/split/realigned/skipped: 2/0/1/0
+ total, split, realigned, skipped = line.strip().split()[-1].split('/')
+ else:
+ error_list.append(line.strip())
+ return error_list, int(total), int(split), int(realigned), int(skipped)
+
+
def parse_biovalidator_validation_results(metadata_check_file):
"""
Read the biovalidator's report and extract the list of validation errors
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
index 236ed42..3070927 100755
--- a/eva_sub_cli/validators/validator.py
+++ b/eva_sub_cli/validators/validator.py
@@ -16,7 +16,7 @@
from eva_sub_cli.report import generate_html_report, generate_text_report
from eva_sub_cli.validators.validation_results_parsers import parse_assembly_check_log, parse_assembly_check_report, \
parse_biovalidator_validation_results, convert_metadata_sheet, convert_metadata_row, convert_metadata_attribute, \
- parse_vcf_check_report, parse_metadata_property
+ parse_vcf_check_report, parse_metadata_property, parse_bcftools_norm_report
VALIDATION_OUTPUT_DIR = "validation_output"
VALIDATION_RESULTS_KEY = 'validation_results'
@@ -153,7 +153,7 @@ def verify_ready_for_submission_to_eva(self):
""" Checks if all the validation are passed """
return all((
all((value.get('pass', False) is True for key, value in self.results.items() if
- key in ['vcf_check', 'assembly_check', 'fasta_check', 'sample_check', 'metadata_check', 'evidence_type_check'])),
+ key in ['vcf_check', 'assembly_check', 'fasta_check', 'sample_check', 'norm_check', 'metadata_check', 'evidence_type_check'])),
any((
self.results['shallow_validation']['requested'] is False,
self.results['shallow_validation'].get('required', True) is False
@@ -167,6 +167,7 @@ def _collect_validation_workflow_results(self):
self._collect_trim_down_metrics()
self._collect_vcf_check_results()
self._collect_assembly_check_results()
+ self._collect_norm_check_results()
self._load_sample_check_results()
self._load_evidence_check_results()
self._load_fasta_check_results()
@@ -189,6 +190,11 @@ def _assess_validation_results(self):
for vcf_name, asm_check in self.results.get('assembly_check', {}).items()))
self.results['assembly_check']['pass'] = asm_nb_mismatch_result and asm_nb_error_result
+ # norm_check result
+ norm_check_result = all((norm_check.get('nb_error', 1) == 0
+ for vcf_name, norm_check in self.results.get('norm_check', {}).items()))
+ self.results['norm_check']['pass'] = norm_check_result
+
# fasta_check result
fasta_check_result = all((fa_file_check.get('all_insdc', False) is True
for fa_file, fa_file_check in self.results.get('fasta_check', {}).items()))
@@ -249,6 +255,10 @@ def _assembly_check_text_report(self, vcf_name):
os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*')
)
+ @lru_cache
+ def _normalisation_log(self, vcf_name):
+ return resolve_single_file_path(os.path.join(self.output_dir, 'norm_check', vcf_name + '_bcftools_norm.log'))
+
@cached_property
def _sample_check_yaml(self):
return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml'))
@@ -310,6 +320,25 @@ def _collect_assembly_check_results(self):
'total': total
}
+ def _collect_norm_check_results(self):
+ self.results['norm_check'] = {}
+ for vcf_file in self.vcf_files:
+ vcf_name = os.path.basename(vcf_file)
+ normalisation_log = self._normalisation_log(vcf_name)
+ if normalisation_log:
+ error_list, nb_total, nb_split, nb_realigned, nb_skipped = parse_bcftools_norm_report(normalisation_log)
+ else:
+ error_list, nb_total, nb_split, nb_realigned, nb_skipped = (['Process failed'], 0, 0, 0, 0)
+ self.results['norm_check'][vcf_name] = {
+ 'report_path': normalisation_log,
+ 'error_list': error_list,
+ 'nb_error': len(error_list),
+ 'nb_total': nb_total,
+ 'nb_split': nb_split,
+ 'nb_realigned': nb_realigned,
+ 'nb_skipped': nb_skipped
+ }
+
def _load_fasta_check_results(self):
for fasta_file in self.fasta_files:
fasta_file_name = os.path.basename(fasta_file)
@@ -338,8 +367,6 @@ def _load_evidence_check_results(self):
self._update_metadata_with_evidence_type()
-
-
def _collect_metadata_results(self):
self.results['metadata_check'] = {}
self._load_spreadsheet_conversion_errors()
diff --git a/tests/build_and_test_docker_locally.py b/tests/build_and_test_docker_locally.py
index 6b2b9e4..b65fb56 100644
--- a/tests/build_and_test_docker_locally.py
+++ b/tests/build_and_test_docker_locally.py
@@ -233,13 +233,12 @@ def get_docker_validation_cmd(self):
def assert_validation_results(self, validator, expected_sample_checker, expected_metadata_files_json,
expected_metadata_val, expected_semantic_val, expected_evidence_type_val):
+ # Assert VCF format check
vcf_format_dir = os.path.join(validator.output_dir, 'vcf_format')
self.assertTrue(os.path.exists(vcf_format_dir))
-
- vcf_format_log_file = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
- self.assertTrue(os.path.exists(vcf_format_log_file))
-
- with open(vcf_format_log_file) as vcf_format_log_file:
+ vcf_format_log_path = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
+ self.assertTrue(os.path.exists(vcf_format_log_path))
+ with open(vcf_format_log_path) as vcf_format_log_file:
vcf_format_logs = vcf_format_log_file.readlines()
self.assertEqual('[info] According to the VCF specification, the input file is valid\n',
vcf_format_logs[2])
@@ -249,18 +248,24 @@ def assert_validation_results(self, validator, expected_sample_checker, expected
self.assertEqual('According to the VCF specification, the input file is valid\n',
text_report_content[0])
- # assert assembly report
+ # Assert assembly report check
assembly_check_dir = os.path.join(validator.output_dir, 'assembly_check')
self.assertTrue(os.path.exists(assembly_check_dir))
-
- assembly_check_log_file = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
- self.assertTrue(os.path.exists(assembly_check_log_file))
-
- with open(assembly_check_log_file) as assembly_check_log_file:
+ assembly_check_log_path = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
+ self.assertTrue(os.path.exists(assembly_check_log_path))
+ with open(assembly_check_log_path) as assembly_check_log_file:
assembly_check_logs = assembly_check_log_file.readlines()
self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[4])
self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[5])
+ # Assert normalisation check
+ norm_check_dir = os.path.join(validator.output_dir, 'norm_check')
+ self.assertTrue(os.path.exists(norm_check_dir))
+ norm_check_log_path = os.path.join(norm_check_dir, 'input_passed.vcf_bcftools_norm.log')
+ with open(norm_check_log_path) as norm_check_log_file:
+ norm_check_logs = norm_check_log_file.readlines()
+ self.assertEqual('[E::faidx_adjust_position] The sequence "1" was not found\n', norm_check_logs[0])
+
# Assert Samples concordance
self.assert_yaml_file(validator._sample_check_yaml, expected_sample_checker)
diff --git a/tests/resources/norm_check/invalid.vcf_bcftools_norm.log b/tests/resources/norm_check/invalid.vcf_bcftools_norm.log
new file mode 100644
index 0000000..e3e3bb8
--- /dev/null
+++ b/tests/resources/norm_check/invalid.vcf_bcftools_norm.log
@@ -0,0 +1,5 @@
+NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T
+NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[
+NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT
+NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A
+Lines total/split/realigned/skipped: 152/0/0/0
diff --git a/tests/resources/validation_reports/expected_report_metadata_json.html b/tests/resources/validation_reports/expected_metadata_json_report.html
similarity index 99%
rename from tests/resources/validation_reports/expected_report_metadata_json.html
rename to tests/resources/validation_reports/expected_metadata_json_report.html
index 6945ce8..b0cde25 100644
--- a/tests/resources/validation_reports/expected_report_metadata_json.html
+++ b/tests/resources/validation_reports/expected_metadata_json_report.html
@@ -158,7 +158,8 @@ Metadata validation results
VCF validation results
Checks whether each file is compliant with the
VCF specification.
- Also checks whether the variants' reference alleles match against the reference assembly.
+ Also checks whether the variants' reference alleles match against the reference assembly,
+ and whether the file can be normalised (if necessary) using
bcftools norm.
input_fail.vcf
▶ ❌ Assembly check: 26/36 (72.22%)
@@ -215,9 +216,31 @@ input_fail.vcf
+ ▶ ❌ Normalisation check: 4 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
+
+
First 10 errors are below. Full report: /path/to/vcf_failed/norm/log
+
+
+ | Category | Error |
+
+
+ | Error | NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T |
+
+
+ | Error | NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[ |
+
+
+ | Error | NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT |
+
+
+ | Error | NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A |
+
+
+
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
+ ✔ Normalisation check: 0 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
Sample name concordance check
diff --git a/tests/resources/validation_reports/expected_report_metadata_json.txt b/tests/resources/validation_reports/expected_metadata_json_report.txt
similarity index 92%
rename from tests/resources/validation_reports/expected_report_metadata_json.txt
rename to tests/resources/validation_reports/expected_metadata_json_report.txt
index 58e7d4e..0b1e71e 100644
--- a/tests/resources/validation_reports/expected_report_metadata_json.txt
+++ b/tests/resources/validation_reports/expected_metadata_json_report.txt
@@ -71,7 +71,8 @@ For requirements, please refer to the EVA website (https://www.ebi.ac.uk/eva/?Su
-
VCF VALIDATION RESULTS
Checks whether each file is compliant with the VCF specification (http://samtools.github.io/hts-specs/VCFv4.4.pdf).
-Also checks whether the variants' reference alleles match against the reference assembly.
+Also checks whether the variants' reference alleles match against the reference assembly, and whether the file can be
+normalised (if necessary) using bcftools norm (https://samtools.github.io/bcftools/bcftools.html#norm).
input_fail.vcf
❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
@@ -89,9 +90,16 @@ Also checks whether the variants' reference alleles match against the reference
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
Critical error: Line 4: Error in meta-data section.
Non-critical error: Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..
+ ❌ Normalisation check: 4 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
+ First 10 errors are below. Full report: /path/to/vcf_failed/norm/log
+ Error: NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T
+ Error: NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[
+ Error: NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT
+ Error: NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
+ ✔ Normalisation check: 0 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
-
SAMPLE NAME CONCORDANCE CHECK
Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
diff --git a/tests/resources/validation_reports/expected_report_metadata_xlsx.html b/tests/resources/validation_reports/expected_metadata_xlsx_report.html
similarity index 99%
rename from tests/resources/validation_reports/expected_report_metadata_xlsx.html
rename to tests/resources/validation_reports/expected_metadata_xlsx_report.html
index 078ceb1..197b069 100644
--- a/tests/resources/validation_reports/expected_report_metadata_xlsx.html
+++ b/tests/resources/validation_reports/expected_metadata_xlsx_report.html
@@ -170,7 +170,8 @@ Metadata validation results
VCF validation results
Checks whether each file is compliant with the
VCF specification.
- Also checks whether the variants' reference alleles match against the reference assembly.
+ Also checks whether the variants' reference alleles match against the reference assembly,
+ and whether the file can be normalised (if necessary) using
bcftools norm.
input_fail.vcf
▶ ❌ Assembly check: 26/36 (72.22%)
@@ -227,9 +228,31 @@ input_fail.vcf
+ ▶ ❌ Normalisation check: 4 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
+
+
First 10 errors are below. Full report: /path/to/vcf_failed/norm/log
+
+
+ | Category | Error |
+
+
+ | Error | NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T |
+
+
+ | Error | NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[ |
+
+
+ | Error | NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT |
+
+
+ | Error | NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A |
+
+
+
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
+ ✔ Normalisation check: 0 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
Sample name concordance check
diff --git a/tests/resources/validation_reports/expected_report_metadata_xlsx.txt b/tests/resources/validation_reports/expected_metadata_xlsx_report.txt
similarity index 91%
rename from tests/resources/validation_reports/expected_report_metadata_xlsx.txt
rename to tests/resources/validation_reports/expected_metadata_xlsx_report.txt
index 8d7df7f..2c98656 100644
--- a/tests/resources/validation_reports/expected_report_metadata_xlsx.txt
+++ b/tests/resources/validation_reports/expected_metadata_xlsx_report.txt
@@ -65,7 +65,8 @@ For requirements, please refer to the EVA website (https://www.ebi.ac.uk/eva/?Su
-
VCF VALIDATION RESULTS
Checks whether each file is compliant with the VCF specification (http://samtools.github.io/hts-specs/VCFv4.4.pdf).
-Also checks whether the variants' reference alleles match against the reference assembly.
+Also checks whether the variants' reference alleles match against the reference assembly, and whether the file can be
+normalised (if necessary) using bcftools norm (https://samtools.github.io/bcftools/bcftools.html#norm).
input_fail.vcf
❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
@@ -83,9 +84,16 @@ Also checks whether the variants' reference alleles match against the reference
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
Critical error: Line 4: Error in meta-data section.
Non-critical error: Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..
+ ❌ Normalisation check: 4 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
+ First 10 errors are below. Full report: /path/to/vcf_failed/norm/log
+ Error: NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T
+ Error: NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[
+ Error: NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT
+ Error: NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
+ ✔ Normalisation check: 0 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
-
SAMPLE NAME CONCORDANCE CHECK
Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
diff --git a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html
index 27f141b..8b9de8a 100644
--- a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html
+++ b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html
@@ -194,7 +194,8 @@ Metadata validation results
VCF validation results
Checks whether each file is compliant with the
VCF specification.
- Also checks whether the variants' reference alleles match against the reference assembly.
+ Also checks whether the variants' reference alleles match against the reference assembly,
+ and whether the file can be normalised (if necessary) using
bcftools norm.
input_fail.vcf
▶ ❌ Assembly check: 26/36 (72.22%)
@@ -251,9 +252,31 @@ input_fail.vcf
+ ▶ ❌ Normalisation check: 4 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
+
+
First 10 errors are below. Full report: /path/to/vcf_failed/norm/log
+
+
+ | Category | Error |
+
+
+ | Error | NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T |
+
+
+ | Error | NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[ |
+
+
+ | Error | NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT |
+
+
+ | Error | NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A |
+
+
+
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
+ ✔ Normalisation check: 0 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
Sample name concordance check
diff --git a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.txt b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.txt
index 03a731e..7aa135f 100644
--- a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.txt
+++ b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.txt
@@ -74,7 +74,8 @@ For requirements, please refer to the EVA website (https://www.ebi.ac.uk/eva/?Su
-
VCF VALIDATION RESULTS
Checks whether each file is compliant with the VCF specification (http://samtools.github.io/hts-specs/VCFv4.4.pdf).
-Also checks whether the variants' reference alleles match against the reference assembly.
+Also checks whether the variants' reference alleles match against the reference assembly, and whether the file can be
+normalised (if necessary) using bcftools norm (https://samtools.github.io/bcftools/bcftools.html#norm).
input_fail.vcf
❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
@@ -92,9 +93,16 @@ Also checks whether the variants' reference alleles match against the reference
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
Critical error: Line 4: Error in meta-data section.
Non-critical error: Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..
+ ❌ Normalisation check: 4 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
+ First 10 errors are below. Full report: /path/to/vcf_failed/norm/log
+ Error: NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T
+ Error: NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[
+ Error: NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT
+ Error: NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
+ ✔ Normalisation check: 0 errors (152 total lines: 0 split, 0 realigned, 0 skipped)
-
SAMPLE NAME CONCORDANCE CHECK
Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
diff --git a/tests/resources/validation_reports/validation_output/norm_check/input_passed.vcf_bcftools_norm.log b/tests/resources/validation_reports/validation_output/norm_check/input_passed.vcf_bcftools_norm.log
new file mode 100644
index 0000000..b1774ca
--- /dev/null
+++ b/tests/resources/validation_reports/validation_output/norm_check/input_passed.vcf_bcftools_norm.log
@@ -0,0 +1 @@
+Lines total/split/realigned/skipped: 152/0/0/0
diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
index a000688..66a62cc 100644
--- a/tests/test_docker_validator.py
+++ b/tests/test_docker_validator.py
@@ -92,35 +92,39 @@ def assert_sample_checker(self, sample_checker_file, expected_checker):
def assert_validation_results(self, validator, expected_sample_checker, expected_metadata_files_json,
expected_metadata_val, expected_semantic_val):
+ # Assert VCF format check
vcf_format_dir = os.path.join(validator.output_dir, 'vcf_format')
self.assertTrue(os.path.exists(vcf_format_dir))
-
- vcf_format_log_file = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
- self.assertTrue(os.path.exists(vcf_format_log_file))
-
- with open(vcf_format_log_file) as vcf_format_log_file:
+ vcf_format_log_path = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
+ self.assertTrue(os.path.exists(vcf_format_log_path))
+ with open(vcf_format_log_path) as vcf_format_log_file:
vcf_format_logs = vcf_format_log_file.readlines()
self.assertEqual('[info] According to the VCF specification, the input file is valid\n',
vcf_format_logs[2])
-
text_report = vcf_format_logs[1].split(':')[1].strip()
with open(os.path.join(validator.output_dir, text_report)) as text_report:
text_report_content = text_report.readlines()
self.assertEqual('According to the VCF specification, the input file is valid\n',
text_report_content[0])
- # assert assembly report
+ # Assert assembly report check
assembly_check_dir = os.path.join(validator.output_dir, 'assembly_check')
self.assertTrue(os.path.exists(assembly_check_dir))
-
- assembly_check_log_file = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
- self.assertTrue(os.path.exists(assembly_check_log_file))
-
- with open(assembly_check_log_file) as assembly_check_log_file:
+ assembly_check_log_path = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
+ self.assertTrue(os.path.exists(assembly_check_log_path))
+ with open(assembly_check_log_path) as assembly_check_log_file:
assembly_check_logs = assembly_check_log_file.readlines()
self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[4])
self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[5])
+ # Assert normalisation check
+ norm_check_dir = os.path.join(validator.output_dir, 'norm_check')
+ self.assertTrue(os.path.exists(norm_check_dir))
+ norm_check_log_path = os.path.join(norm_check_dir, 'input_passed.vcf_bcftools_norm.log')
+ with open(norm_check_log_path) as norm_check_log_file:
+ norm_check_logs = norm_check_log_file.readlines()
+ self.assertEqual('[E::faidx_adjust_position] The sequence "1" was not found\n', norm_check_logs[0])
+
# Assert Samples concordance
self.assert_sample_checker(validator._sample_check_yaml, expected_sample_checker)
diff --git a/tests/test_report.py b/tests/test_report.py
index 1c8d2f2..fc3f2f6 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -6,7 +6,7 @@
import eva_sub_cli
from eva_sub_cli.report import generate_html_report, generate_text_report
-validation_results_xlsx = {
+common_validation_results = {
"ready_for_submission_to_eva": False,
"assembly_check": {
"input_passed.vcf": {
@@ -37,6 +37,7 @@
"nb_mismatch": 10,
"total": 36,
},
+ "pass": False,
},
"vcf_check": {
"input_passed.vcf": {
@@ -56,6 +57,31 @@
"valid": False,
"warning_count": 0,
},
+ "pass": False,
+ },
+ 'norm_check': {
+ 'input_passed.vcf': {
+ 'error_list': [],
+ 'nb_error': 0,
+ 'nb_realigned': 0,
+ 'nb_skipped': 0,
+ 'nb_split': 0,
+ 'nb_total': 152,
+ 'report_path': '/path/to/vcf_passed/norm/log'
+ },
+ 'input_fail.vcf': {
+ 'error_list': ['NON_ACGTN_ALT\tchr1\t49338976\t]chr1:49277505]T',
+ 'NON_ACGTN_ALT\tchr1\t49997014\tTAT[chr1:50014208[',
+ 'NON_ACGTN_ALT\tchr1\t50014208\t]chr1:49997014]ATT',
+ 'NON_ACGTN_ALT\tchr1\t191611692\t[chr8:41723769[A'],
+ 'nb_error': 4,
+ 'nb_realigned': 0,
+ 'nb_skipped': 0,
+ 'nb_split': 0,
+ 'nb_total': 152,
+ 'report_path': '/path/to/vcf_failed/norm/log'
+ },
+ 'pass': False,
},
"sample_check": {
'report_path': '/path/to/sample/report',
@@ -81,11 +107,13 @@
'more_per_submitted_files_metadata': {},
'more_submitted_files_metadata': ['C1Sample ', ' C2Sample', 'C3Sample', 'C4Sample']
}
- }
+ },
+ "pass": False,
},
# NB. obviously this doesn't make sense for the number of analyses in this report, but demonstrates the possible
# outputs for this check.
"fasta_check": {
+ "pass": False,
'not_all_insdc.fa': {
'report_path': '/path/to/not_all_insdc_check.yml',
'all_insdc': False,
@@ -145,7 +173,22 @@
'connection_error': '500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve'
}
},
+ 'evidence_type_check': {
+ 'pass': False,
+ 'Analysis A': {
+ 'evidence_type': None,
+ 'errors': 'VCF file evidence type could not be determined: vcf_files_1, vcf_files_2'
+ },
+ 'Analysis B': {
+ 'evidence_type': None,
+ 'errors': 'Multiple evidence types found: genotype, allele_frequency'
+ },
+ }
+}
+
+validation_results_xlsx = {
'metadata_check': {
+ "pass": False,
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': 2, 'column': 'Project Title',
@@ -166,164 +209,11 @@
'description': 'Column "Sample Accession" is not populated'}
],
'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
- },
-
- 'evidence_type_check': {
- 'pass': False,
- 'Analysis A': {
- 'evidence_type': None,
- 'errors': 'VCF file evidence type could not be determined: vcf_files_1, vcf_files_2'
- },
- 'Analysis B': {
- 'evidence_type': None,
- 'errors': 'Multiple evidence types found: genotype, allele_frequency'
- },
}
}
+validation_results_xlsx.update(common_validation_results)
validation_results_json = {
- "ready_for_submission_to_eva": False,
- "assembly_check": {
- "input_passed.vcf": {
- "report_path": "/path/to/assembly_passed/report",
- "error_list": [],
- "match": 247,
- "mismatch_list": [],
- "nb_error": 0,
- "nb_mismatch": 0,
- "total": 247,
- },
- "input_fail.vcf": {
- "report_path": "/path/to/assembly_failed/report",
- "error_list": ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"],
- "match": 26,
- "mismatch_list": [
- "Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'",
- "Chromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'",
- "Chromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'",
- "Chromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'",
- "Chromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'",
- "Chromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'",
- "Chromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'",
- "Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'",
- "Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'",
- ],
- "nb_error": 1,
- "nb_mismatch": 10,
- "total": 36,
- },
- "pass": False,
- },
- "vcf_check": {
- "input_passed.vcf": {
- 'report_path': '/path/to/vcf_passed/report',
- "error_count": 0,
- "error_list": [],
- "valid": True,
- "warning_count": 0,
- },
- "input_fail.vcf": {
- 'report_path': '/path/to/vcf_failed/report',
- "critical_count": 1,
- "critical_list": ["Line 4: Error in meta-data section."],
- "error_count": 1,
- "error_list": [
- "Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=.."],
- "valid": False,
- "warning_count": 0,
- },
- "pass": False,
- },
- "sample_check": {
- 'report_path': '/path/to/sample/report',
- 'overall_differences': True,
- 'results_per_analysis': {
- 'Analysis A': {
- 'difference': True,
- 'more_metadata_submitted_files': [' SampleA1', 'SampleA2 ', 'SampleA3', 'SampleA4', 'SampleA5',
- 'SampleA6', 'SampleA7', 'SampleA8', 'SampleA9', 'SampleA10'],
- 'more_per_submitted_files_metadata': {},
- 'more_submitted_files_metadata': ['A1Sample ', ' A2Sample', 'A3Sample', 'A4Sample', 'A5Sample',
- 'A6Sample', 'A7Sample', 'A8Sample', 'A9Sample', 'A10Sample']
- },
- 'Analysis B': {
- 'difference': False,
- 'more_metadata_submitted_files': [],
- 'more_per_submitted_files_metadata': {},
- 'more_submitted_files_metadata': []
- },
- 'Analysis C': {
- 'difference': True,
- 'more_metadata_submitted_files': ['SampleC1 ', ' SampleC2', 'SampleC3', 'SampleC4'],
- 'more_per_submitted_files_metadata': {},
- 'more_submitted_files_metadata': ['C1Sample ', ' C2Sample', 'C3Sample', 'C4Sample']
- }
- },
- "pass": False,
- },
- # NB. obviously this doesn't make sense for the number of analyses in this report, but demonstrates the possible
- # outputs for this check.
- "fasta_check": {
- "pass": False,
- 'not_all_insdc.fa': {
- 'report_path': '/path/to/not_all_insdc_check.yml',
- 'all_insdc': False,
- 'sequences': [
- {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
- {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': False}
- ],
- 'metadata_assembly_compatible': True,
- 'possible_assemblies': {'GCA_1'},
- 'assembly_in_metadata': 'GCA_1',
- 'associated_analyses': ['Analysis A']
- },
- 'metadata_asm_not_found.fa': {
- 'report_path': '/path/to/metadata_asm_not_found.yml',
- 'all_insdc': True,
- 'sequences': [
- {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
- {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
- ],
- 'possible_assemblies': {'GCA_1'}
- },
- 'metadata_asm_not_match.fa': {
- 'report_path': '/path/to/metadata_asm_not_match.yml',
- 'all_insdc': True,
- 'sequences': [
- {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
- {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
- ],
- 'metadata_assembly_compatible': False,
- 'possible_assemblies': {'GCA_1'},
- 'assembly_in_metadata': 'GCA_2',
- 'associated_analyses': ['Analysis B']
- },
- 'metadata_asm_match.fa': {
- 'report_path': '/path/to/metadata_asm_match.yml',
- 'all_insdc': True,
- 'sequences': [
- {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
- {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
- ],
- 'metadata_assembly_compatible': True,
- 'possible_assemblies': {'GCA_1'},
- 'assembly_in_metadata': 'GCA_1',
- 'associated_analyses': ['Analysis A']
- },
- 'metadata_error.fa': {
- 'report_path': '/path/to/metadata_error.yml',
- 'all_insdc': True,
- 'sequences': [
- {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
- {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
- ],
- 'metadata_assembly_compatible': True,
- 'possible_assemblies': {'GCA_1'},
- 'assembly_in_metadata': 'GCA_1',
- 'associated_analyses': ['Analysis C'],
- 'connection_error': '500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve'
- }
- },
'metadata_check': {
"pass": False,
'json_errors': [
@@ -343,34 +233,23 @@
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
],
'json_report_path': '/path/to/json/metadata/report'
- },
-
- 'evidence_type_check': {
- 'pass': False,
- 'Analysis A': {
- 'evidence_type': None,
- 'errors': 'VCF file evidence type could not be determined: vcf_files_1, vcf_files_2'
- },
- 'Analysis B': {
- 'evidence_type': None,
- 'errors': 'Multiple evidence types found: genotype, allele_frequency'
- },
}
}
+validation_results_json.update(common_validation_results)
class TestReport(TestCase):
resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
expected_report_metadata_xlsx = os.path.join(resource_dir, 'validation_reports',
- 'expected_report_metadata_xlsx.html')
+ 'expected_metadata_xlsx_report.html')
expected_report_metadata_json = os.path.join(resource_dir, 'validation_reports',
- 'expected_report_metadata_json.html')
+ 'expected_metadata_json_report.html')
expected_report_metadata_xlsx_shallow = os.path.join(resource_dir, 'validation_reports',
'expected_shallow_metadata_xlsx_report.html')
expected_text_report_metadata_xlsx = os.path.join(resource_dir, 'validation_reports',
- 'expected_report_metadata_xlsx.txt')
+ 'expected_metadata_xlsx_report.txt')
expected_text_report_metadata_json = os.path.join(resource_dir, 'validation_reports',
- 'expected_report_metadata_json.txt')
+ 'expected_metadata_json_report.txt')
expected_text_report_metadata_xlsx_shallow = os.path.join(resource_dir, 'validation_reports',
'expected_shallow_metadata_xlsx_report.txt')
test_project_name = "My cool project"
diff --git a/tests/test_validaton_results_parsers.py b/tests/test_validaton_results_parsers.py
index 29fec21..8f8bdf2 100644
--- a/tests/test_validaton_results_parsers.py
+++ b/tests/test_validaton_results_parsers.py
@@ -2,7 +2,7 @@
from unittest import TestCase
from eva_sub_cli.validators.validation_results_parsers import vcf_check_errors_is_critical, parse_assembly_check_log, \
- parse_assembly_check_report
+ parse_assembly_check_report, parse_bcftools_norm_report
class TestValidationParsers(TestCase):
@@ -42,3 +42,15 @@ def test_parse_assembly_check_report(self):
assert nb_mismatch == 12
assert error_list == ['Chromosome scaffold_chr1 is not present in FASTA file']
assert nb_error == 1
+
+ def test_parse_bcftools_norm_report(self):
+ normalisation_report = os.path.join(self.resource_dir, 'norm_check', 'invalid.vcf_bcftools_norm.log')
+ error_list, nb_total, nb_split, nb_realigned, nb_skipped = parse_bcftools_norm_report(normalisation_report)
+ assert error_list == [
+ "NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T",
+ "NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[",
+ "NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT",
+ "NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A"
+ ]
+ assert nb_total == 152
+ assert nb_split == nb_realigned == nb_skipped == 0
diff --git a/tests/test_validator.py b/tests/test_validator.py
index 89cb1ae..c839d99 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -16,6 +16,10 @@
'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0,
'match': 247, 'total': 247}
},
+ 'norm_check': {
+ 'input_passed.vcf': {'error_list': [], 'nb_error': 0, 'nb_realigned': 0, 'nb_skipped': 0, 'nb_split': 0,
+ 'nb_total': 152,
+ 'report_path': '{resource_dir}/validation_reports/validation_output/norm_check/input_passed.vcf_bcftools_norm.log'}},
'sample_check': {
'overall_differences': False,
'results_per_analysis': {