Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
FROM python:3.10

ENV vcf_validator_version=0.10.2
ENV NXF_VER=22.10.6
ENV NXF_VER=23.10.0

WORKDIR /opt

# Install JAVA and Node
RUN apt update && apt install -y default-jdk nodejs npm

# Install bcftools
RUN apt install -y bcftools

# Install VCF validator
RUN curl -LJo /usr/local/bin/vcf_validator https://github.com/EBIvariation/vcf-validator/releases/download/v${vcf_validator_version}/vcf_validator_linux \
&& curl -LJo /usr/local/bin/vcf_assembly_checker https://github.com/EBIvariation/vcf-validator/releases/download/v${vcf_validator_version}/vcf_assembly_checker_linux \
Expand Down
7 changes: 6 additions & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,15 @@ To upgrade to the newest version, run `pip install --upgrade eva-sub-cli`.

## 3. From source natively

Advanced users who want to manage their dependencies in a more granular way can install from source natively.
This installation method requires the following:
* Python 3.8+
* [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html) 21.10+
* [biovalidator](https://github.com/elixir-europe/biovalidator) 2.1.0+
* [vcf-validator](https://github.com/EBIvariation/vcf-validator) 0.9.7+
* [bcftools](https://www.htslib.org/download/) 1.14+

Install each of these and ensure they are included in your PATH. Then install the latest eva-sub-cli release as previously described.
Install each of these and ensure they are included in your PATH. Then install the latest eva-sub-cli release from [PyPI](https://pypi.org/project/eva-sub-cli/):
```bash
pip install eva-sub-cli
```
31 changes: 31 additions & 0 deletions eva_sub_cli/jinja_templates/html/file_validation.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
{{ assembly_check(result) }}
{% elif check_type == "vcf_check" %}
{{ vcf_check(result) }}
{% elif check_type == "norm_check" %}
{{ norm_check(result) }}
{% endif %}
{% endfor %}
{%- endmacro %}
Expand Down Expand Up @@ -90,4 +92,33 @@
{% endif %}
{%- endmacro %}

{% macro norm_check(norm_check_result) %}
{% set error_count = norm_check_result.get("nb_error", 0) %}
{% set expand_icon = "" %}
{% if error_count > 0 %}
{% set expand_icon = "▶" %}
{% set icon = "❌" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
{% set icon = "✔" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'><span class="expand_icon">{{ expand_icon }}</span> {{ icon }} Normalisation check: {{ error_count }} errors ({{ norm_check_result.get("nb_total", 0) }} total lines: {{ norm_check_result.get("nb_split", 0) }} split, {{ norm_check_result.get("nb_realigned", 0) }} realigned, {{ norm_check_result.get("nb_skipped", 0) }} skipped)</div>
{% set error_list = norm_check_result.get("error_list") %}

{% if error_list%}
<div class="error-list">
<div class="error-description">First 10 errors are below. <strong>Full report:</strong> {{ norm_check_result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in error_list[:10] %}
<tr>
<td><strong>Error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{%- endmacro %}
3 changes: 2 additions & 1 deletion eva_sub_cli/jinja_templates/html/report.html
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ <h2>Metadata validation results</h2>
<h2>VCF validation results</h2>
<div class="description">
Checks whether each file is compliant with the <a href="http://samtools.github.io/hts-specs/VCFv4.4.pdf" target=”_blank”>VCF specification</a>.
Also checks whether the variants' reference alleles match against the reference assembly.
Also checks whether the variants' reference alleles match against the reference assembly,
and whether the file can be normalised (if necessary) using <a href="https://samtools.github.io/bcftools/bcftools.html#norm" target="blank">bcftools norm</a>.
</div>
{% for file_name in vcf_files %}
{% if file_name != "pass"%}
Expand Down
20 changes: 20 additions & 0 deletions eva_sub_cli/jinja_templates/text/file_validation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
{{ assembly_check(result) }}
{% elif check_type == "vcf_check" %}
{{ vcf_check(result) }}
{% elif check_type == "norm_check" %}
{{ norm_check(result) }}
{% endif %}
{% endfor %}
{%- endmacro %}
Expand Down Expand Up @@ -56,3 +58,21 @@
{% endfor %}
{% endif %}
{%- endmacro %}

{% macro norm_check(norm_check_result) %}
{% set error_count = norm_check_result.get("nb_error", 0) %}
{% if error_count > 0 %}
{% set icon = "\u274C" %}
{% else %}
{% set icon = "\u2714" %}
{% endif %}
{{ icon }} Normalisation check: {{ error_count }} errors ({{ norm_check_result.get("nb_total", 0) }} total lines: {{ norm_check_result.get("nb_split", 0) }} split, {{ norm_check_result.get("nb_realigned", 0) }} realigned, {{ norm_check_result.get("nb_skipped", 0) }} skipped)

{% set error_list = norm_check_result.get("error_list") %}
{% if error_list%}
First 10 errors are below. Full report: {{ norm_check_result.get('report_path', '') }}
{% for error in error_list[:10] %}
Error: {{ error }}
{% endfor %}
{% endif %}
{%- endmacro %}
3 changes: 2 additions & 1 deletion eva_sub_cli/jinja_templates/text/report.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ For requirements, please refer to the EVA website (https://www.ebi.ac.uk/eva/?Su

VCF VALIDATION RESULTS
Checks whether each file is compliant with the VCF specification (http://samtools.github.io/hts-specs/VCFv4.4.pdf).
Also checks whether the variants' reference alleles match against the reference assembly.
Also checks whether the variants' reference alleles match against the reference assembly, and whether the file can be
normalised (if necessary) using bcftools norm (https://samtools.github.io/bcftools/bcftools.html#norm).

{% for file_name in vcf_files %}
{% if file_name != "pass"%}
Expand Down
29 changes: 27 additions & 2 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ params.metadata_xlsx = null
params.executable = [
"vcf_validator": "vcf_validator",
"vcf_assembly_checker": "vcf_assembly_checker",
"biovalidator": "biovalidator"
"biovalidator": "biovalidator",
"bcftools": "bcftools"
]
// python scripts - installed as part of eva-sub-cli
params.python_scripts = [
Expand Down Expand Up @@ -83,11 +84,11 @@ workflow {
// VCF checks
check_vcf_valid(vcf_and_ref_ch)
check_vcf_reference(vcf_and_ref_ch)
check_vcf_normalised(vcf_and_ref_ch)

generate_file_size_and_md5_digests(vcf_files)
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())


// Metadata conversion
if (params.metadata_xlsx && !params.metadata_json){
convert_xlsx_2_json(joinBasePath(params.metadata_xlsx))
Expand Down Expand Up @@ -177,6 +178,30 @@ process check_vcf_reference {
"""
}

/*
* Check that the VCF file can be normalised using bcftools
*/
process check_vcf_normalised {
publishDir output_dir,
overwrite: true,
mode: "copy"

input:
tuple path(vcf), path(fasta), path(report)

output:
// TODO should we output the normalised file?
path "norm_check/*.log", emit: normalisation_log

script:
"""
mkdir norm_check
# Trap exit code so failures can be reported
$params.executable.bcftools norm --no-version -cw -f $fasta -O u $vcf 1> /dev/null 2> norm_check/${vcf}_bcftools_norm.log \
|| echo "exit code \$?"
"""
}

process generate_file_size_and_md5_digests {
input:
path(vcf_file)
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def generate_report(validation_results, validation_date, submission_dir, vcf_fas
consent_statement_required, subdir, template_file):
results_for_report = {k: v for k, v in validation_results.items() if k != 'ready_for_submission_to_eva'}
vcf_files = sorted(set([file_name
for check in results_for_report if check in ["vcf_check", "assembly_check"]
for check in results_for_report if check in ["vcf_check", "assembly_check", "norm_check"]
for file_name in results_for_report[check]
]))
fasta_files = sorted([file_name for file_name in results_for_report['fasta_check']])
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

default_container_image = 'ebivariation/eva-sub-cli'
default_container_tag = 'v0.0.6'
default_container_tag = 'v0.0.7.dev0'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
7 changes: 5 additions & 2 deletions eva_sub_cli/validators/native_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@ class NativeValidator(Validator):
def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None, metadata_xlsx=None,
shallow_validation=False, vcf_validator_path='vcf_validator',
assembly_checker_path='vcf_assembly_checker', biovalidator_path='biovalidator',
submission_config=None, nextflow_config=None):
bcftools_path='bcftools', submission_config=None, nextflow_config=None):
super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json,
metadata_xlsx=metadata_xlsx, shallow_validation=shallow_validation,
submission_config=submission_config)
self.nextflow_config = nextflow_config
self.vcf_validator_path = vcf_validator_path
self.assembly_checker_path = assembly_checker_path
self.biovalidator_path = biovalidator_path
self.bcftools_path = bcftools_path

@staticmethod
def _validation_file_path_for(file_path):
Expand Down Expand Up @@ -57,13 +58,15 @@ def get_validation_cmd(self):
f" --executable.vcf_validator {self.vcf_validator_path}",
f" --executable.vcf_assembly_checker {self.assembly_checker_path}",
f" --executable.biovalidator {self.biovalidator_path}",
f" --executable.bcftools {self.bcftools_path}",
f" -c {self.nextflow_config} " if self.nextflow_config else ""
])

def verify_executables_installed(self):
for name, path in [('vcf-validator', self.vcf_validator_path),
('vcf-assembly-checker', self.assembly_checker_path),
('biovalidator', self.biovalidator_path)]:
('biovalidator', self.biovalidator_path),
('bcftools', self.bcftools_path)]:
try:
self._run_quiet_command(
f"Check {name} is installed and available on the path",
Expand Down
13 changes: 13 additions & 0 deletions eva_sub_cli/validators/validation_results_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,19 @@ def vcf_check_errors_is_critical(error):
return True


def parse_bcftools_norm_report(norm_report):
total = split = realigned = skipped = 0
error_list = []
with open(norm_report) as open_file:
for line in open_file:
if line.startswith('Lines total/split/realigned/skipped:'):
# Lines total/split/realigned/skipped: 2/0/1/0
total, split, realigned, skipped = line.strip().split()[-1].split('/')
else:
error_list.append(line.strip())
return error_list, int(total), int(split), int(realigned), int(skipped)


def parse_biovalidator_validation_results(metadata_check_file):
"""
Read the biovalidator's report and extract the list of validation errors
Expand Down
35 changes: 31 additions & 4 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from eva_sub_cli.report import generate_html_report, generate_text_report
from eva_sub_cli.validators.validation_results_parsers import parse_assembly_check_log, parse_assembly_check_report, \
parse_biovalidator_validation_results, convert_metadata_sheet, convert_metadata_row, convert_metadata_attribute, \
parse_vcf_check_report, parse_metadata_property
parse_vcf_check_report, parse_metadata_property, parse_bcftools_norm_report

VALIDATION_OUTPUT_DIR = "validation_output"
VALIDATION_RESULTS_KEY = 'validation_results'
Expand Down Expand Up @@ -153,7 +153,7 @@ def verify_ready_for_submission_to_eva(self):
""" Checks if all the validation are passed """
return all((
all((value.get('pass', False) is True for key, value in self.results.items() if
key in ['vcf_check', 'assembly_check', 'fasta_check', 'sample_check', 'metadata_check', 'evidence_type_check'])),
key in ['vcf_check', 'assembly_check', 'fasta_check', 'sample_check', 'norm_check', 'metadata_check', 'evidence_type_check'])),
any((
self.results['shallow_validation']['requested'] is False,
self.results['shallow_validation'].get('required', True) is False
Expand All @@ -167,6 +167,7 @@ def _collect_validation_workflow_results(self):
self._collect_trim_down_metrics()
self._collect_vcf_check_results()
self._collect_assembly_check_results()
self._collect_norm_check_results()
self._load_sample_check_results()
self._load_evidence_check_results()
self._load_fasta_check_results()
Expand All @@ -189,6 +190,11 @@ def _assess_validation_results(self):
for vcf_name, asm_check in self.results.get('assembly_check', {}).items()))
self.results['assembly_check']['pass'] = asm_nb_mismatch_result and asm_nb_error_result

# norm_check result
norm_check_result = all((norm_check.get('nb_error', 1) == 0
for vcf_name, norm_check in self.results.get('norm_check', {}).items()))
self.results['norm_check']['pass'] = norm_check_result

# fasta_check result
fasta_check_result = all((fa_file_check.get('all_insdc', False) is True
for fa_file, fa_file_check in self.results.get('fasta_check', {}).items()))
Expand Down Expand Up @@ -249,6 +255,10 @@ def _assembly_check_text_report(self, vcf_name):
os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*')
)

@lru_cache
def _normalisation_log(self, vcf_name):
return resolve_single_file_path(os.path.join(self.output_dir, 'norm_check', vcf_name + '_bcftools_norm.log'))

@cached_property
def _sample_check_yaml(self):
return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml'))
Expand Down Expand Up @@ -310,6 +320,25 @@ def _collect_assembly_check_results(self):
'total': total
}

def _collect_norm_check_results(self):
self.results['norm_check'] = {}
for vcf_file in self.vcf_files:
vcf_name = os.path.basename(vcf_file)
normalisation_log = self._normalisation_log(vcf_name)
if normalisation_log:
error_list, nb_total, nb_split, nb_realigned, nb_skipped = parse_bcftools_norm_report(normalisation_log)
else:
error_list, nb_total, nb_split, nb_realigned, nb_skipped = (['Process failed'], 0, 0, 0, 0)
self.results['norm_check'][vcf_name] = {
'report_path': normalisation_log,
'error_list': error_list,
'nb_error': len(error_list),
'nb_total': nb_total,
'nb_split': nb_split,
'nb_realigned': nb_realigned,
'nb_skipped': nb_skipped
}

def _load_fasta_check_results(self):
for fasta_file in self.fasta_files:
fasta_file_name = os.path.basename(fasta_file)
Expand Down Expand Up @@ -338,8 +367,6 @@ def _load_evidence_check_results(self):

self._update_metadata_with_evidence_type()



def _collect_metadata_results(self):
self.results['metadata_check'] = {}
self._load_spreadsheet_conversion_errors()
Expand Down
27 changes: 16 additions & 11 deletions tests/build_and_test_docker_locally.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,12 @@ def get_docker_validation_cmd(self):

def assert_validation_results(self, validator, expected_sample_checker, expected_metadata_files_json,
expected_metadata_val, expected_semantic_val, expected_evidence_type_val):
# Assert VCF format check
vcf_format_dir = os.path.join(validator.output_dir, 'vcf_format')
self.assertTrue(os.path.exists(vcf_format_dir))

vcf_format_log_file = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
self.assertTrue(os.path.exists(vcf_format_log_file))

with open(vcf_format_log_file) as vcf_format_log_file:
vcf_format_log_path = os.path.join(vcf_format_dir, 'input_passed.vcf.vcf_format.log')
self.assertTrue(os.path.exists(vcf_format_log_path))
with open(vcf_format_log_path) as vcf_format_log_file:
vcf_format_logs = vcf_format_log_file.readlines()
self.assertEqual('[info] According to the VCF specification, the input file is valid\n',
vcf_format_logs[2])
Expand All @@ -249,18 +248,24 @@ def assert_validation_results(self, validator, expected_sample_checker, expected
self.assertEqual('According to the VCF specification, the input file is valid\n',
text_report_content[0])

# assert assembly report
# Assert assembly report check
assembly_check_dir = os.path.join(validator.output_dir, 'assembly_check')
self.assertTrue(os.path.exists(assembly_check_dir))

assembly_check_log_file = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
self.assertTrue(os.path.exists(assembly_check_log_file))

with open(assembly_check_log_file) as assembly_check_log_file:
assembly_check_log_path = os.path.join(assembly_check_dir, 'input_passed.vcf.assembly_check.log')
self.assertTrue(os.path.exists(assembly_check_log_path))
with open(assembly_check_log_path) as assembly_check_log_file:
assembly_check_logs = assembly_check_log_file.readlines()
self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[4])
self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[5])

# Assert normalisation check
norm_check_dir = os.path.join(validator.output_dir, 'norm_check')
self.assertTrue(os.path.exists(norm_check_dir))
norm_check_log_path = os.path.join(norm_check_dir, 'input_passed.vcf_bcftools_norm.log')
with open(norm_check_log_path) as norm_check_log_file:
norm_check_logs = norm_check_log_file.readlines()
self.assertEqual('[E::faidx_adjust_position] The sequence "1" was not found\n', norm_check_logs[0])

# Assert Samples concordance
self.assert_yaml_file(validator._sample_check_yaml, expected_sample_checker)

Expand Down
5 changes: 5 additions & 0 deletions tests/resources/norm_check/invalid.vcf_bcftools_norm.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
NON_ACGTN_ALT chr1 49338976 ]chr1:49277505]T
NON_ACGTN_ALT chr1 49997014 TAT[chr1:50014208[
NON_ACGTN_ALT chr1 50014208 ]chr1:49997014]ATT
NON_ACGTN_ALT chr1 191611692 [chr8:41723769[A
Lines total/split/realigned/skipped: 152/0/0/0
Loading