Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
f73f0c6
edit test input file
khetherin Jan 6, 2026
4e11667
edited README.md to provide info on test dataset
khetherin Jan 6, 2026
10a54e4
bug fix for no start_range in GVF attributes
khetherin Jan 6, 2026
0db4b68
print warning to the log file when the ALT allele cannot be found or …
khetherin Jan 12, 2026
9575e68
edited warning message
khetherin Jan 12, 2026
d0f3f52
changed warning to logger
khetherin Jan 12, 2026
7ca6d01
changed warning to logger
khetherin Jan 12, 2026
26d8d65
Added statement to clarify the purpose of the attribute_mapper.yaml f…
khetherin Jan 12, 2026
6d1b546
add "." for empty FORMAT field
khetherin Jan 12, 2026
00289e0
amended SVLEN in attibute matter to fix the vcf-validator error
khetherin Jan 13, 2026
7ec2990
amended attribute_mapper.yaml to error in vcf-validator
khetherin Jan 13, 2026
4be6a50
added new functions and amended __str__ to ensure if FORMAT key is em…
khetherin Jan 13, 2026
942da4e
removed redundancy in attribute_mapper.yaml
khetherin Jan 13, 2026
eff50f0
sort VCF objects by chromosome and position
khetherin Jan 13, 2026
ee08a3e
amended __str__ so do not print format and samples if it is empty
khetherin Jan 13, 2026
6721b64
add method for gettiing the chromosome and position of a vcf object, …
khetherin Jan 14, 2026
e48e151
edit to sort and make unique the ID of a vcf line
khetherin Jan 14, 2026
57cd7c0
print vcf lines
khetherin Jan 14, 2026
10c5277
moved function above main method
khetherin Jan 14, 2026
c514eb1
added new unit tests
khetherin Jan 14, 2026
6123fde
edited unit tests
khetherin Jan 14, 2026
085ae47
function for has_svclaim_abundance_evidence and its unit test
khetherin Jan 14, 2026
6634f7f
edit list of variant call/region sequence ontology ids
khetherin Jan 15, 2026
0de41c4
removed duplicate test. edited existing test for SVCLAIM
khetherin Jan 16, 2026
2841ab5
added functionality for SVCLAIM - a temporary placeholder to be corre…
khetherin Jan 16, 2026
cc70045
added TODOs
khetherin Jan 19, 2026
2d08696
Merge branch 'upstream_main' into iterate
khetherin Jan 20, 2026
5f1dd11
edit test
khetherin Jan 20, 2026
1390892
edit tests
khetherin Jan 20, 2026
3d186cb
edit tests
khetherin Jan 20, 2026
7949312
print info to logger for no format keys detected
khetherin Jan 20, 2026
7fd761d
edit test
khetherin Jan 20, 2026
79254f4
use logger for warnings
khetherin Jan 20, 2026
94eec07
edit tests
khetherin Jan 20, 2026
bb36a06
edit tests
khetherin Jan 21, 2026
ed605ba
edit tests
khetherin Jan 21, 2026
dd3f5c5
refactored to ensure clarity in generate_symbolic_allele
khetherin Jan 21, 2026
9333761
refactor: replace co-ordinates with VariantRange dataclass
khetherin Jan 21, 2026
3da35ac
edit import
khetherin Jan 21, 2026
5170743
edit pos to variant_range_coordinate.pos
khetherin Jan 21, 2026
7eea89b
change any to all
khetherin Jan 22, 2026
f78ad96
addressing reviewers comments
khetherin Jan 22, 2026
33dc9ff
fix if statement
khetherin Jan 22, 2026
3e40277
indentation fix
khetherin Jan 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ pip install -r requirements.txt
- zebrafish.fa = This is a dummy assembly
- zebrafish.gvf = This is a highly reduced GVF file https://ftp.ebi.ac.uk/pub/databases/dgva/nstd62_Brown_et_al_2012/gvf/
- human.fa = This is a full assembly which has been converted so the naming convention follows the corresponding GVF
- human_est199.gvf = This is a full GVF file https://ftp.ebi.ac.uk/pub/databases/dgva/estd199_1000_Genomes_Consortium_Phase_1/gvf/
- human_est199.gvf = This is a full GVF file https://ftp.ebi.ac.uk/pub/databases/dgva/estd199_1000_Genomes_Consortium_Phase_1/gvf/
- drosophila_nstd134.gvf = This is a full GVF file https://ftp.ebi.ac.uk/pub/databases/dgva/nstd134_Gilks_et_al_2016/gvf/
- drosophila_GCA_000001215.4.fa = This is a full assembly representing GCA_000001215.4
- drosophila_GCA_000001215.4.fa = This is a full assembly representing GCA_000001215.4
138 changes: 124 additions & 14 deletions convert_gvf_to_vcf/convertGVFtoVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,21 @@ def convert_gvf_pragmas_for_vcf_header(gvf_pragmas,
return unique_converted_pragmas, unique_sample_name

# the function below relates to the VCF headerline (Part 2)
def generate_vcf_header_line(samples):
def generate_vcf_header_line(is_missing_format, samples):
""" Generates the VCF header line using the nine mandatory headers and the sample names.
:param is_missing_format: boolean (False = Format + sample names, True = mandatory fields only)
:param samples: list of samples, these will appear in the header line
:return: vcf_header: a string
"""
vcf_header_fields = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
for sample in samples:
vcf_header_fields.append(sample)
vcf_header = '\t'.join(vcf_header_fields)
if is_missing_format:
vcf_header_fields = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
vcf_header = '\t'.join(vcf_header_fields)
else:

vcf_header_fields = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
for sample in samples:
vcf_header_fields.append(sample)
vcf_header = '\t'.join(vcf_header_fields)
return vcf_header

# the functions below relate to the GVF header
Expand Down Expand Up @@ -259,6 +265,22 @@ def convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup, or
return standard_header_lines, list_of_vcf_objects

# The functions below relate to the VCF objects
def collect_missing_format_flags(list_of_vcf_objects):
""" Returns a list of booleans for each VCF object (True if format is missing, False if format keys present)
:params: list_of_vcf_objects: list of vcf objects
:return: missing_format_flags: list of booleans (True if format is missing, False if format keys present)
"""
missing_format_flags = []
for index in range(1, len(list_of_vcf_objects)):
if list_of_vcf_objects[index].format_keys == ['.']:
format_flag = True
missing_format_flags.append(format_flag)
else:
format_flag = False
missing_format_flags.append(format_flag)
return missing_format_flags


def compare_vcf_objects(list_of_vcf_objects):
""" Compares VCF objects in the list with the VCF object before it. Returns boolean values.
:params: list_of_vcf_objects: list of vcf objects
Expand Down Expand Up @@ -314,6 +336,69 @@ def determine_merge_or_keep_vcf_objects(list_of_vcf_objects, comparison_results,
merge_or_kept_objects.append(list_of_vcf_objects[-1])
return merge_or_kept_objects

def get_chrom_pos_of_vcf_object(obj):
"""
Returns chromosome and position of an object. Used to help sort VCF object by chromosome name and by numeric position.
:params: obj : vcf line object
:return: obj.chrom, obj.pos
"""
return (obj.chrom, int(obj.pos))

def has_duplicates(list_of_objects):
""" Checks chrom and pos of vcf line objects to see if there are duplicates. If list of vcf object has duplicates, merge again.
:params: list_of_objects
:return: duplicate_flag - boolean value (True = has duplicates so merge again, False= no duplicates)
"""
duplicate_flag = False
list_of_duplicate_chrom_pos = []
seen_chrom_pos = set()
for obj in list_of_objects:
chrom_pos = (obj.chrom, int(obj.pos))
if chrom_pos in seen_chrom_pos:
# returns true to allow to merge again
duplicate_flag = True
list_of_duplicate_chrom_pos.append(chrom_pos)
else:
seen_chrom_pos.add(chrom_pos)
return duplicate_flag, list_of_duplicate_chrom_pos


def get_list_of_merged_vcf_objects(list_of_vcf_objects, samples):
""" Compares VCF objects, merges VCF objects, sorts VCF objects. This gives a list of sorted merged vcf objects.
:params: list_of_vcf_objects
:params: samples
:returns: merge_or_kept_vcf_objects
"""
comparison_flags = compare_vcf_objects(list_of_vcf_objects) # Identifies which VCF objects to merge
merge_or_kept_vcf_objects = determine_merge_or_keep_vcf_objects(list_of_vcf_objects, comparison_flags, samples)
merge_or_kept_vcf_objects.sort(key=get_chrom_pos_of_vcf_object) # sorting by chromosome and position
return merge_or_kept_vcf_objects

def filter_duplicates_by_merging(chrom_pos_list, has_dups, list_of_vcf_objects,
list_of_vcf_objects_to_be_filtered, samples):
"""
:params: chrom_pos_list: list of tuples - this represents duplicate positions
:params: has_dups: boolean - True if it contains duplicates
:params: list_of_vcf_objects: list of VCF objects (GVF converted to VCF; with no merging/remove dups)
:params: list_of_vcf_objects_to_be_filtered: list of VCF objects from a previous merge
:params: samples: names
:returns: filtered_merge_or_kept_vcf_objects - list of vcf objects with duplicates from this iteration removed
"""
if has_dups:
for chrom_pos in chrom_pos_list:
chrom_to_search = chrom_pos[0]
pos_to_search = chrom_pos[1]
vcf_objects_to_merge = []
for vcf_object in list_of_vcf_objects:
if vcf_object.chrom == chrom_to_search and vcf_object.pos == pos_to_search:
vcf_objects_to_merge.append(vcf_object)

merge_duplicates = get_list_of_merged_vcf_objects(vcf_objects_to_merge, samples)
filtered_merge_or_kept_vcf_objects = [x for x in list_of_vcf_objects_to_be_filtered if x not in vcf_objects_to_merge]
filtered_merge_or_kept_vcf_objects.extend(merge_duplicates)
filtered_merge_or_kept_vcf_objects.sort(key=get_chrom_pos_of_vcf_object)
return filtered_merge_or_kept_vcf_objects

def main():
# Parse command line arguments
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -375,26 +460,51 @@ def main():
for header_line in header_type:
vcf_output.write(f"{header_line}\n")

# Part 2 of VCF file: Write the VCF header line. This is the nine mandatory fields with its sample names.
header_fields = generate_vcf_header_line(samples)
# Part 2 of VCF file: Write the VCF header line.
# Determine if the header is the 8 mandatory fields or 8 mandatory fields + FORMAT + sample names.
missing_flags = collect_missing_format_flags(list_of_vcf_objects) # True if format keys are missing, False if present
if all(missing_flags):
is_missing_format_value = True
logger.info("No Format Keys detected. Printing mandatory VCF headers.")
else:
is_missing_format_value = False
# Write the header.
header_fields = generate_vcf_header_line(is_missing_format=is_missing_format_value, samples=samples)
vcf_output.write(f"{header_fields}\n")

# Part 3 of VCF file: Write the VCF data lines. This will contain info about the position in the genome,
# its variants and genotype information per sample.
if len(gvf_lines_obj_list) > 0:
if (list_of_vcf_objects):
logger.info("Generating the VCF datalines")
# Each GVF feature has been converted to a VCF object so begin comparing and merging the VCF objects.
comparison_flags = compare_vcf_objects(list_of_vcf_objects) # Identifies which VCF objects to merge
merge_or_kept_vcf_objects = determine_merge_or_keep_vcf_objects(list_of_vcf_objects, comparison_flags, samples)
# initial merge
merge_or_kept_vcf_objects = get_list_of_merged_vcf_objects(list_of_vcf_objects, samples)
# identify if duplicates are present after merging
has_dups, chrom_pos_list = has_duplicates(merge_or_kept_vcf_objects)
# while duplicates are present, merge, then re-check for dups
max_iterations = 100
iteration = 0
list_of_vcf_objects_to_be_filtered = merge_or_kept_vcf_objects
while has_dups and iteration < max_iterations:
filtered_merge_or_kept_vcf_objects = filter_duplicates_by_merging(chrom_pos_list, has_dups,
list_of_vcf_objects,
list_of_vcf_objects_to_be_filtered, samples)
has_dups, chrom_pos_list = has_duplicates(filtered_merge_or_kept_vcf_objects)
iteration += 1
list_of_vcf_objects_to_be_filtered = filtered_merge_or_kept_vcf_objects
logger.info(f"Iteration of merge (remove dups): {iteration}")
Comment on lines +481 to +495
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason why the merging algorithm not capable of removing all the merge in one pass ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The merge function compares the current line with the previous line (it is limited to 2 lines in its comparison and merge). For example:
The lines in the file: lineA, lineB, lineC
After the merge: lineAandB, lineC
After another iteration: lineAandBandC

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The merge function compares the current line with the previous line (it is limited to 2 lines in its comparison and merge).

Ok so now that you have identified the limitation you should work removing it rather engineer something around.
The issue is in get_list_of_merged_vcf_objects where you compare and merge separately.
A better algorithm would be:

  • For all line starting with line 2
    • take the current and previous line and compare them
    • if they are equal:
      • merge and set the merge result as the previous line
    • otherwise set the current line as the previous line


# Write the VCF objects as data lines in the VCF file.
for vcf_line_object in merge_or_kept_vcf_objects:
vcf_output.write(str(vcf_line_object) + "\n")
# vcf_output.write("\t".join(str(val) for val in line) + "\n")
if iteration != 0:
for vcf_line_object in filtered_merge_or_kept_vcf_objects:
vcf_output.write(str(vcf_line_object) + "\n")
else:
for vcf_line_object in merge_or_kept_vcf_objects:
vcf_output.write(str(vcf_line_object) + "\n")
else:
logger.warning("No feature lines were found for this GVF file.")
vcf_output.close()
logger.info("GVF to VCF conversion complete")


if __name__ == "__main__":
main()
35 changes: 25 additions & 10 deletions convert_gvf_to_vcf/etc/attribute_mapper.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# The purpose of this file is to map the GVF attribute to its corresponding VCF tag (Field:FieldKey)
# Below is an example format
# name: (this can be the name of gvf_attribute or if unavailable, a general name)
# FIELD: (INFO/FORMAT/ALT)
Expand Down Expand Up @@ -494,6 +495,13 @@ parid:
Type: String
Description: "ID of partner breakend"
From: SVVCF
parent:
INFO:
FieldKey: PARENT
Number: .
Type: String
Description: "Parent"
From: [SVVCF, DGVa]
phred_likelihoods:
FORMAT:
FieldKey: PL
Expand Down Expand Up @@ -594,7 +602,7 @@ svlen:
INFO:
FieldKey: SVLEN
Number: A
Type: String
Type: Integer
Description: "Length of structural variant"
From: SVVCF
svtype:
Expand Down Expand Up @@ -734,13 +742,6 @@ Name:
Type: String
Description: "Name"
From: DGVa
Parent:
INFO:
FieldKey: PARENT
Number: .
Type: String
Description: "Parent"
From: DGVa
phenotype_description:
INFO:
FieldKey: PHENODESC
Expand Down Expand Up @@ -776,11 +777,18 @@ samples:
Type: String
Description: "Samples"
From: DGVa
sample_name:
INFO:
FieldKey: SAMPLENAME
Number: .
Type: String
Description: "Sample Name"
From: DGVa
submitter_variant_call_id:
INFO:
FieldKey: SVCID
Number: .
Type: Integer
Type: String
Description: "submitter variant call ID"
From: DGVa
submitter_variant_region_id:
Expand Down Expand Up @@ -824,7 +832,14 @@ variant_call_so_id:
Number: .
Type: String
Description: "Variant call Sequence ontology ID"
From:
From: DGVa
variant_region_description:
INFO:
FieldKey: VARREGDESC
Number: .
Type: String
Description: "Variant Region Sequence Ontology ID"
From: DGVa
ID:
INFO:
FieldKey: ID
Expand Down
Loading