script for reading kraken2 report, ktaxonomy, and ncbi_taxonomy into an internal tree structure which can be more easily edited.
Example:
from k2_tree.kraken_tree import Tree, ncbi_taxonomy_to_tree
LEVEL_CODES = {
"superkingdom": "d",
"phylum": "p",
"class": "c",
"order": "o",
"family": "f",
"genus": "g",
"species": "s",
}
ncbi_refs = pd.read_csv("RefSeq/reference_genomes.csv")
ncbi_taxonomy_dir = "ncbi_taxonomy"
tax_tree = ncbi_taxonomy_to_tree(
f"{ncbi_taxonomy_dir}/nodes.dmp", f"{ncbi_taxonomy_dir}/names.dmp"
)
taxa_to_node = {taxon.taxid: taxon for taxon in tax_tree.to_list()}
print("Taxonomy loaded")
def get_taxonomy(node: Tree | None) -> list[str]:
if node is None:
return ["unknown_taxa"]
if node.level_code in ["R"] or node.name == "cellular organisms":
return []
level_code = LEVEL_CODES.get(node.level_code, "x")
taxa_string = level_code + "__" + str(node.name)
return get_taxonomy(node.parent) + [taxa_string]
ncbi_refs["taxonomy"] = ncbi_refs["taxid"].apply(
lambda x: ";".join(get_taxonomy(taxa_to_node.get(x, None)))
)