From f4f2635c7b63c22920e4d79405c685b715a52a97 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 12 Mar 2025 10:59:16 +0100 Subject: [PATCH 1/6] WIP: Add partial subcommand --- vamb/__main__.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 064d0cfb..3979a080 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -1789,7 +1789,7 @@ def add_help_arguments(parser: argparse.ArgumentParser): ) -def add_general_arguments(subparser: argparse.ArgumentParser): +def add_universal_arguments(subparser: argparse.ArgumentParser): add_help_arguments(subparser) reqos = subparser.add_argument_group(title="Output", description=None) reqos.add_argument( @@ -1801,9 +1801,22 @@ def add_general_arguments(subparser: argparse.ArgumentParser): required=True, ) + +# TODO: These are not general. +# Outdir: All +# Minlength: Composition and abundance +# Nthreads: abundance, encoding and clustering +# norefcheck: abundance, encoding, clustering +# cuda: encoding and clustering +# seed: encoding and clustering +def add_general_group(subparser: argparse.ArgumentParser) -> argparse._ArgumentGroup: general = subparser.add_argument_group( title="General optional arguments", description=None ) + return general + + +def add_minlength(general: argparse._ArgumentGroup): general.add_argument( "-m", dest="minlength", @@ -1813,6 +1826,13 @@ def add_general_arguments(subparser: argparse.ArgumentParser): help="Ignore contigs shorter than this [2000]", ) + +def add_general_arguments(subparser: argparse.ArgumentParser): + add_universal_arguments(subparser) + + general = add_general_group(subparser) + add_minlength(general) + general.add_argument( "-p", dest="nthreads", @@ -2247,6 +2267,7 @@ def main(): TAXVAMB = "taxvamb" AVAMB = "avamb" RECLUSTER = "recluster" + PARTIAL = "partial" vaevae_parserbin_parser = subparsers.add_parser( BIN, @@ -2358,6 +2379,19 @@ def main(): add_predictor_arguments(recluster_parser) add_taxonomy_arguments(recluster_parser) + partial_subparser = subparsers.add_parser( + PARTIAL, help="Process individual parts of the VAMB pipelines", add_help=False + ) + partial_part = partial_subparser.add_subparsers(dest="partial_part") + + composition_parser = partial_part.add_parser( + "composition", help="Process composition data", add_help=False + ) + add_universal_arguments(composition_parser) + general = add_general_group(composition_parser) + add_minlength(general) + add_composition_arguments(composition_parser) + args = parser.parse_args() if args.subcommand == TAXOMETER: @@ -2381,10 +2415,25 @@ def main(): opt = BinAvambOptions.from_args(args) runner = partial(run_bin_aae, opt) run(runner, opt.common.general) + else: + assert False # no other options elif args.subcommand == RECLUSTER: opt = ReclusteringOptions.from_args(args) runner = partial(run_reclustering, opt) run(runner, opt.general) + elif args.subcommand == PARTIAL: + # TODO: Not implemented. Need to refactor to universal options, then minlength, then add composition + # then make a function to run tnf only + if args.partial_part == "composition": + opt = CompositionOptions.from_args(args) + runner = partial(run_tnf, opt) + run(runner, opt) + else: + # TODO: Add abundance + # TODO: Add encoding w. VAE + # TODO: Add encoding w. VAEVAE + # TODO: Add clustering + assert False # no other options else: # There are no more subcommands assert False From 5964c95f331ee5c0898743141e7d01d0690c5ec8 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 19 Mar 2025 14:12:14 +0100 Subject: [PATCH 2/6] WIP: Refactor main --- vamb/__main__.py | 112 ++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 44 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 3979a080..65a2c5b7 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -68,6 +68,22 @@ def try_make_dir(name: Union[Path, str]): pass +class CreatablePath: + def __init__(self, path: Path): + if path.exists(): + raise FileExistsError( + f"Attempted to create path {path}, but it already exists" + ) + + # Cannot be created if the parent does not exist or is not a dir + parent_dir = path.parent + if not parent_dir.is_dir(): + raise NotADirectoryError( + f"Attempted to create path {path}, but its parent is not an existing directory" + ) + self.path = path + + class FASTAPath: def __init__(self, path: Path): self.path = check_existing_file(path) @@ -314,6 +330,19 @@ def __init__( self.dropout = dropout +class MinContigLength: + @classmethod + def from_args(cls, args: argparse.Namespace): + return cls(typeasserted(args.minlength, int)) + + def __init__(self, min_contig_length: int): + if min_contig_length < 250: + raise argparse.ArgumentTypeError( + "Minimum contig length must be at least 250" + ) + self.n = min_contig_length + + class GeneralOptions: __slots__ = [ "out_dir", @@ -328,7 +357,7 @@ class GeneralOptions: def from_args(cls, args: argparse.Namespace): return cls( typeasserted(args.outdir, Path), - typeasserted(args.minlength, int), + MinContigLength.from_args(args), typeasserted(args.nthreads, int), not typeasserted(args.norefcheck, bool), typeasserted(args.seed, int), @@ -338,30 +367,16 @@ def from_args(cls, args: argparse.Namespace): def __init__( self, out_dir: Path, - min_contig_length: int, + min_contig_length: MinContigLength, n_threads: int, refcheck: bool, seed: int, cuda: bool, ): - # Outdir does not exist - if out_dir.exists(): - raise FileExistsError(out_dir) - - # Outdir is in an existing parent dir - parent_dir = out_dir.parent - if not parent_dir.is_dir(): - raise NotADirectoryError(parent_dir) - self.out_dir = out_dir - + self.out_dir = CreatablePath(out_dir) if n_threads < 1: raise ValueError(f"Must pass at least 1 thread, not {n_threads}") self.n_threads = n_threads - - if min_contig_length < 250: - raise argparse.ArgumentTypeError( - "Minimum contig length must be at least 250" - ) self.min_contig_length = min_contig_length if cuda and not torch.cuda.is_available(): @@ -691,8 +706,8 @@ def run( general: GeneralOptions, ): torch.set_num_threads(general.n_threads) - try_make_dir(general.out_dir) - logger.add(general.out_dir.joinpath("log.txt"), format=format_log) + try_make_dir(general.out_dir.path) + logger.add(general.out_dir.path.joinpath("log.txt"), format=format_log) begintime = time.time() logger.info("Starting Vamb version " + vamb.__version_str__) logger.info("Random seed is " + str(general.seed)) @@ -1001,11 +1016,14 @@ def load_composition_and_abundance( binsplitter: vamb.vambtools.BinSplitter, ) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]: composition = calc_tnf( - comp_options, vamb_options.min_contig_length, vamb_options.out_dir, binsplitter + comp_options, + vamb_options.min_contig_length.n, + vamb_options.out_dir.path, + binsplitter, ) abundance = calc_abundance( abundance_options, - vamb_options.out_dir, + vamb_options.out_dir.path, vamb_options.refcheck, composition.metadata, vamb_options.n_threads, @@ -1070,7 +1088,7 @@ def trainvae( ) logger.info("\tCreated VAE") - modelpath = vamb_options.out_dir.joinpath("model.pt") + modelpath = vamb_options.out_dir.path.joinpath("model.pt") vae.trainmodel( vamb.encode.set_batchsize( data_loader, @@ -1084,7 +1102,7 @@ def trainvae( logger.info("\tEncoding to latent representation") latent = vae.encode(data_loader) - vamb.vambtools.write_npz(vamb_options.out_dir.joinpath("latent.npz"), latent) + vamb.vambtools.write_npz(vamb_options.out_dir.path.joinpath("latent.npz"), latent) del vae # Needed to free "latent" array's memory references? elapsed = round(time.time() - begintime, 2) @@ -1117,7 +1135,7 @@ def trainaae( ) logger.info("\tCreated AAE") - modelpath = os.path.join(vamb_options.out_dir, "aae_model.pt") + modelpath = os.path.join(vamb_options.out_dir.path, "aae_model.pt") n_obs = data_loader.dataset.tensors[0].shape[0] # type: ignore aae.trainmodel( vamb.encode.set_batchsize( @@ -1132,7 +1150,7 @@ def trainaae( logger.info("\tEncoding to latent representation") clusters_y_dict, latent = aae.get_latents(contignames, data_loader) vamb.vambtools.write_npz( - os.path.join(vamb_options.out_dir, "aae_z_latent.npz"), latent + os.path.join(vamb_options.out_dir.path, "aae_z_latent.npz"), latent ) del aae # Needed to free "latent" array's memory references? @@ -1155,7 +1173,7 @@ def try_from_common(cls, common: BinnerCommonOptions): assert isinstance(common.comp.path, FASTAPath) return cls( common.comp.path, - common.general.out_dir.joinpath("bins"), + common.general.out_dir.path.joinpath("bins"), common.output.min_fasta_output_size, ) else: @@ -1343,7 +1361,7 @@ def run_bin_default(opt: BinDefaultOptions): comp_metadata.lengths, opt.common.general.seed, opt.common.general.cuda, - str(opt.common.general.out_dir.joinpath("vae_clusters")), + str(opt.common.general.out_dir.path.joinpath("vae_clusters")), FastaOutput.try_from_common(opt.common), None, ) @@ -1390,7 +1408,7 @@ def run_bin_aae(opt: BinAvambOptions): comp_metadata.lengths, opt.common.general.seed, opt.common.general.cuda, - str(opt.common.general.out_dir.joinpath("aae_z_clusters")), + str(opt.common.general.out_dir.path.joinpath("aae_z_clusters")), FastaOutput.try_from_common(opt.common), "z_", ) @@ -1403,7 +1421,9 @@ def run_bin_aae(opt: BinAvambOptions): FastaOutput.try_from_common(opt.common), "y_", binsplitter=opt.common.output.binsplitter, - base_clusters_name=str(opt.common.general.out_dir.joinpath("aae_y_clusters")), + base_clusters_name=str( + opt.common.general.out_dir.path.joinpath("aae_y_clusters") + ), clusters=clusters_y_dict, sequence_names=cast(Sequence[str], comp_metadata.identifiers), sequence_lens=cast(Sequence[int], comp_metadata.lengths), @@ -1531,7 +1551,7 @@ def run_taxonomy_predictor(opt: TaxometerOptions): abundance=abundance, tnfs=tnfs, lengths=lengths, - out_dir=opt.general.out_dir, + out_dir=opt.general.out_dir.path, taxonomy_options=opt, cuda=opt.general.cuda, ) @@ -1557,7 +1577,7 @@ def run_vaevae(opt: BinTaxVambOptions): abundance=abundance, tnfs=tnfs, lengths=lengths, - out_dir=opt.common.general.out_dir, + out_dir=opt.common.general.out_dir.path, taxonomy_options=opt.taxonomy, cuda=opt.common.general.cuda, ) @@ -1637,7 +1657,7 @@ def run_vaevae(opt: BinTaxVambOptions): batchsize=vae_options.basic_options.starting_batch_size, cuda=opt.common.general.cuda, ) - model_path = opt.common.general.out_dir.joinpath("vaevae_model.pt") + model_path = opt.common.general.out_dir.path.joinpath("vaevae_model.pt") with open(model_path, "wb") as modelfile: vae.trainmodel( dataloader, @@ -1649,7 +1669,7 @@ def run_vaevae(opt: BinTaxVambOptions): latent_both = vae.VAEJoint.encode(dataloader_joint) logger.info(f"{latent_both.shape} embedding shape") - latent_path = opt.common.general.out_dir.joinpath("vaevae_latent.npz") + latent_path = opt.common.general.out_dir.path.joinpath("vaevae_latent.npz") vamb.vambtools.write_npz(latent_path, latent_both) # Cluster, save tsv file @@ -1661,7 +1681,7 @@ def run_vaevae(opt: BinTaxVambOptions): lengths, opt.common.general.seed, opt.common.general.cuda, - str(opt.common.general.out_dir.joinpath("vaevae_clusters")), + str(opt.common.general.out_dir.path.joinpath("vaevae_clusters")), FastaOutput.try_from_common(opt.common), None, ) @@ -1670,12 +1690,15 @@ def run_vaevae(opt: BinTaxVambOptions): def run_reclustering(opt: ReclusteringOptions): composition = calc_tnf( opt.composition, - opt.general.min_contig_length, - opt.general.out_dir, + opt.general.min_contig_length.n, + opt.general.out_dir.path, opt.output.binsplitter, ) markers = load_markers( - opt.markers, composition.metadata, opt.general.out_dir, opt.general.n_threads + opt.markers, + composition.metadata, + opt.general.out_dir.path, + opt.general.n_threads, ) latent = vamb.vambtools.read_npz(opt.latent_path) alg = opt.algorithm @@ -1686,7 +1709,7 @@ def run_reclustering(opt: ReclusteringOptions): taxopt = alg.taxonomy abundance = calc_abundance( taxopt.abundance, - taxopt.general.out_dir, + taxopt.general.out_dir.path, taxopt.general.refcheck, composition.metadata, taxopt.general.n_threads, @@ -1696,7 +1719,7 @@ def run_reclustering(opt: ReclusteringOptions): abundance.matrix, composition.matrix, composition.metadata.lengths, - taxopt.general.out_dir, + taxopt.general.out_dir.path, taxopt, taxopt.general.cuda, ) @@ -1764,7 +1787,7 @@ def run_reclustering(opt: ReclusteringOptions): assert isinstance(opt.composition.path, FASTAPath) fasta_output = FastaOutput( opt.composition.path, - opt.general.out_dir.joinpath("bins"), + opt.general.out_dir.path.joinpath("bins"), opt.output.min_fasta_output_size, ) @@ -1772,7 +1795,7 @@ def run_reclustering(opt: ReclusteringOptions): fasta_output, None, opt.output.binsplitter, - str(opt.general.out_dir.joinpath("clusters_reclustered")), + str(opt.general.out_dir.path.joinpath("clusters_reclustered")), clusters_dict, cast(Sequence[str], composition.metadata.identifiers), cast(Sequence[int], composition.metadata.lengths), @@ -2425,9 +2448,10 @@ def main(): # TODO: Not implemented. Need to refactor to universal options, then minlength, then add composition # then make a function to run tnf only if args.partial_part == "composition": - opt = CompositionOptions.from_args(args) - runner = partial(run_tnf, opt) - run(runner, opt) + # opt = CompositionOptions.from_args(args) + # runner = partial(run_tnf, opt) + # run(runner, opt) + pass else: # TODO: Add abundance # TODO: Add encoding w. VAE From 99d95180ec75887bdfd4c2460fd0260f51d5b45b Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 19 Mar 2025 14:32:47 +0100 Subject: [PATCH 3/6] WIP: Refactor main --- vamb/__main__.py | 92 ++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 65a2c5b7..190c43e0 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -94,9 +94,20 @@ def __init__(self, path: Path): self.path = check_existing_file(path) -class CompositionOptions: - __slots__ = ["path"] +class MinContigLength: + @classmethod + def from_args(cls, args: argparse.Namespace): + return cls(typeasserted(args.minlength, int)) + def __init__(self, min_contig_length: int): + if min_contig_length < 250: + raise argparse.ArgumentTypeError( + "Minimum contig length must be at least 250" + ) + self.n = min_contig_length + + +class CompositionOptions: @staticmethod def are_args_present(args: argparse.Namespace) -> bool: return isinstance(args.fasta, Path) or isinstance(args.composition, Path) @@ -106,12 +117,14 @@ def from_args(cls, args: argparse.Namespace): return cls( typeasserted(args.fasta, (Path, type(None))), typeasserted(args.composition, (Path, type(None))), + MinContigLength.from_args(args), ) def __init__( self, fastapath: Optional[Path], npzpath: Optional[Path], + min_contig_length: MinContigLength, ): if not (fastapath is None) ^ (npzpath is None): raise argparse.ArgumentTypeError( @@ -128,6 +141,28 @@ def __init__( assert npzpath is not None self.path = CompositionPath(npzpath) + self.min_contig_length = min_contig_length + + +class PartialCompositionOptions: + @classmethod + def from_args(cls, args: argparse.Namespace): + return cls( + CreatablePath(typeasserted(args.outdir, Path)), + MinContigLength.from_args(args), + FASTAPath(typeasserted(args.fasta, Path)), + ) + + def __init__( + self, + outdir: CreatablePath, + min_contig_length: MinContigLength, + fasta_path: FASTAPath, + ): + self.outdir = outdir + self.min_contig_length = min_contig_length + self.fasta_path = fasta_path + class AbundancePath: def __init__(self, path: Path): @@ -178,8 +213,6 @@ def __init__(self, path: Path): class AbundanceOptions: - __slots__ = ["paths"] - @staticmethod def are_args_present(args: argparse.Namespace) -> bool: return ( @@ -197,6 +230,8 @@ def from_args(cls, args: argparse.Namespace): typeasserted(args.abundance_tsv, (Path, type(None))), typeasserted(args.abundancepath, (Path, type(None))), typeasserted(args.min_alignment_id, (float, type(None))), + MinContigLength.from_args(args), + not args.norefcheck, ) def __init__( @@ -206,6 +241,8 @@ def __init__( abundance_tsv: Optional[Path], abundancepath: Optional[Path], min_alignment_id: Optional[float], + min_contig_length: MinContigLength, + refcheck: bool, ): # Make sure only one abundance input is there if ( @@ -229,6 +266,8 @@ def __init__( self.paths = BAMPaths(bampaths, min_alignment_id) elif abundance_tsv is not None: self.paths = AbundanceTSVPath(abundance_tsv) + self.min_contig_length = min_contig_length + self.refcheck = refcheck class BasicTrainingOptions: @@ -330,25 +369,10 @@ def __init__( self.dropout = dropout -class MinContigLength: - @classmethod - def from_args(cls, args: argparse.Namespace): - return cls(typeasserted(args.minlength, int)) - - def __init__(self, min_contig_length: int): - if min_contig_length < 250: - raise argparse.ArgumentTypeError( - "Minimum contig length must be at least 250" - ) - self.n = min_contig_length - - class GeneralOptions: __slots__ = [ "out_dir", - "min_contig_length", "n_threads", - "refcheck", "seed", "cuda", ] @@ -357,9 +381,7 @@ class GeneralOptions: def from_args(cls, args: argparse.Namespace): return cls( typeasserted(args.outdir, Path), - MinContigLength.from_args(args), typeasserted(args.nthreads, int), - not typeasserted(args.norefcheck, bool), typeasserted(args.seed, int), typeasserted(args.cuda, bool), ) @@ -367,9 +389,7 @@ def from_args(cls, args: argparse.Namespace): def __init__( self, out_dir: Path, - min_contig_length: MinContigLength, n_threads: int, - refcheck: bool, seed: int, cuda: bool, ): @@ -377,7 +397,6 @@ def __init__( if n_threads < 1: raise ValueError(f"Must pass at least 1 thread, not {n_threads}") self.n_threads = n_threads - self.min_contig_length = min_contig_length if cuda and not torch.cuda.is_available(): raise ModuleNotFoundError( @@ -385,7 +404,6 @@ def __init__( ) self.seed = seed self.cuda = cuda - self.refcheck = refcheck class TaxonomyBase: @@ -885,26 +903,25 @@ def __init__( def calc_tnf( options: CompositionOptions, - min_contig_length: int, outdir: Path, binsplitter: vamb.vambtools.BinSplitter, ) -> vamb.parsecontigs.Composition: begintime = time.time() logger.info("Loading TNF") - logger.info(f"\tMinimum sequence length: {min_contig_length}") + logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") path = options.path if isinstance(path, CompositionPath): logger.info(f'\tLoading composition from npz at: "{path.path}"') composition = vamb.parsecontigs.Composition.load(path.path) - composition.filter_min_length(min_contig_length) + composition.filter_min_length(options.min_contig_length.n) else: assert isinstance(path, FASTAPath) logger.info(f"\tLoading data from FASTA file {path.path}") with vamb.vambtools.Reader(path.path) as file: composition = vamb.parsecontigs.Composition.from_file( - file, str(path.path), minlength=min_contig_length + file, str(path.path), minlength=options.min_contig_length.n ) composition.save(outdir.joinpath("composition.npz")) @@ -925,7 +942,7 @@ def calc_tnf( if not np.all(composition.metadata.mask): n_removed = len(composition.metadata.mask) - np.sum(composition.metadata.mask) message = ( - f"The minimum sequence length has been set to {min_contig_length}, " + f"The minimum sequence length has been set to {options.min_contig_length.n}, " f"but {n_removed} sequences fell below this threshold and was filtered away." "\nBetter results are obtained if the sequence file is filtered to the minimum " "sequence length before mapping.\n" @@ -945,14 +962,13 @@ def calc_tnf( def calc_abundance( abundance_options: AbundanceOptions, outdir: Path, - refcheck: bool, comp_metadata: vamb.parsecontigs.CompositionMetaData, nthreads: int, ) -> vamb.parsebam.Abundance: begintime = time.time() logger.info("Loading depths") logger.info( - f"\tReference hash: {comp_metadata.refhash.hex() if refcheck else 'None'}" + f"\tReference hash: {comp_metadata.refhash.hex() if abundance_options.refcheck else 'None'}" ) paths = abundance_options.paths @@ -961,13 +977,13 @@ def calc_abundance( abundance = vamb.parsebam.Abundance.load( paths.path, - comp_metadata.refhash if refcheck else None, + comp_metadata.refhash if abundance_options.refcheck else None, ) # I don't want this check in any constructors of abundance, since the constructors # should be able to skip this check in case comp and abundance are independent. # But when running the main Vamb workflow, we need to assert this. if abundance.nseqs != comp_metadata.nseqs: - assert not refcheck + assert not abundance_options.refcheck raise ValueError( f"Loaded abundance has {abundance.nseqs} sequences, " f"but composition has {comp_metadata.nseqs}." @@ -980,7 +996,7 @@ def calc_abundance( list(paths.paths), outdir.joinpath("tmp").joinpath("pycoverm"), comp_metadata, - refcheck, + abundance_options.refcheck, paths.min_alignment_id, nthreads, ) @@ -1017,14 +1033,12 @@ def load_composition_and_abundance( ) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]: composition = calc_tnf( comp_options, - vamb_options.min_contig_length.n, vamb_options.out_dir.path, binsplitter, ) abundance = calc_abundance( abundance_options, vamb_options.out_dir.path, - vamb_options.refcheck, composition.metadata, vamb_options.n_threads, ) @@ -1690,7 +1704,6 @@ def run_vaevae(opt: BinTaxVambOptions): def run_reclustering(opt: ReclusteringOptions): composition = calc_tnf( opt.composition, - opt.general.min_contig_length.n, opt.general.out_dir.path, opt.output.binsplitter, ) @@ -1710,7 +1723,6 @@ def run_reclustering(opt: ReclusteringOptions): abundance = calc_abundance( taxopt.abundance, taxopt.general.out_dir.path, - taxopt.general.refcheck, composition.metadata, taxopt.general.n_threads, ) @@ -2448,6 +2460,8 @@ def main(): # TODO: Not implemented. Need to refactor to universal options, then minlength, then add composition # then make a function to run tnf only if args.partial_part == "composition": + opt = GeneralOptions.from_args(args) + # opt = CompositionOptions.from_args(args) # runner = partial(run_tnf, opt) # run(runner, opt) From b582cbed9c1b38b4f81cdcc17625ffbade7d41fb Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 19 Mar 2025 14:47:26 +0100 Subject: [PATCH 4/6] WIP: Complete partial composition --- vamb/__main__.py | 121 +++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 66 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 190c43e0..73153158 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -144,26 +144,6 @@ def __init__( self.min_contig_length = min_contig_length -class PartialCompositionOptions: - @classmethod - def from_args(cls, args: argparse.Namespace): - return cls( - CreatablePath(typeasserted(args.outdir, Path)), - MinContigLength.from_args(args), - FASTAPath(typeasserted(args.fasta, Path)), - ) - - def __init__( - self, - outdir: CreatablePath, - min_contig_length: MinContigLength, - fasta_path: FASTAPath, - ): - self.outdir = outdir - self.min_contig_length = min_contig_length - self.fasta_path = fasta_path - - class AbundancePath: def __init__(self, path: Path): self.path = check_existing_file(path) @@ -406,6 +386,26 @@ def __init__( self.cuda = cuda +class PartialCompositionOptions: + @classmethod + def from_args(cls, args: argparse.Namespace): + return cls( + GeneralOptions.from_args(args), + MinContigLength.from_args(args), + FASTAPath(typeasserted(args.fasta, Path)), + ) + + def __init__( + self, + general: GeneralOptions, + min_contig_length: MinContigLength, + path: FASTAPath, + ): + self.general = general + self.min_contig_length = min_contig_length + self.path = path + + class TaxonomyBase: __slots__ = ["path"] @@ -902,9 +902,9 @@ def __init__( def calc_tnf( - options: CompositionOptions, + options: CompositionOptions | PartialCompositionOptions, outdir: Path, - binsplitter: vamb.vambtools.BinSplitter, + binsplitter: Optional[vamb.vambtools.BinSplitter], ) -> vamb.parsecontigs.Composition: begintime = time.time() logger.info("Loading TNF") @@ -925,7 +925,10 @@ def calc_tnf( ) composition.save(outdir.joinpath("composition.npz")) - binsplitter.initialize(composition.metadata.identifiers) + # Initialize binsplitter on the identifiers. Only done if we actually need to binsplit + # later. + if binsplitter is not None: + binsplitter.initialize(composition.metadata.identifiers) if composition.nseqs < MINIMUM_SEQS: err = ( @@ -1342,6 +1345,10 @@ def add_bin_prefix( return {prefix + b: c for (b, c) in clusters.items()} +def run_partial_composition(opt: PartialCompositionOptions): + calc_tnf(opt, opt.general.out_dir.path, None) + + def run_bin_default(opt: BinDefaultOptions): composition, abundance = load_composition_and_abundance( vamb_options=opt.common.general, @@ -1824,7 +1831,18 @@ def add_help_arguments(parser: argparse.ArgumentParser): ) -def add_universal_arguments(subparser: argparse.ArgumentParser): +def add_minlength(general: argparse._ArgumentGroup): + general.add_argument( + "-m", + dest="minlength", + metavar="", + type=int, + default=2000, + help="Ignore contigs shorter than this [2000]", + ) + + +def add_general_arguments(subparser: argparse.ArgumentParser): add_help_arguments(subparser) reqos = subparser.add_argument_group(title="Output", description=None) reqos.add_argument( @@ -1836,37 +1854,9 @@ def add_universal_arguments(subparser: argparse.ArgumentParser): required=True, ) - -# TODO: These are not general. -# Outdir: All -# Minlength: Composition and abundance -# Nthreads: abundance, encoding and clustering -# norefcheck: abundance, encoding, clustering -# cuda: encoding and clustering -# seed: encoding and clustering -def add_general_group(subparser: argparse.ArgumentParser) -> argparse._ArgumentGroup: general = subparser.add_argument_group( title="General optional arguments", description=None ) - return general - - -def add_minlength(general: argparse._ArgumentGroup): - general.add_argument( - "-m", - dest="minlength", - metavar="", - type=int, - default=2000, - help="Ignore contigs shorter than this [2000]", - ) - - -def add_general_arguments(subparser: argparse.ArgumentParser): - add_universal_arguments(subparser) - - general = add_general_group(subparser) - add_minlength(general) general.add_argument( "-p", @@ -1891,12 +1881,17 @@ def add_general_arguments(subparser: argparse.ArgumentParser): default=int.from_bytes(os.urandom(7), "little"), help="Random seed (determinism not guaranteed)", ) - return subparser + return general -def add_composition_arguments(subparser: argparse.ArgumentParser): +def add_fasta_arguments(subparser: argparse.ArgumentParser): tnfos = subparser.add_argument_group(title="Composition input") tnfos.add_argument("--fasta", metavar="", type=Path, help="Path to fasta file") + return tnfos + + +def add_composition_arguments(subparser: argparse.ArgumentParser): + tnfos = add_fasta_arguments(subparser) tnfos.add_argument( "--composition", metavar="", type=Path, help="Path to .npz of composition" ) @@ -2422,10 +2417,9 @@ def main(): composition_parser = partial_part.add_parser( "composition", help="Process composition data", add_help=False ) - add_universal_arguments(composition_parser) - general = add_general_group(composition_parser) - add_minlength(general) - add_composition_arguments(composition_parser) + general_group = add_general_arguments(composition_parser) + add_minlength(general_group) + add_fasta_arguments(composition_parser) args = parser.parse_args() @@ -2457,15 +2451,10 @@ def main(): runner = partial(run_reclustering, opt) run(runner, opt.general) elif args.subcommand == PARTIAL: - # TODO: Not implemented. Need to refactor to universal options, then minlength, then add composition - # then make a function to run tnf only if args.partial_part == "composition": - opt = GeneralOptions.from_args(args) - - # opt = CompositionOptions.from_args(args) - # runner = partial(run_tnf, opt) - # run(runner, opt) - pass + opt = PartialCompositionOptions.from_args(args) + runner = partial(run_partial_composition, opt) + run(runner, opt.general) else: # TODO: Add abundance # TODO: Add encoding w. VAE From 8556c9797afa87a32f6e705e37f274470d0bca9a Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 19 Mar 2025 14:52:12 +0100 Subject: [PATCH 5/6] Fixup: Add minimum length --- vamb/__main__.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 73153158..2a56593a 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -2321,7 +2321,8 @@ def main(): Required arguments: Outdir, at least one composition input and at least one abundance input""", ) - add_general_arguments(vae_parser) + general_group = add_general_arguments(vae_parser) + add_minlength(general_group) add_composition_arguments(vae_parser) add_abundance_arguments(vae_parser) add_bin_output_arguments(vae_parser) @@ -2340,7 +2341,8 @@ def main(): Required arguments: Outdir, taxonomy, at least one composition input and at least one abundance input""", ) - add_general_arguments(vaevae_parser) + general_group = add_general_arguments(vaevae_parser) + add_minlength(general_group) add_composition_arguments(vaevae_parser) add_abundance_arguments(vaevae_parser) add_taxonomy_arguments(vaevae_parser) @@ -2355,7 +2357,8 @@ def main(): add_help=False, usage="%(prog)s [options]", ) - add_general_arguments(vaeaae_parser) + general_group = add_general_arguments(vaeaae_parser) + add_minlength(general_group) add_composition_arguments(vaeaae_parser) add_abundance_arguments(vaeaae_parser) add_bin_output_arguments(vaeaae_parser) @@ -2375,7 +2378,8 @@ def main(): Required arguments: Outdir, unrefined taxonomy, at least one composition input and at least one abundance input""", ) - add_general_arguments(predict_parser) + general_group = add_general_arguments(predict_parser) + add_minlength(general_group) add_composition_arguments(predict_parser) add_abundance_arguments(predict_parser) add_taxonomy_arguments(predict_parser, taxonomy_only=True) @@ -2400,7 +2404,8 @@ def main(): at least one abundance input, at least one marker gene input, latent path and taxonomy """, ) - add_general_arguments(recluster_parser) + general_group = add_general_arguments(recluster_parser) + add_minlength(general_group) add_composition_arguments(recluster_parser) add_abundance_arguments(recluster_parser) add_marker_arguments(recluster_parser) From 734f38a9574d611c80b2cf1a65e78fe5667a0183 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 19 Mar 2025 15:40:01 +0100 Subject: [PATCH 6/6] Add partial abundance --- vamb/__main__.py | 119 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 104 insertions(+), 15 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 2a56593a..3790039b 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -406,6 +406,54 @@ def __init__( self.path = path +class PartialAbundanceOptions: + @classmethod + def from_args(cls, args: argparse.Namespace): + comp = CompositionOptions( + None, typeasserted(args.composition, Path), MinContigLength.from_args(args) + ) + return cls( + GeneralOptions.from_args(args), + comp, + typeasserted(args.abundance_tsv, (Path, type(None))), + typeasserted(args.bampaths, (list, type(None))), + typeasserted(args.bamdir, (Path, type(None))), + typeasserted(args.min_alignment_id, (float, type(None))), + not typeasserted(args.norefcheck, bool), + ) + + def __init__( + self, + general: GeneralOptions, + composition_options: CompositionOptions, + abundance_tsv: Optional[Path], + bam_paths: Optional[list[Path]], + bam_dir: Optional[Path], + min_alignment_id: Optional[float], + refcheck: bool, + ): + if ( + (bam_paths is not None) + + (abundance_tsv is not None) + + (bam_dir is not None) + ) != 1: + raise argparse.ArgumentTypeError( + "Must specify exactly one of BAM files, BAM dir or TSV file input" + ) + if bam_dir is not None: + self.paths = BAMPaths.from_dir(bam_dir, min_alignment_id) + elif bam_paths is not None: + logger.warning( + "The --bamfiles argument is deprecated. It works, but might be removed in future versions of Vamb. Please use --bamdir instead" + ) + self.paths = BAMPaths(bam_paths, min_alignment_id) + elif abundance_tsv is not None: + self.paths = AbundanceTSVPath(abundance_tsv) + self.general = general + self.refcheck = refcheck + self.composition_options = composition_options + + class TaxonomyBase: __slots__ = ["path"] @@ -923,6 +971,7 @@ def calc_tnf( composition = vamb.parsecontigs.Composition.from_file( file, str(path.path), minlength=options.min_contig_length.n ) + assert outdir is not None composition.save(outdir.joinpath("composition.npz")) # Initialize binsplitter on the identifiers. Only done if we actually need to binsplit @@ -963,7 +1012,7 @@ def calc_tnf( def calc_abundance( - abundance_options: AbundanceOptions, + abundance_options: AbundanceOptions | PartialAbundanceOptions, outdir: Path, comp_metadata: vamb.parsecontigs.CompositionMetaData, nthreads: int, @@ -1349,6 +1398,13 @@ def run_partial_composition(opt: PartialCompositionOptions): calc_tnf(opt, opt.general.out_dir.path, None) +def run_partial_abundance(opt: PartialAbundanceOptions): + composition = calc_tnf(opt.composition_options, opt.general.out_dir.path, None) + calc_abundance( + opt, opt.general.out_dir.path, composition.metadata, opt.general.n_threads + ) + + def run_bin_default(opt: BinDefaultOptions): composition, abundance = load_composition_and_abundance( vamb_options=opt.common.general, @@ -1884,21 +1940,37 @@ def add_general_arguments(subparser: argparse.ArgumentParser): return general -def add_fasta_arguments(subparser: argparse.ArgumentParser): - tnfos = subparser.add_argument_group(title="Composition input") +def make_composition_group(subparser: argparse.ArgumentParser): + return subparser.add_argument_group(title="Composition input") + + +def add_fasta_to_group(tnfos: argparse._ArgumentGroup): tnfos.add_argument("--fasta", metavar="", type=Path, help="Path to fasta file") - return tnfos -def add_composition_arguments(subparser: argparse.ArgumentParser): - tnfos = add_fasta_arguments(subparser) +def add_composition_npz_to_group(tnfos: argparse._ArgumentGroup): tnfos.add_argument( "--composition", metavar="", type=Path, help="Path to .npz of composition" ) - return subparser -def add_abundance_arguments(subparser: argparse.ArgumentParser): +def add_fasta_arguments(subparser: argparse.ArgumentParser): + tnfos = make_composition_group(subparser) + add_fasta_to_group(tnfos) + + +def add_composition_npz_argument(subparser: argparse.ArgumentParser): + tnfos = make_composition_group(subparser) + add_composition_npz_to_group(tnfos) + + +def add_composition_arguments(subparser: argparse.ArgumentParser): + tnfos = make_composition_group(subparser) + add_fasta_to_group(tnfos) + add_composition_npz_to_group(tnfos) + + +def add_abundance_args_nonpz(subparser: argparse.ArgumentParser): abundanceos = subparser.add_argument_group(title="Abundance input") # Note: This argument is deprecated, but we'll keep supporting it for now. # Instead, use --bamdir. @@ -1922,13 +1994,6 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): type=Path, help='Path to TSV file of precomputed abundances with header being "contigname(\\t)*"', ) - abundanceos.add_argument( - "--abundance", - metavar="", - dest="abundancepath", - type=Path, - help="Path to .npz of abundances", - ) abundanceos.add_argument( "-z", dest="min_alignment_id", @@ -1937,6 +2002,18 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): default=None, help=argparse.SUPPRESS, ) + return abundanceos + + +def add_abundance_arguments(subparser: argparse.ArgumentParser): + abundanceos = add_abundance_args_nonpz(subparser) + abundanceos.add_argument( + "--abundance", + metavar="", + dest="abundancepath", + type=Path, + help="Path to .npz of abundances", + ) return subparser @@ -2426,6 +2503,14 @@ def main(): add_minlength(general_group) add_fasta_arguments(composition_parser) + abundance_parser = partial_part.add_parser( + "abundance", help="Process abundance data", add_help=False + ) + general_group = add_general_arguments(abundance_parser) + add_minlength(general_group) + add_composition_npz_argument(abundance_parser) + add_abundance_args_nonpz(abundance_parser) + args = parser.parse_args() if args.subcommand == TAXOMETER: @@ -2460,6 +2545,10 @@ def main(): opt = PartialCompositionOptions.from_args(args) runner = partial(run_partial_composition, opt) run(runner, opt.general) + elif args.partial_part == "abundance": + opt = PartialAbundanceOptions.from_args(args) + runner = partial(run_partial_abundance, opt) + run(runner, opt.general) else: # TODO: Add abundance # TODO: Add encoding w. VAE