From db0f3a1380d49b2b58990281bf9733a3879922b5 Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Fri, 4 Jul 2025 17:02:50 -0400 Subject: [PATCH 1/7] Added a command that trains and returns latent.npz. Still in testing phase. --- vamb/__main__.py | 73 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 3790039b..b7555adf 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -1076,6 +1076,46 @@ def calc_abundance( return abundance +def train(args): + logger.info("It works!!") + + composition = vamb.parsecontigs.Composition.load(args.composition_file) + abundance = vamb.parsebam.Abundance.load(args.abundance_file, composition.metadata.refhash) + + data_loader = vamb.encode.make_dataloader( + abundance.matrix, + composition.matrix, + composition.metadata.lengths, + batchsize=256, + destroy=True, + cuda=False, + ) + + vae_options = VAEOptions( + basic_options=BasicTrainingOptions( + num_epochs=4, + starting_batch_size=256, + batch_steps=[1, 3], + ), + nhiddens=[512, 512], + nlatent=32, + alpha=0.01, + beta=200.0, + dropout=0.2, + ) + + vamb_options = GeneralOptions( + out_dir=Path(args.outdir), + n_threads=4, + seed=42, + cuda=False, + ) + + os.makedirs(args.outdir, exist_ok=True) + + trainvae(vae_options, vamb_options, data_loader) + logger.info("Latent space written to latent.npz") + def load_composition_and_abundance( vamb_options: GeneralOptions, @@ -1088,6 +1128,7 @@ def load_composition_and_abundance( vamb_options.out_dir.path, binsplitter, ) + abundance = calc_abundance( abundance_options, vamb_options.out_dir.path, @@ -2016,6 +2057,13 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): ) return subparser +def add_training_arguments(subparser: argparse.ArgumentParser): + trainingos = subparser.add_argument_group(title="Training options") + trainingos.add_argument( + "--print_test", + type=str, + help="Print test output" + ) def add_taxonomy_arguments(subparser: argparse.ArgumentParser, taxonomy_only=False): taxonomys = subparser.add_argument_group(title="Taxonomy input") @@ -2383,8 +2431,7 @@ def main(): """, add_help=False, ) - add_help_arguments(vaevae_parserbin_parser) - subparsers_model = vaevae_parserbin_parser.add_subparsers(dest="model_subcommand") + subparsers_model = vaevae_parserbin_parser.add_subparsers(dest="model_subcommand", required=True) vae_parser = subparsers_model.add_parser( VAMB, @@ -2393,7 +2440,7 @@ def main(): default binner based on a variational autoencoder. See the paper 'Improved metagenome binning and assembly using deep variational autoencoders'""", add_help=False, - usage="%(prog)s [options]", + #usage="%(prog)s [options]", description="""Bin using a VAE that merges composition and abundance information. Required arguments: Outdir, at least one composition input and at least one abundance input""", @@ -2413,7 +2460,7 @@ def main(): taxonomy informed binner based on a bi-modal variational autoencoder. See the paper 'TaxVAMB: taxonomic annotations improve metagenome binning'""", add_help=False, - usage="%(prog)s [options]", + #usage="%(prog)s [options]", description="""Bin using a semi-supervised VAEVAE model that merges composition, abundance and taxonomic information. Required arguments: Outdir, taxonomy, at least one composition input and at least one abundance input""", @@ -2432,7 +2479,7 @@ def main(): AVAMB, help=argparse.SUPPRESS, add_help=False, - usage="%(prog)s [options]", + #usage="%(prog)s [options]", ) general_group = add_general_arguments(vaeaae_parser) add_minlength(general_group) @@ -2511,9 +2558,17 @@ def main(): add_composition_npz_argument(abundance_parser) add_abundance_args_nonpz(abundance_parser) + train_parser = partial_part.add_parser( + "train", help="Do training without clustering", add_help=False + ) + general_group = add_training_arguments(train_parser) + train_parser.add_argument('--abundance_file', type=str, help='Input filename') + train_parser.add_argument('--composition_file', type=str, help='Input filename') + train_parser.add_argument('--outdir', type=str, help='Output directory') + args = parser.parse_args() - if args.subcommand == TAXOMETER: + if args.subcommand == TAXOMETER: opt = TaxometerOptions.from_args(args) runner = partial(run_taxonomy_predictor, opt) run(runner, opt.general) @@ -2549,6 +2604,12 @@ def main(): opt = PartialAbundanceOptions.from_args(args) runner = partial(run_partial_abundance, opt) run(runner, opt.general) + elif args.partial_part == "train": + logger.info("Almost thereeeee!!!") + train(args) + #opt = PartialTrainOptions.from_args(args) + #runner = partial(run_partial_train, opt) + #run(runner, opt.general) else: # TODO: Add abundance # TODO: Add encoding w. VAE From f97e0b0f1aeeae507beafd1bc986d048d33c3236 Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Fri, 11 Jul 2025 13:23:13 -0400 Subject: [PATCH 2/7] Added more functions and divided some older functions to add the ability to use different options. Work in progress. --- vamb/__main__.py | 227 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 168 insertions(+), 59 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index b7555adf..a8561219 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -360,12 +360,13 @@ class GeneralOptions: @classmethod def from_args(cls, args: argparse.Namespace): return cls( - typeasserted(args.outdir, Path), + typeasserted(Path(args.outdir), Path), typeasserted(args.nthreads, int), typeasserted(args.seed, int), typeasserted(args.cuda, bool), ) - + def training_args_assertions(cls, args: argparse.Namespace): + None def __init__( self, out_dir: Path, @@ -793,7 +794,6 @@ def from_args(cls, args: argparse.Namespace): ClusterOptions.from_args(args), BinOutputOptions.from_args(comp, args), ) - # We do not have BasicTrainingOptions because that is model-specific def __init__( self, @@ -809,6 +809,52 @@ def __init__( self.clustering = clustering self.output = output +class TrainingCommonOptions: + def __init__(self, general: GeneralOptions, comp: CompositionPath, abundance: AbundancePath): + self.general = general + self.comp = comp + self.abundance = abundance + +class PartialTrainingOptions: + def __init__( + self, + general: GeneralOptions, + common: TrainingCommonOptions, + comp: CompositionPath, + abundance: AbundancePath, + min_contig_length: MinContigLength, + vae: VAEOptions + ): + self.general = general + self.common = common + self.comp = comp + self.abundance = abundance + self.min_contig_length = min_contig_length + self.vae = vae + @classmethod + def from_args(cls, args: argparse.Namespace): + general = GeneralOptions.from_args(args) + comp = CompositionPath(Path(args.composition_file)) + abundance_path = Path(args.abundance_file) + min_contig_length = MinContigLength.from_args(args) + basic = BasicTrainingOptions.from_args_vae(args) + vae = VAEOptions.from_args(basic, args) + + + abundance = AbundanceOptions( + bampaths=None, + bamdir=None, + abundance_tsv=None, + abundancepath=abundance_path, + min_alignment_id=0.0, + min_contig_length=min_contig_length, + refcheck=False + ) + + common = TrainingCommonOptions(general, comp, abundance) + return cls(general, common, comp, abundance, min_contig_length, vae) + + class BinDefaultOptions: @classmethod @@ -948,6 +994,51 @@ def __init__( self.output = output self.algorithm = algorithm +def calc_tnf_train_only( + options: PartialTrainingOptions, + outdir: Path, +) -> vamb.parsecontigs.Composition: + begintime = time.time() + logger.info("Loading TNF") + #logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") + + path = options.comp + if isinstance(path, CompositionPath): + logger.info(f'\tLoading composition from npz at: "{path.path}"') + composition = vamb.parsecontigs.Composition.load(path.path) + composition.filter_min_length(options.min_contig_length.n) + else: + raise TypeError("Training-only mode requires a CompositionPath (precomputed .npz)") + if composition.nseqs < MINIMUM_SEQS: + err = ( + f"Found only {composition.nseqs} contigs, but Vamb currently requires at least " + f"{MINIMUM_SEQS} to work correctly. " + "If you have this few sequences in a metagenomic assembly, " + "it's probably an error somewhere in your workflow." + ) + logger.error(err) + raise ValueError(err) + + # Warn the user if any contigs have been observed, which is smaller + # than the threshold. + if not np.all(composition.metadata.mask): + n_removed = len(composition.metadata.mask) - np.sum(composition.metadata.mask) + message = ( + f"The minimum sequence length has been set to {options.min_contig_length.n}, " + f"but {n_removed} sequences fell below this threshold and was filtered away." + "\nBetter results are obtained if the sequence file is filtered to the minimum " + "sequence length before mapping.\n" + ) + logger.opt(raw=True).info("\n") + logger.warning(message) + + elapsed = round(time.time() - begintime, 2) + logger.info( + f"\tKept {composition.count_bases()} bases in {composition.nseqs} sequences" + ) + logger.info(f"\tProcessed TNF in {elapsed} seconds.\n") + + return composition def calc_tnf( options: CompositionOptions | PartialCompositionOptions, @@ -1076,47 +1167,6 @@ def calc_abundance( return abundance -def train(args): - logger.info("It works!!") - - composition = vamb.parsecontigs.Composition.load(args.composition_file) - abundance = vamb.parsebam.Abundance.load(args.abundance_file, composition.metadata.refhash) - - data_loader = vamb.encode.make_dataloader( - abundance.matrix, - composition.matrix, - composition.metadata.lengths, - batchsize=256, - destroy=True, - cuda=False, - ) - - vae_options = VAEOptions( - basic_options=BasicTrainingOptions( - num_epochs=4, - starting_batch_size=256, - batch_steps=[1, 3], - ), - nhiddens=[512, 512], - nlatent=32, - alpha=0.01, - beta=200.0, - dropout=0.2, - ) - - vamb_options = GeneralOptions( - out_dir=Path(args.outdir), - n_threads=4, - seed=42, - cuda=False, - ) - - os.makedirs(args.outdir, exist_ok=True) - - trainvae(vae_options, vamb_options, data_loader) - logger.info("Latent space written to latent.npz") - - def load_composition_and_abundance( vamb_options: GeneralOptions, comp_options: CompositionOptions, @@ -1137,6 +1187,21 @@ def load_composition_and_abundance( ) return (composition, abundance) +def load_composition_and_abundance_train_only( + opt: PartialTrainingOptions, +) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]: + composition = calc_tnf_train_only( + opt, + opt.general.out_dir.path + ) + + abundance = calc_abundance( + opt.abundance, + opt.general.out_dir.path, + composition.metadata, + opt.general.n_threads, + ) + return (composition, abundance) def load_markers( options: MarkerOptions, @@ -1444,15 +1509,7 @@ def run_partial_abundance(opt: PartialAbundanceOptions): calc_abundance( opt, opt.general.out_dir.path, composition.metadata, opt.general.n_threads ) - - -def run_bin_default(opt: BinDefaultOptions): - composition, abundance = load_composition_and_abundance( - vamb_options=opt.common.general, - comp_options=opt.common.comp, - abundance_options=opt.common.abundance, - binsplitter=opt.common.output.binsplitter, - ) +def run_train_vae(opt: BinDefaultOptions, composition: CompositionPath, abundance: AbundancePath): data_loader = vamb.encode.make_dataloader( abundance.matrix, composition.matrix, @@ -1471,6 +1528,12 @@ def run_bin_default(opt: BinDefaultOptions): del composition, abundance assert comp_metadata.nseqs == len(latent) + return latent + +def run_cluster_and_write_files(latent, opt: BinDefaultOptions, composition: CompositionPath): + + comp_metadata = composition.metadata + assert comp_metadata.nseqs == len(latent) cluster_and_write_files( opt.common.clustering, opt.common.output.binsplitter, @@ -1483,6 +1546,15 @@ def run_bin_default(opt: BinDefaultOptions): FastaOutput.try_from_common(opt.common), None, ) +def run_bin_default(opt: BinDefaultOptions): + composition, abundance = load_composition_and_abundance( + vamb_options=opt.common.general, + comp_options=opt.common.comp, + abundance_options=opt.common.abundance, + binsplitter=opt.common.output.binsplitter, + ) + latent = run_train_vae(opt, composition, abundance) + run_cluster_and_write_files(latent, opt, composition) del latent @@ -2059,11 +2131,30 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): def add_training_arguments(subparser: argparse.ArgumentParser): trainingos = subparser.add_argument_group(title="Training options") + add_minlength(trainingos) trainingos.add_argument( "--print_test", type=str, help="Print test output" ) + trainingos.add_argument( + "-p", + dest="nthreads", + metavar="", + type=int, + default=DEFAULT_THREADS, + help="number of threads to use where customizable", + ) + trainingos.add_argument( + "--seed", + metavar="", + type=int, + default=int.from_bytes(os.urandom(7), "little"), + help="Random seed (determinism not guaranteed)", + ) + trainingos.add_argument( + "--cuda", help="Use GPU to train & cluster [False]", action="store_true" + ) def add_taxonomy_arguments(subparser: argparse.ArgumentParser, taxonomy_only=False): taxonomys = subparser.add_argument_group(title="Taxonomy input") @@ -2173,7 +2264,11 @@ def add_vae_arguments(subparser: argparse.ArgumentParser): trainos = subparser.add_argument_group(title="Training options", description=None) trainos.add_argument( - "-e", dest="nepochs", metavar="", type=int, default=300, help=argparse.SUPPRESS + "-e", dest="nepochs", + metavar="", + type=int, + default=70, + help=argparse.SUPPRESS, ) trainos.add_argument( "-t", @@ -2565,6 +2660,15 @@ def main(): train_parser.add_argument('--abundance_file', type=str, help='Input filename') train_parser.add_argument('--composition_file', type=str, help='Input filename') train_parser.add_argument('--outdir', type=str, help='Output directory') + train_parser.add_argument('--nepochs', type=int, default=70, help='Number of training epochs (default: 70)') + train_parser.add_argument('--batchsize', type=int, default=64, help='Batchsize') + train_parser.add_argument('--batchsteps', type=int, nargs='+', default=[], help='Epochs at which to update batch statistics (default: none)') + train_parser.add_argument('--nhiddens', type=int, nargs='*', default=None, help='List of hidden layer sizes for the VAE (e.g., --nhiddens 512 256)') + parser.add_argument('--nlatent', type=int, default=32, help='Size of the VAE latent space (default: 32)') + parser.add_argument('--alpha', type=float, default=None, help='Beta-VAE alpha parameter (optional, default: None)') + parser.add_argument('--beta', type=float, default=1.0, help='KL-divergence weight for Beta-VAE (default: 1.0)') + parser.add_argument('--dropout', type=float, default=None, help='Dropout rate for VAE layers (optional, default: None)') + args = parser.parse_args() @@ -2605,11 +2709,16 @@ def main(): runner = partial(run_partial_abundance, opt) run(runner, opt.general) elif args.partial_part == "train": - logger.info("Almost thereeeee!!!") - train(args) - #opt = PartialTrainOptions.from_args(args) - #runner = partial(run_partial_train, opt) - #run(runner, opt.general) + starting_time = time.time() + opt = PartialTrainingOptions.from_args(args) + os.makedirs(args.outdir, exist_ok=False) + composition, abundance = load_composition_and_abundance_train_only(opt) + run_train_vae(opt, composition, abundance) + logger.info(f"Saved latent.npz to /{args.outdir}") + ending_time = time.time() + elapsed = ending_time - starting_time + logger.info(f"Completed training in {elapsed:.2f} seconds") + else: # TODO: Add abundance # TODO: Add encoding w. VAE From 1e6ddd31ea0b50a3a616f4fdbd4aace8854f4f5d Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Mon, 28 Jul 2025 17:12:32 +0200 Subject: [PATCH 3/7] Got rid of unnecessary code, combined similar functions, moved functionality so it makes more sense --- vamb/__main__.py | 183 +++++++++++++++++++++-------------------------- 1 file changed, 81 insertions(+), 102 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index a8561219..010ff454 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -365,8 +365,10 @@ def from_args(cls, args: argparse.Namespace): typeasserted(args.seed, int), typeasserted(args.cuda, bool), ) + def training_args_assertions(cls, args: argparse.Namespace): None + def __init__( self, out_dir: Path, @@ -794,6 +796,7 @@ def from_args(cls, args: argparse.Namespace): ClusterOptions.from_args(args), BinOutputOptions.from_args(comp, args), ) + # We do not have BasicTrainingOptions because that is model-specific def __init__( self, @@ -807,15 +810,18 @@ def __init__( self.comp = comp self.abundance = abundance self.clustering = clustering - self.output = output + class TrainingCommonOptions: - def __init__(self, general: GeneralOptions, comp: CompositionPath, abundance: AbundancePath): + def __init__( + self, general: GeneralOptions, comp: CompositionPath, abundance: AbundancePath + ): self.general = general self.comp = comp self.abundance = abundance -class PartialTrainingOptions: + +class PartialTrainingOptions: def __init__( self, general: GeneralOptions, @@ -823,7 +829,8 @@ def __init__( comp: CompositionPath, abundance: AbundancePath, min_contig_length: MinContigLength, - vae: VAEOptions + vae: VAEOptions, + outdir: Path, ): self.general = general self.common = common @@ -831,6 +838,8 @@ def __init__( self.abundance = abundance self.min_contig_length = min_contig_length self.vae = vae + self.outdir = outdir + @classmethod def from_args(cls, args: argparse.Namespace): general = GeneralOptions.from_args(args) @@ -839,8 +848,8 @@ def from_args(cls, args: argparse.Namespace): min_contig_length = MinContigLength.from_args(args) basic = BasicTrainingOptions.from_args_vae(args) vae = VAEOptions.from_args(basic, args) + outdir = Path(args.outdir) - abundance = AbundanceOptions( bampaths=None, bamdir=None, @@ -848,13 +857,18 @@ def from_args(cls, args: argparse.Namespace): abundancepath=abundance_path, min_alignment_id=0.0, min_contig_length=min_contig_length, - refcheck=False + refcheck=False, ) - common = TrainingCommonOptions(general, comp, abundance) - return cls(general, common, comp, abundance, min_contig_length, vae) + return cls(general, common, comp, abundance, min_contig_length, vae, outdir) + + def validate_comp_is_npz(self) -> Path: + if not isinstance(self.comp, CompositionPath): + raise TypeError( + "Training-only mode requires a CompositionPath (precomputed .npz)" + ) + return self.comp - class BinDefaultOptions: @classmethod @@ -994,68 +1008,25 @@ def __init__( self.output = output self.algorithm = algorithm -def calc_tnf_train_only( - options: PartialTrainingOptions, - outdir: Path, -) -> vamb.parsecontigs.Composition: - begintime = time.time() - logger.info("Loading TNF") - #logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") - - path = options.comp - if isinstance(path, CompositionPath): - logger.info(f'\tLoading composition from npz at: "{path.path}"') - composition = vamb.parsecontigs.Composition.load(path.path) - composition.filter_min_length(options.min_contig_length.n) - else: - raise TypeError("Training-only mode requires a CompositionPath (precomputed .npz)") - if composition.nseqs < MINIMUM_SEQS: - err = ( - f"Found only {composition.nseqs} contigs, but Vamb currently requires at least " - f"{MINIMUM_SEQS} to work correctly. " - "If you have this few sequences in a metagenomic assembly, " - "it's probably an error somewhere in your workflow." - ) - logger.error(err) - raise ValueError(err) - - # Warn the user if any contigs have been observed, which is smaller - # than the threshold. - if not np.all(composition.metadata.mask): - n_removed = len(composition.metadata.mask) - np.sum(composition.metadata.mask) - message = ( - f"The minimum sequence length has been set to {options.min_contig_length.n}, " - f"but {n_removed} sequences fell below this threshold and was filtered away." - "\nBetter results are obtained if the sequence file is filtered to the minimum " - "sequence length before mapping.\n" - ) - logger.opt(raw=True).info("\n") - logger.warning(message) - - elapsed = round(time.time() - begintime, 2) - logger.info( - f"\tKept {composition.count_bases()} bases in {composition.nseqs} sequences" - ) - logger.info(f"\tProcessed TNF in {elapsed} seconds.\n") - - return composition def calc_tnf( - options: CompositionOptions | PartialCompositionOptions, + options: CompositionOptions | PartialCompositionOptions | PartialTrainingOptions, outdir: Path, - binsplitter: Optional[vamb.vambtools.BinSplitter], + binsplitter: Optional[vamb.vambtools.BinSplitter] = None, + train_only: bool = False, ) -> vamb.parsecontigs.Composition: begintime = time.time() logger.info("Loading TNF") - logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") - - path = options.path - + if not train_only: + logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") + path = options.path + else: + path = options.validate_comp_is_npz() if isinstance(path, CompositionPath): logger.info(f'\tLoading composition from npz at: "{path.path}"') composition = vamb.parsecontigs.Composition.load(path.path) composition.filter_min_length(options.min_contig_length.n) - else: + elif not train_only: assert isinstance(path, FASTAPath) logger.info(f"\tLoading data from FASTA file {path.path}") with vamb.vambtools.Reader(path.path) as file: @@ -1064,11 +1035,16 @@ def calc_tnf( ) assert outdir is not None composition.save(outdir.joinpath("composition.npz")) + else: + raise TypeError( + "In training-only mode, path must be a CompositionPath with a valid .npz file" + ) - # Initialize binsplitter on the identifiers. Only done if we actually need to binsplit - # later. - if binsplitter is not None: - binsplitter.initialize(composition.metadata.identifiers) + if not train_only: + # Initialize binsplitter on the identifiers. Only done if we actually need to binsplit + # later. + if binsplitter is not None: + binsplitter.initialize(composition.metadata.identifiers) if composition.nseqs < MINIMUM_SEQS: err = ( @@ -1167,6 +1143,7 @@ def calc_abundance( return abundance + def load_composition_and_abundance( vamb_options: GeneralOptions, comp_options: CompositionOptions, @@ -1187,13 +1164,11 @@ def load_composition_and_abundance( ) return (composition, abundance) + def load_composition_and_abundance_train_only( opt: PartialTrainingOptions, ) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]: - composition = calc_tnf_train_only( - opt, - opt.general.out_dir.path - ) + composition = calc_tnf(opt, opt.general.out_dir.path, train_only=True) abundance = calc_abundance( opt.abundance, @@ -1203,6 +1178,7 @@ def load_composition_and_abundance_train_only( ) return (composition, abundance) + def load_markers( options: MarkerOptions, comp_metadata: vamb.parsecontigs.CompositionMetaData, @@ -1509,7 +1485,11 @@ def run_partial_abundance(opt: PartialAbundanceOptions): calc_abundance( opt, opt.general.out_dir.path, composition.metadata, opt.general.n_threads ) -def run_train_vae(opt: BinDefaultOptions, composition: CompositionPath, abundance: AbundancePath): + + +def run_train_vae( + opt: BinDefaultOptions, composition: CompositionPath, abundance: AbundancePath +): data_loader = vamb.encode.make_dataloader( abundance.matrix, composition.matrix, @@ -1528,10 +1508,14 @@ def run_train_vae(opt: BinDefaultOptions, composition: CompositionPath, abundanc del composition, abundance assert comp_metadata.nseqs == len(latent) + logger.info(f"Saved latent.npz to {opt.outdir}") + return latent -def run_cluster_and_write_files(latent, opt: BinDefaultOptions, composition: CompositionPath): - + +def run_cluster_and_write_files( + latent, opt: BinDefaultOptions, composition: CompositionPath +): comp_metadata = composition.metadata assert comp_metadata.nseqs == len(latent) cluster_and_write_files( @@ -1546,12 +1530,14 @@ def run_cluster_and_write_files(latent, opt: BinDefaultOptions, composition: Com FastaOutput.try_from_common(opt.common), None, ) + + def run_bin_default(opt: BinDefaultOptions): composition, abundance = load_composition_and_abundance( - vamb_options=opt.common.general, - comp_options=opt.common.comp, - abundance_options=opt.common.abundance, - binsplitter=opt.common.output.binsplitter, + vamb_options=opt.common.general, + comp_options=opt.common.comp, + abundance_options=opt.common.abundance, + binsplitter=opt.common.output.binsplitter, ) latent = run_train_vae(opt, composition, abundance) run_cluster_and_write_files(latent, opt, composition) @@ -2129,14 +2115,11 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): ) return subparser + def add_training_arguments(subparser: argparse.ArgumentParser): trainingos = subparser.add_argument_group(title="Training options") add_minlength(trainingos) - trainingos.add_argument( - "--print_test", - type=str, - help="Print test output" - ) + trainingos.add_argument("--print_test", type=str, help="Print test output") trainingos.add_argument( "-p", dest="nthreads", @@ -2144,7 +2127,7 @@ def add_training_arguments(subparser: argparse.ArgumentParser): type=int, default=DEFAULT_THREADS, help="number of threads to use where customizable", - ) + ) trainingos.add_argument( "--seed", metavar="", @@ -2156,6 +2139,7 @@ def add_training_arguments(subparser: argparse.ArgumentParser): "--cuda", help="Use GPU to train & cluster [False]", action="store_true" ) + def add_taxonomy_arguments(subparser: argparse.ArgumentParser, taxonomy_only=False): taxonomys = subparser.add_argument_group(title="Taxonomy input") taxonomys.add_argument( @@ -2264,7 +2248,8 @@ def add_vae_arguments(subparser: argparse.ArgumentParser): trainos = subparser.add_argument_group(title="Training options", description=None) trainos.add_argument( - "-e", dest="nepochs", + "-e", + dest="nepochs", metavar="", type=int, default=70, @@ -2526,7 +2511,9 @@ def main(): """, add_help=False, ) - subparsers_model = vaevae_parserbin_parser.add_subparsers(dest="model_subcommand", required=True) + subparsers_model = vaevae_parserbin_parser.add_subparsers( + dest="model_subcommand", required=True + ) vae_parser = subparsers_model.add_parser( VAMB, @@ -2535,7 +2522,7 @@ def main(): default binner based on a variational autoencoder. See the paper 'Improved metagenome binning and assembly using deep variational autoencoders'""", add_help=False, - #usage="%(prog)s [options]", + # usage="%(prog)s [options]", description="""Bin using a VAE that merges composition and abundance information. Required arguments: Outdir, at least one composition input and at least one abundance input""", @@ -2555,7 +2542,7 @@ def main(): taxonomy informed binner based on a bi-modal variational autoencoder. See the paper 'TaxVAMB: taxonomic annotations improve metagenome binning'""", add_help=False, - #usage="%(prog)s [options]", + # usage="%(prog)s [options]", description="""Bin using a semi-supervised VAEVAE model that merges composition, abundance and taxonomic information. Required arguments: Outdir, taxonomy, at least one composition input and at least one abundance input""", @@ -2574,7 +2561,7 @@ def main(): AVAMB, help=argparse.SUPPRESS, add_help=False, - #usage="%(prog)s [options]", + # usage="%(prog)s [options]", ) general_group = add_general_arguments(vaeaae_parser) add_minlength(general_group) @@ -2657,22 +2644,13 @@ def main(): "train", help="Do training without clustering", add_help=False ) general_group = add_training_arguments(train_parser) - train_parser.add_argument('--abundance_file', type=str, help='Input filename') - train_parser.add_argument('--composition_file', type=str, help='Input filename') - train_parser.add_argument('--outdir', type=str, help='Output directory') - train_parser.add_argument('--nepochs', type=int, default=70, help='Number of training epochs (default: 70)') - train_parser.add_argument('--batchsize', type=int, default=64, help='Batchsize') - train_parser.add_argument('--batchsteps', type=int, nargs='+', default=[], help='Epochs at which to update batch statistics (default: none)') - train_parser.add_argument('--nhiddens', type=int, nargs='*', default=None, help='List of hidden layer sizes for the VAE (e.g., --nhiddens 512 256)') - parser.add_argument('--nlatent', type=int, default=32, help='Size of the VAE latent space (default: 32)') - parser.add_argument('--alpha', type=float, default=None, help='Beta-VAE alpha parameter (optional, default: None)') - parser.add_argument('--beta', type=float, default=1.0, help='KL-divergence weight for Beta-VAE (default: 1.0)') - parser.add_argument('--dropout', type=float, default=None, help='Dropout rate for VAE layers (optional, default: None)') - - + add_vae_arguments(train_parser) + train_parser.add_argument("--abundance_file", type=str, help="Input filename") + train_parser.add_argument("--composition_file", type=str, help="Input filename") + train_parser.add_argument("--outdir", type=str, help="Output directory") args = parser.parse_args() - if args.subcommand == TAXOMETER: + if args.subcommand == TAXOMETER: opt = TaxometerOptions.from_args(args) runner = partial(run_taxonomy_predictor, opt) run(runner, opt.general) @@ -2714,7 +2692,6 @@ def main(): os.makedirs(args.outdir, exist_ok=False) composition, abundance = load_composition_and_abundance_train_only(opt) run_train_vae(opt, composition, abundance) - logger.info(f"Saved latent.npz to /{args.outdir}") ending_time = time.time() elapsed = ending_time - starting_time logger.info(f"Completed training in {elapsed:.2f} seconds") @@ -2732,3 +2709,5 @@ def main(): if __name__ == "__main__": main() + + From 492f703a1059624480882a389dce9fc7ad8b0a3d Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Wed, 13 Aug 2025 14:01:32 +0200 Subject: [PATCH 4/7] Deleted majority of previous code and god training and clustering working --- vamb/__main__.py | 152 ++++++++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 80 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 010ff454..4d2b4e44 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -366,9 +366,6 @@ def from_args(cls, args: argparse.Namespace): typeasserted(args.cuda, bool), ) - def training_args_assertions(cls, args: argparse.Namespace): - None - def __init__( self, out_dir: Path, @@ -810,6 +807,7 @@ def __init__( self.comp = comp self.abundance = abundance self.clustering = clustering + self.output = output class TrainingCommonOptions: @@ -1010,23 +1008,21 @@ def __init__( def calc_tnf( - options: CompositionOptions | PartialCompositionOptions | PartialTrainingOptions, + options: CompositionOptions | PartialCompositionOptions, outdir: Path, - binsplitter: Optional[vamb.vambtools.BinSplitter] = None, - train_only: bool = False, + binsplitter: Optional[vamb.vambtools.BinSplitter], ) -> vamb.parsecontigs.Composition: begintime = time.time() logger.info("Loading TNF") - if not train_only: - logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") - path = options.path - else: - path = options.validate_comp_is_npz() + logger.info(f"\tMinimum sequence length: {options.min_contig_length.n}") + + path = options.path + if isinstance(path, CompositionPath): logger.info(f'\tLoading composition from npz at: "{path.path}"') composition = vamb.parsecontigs.Composition.load(path.path) composition.filter_min_length(options.min_contig_length.n) - elif not train_only: + else: assert isinstance(path, FASTAPath) logger.info(f"\tLoading data from FASTA file {path.path}") with vamb.vambtools.Reader(path.path) as file: @@ -1035,16 +1031,11 @@ def calc_tnf( ) assert outdir is not None composition.save(outdir.joinpath("composition.npz")) - else: - raise TypeError( - "In training-only mode, path must be a CompositionPath with a valid .npz file" - ) - if not train_only: - # Initialize binsplitter on the identifiers. Only done if we actually need to binsplit - # later. - if binsplitter is not None: - binsplitter.initialize(composition.metadata.identifiers) + # Initialize binsplitter on the identifiers. Only done if we actually need to binsplit + # later. + if binsplitter is not None: + binsplitter.initialize(composition.metadata.identifiers) if composition.nseqs < MINIMUM_SEQS: err = ( @@ -1165,20 +1156,6 @@ def load_composition_and_abundance( return (composition, abundance) -def load_composition_and_abundance_train_only( - opt: PartialTrainingOptions, -) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]: - composition = calc_tnf(opt, opt.general.out_dir.path, train_only=True) - - abundance = calc_abundance( - opt.abundance, - opt.general.out_dir.path, - composition.metadata, - opt.general.n_threads, - ) - return (composition, abundance) - - def load_markers( options: MarkerOptions, comp_metadata: vamb.parsecontigs.CompositionMetaData, @@ -1508,8 +1485,6 @@ def run_train_vae( del composition, abundance assert comp_metadata.nseqs == len(latent) - logger.info(f"Saved latent.npz to {opt.outdir}") - return latent @@ -1532,16 +1507,19 @@ def run_cluster_and_write_files( ) -def run_bin_default(opt: BinDefaultOptions): +def load_train_bin(opt: BinDefaultOptions, partial_mode: str = "default", latent=None): composition, abundance = load_composition_and_abundance( vamb_options=opt.common.general, comp_options=opt.common.comp, abundance_options=opt.common.abundance, binsplitter=opt.common.output.binsplitter, ) - latent = run_train_vae(opt, composition, abundance) - run_cluster_and_write_files(latent, opt, composition) - del latent + if partial_mode == "default" or partial_mode == "train": + latent = run_train_vae(opt, composition, abundance) + + if partial_mode == "default" or partial_mode == "cluster": + run_cluster_and_write_files(latent, opt, composition) + del latent def run_bin_aae(opt: BinAvambOptions): @@ -2116,30 +2094,6 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): return subparser -def add_training_arguments(subparser: argparse.ArgumentParser): - trainingos = subparser.add_argument_group(title="Training options") - add_minlength(trainingos) - trainingos.add_argument("--print_test", type=str, help="Print test output") - trainingos.add_argument( - "-p", - dest="nthreads", - metavar="", - type=int, - default=DEFAULT_THREADS, - help="number of threads to use where customizable", - ) - trainingos.add_argument( - "--seed", - metavar="", - type=int, - default=int.from_bytes(os.urandom(7), "little"), - help="Random seed (determinism not guaranteed)", - ) - trainingos.add_argument( - "--cuda", help="Use GPU to train & cluster [False]", action="store_true" - ) - - def add_taxonomy_arguments(subparser: argparse.ArgumentParser, taxonomy_only=False): taxonomys = subparser.add_argument_group(title="Taxonomy input") taxonomys.add_argument( @@ -2334,6 +2288,20 @@ def add_predictor_arguments(subparser: argparse.ArgumentParser): return subparser +def add_cluster_only_args(subparser: argparse.ArgumentParser): + c_only_arg = subparser.add_argument_group( + title="Clustering options", description=None + ) + c_only_arg.add_argument( + "--latent", + dest="latent_file", + required=True, + metavar="", + type=lambda p: np.load(Path(p)), + help="Path to latent.npz file", + ) + + def add_clustering_arguments(subparser: argparse.ArgumentParser): # Clustering arguments clusto = subparser.add_argument_group(title="Clustering options", description=None) @@ -2643,11 +2611,33 @@ def main(): train_parser = partial_part.add_parser( "train", help="Do training without clustering", add_help=False ) - general_group = add_training_arguments(train_parser) + + train_parser.set_defaults(model_subcommand=VAMB) + + general_group = add_general_arguments(train_parser) + add_minlength(general_group) + add_composition_arguments(train_parser) + add_abundance_arguments(train_parser) + add_taxonomy_arguments(train_parser) + add_bin_output_arguments(train_parser) add_vae_arguments(train_parser) - train_parser.add_argument("--abundance_file", type=str, help="Input filename") - train_parser.add_argument("--composition_file", type=str, help="Input filename") - train_parser.add_argument("--outdir", type=str, help="Output directory") + add_clustering_arguments(train_parser) + + cluster_parser = partial_part.add_parser( + "cluster", help="Cluster after training", add_help=False + ) + cluster_parser.set_defaults(model_subcommand=VAMB) + + general_group = add_general_arguments(cluster_parser) + add_minlength(general_group) + add_composition_arguments(cluster_parser) + add_abundance_arguments(cluster_parser) + add_taxonomy_arguments(cluster_parser) + add_bin_output_arguments(cluster_parser) + add_vae_arguments(cluster_parser) + add_clustering_arguments(cluster_parser) + add_cluster_only_args(cluster_parser) + args = parser.parse_args() if args.subcommand == TAXOMETER: @@ -2661,7 +2651,7 @@ def main(): sys.exit(1) if model == VAMB: opt = BinDefaultOptions.from_args(args) - runner = partial(run_bin_default, opt) + runner = partial(load_train_bin, opt) run(runner, opt.common.general) elif model == TAXVAMB: opt = BinTaxVambOptions.from_args(args) @@ -2687,14 +2677,18 @@ def main(): runner = partial(run_partial_abundance, opt) run(runner, opt.general) elif args.partial_part == "train": - starting_time = time.time() - opt = PartialTrainingOptions.from_args(args) - os.makedirs(args.outdir, exist_ok=False) - composition, abundance = load_composition_and_abundance_train_only(opt) - run_train_vae(opt, composition, abundance) - ending_time = time.time() - elapsed = ending_time - starting_time - logger.info(f"Completed training in {elapsed:.2f} seconds") + opt = BinDefaultOptions.from_args(args) + runner = partial(load_train_bin, opt, partial_mode="train") + run(runner, opt.common.general) + elif args.partial_part == "cluster": + opt = BinDefaultOptions.from_args(args) + runner = partial( + load_train_bin, + opt, + partial_mode="cluster", + latent=args.latent_file["arr_0"], + ) + run(runner, opt.common.general) else: # TODO: Add abundance @@ -2709,5 +2703,3 @@ def main(): if __name__ == "__main__": main() - - From 4ac46838de47d4f95e159b7c06ecbb56529efaf2 Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Thu, 14 Aug 2025 11:12:42 +0200 Subject: [PATCH 5/7] Removed alot of code that was made redundant by my changes. Used classes instead of strings for mode checking. --- vamb/__main__.py | 98 +++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 68 deletions(-) diff --git a/vamb/__main__.py b/vamb/__main__.py index 4d2b4e44..04f1b3fb 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -810,64 +810,6 @@ def __init__( self.output = output -class TrainingCommonOptions: - def __init__( - self, general: GeneralOptions, comp: CompositionPath, abundance: AbundancePath - ): - self.general = general - self.comp = comp - self.abundance = abundance - - -class PartialTrainingOptions: - def __init__( - self, - general: GeneralOptions, - common: TrainingCommonOptions, - comp: CompositionPath, - abundance: AbundancePath, - min_contig_length: MinContigLength, - vae: VAEOptions, - outdir: Path, - ): - self.general = general - self.common = common - self.comp = comp - self.abundance = abundance - self.min_contig_length = min_contig_length - self.vae = vae - self.outdir = outdir - - @classmethod - def from_args(cls, args: argparse.Namespace): - general = GeneralOptions.from_args(args) - comp = CompositionPath(Path(args.composition_file)) - abundance_path = Path(args.abundance_file) - min_contig_length = MinContigLength.from_args(args) - basic = BasicTrainingOptions.from_args_vae(args) - vae = VAEOptions.from_args(basic, args) - outdir = Path(args.outdir) - - abundance = AbundanceOptions( - bampaths=None, - bamdir=None, - abundance_tsv=None, - abundancepath=abundance_path, - min_alignment_id=0.0, - min_contig_length=min_contig_length, - refcheck=False, - ) - common = TrainingCommonOptions(general, comp, abundance) - return cls(general, common, comp, abundance, min_contig_length, vae, outdir) - - def validate_comp_is_npz(self) -> Path: - if not isinstance(self.comp, CompositionPath): - raise TypeError( - "Training-only mode requires a CompositionPath (precomputed .npz)" - ) - return self.comp - - class BinDefaultOptions: @classmethod def from_args(cls, args: argparse.Namespace): @@ -1507,19 +1449,39 @@ def run_cluster_and_write_files( ) -def load_train_bin(opt: BinDefaultOptions, partial_mode: str = "default", latent=None): +class RunDefault: + pass + + +class RunTrain: + pass + + +class RunCluster: + def __init__(self, latent: Path): + self.latent = latent + + +def load_train_bin( + opt: BinDefaultOptions, partial_mode: Union[RunDefault, RunTrain, RunCluster] +): composition, abundance = load_composition_and_abundance( vamb_options=opt.common.general, comp_options=opt.common.comp, abundance_options=opt.common.abundance, binsplitter=opt.common.output.binsplitter, ) - if partial_mode == "default" or partial_mode == "train": + + latent = None + + if isinstance(partial_mode, (RunDefault, RunTrain)): latent = run_train_vae(opt, composition, abundance) - if partial_mode == "default" or partial_mode == "cluster": + if isinstance(partial_mode, RunCluster): + latent = partial_mode.latent + + if isinstance(partial_mode, (RunDefault, RunCluster)): run_cluster_and_write_files(latent, opt, composition) - del latent def run_bin_aae(opt: BinAvambOptions): @@ -2490,7 +2452,7 @@ def main(): default binner based on a variational autoencoder. See the paper 'Improved metagenome binning and assembly using deep variational autoencoders'""", add_help=False, - # usage="%(prog)s [options]", + usage="%(prog)s [options]", description="""Bin using a VAE that merges composition and abundance information. Required arguments: Outdir, at least one composition input and at least one abundance input""", @@ -2607,7 +2569,6 @@ def main(): add_minlength(general_group) add_composition_npz_argument(abundance_parser) add_abundance_args_nonpz(abundance_parser) - train_parser = partial_part.add_parser( "train", help="Do training without clustering", add_help=False ) @@ -2651,7 +2612,7 @@ def main(): sys.exit(1) if model == VAMB: opt = BinDefaultOptions.from_args(args) - runner = partial(load_train_bin, opt) + runner = partial(load_train_bin, opt, RunDefault()) run(runner, opt.common.general) elif model == TAXVAMB: opt = BinTaxVambOptions.from_args(args) @@ -2668,6 +2629,8 @@ def main(): runner = partial(run_reclustering, opt) run(runner, opt.general) elif args.subcommand == PARTIAL: + # TODO: args.partial_part is not a string, so why is it being + # compared to a string here?? if args.partial_part == "composition": opt = PartialCompositionOptions.from_args(args) runner = partial(run_partial_composition, opt) @@ -2678,15 +2641,14 @@ def main(): run(runner, opt.general) elif args.partial_part == "train": opt = BinDefaultOptions.from_args(args) - runner = partial(load_train_bin, opt, partial_mode="train") + runner = partial(load_train_bin, opt, partial_mode=RunTrain()) run(runner, opt.common.general) elif args.partial_part == "cluster": opt = BinDefaultOptions.from_args(args) runner = partial( load_train_bin, opt, - partial_mode="cluster", - latent=args.latent_file["arr_0"], + partial_mode=RunCluster(args.latent_file["arr_0"]), ) run(runner, opt.common.general) From 44dba9abde5875704f255501ac493ed81f44557b Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Thu, 14 Aug 2025 12:27:11 +0200 Subject: [PATCH 6/7] Tweaked some code in main and added testcases --- .github/workflows/cli_vamb.yml | 10 ++++++++++ vamb/__main__.py | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cli_vamb.yml b/.github/workflows/cli_vamb.yml index 4b988af4..f2881ab2 100644 --- a/.github/workflows/cli_vamb.yml +++ b/.github/workflows/cli_vamb.yml @@ -61,3 +61,13 @@ jobs: vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --markers markers_mock.npz --algorithm kmeans --minfasta 200000 ls -la outdir_recluster cat outdir_recluster/log.txt + - name: Run Partial Train + run: | + vamb partial train --outdir latent_file --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -e 3 -q + ls -la latent_file + cat latent_file/log.txt + - name: Run Partial Cluster + run: | + vamb partial cluster --outdir outdir_cluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent latent_file/latent.npz + ls -la outdir_cluster + cat outdir_cluster/log.txt diff --git a/vamb/__main__.py b/vamb/__main__.py index 04f1b3fb..61eff26f 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -2168,7 +2168,7 @@ def add_vae_arguments(subparser: argparse.ArgumentParser): dest="nepochs", metavar="", type=int, - default=70, + default=300, help=argparse.SUPPRESS, ) trainos.add_argument( @@ -2472,7 +2472,7 @@ def main(): taxonomy informed binner based on a bi-modal variational autoencoder. See the paper 'TaxVAMB: taxonomic annotations improve metagenome binning'""", add_help=False, - # usage="%(prog)s [options]", + usage="%(prog)s [options]", description="""Bin using a semi-supervised VAEVAE model that merges composition, abundance and taxonomic information. Required arguments: Outdir, taxonomy, at least one composition input and at least one abundance input""", @@ -2491,7 +2491,7 @@ def main(): AVAMB, help=argparse.SUPPRESS, add_help=False, - # usage="%(prog)s [options]", + usage="%(prog)s [options]", ) general_group = add_general_arguments(vaeaae_parser) add_minlength(general_group) From 89199e5fc549da969c0e459c1f10c6137a4b55d2 Mon Sep 17 00:00:00 2001 From: Elek Lamoureux Date: Thu, 14 Aug 2025 14:19:42 +0200 Subject: [PATCH 7/7] Fixed type errors and added partial composition as a testcase --- .github/workflows/cli_vamb.yml | 5 +++++ vamb/__main__.py | 16 ++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/cli_vamb.yml b/.github/workflows/cli_vamb.yml index f2881ab2..bbbebccd 100644 --- a/.github/workflows/cli_vamb.yml +++ b/.github/workflows/cli_vamb.yml @@ -61,6 +61,11 @@ jobs: vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --markers markers_mock.npz --algorithm kmeans --minfasta 200000 ls -la outdir_recluster cat outdir_recluster/log.txt + - name: Run Partial Composition + run: | + vamb partial composition --outdir outdir_composition --fasta catalogue_mock.fna.gz + ls -la outdir_composition + cat outdir_composition/log.txt - name: Run Partial Train run: | vamb partial train --outdir latent_file --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -e 3 -q diff --git a/vamb/__main__.py b/vamb/__main__.py index 61eff26f..9feb7753 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -1407,8 +1407,10 @@ def run_partial_abundance(opt: PartialAbundanceOptions): def run_train_vae( - opt: BinDefaultOptions, composition: CompositionPath, abundance: AbundancePath -): + opt: BinDefaultOptions, + composition: vamb.parsecontigs.Composition, + abundance: vamb.parsebam.Abundance, +) -> np.ndarray: data_loader = vamb.encode.make_dataloader( abundance.matrix, composition.matrix, @@ -1431,7 +1433,7 @@ def run_train_vae( def run_cluster_and_write_files( - latent, opt: BinDefaultOptions, composition: CompositionPath + latent, opt: BinDefaultOptions, composition: vamb.parsecontigs.Composition ): comp_metadata = composition.metadata assert comp_metadata.nseqs == len(latent) @@ -1472,13 +1474,11 @@ def load_train_bin( binsplitter=opt.common.output.binsplitter, ) - latent = None - if isinstance(partial_mode, (RunDefault, RunTrain)): latent = run_train_vae(opt, composition, abundance) if isinstance(partial_mode, RunCluster): - latent = partial_mode.latent + latent = vamb.vambtools.read_npz(partial_mode.latent) if isinstance(partial_mode, (RunDefault, RunCluster)): run_cluster_and_write_files(latent, opt, composition) @@ -2259,7 +2259,7 @@ def add_cluster_only_args(subparser: argparse.ArgumentParser): dest="latent_file", required=True, metavar="", - type=lambda p: np.load(Path(p)), + type=Path, help="Path to latent.npz file", ) @@ -2648,7 +2648,7 @@ def main(): runner = partial( load_train_bin, opt, - partial_mode=RunCluster(args.latent_file["arr_0"]), + partial_mode=RunCluster(args.latent_file), ) run(runner, opt.common.general)