CompOmics · RalfG · Apr 24, 2025 · Jun 5, 2025 · Jun 6, 2025 · Sep 23, 2025
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,4 @@
-include deeplc/mods/*
+include deeplc/models/*
 include deeplc/package_data/**/*
 include deeplc/unimod/*
-include deeplc/aa_comp_rel.csv
 include deeplc/baseline_performance/*
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,32 @@
+# DeepLC 4.0 to do list
+
+## Alpha 1 release
+
+[x] SciKit-Learn like API for calibration
+[x] Streamlined use of Data and DataLoader
+[x] Module with PyTorch-level model operations (train, predict, load, save)
+[x] Refactor core functions to use new model operations module
+
+## Alpha 2 release
+
+[ ] Add architecture module for training new models
+[ ] Get calibration/finetuning PSMs from main psm_list using score/q-value for best selection?
+[ ] Add CLI commands with file I/O
+
+## Beta release
+
+[ ] Ensure mapping of MaxQuant modifications
+[ ] Update README
+[ ] Update documentation to reflect new structure
+[ ] Update examples to use new structure
+
+## Stable release
+
+[ ] Decent coverage of unit tests
+[ ] Update GUI (no use of argparse -> alternative for Gooey?)
+[ ] Update Streamlit app
+
+## Open questions / issues
+
+[ ] Should the library feature be reintroduced?
+[ ] Implementation into IM2Deep
diff --git a/config.ini b/config.ini
diff --git a/deeplc/__init__.py b/deeplc/__init__.py
@@ -1,8 +1,12 @@
-__all__ = ["DeepLC"]
+"""DeepLC: Retention time prediction for peptides carrying any modification."""
 
 from importlib.metadata import version
 
-__version__ = version("deeplc")
+from deeplc.core import calibrate_and_predict, finetune_and_predict, predict
 
-
-from deeplc.deeplc import DeepLC
+__version__: str = version("deeplc")
+__all__: list[str] = [
+    "predict",
+    "calibrate_and_predict",
+    "finetune_and_predict",
+]
diff --git a/deeplc/__main__.py b/deeplc/__main__.py
@@ -1,35 +1,12 @@
 """Main command line interface to DeepLC."""
 
-__author__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
-__credits__ = [
-    "Robbin Bouwmeester",
-    "Ralf Gabriels",
-    "Prof. Lennart Martens",
-    "Sven Degroeve",
-]
-__license__ = "Apache License, Version 2.0"
-__maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
-__email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"]
-
 import logging
-import os
 import sys
-import warnings
-
-import pandas as pd
-from psm_utils.io import read_file
-from psm_utils.io.peptide_record import peprec_to_proforma
-from psm_utils.psm import PSM
-from psm_utils.psm_list import PSMList
 
-from deeplc import DeepLC, __version__
-from deeplc._argument_parser import parse_arguments
-from deeplc._exceptions import DeepLCError
+LOGGER = logging.getLogger(__name__)
 
-logger = logging.getLogger(__name__)
 
-
-def setup_logging(passed_level):
+def _setup_logging(passed_level):
     log_mapping = {
         "critical": logging.CRITICAL,
         "error": logging.ERROR,
@@ -51,181 +28,3 @@ def setup_logging(passed_level):
         datefmt="%Y-%m-%d %H:%M:%S",
         level=log_mapping[passed_level.lower()],
     )
-
-
-def main(gui=False):
-    """Main function for the CLI."""
-    argu = parse_arguments(gui=gui)
-
-    setup_logging(argu.log_level)
-
-    # Reset logging levels if DEBUG (see deeplc.py)
-    if argu.log_level.lower() == "debug":
-        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
-        logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-        warnings.filterwarnings("default", category=DeprecationWarning)
-        warnings.filterwarnings("default", category=FutureWarning)
-        warnings.filterwarnings("default", category=UserWarning)
-    else:
-        os.environ["KMP_WARNINGS"] = "0"
-
-    try:
-        run(**vars(argu))
-    except DeepLCError as e:
-        logger.exception(e)
-        sys.exit(1)
-
-
-def run(
-    file_pred,
-    file_cal=None,
-    file_pred_out=None,
-    file_model=None,
-    pygam_calibration=True,
-    split_cal=50,
-    dict_divider=50,
-    use_library=None,
-    write_library=False,
-    batch_num=50000,
-    n_threads=None,
-    transfer_learning=False,
-    log_level="info",
-    verbose=True,
-):
-    """Run DeepLC."""
-    logger.info("Using DeepLC version %s", __version__)
-    logger.debug("Using %i CPU threads", n_threads)
-
-    df_pred = False
-    df_cal = False
-    first_line_pred = ""
-    first_line_cal = ""
-
-    if not file_cal and file_model != None:
-        fm_dict = {}
-        sel_group = ""
-        for fm in file_model:
-            if len(sel_group) == 0:
-                sel_group = "_".join(fm.split("_")[:-1])
-                fm_dict[sel_group] = fm
-                continue
-            m_group = "_".join(fm.split("_")[:-1])
-            if m_group == sel_group:
-                fm_dict[m_group] = fm
-        file_model = fm_dict
-
-    with open(file_pred) as f:
-        first_line_pred = f.readline().strip()
-    if file_cal:
-        with open(file_cal) as f:
-            first_line_cal = f.readline().strip()
-
-    if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
-        # Read input files
-        df_pred = pd.read_csv(file_pred)
-        if len(df_pred.columns) < 2:
-            df_pred = pd.read_csv(file_pred, sep=" ")
-        df_pred = df_pred.fillna("")
-        file_pred = ""
-
-        list_of_psms = []
-        for seq, mod, ident in zip(df_pred["seq"], df_pred["modifications"], df_pred.index):
-            list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident))
-        psm_list_pred = PSMList(psm_list=list_of_psms)
-        df_pred = None
-    else:
-        psm_list_pred = read_file(file_pred)
-        if "msms" in file_pred and ".txt" in file_pred:
-            mapper = pd.read_csv(
-                os.path.join(
-                    os.path.dirname(os.path.realpath(__file__)),
-                    "unimod/map_mq_file.csv",
-                ),
-                index_col=0,
-            )["value"].to_dict()
-            psm_list_pred.rename_modifications(mapper)
-
-    # Allow for calibration file to be empty (undefined), fill in if/elif if present
-    psm_list_cal = []
-    if (
-        "modifications" in first_line_cal.split(",")
-        and "seq" in first_line_cal.split(",")
-        and file_cal
-    ):
-        df_cal = pd.read_csv(file_cal)
-        if len(df_cal.columns) < 2:
-            df_cal = pd.read_csv(df_cal, sep=" ")
-        df_cal = df_cal.fillna("")
-        file_cal = ""
-
-        list_of_psms = []
-        for seq, mod, ident, tr in zip(
-            df_cal["seq"], df_cal["modifications"], df_cal.index, df_cal["tr"]
-        ):
-            list_of_psms.append(
-                PSM(
-                    peptidoform=peprec_to_proforma(seq, mod),
-                    spectrum_id=ident,
-                    retention_time=tr,
-                )
-            )
-        psm_list_cal = PSMList(psm_list=list_of_psms)
-        df_cal = None
-    elif file_cal:
-        psm_list_cal = read_file(file_cal)
-        if "msms" in file_cal and ".txt" in file_cal:
-            mapper = pd.read_csv(
-                os.path.join(
-                    os.path.dirname(os.path.realpath(__file__)),
-                    "unimod/map_mq_file.csv",
-                ),
-                index_col=0,
-            )["value"].to_dict()
-            psm_list_cal.rename_modifications(mapper)
-
-    # Make the DeepLC object that will handle making predictions and calibration
-    dlc = DeepLC(
-        path_model=file_model,
-        cnn_model=True,
-        split_cal=split_cal,
-        dict_cal_divider=dict_divider,
-        write_library=write_library,
-        use_library=use_library,
-        batch_num=batch_num,
-        n_jobs=n_threads,
-        verbose=verbose,
-        deeplc_retrain=transfer_learning,
-    )
-
-    # Calibrate the original model based on the new retention times
-    if len(psm_list_cal) > 0:
-        logger.info("Selecting best model and calibrating predictions...")
-        logger.info("Initiating transfer learning?")
-        dlc.calibrate_preds(psm_list=psm_list_cal)
-
-    # Make predictions; calibrated or uncalibrated
-    logger.info("Making predictions using model: %s", dlc.model)
-    if len(psm_list_cal) > 0:
-        preds = dlc._make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
-    else:
-        preds = dlc._make_preds(
-            seq_df=df_pred,
-            infile=file_pred,
-            psm_list=psm_list_pred,
-            calibrate=False,
-        )
-
-    # df_pred["predicted_tr"] = preds
-    logger.info("Writing predictions to file: %s", file_pred_out)
-
-    file_pred_out = open(file_pred_out, "w")
-    file_pred_out.write("Sequence proforma,predicted retention time\n")
-    for psm, tr in zip(psm_list_pred, preds):
-        file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n")
-    file_pred_out.close()
-
-    logger.info("DeepLC finished!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/deeplc/_architecture.py b/deeplc/_architecture.py
@@ -0,0 +1 @@
+# TODO: Add architectures for training from scratch
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# TODO: Add architectures for training from scratch