Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
include deeplc/mods/*
include deeplc/models/*
include deeplc/package_data/**/*
include deeplc/unimod/*
include deeplc/aa_comp_rel.csv
include deeplc/baseline_performance/*
32 changes: 32 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# DeepLC 4.0 to do list

## Alpha 1 release

[x] SciKit-Learn like API for calibration
[x] Streamlined use of Data and DataLoader
[x] Module with PyTorch-level model operations (train, predict, load, save)
[x] Refactor core functions to use new model operations module

## Alpha 2 release

[ ] Add architecture module for training new models
[ ] Get calibration/finetuning PSMs from main psm_list using score/q-value for best selection?
[ ] Add CLI commands with file I/O

## Beta release

[ ] Ensure mapping of MaxQuant modifications
[ ] Update README
[ ] Update documentation to reflect new structure
[ ] Update examples to use new structure

## Stable release

[ ] Decent coverage of unit tests
[ ] Update GUI (no use of argparse -> alternative for Gooey?)
[ ] Update Streamlit app

## Open questions / issues

[ ] Should the library feature be reintroduced?
[ ] Implementation into IM2Deep
32 changes: 0 additions & 32 deletions config.ini

This file was deleted.

12 changes: 8 additions & 4 deletions deeplc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
__all__ = ["DeepLC"]
"""DeepLC: Retention time prediction for peptides carrying any modification."""

from importlib.metadata import version

__version__ = version("deeplc")
from deeplc.core import calibrate_and_predict, finetune_and_predict, predict


from deeplc.deeplc import DeepLC
__version__: str = version("deeplc")
__all__: list[str] = [
"predict",
"calibrate_and_predict",
"finetune_and_predict",
]
205 changes: 2 additions & 203 deletions deeplc/__main__.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,12 @@
"""Main command line interface to DeepLC."""

__author__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
__credits__ = [
"Robbin Bouwmeester",
"Ralf Gabriels",
"Prof. Lennart Martens",
"Sven Degroeve",
]
__license__ = "Apache License, Version 2.0"
__maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
__email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"]

import logging
import os
import sys
import warnings

import pandas as pd
from psm_utils.io import read_file
from psm_utils.io.peptide_record import peprec_to_proforma
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList

from deeplc import DeepLC, __version__
from deeplc._argument_parser import parse_arguments
from deeplc._exceptions import DeepLCError
LOGGER = logging.getLogger(__name__)

logger = logging.getLogger(__name__)


def setup_logging(passed_level):
def _setup_logging(passed_level):
log_mapping = {
"critical": logging.CRITICAL,
"error": logging.ERROR,
Expand All @@ -51,181 +28,3 @@ def setup_logging(passed_level):
datefmt="%Y-%m-%d %H:%M:%S",
level=log_mapping[passed_level.lower()],
)


def main(gui=False):
"""Main function for the CLI."""
argu = parse_arguments(gui=gui)

setup_logging(argu.log_level)

# Reset logging levels if DEBUG (see deeplc.py)
if argu.log_level.lower() == "debug":
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
logging.getLogger("tensorflow").setLevel(logging.DEBUG)
warnings.filterwarnings("default", category=DeprecationWarning)
warnings.filterwarnings("default", category=FutureWarning)
warnings.filterwarnings("default", category=UserWarning)
else:
os.environ["KMP_WARNINGS"] = "0"

try:
run(**vars(argu))
except DeepLCError as e:
logger.exception(e)
sys.exit(1)


def run(
file_pred,
file_cal=None,
file_pred_out=None,
file_model=None,
pygam_calibration=True,
split_cal=50,
dict_divider=50,
use_library=None,
write_library=False,
batch_num=50000,
n_threads=None,
transfer_learning=False,
log_level="info",
verbose=True,
):
"""Run DeepLC."""
logger.info("Using DeepLC version %s", __version__)
logger.debug("Using %i CPU threads", n_threads)

df_pred = False
df_cal = False
first_line_pred = ""
first_line_cal = ""

if not file_cal and file_model != None:
fm_dict = {}
sel_group = ""
for fm in file_model:
if len(sel_group) == 0:
sel_group = "_".join(fm.split("_")[:-1])
fm_dict[sel_group] = fm
continue
m_group = "_".join(fm.split("_")[:-1])
if m_group == sel_group:
fm_dict[m_group] = fm
file_model = fm_dict

with open(file_pred) as f:
first_line_pred = f.readline().strip()
if file_cal:
with open(file_cal) as f:
first_line_cal = f.readline().strip()

if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
# Read input files
df_pred = pd.read_csv(file_pred)
if len(df_pred.columns) < 2:
df_pred = pd.read_csv(file_pred, sep=" ")
df_pred = df_pred.fillna("")
file_pred = ""

list_of_psms = []
for seq, mod, ident in zip(df_pred["seq"], df_pred["modifications"], df_pred.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident))
psm_list_pred = PSMList(psm_list=list_of_psms)
df_pred = None
else:
psm_list_pred = read_file(file_pred)
if "msms" in file_pred and ".txt" in file_pred:
mapper = pd.read_csv(
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"unimod/map_mq_file.csv",
),
index_col=0,
)["value"].to_dict()
psm_list_pred.rename_modifications(mapper)

# Allow for calibration file to be empty (undefined), fill in if/elif if present
psm_list_cal = []
if (
"modifications" in first_line_cal.split(",")
and "seq" in first_line_cal.split(",")
and file_cal
):
df_cal = pd.read_csv(file_cal)
if len(df_cal.columns) < 2:
df_cal = pd.read_csv(df_cal, sep=" ")
df_cal = df_cal.fillna("")
file_cal = ""

list_of_psms = []
for seq, mod, ident, tr in zip(
df_cal["seq"], df_cal["modifications"], df_cal.index, df_cal["tr"]
):
list_of_psms.append(
PSM(
peptidoform=peprec_to_proforma(seq, mod),
spectrum_id=ident,
retention_time=tr,
)
)
psm_list_cal = PSMList(psm_list=list_of_psms)
df_cal = None
elif file_cal:
psm_list_cal = read_file(file_cal)
if "msms" in file_cal and ".txt" in file_cal:
mapper = pd.read_csv(
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"unimod/map_mq_file.csv",
),
index_col=0,
)["value"].to_dict()
psm_list_cal.rename_modifications(mapper)

# Make the DeepLC object that will handle making predictions and calibration
dlc = DeepLC(
path_model=file_model,
cnn_model=True,
split_cal=split_cal,
dict_cal_divider=dict_divider,
write_library=write_library,
use_library=use_library,
batch_num=batch_num,
n_jobs=n_threads,
verbose=verbose,
deeplc_retrain=transfer_learning,
)

# Calibrate the original model based on the new retention times
if len(psm_list_cal) > 0:
logger.info("Selecting best model and calibrating predictions...")
logger.info("Initiating transfer learning?")
dlc.calibrate_preds(psm_list=psm_list_cal)

# Make predictions; calibrated or uncalibrated
logger.info("Making predictions using model: %s", dlc.model)
if len(psm_list_cal) > 0:
preds = dlc._make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
else:
preds = dlc._make_preds(
seq_df=df_pred,
infile=file_pred,
psm_list=psm_list_pred,
calibrate=False,
)

# df_pred["predicted_tr"] = preds
logger.info("Writing predictions to file: %s", file_pred_out)

file_pred_out = open(file_pred_out, "w")
file_pred_out.write("Sequence proforma,predicted retention time\n")
for psm, tr in zip(psm_list_pred, preds):
file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n")
file_pred_out.close()

logger.info("DeepLC finished!")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions deeplc/_architecture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# TODO: Add architectures for training from scratch
Loading