Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/pr-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ jobs:
steps:
- name: Check out repository
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0

- name: Verify required folders exist
run: |
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "slide2vec/hs2p"]
path = slide2vec/hs2p
url = https://github.com/clemsgrs/hs2p.git
35 changes: 18 additions & 17 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,27 +35,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
zip unzip \
git \
openssh-server \
build-essential \
ninja-build \
python3-pip python3-dev python-is-python3 \
&& mkdir /var/run/sshd \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# install ASAP
ARG ASAP_URL=https://github.com/computationalpathologygroup/ASAP/releases/download/ASAP-2.2-(Nightly)/ASAP-2.2-Ubuntu2204.deb
RUN apt-get update && curl -L ${ASAP_URL} -o /tmp/ASAP.deb && apt-get install --assume-yes /tmp/ASAP.deb && \
SITE_PACKAGES=`python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"` && \
printf "/opt/ASAP/bin/\n" > "${SITE_PACKAGES}/asap.pth" && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# clone & install relevant repositories
RUN git clone https://github.com/prov-gigapath/prov-gigapath.git /home/user/prov-gigapath

# add folders to python path
ENV PYTHONPATH="/home/user/prov-gigapath:/home/user/CONCH:/home/user/MUSK:$PYTHONPATH"

WORKDIR /opt/app/

# you can add any Python dependencies to requirements.in
Expand All @@ -70,9 +57,16 @@ RUN python -m pip install \
--requirement /opt/app/requirements.in \
&& rm -rf /home/user/.cache/pip

COPY --chown=user:user . /opt/app/
COPY --chown=user:user slide2vec /opt/app/slide2vec
COPY --chown=user:user setup.py /opt/app/setup.py
COPY --chown=user:user setup.cfg /opt/app/setup.cfg
COPY --chown=user:user pyproject.toml /opt/app/pyproject.toml
COPY --chown=user:user MANIFEST.in /opt/app/MANIFEST.in
COPY --chown=user:user README.md /opt/app/README.md
COPY --chown=user:user LICENSE /opt/app/LICENSE

RUN python -m pip install /opt/app
RUN python -m pip install flash-attn>=2.5.8 --no-build-isolation
RUN python -m pip install 'flash-attn>=2.7.1,<=2.8.0' --no-build-isolation


##########################
Expand Down Expand Up @@ -112,12 +106,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# install ASAP
ARG ASAP_URL=https://github.com/computationalpathologygroup/ASAP/releases/download/ASAP-2.2-(Nightly)/ASAP-2.2-Ubuntu2204.deb
RUN apt-get update && curl -L ${ASAP_URL} -o /tmp/ASAP.deb && apt-get install --assume-yes /tmp/ASAP.deb && \
SITE_PACKAGES=`python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"` && \
printf "/opt/ASAP/bin/\n" > "${SITE_PACKAGES}/asap.pth" && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# copy Python libs & entrypoints from build stage (includes flash-attn, your deps, ASAP .pth)
COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=build /usr/local/bin /usr/local/bin

# copy ASAP installation, app code, and prov-gigapath
COPY --from=build /opt/ASAP /opt/ASAP
# copy app code, and prov-gigapath
COPY --from=build /opt/app /opt/app
COPY --from=build /home/user/prov-gigapath /home/user/prov-gigapath

Expand Down
9 changes: 8 additions & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

# prov-gigapath
RUN git clone https://github.com/prov-gigapath/prov-gigapath.git /home/user/prov-gigapath
ENV PYTHONPATH="/home/user/prov-gigapath:/home/user/CONCH:/home/user/MUSK:${PYTHONPATH}"

Check warning on line 48 in Dockerfile.ci

View workflow job for this annotation

GitHub Actions / docker-test

Variables should be defined before their use

UndefinedVar: Usage of undefined variable '$PYTHONPATH' More info: https://docs.docker.com/go/dockerfile/rule/undefined-var/

# Python deps & app
RUN python -m pip install --upgrade pip setuptools pip-tools \
Expand All @@ -58,7 +58,14 @@
--requirement /opt/app/requirements.in \
&& rm -rf /root/.cache/pip

COPY --chown=user:user . /opt/app/
COPY --chown=user:user slide2vec /opt/app/slide2vec
COPY --chown=user:user setup.py /opt/app/setup.py
COPY --chown=user:user setup.cfg /opt/app/setup.cfg
COPY --chown=user:user pyproject.toml /opt/app/pyproject.toml
COPY --chown=user:user MANIFEST.in /opt/app/MANIFEST.in
COPY --chown=user:user README.md /opt/app/README.md
COPY --chown=user:user LICENSE /opt/app/LICENSE

RUN python -m pip install /opt/app

USER user
Expand Down
19 changes: 15 additions & 4 deletions slide2vec/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,20 @@ def get_args_parser(add_help: bool = True):
"--config-file", default="", metavar="FILE", help="path to config file"
)
parser.add_argument(
"--run-id",
"--output-dir",
type=str,
default="",
help="Name of output subdirectory",
default=None,
help="output directory to save logs and checkpoints",
)
parser.add_argument(
"--run-on-cpu", action="store_true", help="run inference on cpu"
)
parser.add_argument(
"opts",
help="Modify config options at the end of the command using \"path.key=value\".",
default=None,
nargs=argparse.REMAINDER,
)
return parser


Expand All @@ -54,7 +60,7 @@ def main(args):
# setup configuration
run_on_cpu = args.run_on_cpu
cfg = get_cfg_from_file(args.config_file)
output_dir = Path(cfg.output_dir, args.run_id)
output_dir = Path(cfg.output_dir, args.output_dir)
cfg.output_dir = str(output_dir)

coordinates_dir = Path(cfg.output_dir, "coordinates")
Expand All @@ -71,6 +77,11 @@ def main(args):
process_list.is_file()
), "Process list CSV not found. Ensure tiling has been run."
process_df = pd.read_csv(process_list)
if "aggregation_status" not in process_df.columns:
process_df["aggregation_status"] = ["tbp"] * len(process_df)
cols = ["wsi_name", "wsi_path", "mask_path", "tiling_status", "feature_status", "aggregation_status", "error", "traceback"]
process_df = process_df[cols]

skip_feature_aggregation = process_df["aggregation_status"].str.contains("success").all()

if skip_feature_aggregation and distributed.is_main_process():
Expand Down
19 changes: 15 additions & 4 deletions slide2vec/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,20 @@ def get_args_parser(add_help: bool = True):
"--config-file", default="", metavar="FILE", help="path to config file"
)
parser.add_argument(
"--run-id",
"--output-dir",
type=str,
default="",
help="Name of output subdirectory",
default=None,
help="output directory to save logs and checkpoints",
)
parser.add_argument(
"--run-on-cpu", action="store_true", help="run inference on cpu"
)
parser.add_argument(
"opts",
help="Modify config options at the end of the command using \"path.key=value\".",
default=None,
nargs=argparse.REMAINDER,
)
return parser


Expand Down Expand Up @@ -123,7 +129,7 @@ def main(args):
# setup configuration
run_on_cpu = args.run_on_cpu
cfg = get_cfg_from_file(args.config_file)
output_dir = Path(cfg.output_dir, args.run_id)
output_dir = Path(cfg.output_dir, args.output_dir)
cfg.output_dir = str(output_dir)

if not run_on_cpu:
Expand All @@ -148,6 +154,11 @@ def main(args):
process_list.is_file()
), "Process list CSV not found. Ensure tiling has been run."
process_df = pd.read_csv(process_list)
if "feature_status" not in process_df.columns:
process_df["feature_status"] = ["tbp"] * len(process_df)
cols = ["wsi_name", "wsi_path", "mask_path", "tiling_status", "feature_status", "error", "traceback"]
process_df = process_df[cols]

skip_feature_extraction = process_df["feature_status"].str.contains("success").all()

if skip_feature_extraction:
Expand Down
1 change: 1 addition & 0 deletions slide2vec/hs2p
Submodule hs2p added at f1a9de
52 changes: 28 additions & 24 deletions slide2vec/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,26 @@ def log_progress(features_dir: Path, stop_event: threading.Event, log_interval:
time.sleep(log_interval)


def run_tiling(config_file, run_id):
print("Running tiling.py...")
def run_tiling(root_dir, config_file, output_dir):
print(f"Running tiling.py from {root_dir}...")
cmd = [
sys.executable,
"slide2vec/tiling.py",
"--run-id",
run_id,
"hs2p/tiling.py",
"--config-file",
config_file,
os.path.abspath(config_file),
"--output-dir",
os.path.abspath(output_dir),
"--skip-datetime",
"--skip-logging",
"wandb.enable=false", # disable wandb to avoid dupliacte logging
]
proc = subprocess.Popen(cmd)
proc.wait()
proc = subprocess.run(cmd, cwd=root_dir)
if proc.returncode != 0:
print("Slide tiling failed. Exiting.")
sys.exit(proc.returncode)


def run_feature_extraction(config_file, run_id, run_on_cpu: False):
def run_feature_extraction(config_file, output_dir, run_on_cpu: False):
print("Running embed.py...")
# find a free port
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
Expand All @@ -78,19 +80,19 @@ def run_feature_extraction(config_file, run_id, run_on_cpu: False):
f"--master_port={free_port}",
"--nproc_per_node=gpu",
"slide2vec/embed.py",
"--run-id",
run_id,
"--config-file",
config_file,
os.path.abspath(config_file),
"--output-dir",
os.path.abspath(output_dir),
]
if run_on_cpu:
cmd = [
sys.executable,
"slide2vec/embed.py",
"--run-id",
run_id,
"--config-file",
config_file,
os.path.abspath(config_file),
"--output-dir",
os.path.abspath(output_dir),
"--run-on-cpu",
]
# launch in its own process group.
Expand All @@ -107,16 +109,16 @@ def run_feature_extraction(config_file, run_id, run_on_cpu: False):
sys.exit(proc.returncode)


def run_feature_aggregation(config_file, run_id, run_on_cpu: False):
def run_feature_aggregation(config_file, output_dir, run_on_cpu: False):
print("Running aggregate.py...")
# find a free port
cmd = [
sys.executable,
"slide2vec/aggregate.py",
"--run-id",
run_id,
"--config-file",
config_file,
os.path.abspath(config_file),
"--output-dir",
os.path.abspath(output_dir),
]
if run_on_cpu:
cmd.append("--run-on-cpu")
Expand All @@ -137,15 +139,17 @@ def run_feature_aggregation(config_file, run_id, run_on_cpu: False):
def main(args):
run_on_cpu = args.run_on_cpu

cfg, cfg_path, run_id = setup(args)
cfg, cfg_path = setup(args)
output_dir = Path(cfg.output_dir)

hf_login()

run_tiling(cfg_path, run_id)
root_dir = "slide2vec/hs2p"
run_tiling(root_dir, cfg_path, output_dir)

print("Tiling completed.")
print("=+=" * 10)

output_dir = Path(cfg.output_dir)
features_dir = output_dir / "features"
if cfg.wandb.enable:
stop_event = threading.Event()
Expand All @@ -154,10 +158,10 @@ def main(args):
)
log_thread.start()

run_feature_extraction(cfg_path, run_id, run_on_cpu)
run_feature_extraction(cfg_path, output_dir, run_on_cpu)

if cfg.model.level == "slide":
run_feature_aggregation(cfg_path, run_id, run_on_cpu)
run_feature_aggregation(cfg_path, output_dir, run_on_cpu)
print("Feature extraction completed.")
print("=+=" * 10)
else:
Expand Down
Loading