diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 5c4bbb13..ece7eb68 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -17,11 +17,11 @@ jobs: with: repo: 'WGLab/ContextSV' version: 'tags/v0.1.0' - file: 'TestData.zip' + file: 'SampleData.zip' - name: Unzip assets shell: bash --login {0} - run: unzip TestData.zip + run: unzip SampleData.zip -d tests - name: Set up conda (Miniconda only) uses: conda-incubator/setup-miniconda@v2 @@ -47,7 +47,7 @@ jobs: ls -l $CONDA_PREFIX/include/htslib make CONDA_PREFIX=$CONDA_PREFIX - - name: Run unit tests + - name: Test installation shell: bash --login {0} run: | source $(conda info --base)/etc/profile.d/conda.sh @@ -55,6 +55,14 @@ jobs: ./build/contextsv --version ./build/contextsv --help + - name: Run tests + shell: bash --login {0} + run: | + source $(conda info --base)/etc/profile.d/conda.sh + conda activate contextsv + mkdir -p output + python -m pytest -s -v + # run: | # source $(conda info --base)/etc/profile.d/conda.sh # conda activate contextsv diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..95933f28 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +# Use the miniconda container +FROM continuumio/miniconda3:main + +# Version argument (set during build) +ARG CONTEXTSV_VERSION + +WORKDIR /app + +RUN apt-get update +RUN conda update conda + +# Install ContextSV +RUN conda config --add channels wglab +RUN conda config --add channels conda-forge +RUN conda config --add channels bioconda +RUN conda create -n contextsv python=3.9 +RUN echo "conda activate contextsv" >> ~/.bashrc +SHELL ["/bin/bash", "--login", "-c"] +RUN conda install -n contextsv -c wglab -c conda-forge -c bioconda contextsv=${CONTEXTSV_VERSION} && conda clean -afy + +ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "contextsv", "contextsv"] diff --git a/Makefile b/Makefile index 207f7e09..ae39b32b 100644 --- a/Makefile +++ b/Makefile @@ -4,13 +4,6 @@ SRC_DIR := $(CURDIR)/src BUILD_DIR := $(CURDIR)/build LIB_DIR := $(CURDIR)/lib -# Version header -VERSION := $(shell git describe --tags --always) -VERSION_HEADER := $(INCL_DIR)/version.h -.PHONY: $(VERSION_HEADER) - @echo "#pragma once" > $@ - @echo "#define VERSION \"$(VERSION)\"" >> $@ - # Conda environment directories CONDA_PREFIX := $(shell echo $$CONDA_PREFIX) CONDA_INCL_DIR := $(CONDA_PREFIX)/include @@ -50,4 +43,3 @@ $(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp # Clean the build directory clean: rm -rf $(BUILD_DIR) - \ No newline at end of file diff --git a/README.md b/README.md index 2cb17bf8..3a7e0cda 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,32 @@ Class documentation is available at (optional: )") - sys.exit(1) - - # The second argument is the input VCF file. - input_vcf = sys.argv[2] - - # The third argument is the epsilon value for the DBSCAN clustering. If - # empty, set to 34. - epsilon = sys.argv[3] if len(sys.argv) >= 4 else 34 - - # The fourth argument is the suffix for the output file. If empty, set - # to ".merged" - suffix = sys.argv[4] if len(sys.argv) >= 5 else ".merged" - - # Run the SV merger. - from python import sv_merger - sv_merger.sv_merger(input_vcf, mode='dbscan', eps=int(epsilon), suffix=suffix) - - # Run the program. - main() diff --git a/conda/meta.yaml b/conda/meta.yaml index 698a62e4..d217ee43 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,46 +1,45 @@ -{% set version = "1.0.0" %} -{% set sha256 = "" %} +{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %} +{% set revision = environ.get('GIT_FULL_HASH') %} package: - name: longreadsum + name: contextsv version: {{ version }} source: - url: https://github.com/WGLab/LongReadSum/archive/refs/tags/v{{ version }}.tar.gz - sha256: '{{ sha256 }}' + path: ../ + git_lfs: false + +channels: + - conda-forge + - bioconda + - defaults build: number: 0 - skip: true # [py2k] - entry_points: - - contextsv = entry_point:main requirements: build: - {{ compiler('cxx') }} - make host: - - python - - swig - - htslib + - htslib=1.20 run: - - python - - numpy - - htslib - - swig - - plotly - - pandas - - scikit-learn - - joblib + - htslib=1.20 test: commands: - contextsv --help + - test -f $PREFIX/bin/contextsv + - contextsv --version about: home: https://github.com/WGLab/ContextSV license: MIT - summary: 'Long read structural variant calling tool' + license_family: MIT + license_file: LICENSE + summary: 'Long-read structural variant caller' description: | - An alignment-based, generalized structural variant caller for long-read sequencing/mapping data. + ContextSV is a tool for calling structural variants from long read + sequencing data that integrates important copy number and single-nucleotide + allele frequency information to improve accuracy. dev_url: https://github.com/WGLab/ContextSV doc_url: https://github.com/WGLab/ContextSV#readme diff --git a/include/cnv_caller.h b/include/cnv_caller.h index afdd78b3..293e547a 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -105,9 +105,6 @@ class CNVCaller { void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb, const InputData& input_data) const; - // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions - void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; - void saveSVCopyNumberToJSON(SNPData& before_sv, SNPData& after_sv, SNPData& snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const; }; diff --git a/include/input_data.h b/include/input_data.h index 1e2c3c1e..32c78ce2 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -77,11 +77,6 @@ class InputData { std::string getChromosome() const; bool isSingleChr() const; - // Set the region to analyze. - void setRegion(std::string region); - std::pair getRegion() const; - bool isRegionSet() const; - // Set the output directory where the results will be written. void setOutputDir(std::string dirpath); std::string getOutputDir() const; @@ -116,7 +111,6 @@ class InputData { double dbscan_min_pts_pct; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze - bool region_set; // True if a region is set int thread_count; std::string hmm_filepath; std::string cnv_filepath; diff --git a/include/utils.h b/include/utils.h index d95f0a8a..bb6d9847 100644 --- a/include/utils.h +++ b/include/utils.h @@ -13,40 +13,6 @@ /// @endcond -// Guard to close the BAM file -// struct BamFileGuard { -// samFile* fp_in; -// hts_idx_t* idx; -// bam_hdr_t* bamHdr; - -// BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr) -// : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {} - -// ~BamFileGuard() { -// if (idx) { -// hts_idx_destroy(idx); -// idx = nullptr; -// } -// if (bamHdr) { -// bam_hdr_destroy(bamHdr); -// bamHdr = nullptr; -// } -// if (fp_in) { -// sam_close(fp_in); -// fp_in = nullptr; -// } -// } - -// BamFileGuard(const BamFileGuard&) = delete; // Non-copyable -// BamFileGuard& operator=(const BamFileGuard&) = delete; // Non-assignable -// }; - -// Print the progress of a task -void printProgress(int progress, int total); - -// Run bcftools to determine the chr notation of a VCF file -bool isChrNotation(std::string vcf_filepath); - // Print a message to stdout in a thread-safe manner void printMessage(std::string message); @@ -55,8 +21,6 @@ void printError(std::string message); std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start, std::chrono::high_resolution_clock::time_point end); -std::string removeChrPrefix(std::string chr); - void printMemoryUsage(const std::string &functionName); bool fileExists(const std::string &filepath); diff --git a/include/version.h b/include/version.h index d38178a8..5a232bc4 100644 --- a/include/version.h +++ b/include/version.h @@ -1,2 +1,9 @@ -#pragma once -#define VERSION "v0,1,0-41-gd62fe12" +// version.h +#ifndef CONTEXTSV_VERSION_H +#define CONTEXTSV_VERSION_H + +#define VERSION_MAJOR 1 +#define VERSION_MINOR 0 +#define VERSION_PATCH 0 + +#endif // CONTEXTSV_VERSION_H diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py index 768058e9..9f2c9458 100644 --- a/python/cnv_plots_json.py +++ b/python/cnv_plots_json.py @@ -5,7 +5,7 @@ import plotly from plotly.subplots import make_subplots -min_sv_length = 200000 # Minimum SV length in base pairs +min_sv_length = 60000 # Minimum SV length in base pairs # Set up argument parser parser = argparse.ArgumentParser(description='Generate CNV plots from JSON data.') @@ -20,12 +20,12 @@ # State marker colors # https://community.plotly.com/t/plotly-colours-list/11730/6 state_colors_dict = { - '1': 'red', - '2': 'darkred', - '3': 'darkgreen', + '1': 'darkred', + '2': 'red', + '3': 'gray', '4': 'green', - '5': 'darkblue', - '6': 'blue', + '5': 'blue', + '6': 'darkblue', } sv_type_dict = { @@ -39,6 +39,7 @@ # If a chromosome is specified, filter the SVs by that chromosome if args.chromosome and sv['chromosome'] != args.chromosome: + print(f"Skipping SV {sv['chromosome']}:{sv['start']}-{sv['end']} of type {sv['sv_type']} (not on chromosome {args.chromosome})") continue # Filter out SVs that are smaller than the minimum length @@ -224,7 +225,7 @@ # Set the title of the plot. fig.update_layout( - title_text = f"{sv_type_dict[sv_type]} at {chromosome}:{start}-{end} ({sv_length} bp) (LLH={likelihood})", + title_text = f"{sv_type_dict[sv_type]} at {chromosome}:{start}-{end} ({sv_length} bp)", title_x = 0.5, showlegend = False, ) diff --git a/setup.py b/setup.py deleted file mode 100644 index c8591523..00000000 --- a/setup.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -setup.py: - This file is used to install the package. -""" - -import os -import glob -from setuptools import setup, find_packages, Extension - -print("Running setup.py...") - - -# Set the project metadata -NAME = "contextsv" -VERSION = "1.0.0" -AUTHOR = "WGLab" -DESCRIPTION = "ContextSV: A tool for integrative structural variant detection." - -# Get the conda environment's include path -conda_prefix = os.environ.get("CONDA_PREFIX") -if conda_prefix is None: - raise AssertionError("CONDA_PREFIX is not set.") - -conda_include_dir = os.path.join(conda_prefix, "include") - -# Get the conda environment's lib path -conda_lib_dir = os.path.join(conda_prefix, "lib") - -# Set the project dependencies -SRC_DIR = "src" -# SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp")) -SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp")) -SRC_FILES = [f for f in SRC_FILES if "main.cpp" not in f] # Ignore the main.cpp file - -INCLUDE_DIR = "include" -INCLUDE_FILES = glob.glob(os.path.join(INCLUDE_DIR, "*.h")) - -# Set up the extension -ext = Extension( - name="_" + NAME, - sources=SRC_FILES, - include_dirs=[INCLUDE_DIR, conda_include_dir], - extra_compile_args=["-std=c++17"], - language="c++", - libraries=["hts"], - library_dirs=[conda_lib_dir] -) - -# Set up the module -setup( - name=NAME, - version=VERSION, - author=AUTHOR, - description=DESCRIPTION, - ext_modules=[ext], - py_modules=[NAME], - packages=find_packages(), - test_suite="tests", - entry_points={ - "console_scripts": [ - "contextsv = contextsv:main" - ] - } -) - diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 66f1f146..c91d1d31 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -171,22 +171,6 @@ std::tuple CNVCaller::runCopyNumberPrediction(std printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); } - /* - // Check that there is no large number of zero-depth positions in the region - int zero_depth_count = 0; - for (uint32_t pos = start_pos; pos <= end_pos; pos++) - { - if (pos < pos_depth_map.size() && pos_depth_map[pos] == 0) - { - zero_depth_count++; - } - } - if (zero_depth_count > 0.1 * (end_pos - start_pos + 1)) - { - printError("WARNING: Too many zero-depth positions in the SV region for copy number prediction, skipping: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); - return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); - } - */ // Run the Viterbi algorithm on SNPs in the SV region // Only extend the region if "save CNV data" is enabled @@ -226,27 +210,34 @@ std::tuple CNVCaller::runCopyNumberPrediction(std std::vector& state_sequence = prediction.first; double likelihood = prediction.second; - // Determine if there is a majority state within the SV region - int max_state = 0; - int max_count = 0; + // Get state percentages + std::unordered_map state_pct; + double state_count = (double) state_sequence.size(); + double largest_non_neutral_pct = 0.0; + int non_neutral_state = 0; for (int i = 0; i < 6; i++) { - int state_count = std::count(state_sequence.begin(), state_sequence.end(), i+1); - if (state_count > max_count) + state_pct[i+1] = (double)std::count(state_sequence.begin(), state_sequence.end(), i+1) / state_count; + if (i+1 != 3 && state_pct[i+1] > largest_non_neutral_pct) { - max_state = i+1; - max_count = state_count; + largest_non_neutral_pct = state_pct[i+1]; + non_neutral_state = i+1; } } - // If there is no majority state, then set the state to unknown - double pct_threshold = 0.50; - int state_count = (int) state_sequence.size(); - if ((double) max_count / (double) state_count < pct_threshold) + // Use the state exceeding the threshold if non-neutral + double pct_threshold = 0.3; + int max_state = 0; // Unknown state + if (largest_non_neutral_pct > pct_threshold) { - max_state = 0; + max_state = non_neutral_state; + + // Use the neutral state if it exceeds the threshold + } else if (state_pct[3] > pct_threshold) { + max_state = 3; } + // Determine the genotype and SV type Genotype genotype = getGenotypeFromCNState(max_state); SVType predicted_cnv_type = getSVTypeFromCNState(max_state); @@ -817,88 +808,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui bcf_sr_destroy(pfb_reader); } -void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const -{ - // Open the TSV file for writing - std::ofstream tsv_file(filepath); - if (!tsv_file.is_open()) - { - std::cerr << "ERROR: Could not open TSV file for writing: " << filepath << std::endl; - exit(1); - } - - // Ensure all values are valid, and print an error message if not - if (chr == "" || start == 0 || end == 0 || sv_type == "") - { - std::cerr << "ERROR: Invalid SV information for TSV file: " << chr << ":" << start << "-" << end << " " << sv_type << std::endl; - exit(1); - } - - if (snp_data.pos.size() == 0) - { - std::cerr << "ERROR: No SNP data available for TSV file: " << chr << ":" << start << "-" << end << " " << sv_type << std::endl; - exit(1); - } - - if (likelihood == 0.0) - { - std::cerr << "ERROR: Invalid likelihood value for TSV file: " << chr << ":" << start << "-" << end << " " << sv_type << std::endl; - exit(1); - } - - if (sv_type.empty()) - { - std::cerr << "ERROR: Invalid SV type for TSV file: " << chr << ":" << start << "-" << end << " " << sv_type << std::endl; - exit(1); - } - - // Format the size string in kb - std::stringstream ss; - ss << std::fixed << std::setprecision(6) << likelihood; - std::string likelihood_str = ss.str(); - - // Print SV information to the TSV file - tsv_file << "SVTYPE=" << sv_type << std::endl; - tsv_file << "POS=" << chr << ":" << start << "-" << end << std::endl; - tsv_file << "HMM_LOGLH=" << likelihood_str << std::endl; - - // Write the header - tsv_file << "chromosome\tposition\tsnp\tb_allele_freq\tlog2_ratio\tcnv_state\tpopulation_freq" << std::endl; - - // Write the data - int snp_count = (int) snp_data.pos.size(); - for (int i = 0; i < snp_count; i++) - { - // Get the SNP data - uint32_t pos = snp_data.pos[i]; - bool is_snp = snp_data.is_snp[i]; - double pfb = snp_data.pfb[i]; - double baf = snp_data.baf[i]; - double log2_ratio = snp_data.log2_cov[i]; - int cn_state = snp_data.state_sequence[i]; - - // If the SNP is not a SNP, then set the BAF to 0.0 - if (!is_snp) - { - baf = 0.0; - } - - // Write the TSV line (chrom, pos, baf, lrr, state) - tsv_file << \ - chr << "\t" << \ - pos << "\t" << \ - is_snp << "\t" << \ - baf << "\t" << \ - log2_ratio << "\t" << \ - cn_state << "\t" << \ - pfb << \ - std::endl; - } - - // Close the file - tsv_file.close(); -} - void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SNPData &snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const { // Append the SV information to the JSON file diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index e4f0e1dc..237db443 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -174,7 +174,7 @@ uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const } catch (const std::out_of_range& e) { - printError("Chromosome " + chr + " not found in reference genome"); + printError("Length for chromosome " + chr + " not found in reference genome"); return 0; } } diff --git a/src/input_data.cpp b/src/input_data.cpp index 3b53a7d7..936f7e62 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -21,8 +21,6 @@ InputData::InputData() this->ref_filepath = ""; this->snp_vcf_filepath = ""; this->chr = ""; - this->start_end = std::make_pair(0, 0); - this->region_set = false; this->output_dir = ""; this->sample_size = 20; this->min_cnv_length = 2000; // Default minimum CNV length @@ -49,14 +47,6 @@ void InputData::printParameters() const DEBUG_PRINT("Minimum CNV length: " << this->min_cnv_length); DEBUG_PRINT("DBSCAN epsilon: " << this->dbscan_epsilon); DEBUG_PRINT("DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f << "%"); - if (this->region_set) - { - DEBUG_PRINT("Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second)); - } - else - { - DEBUG_PRINT("Running on whole genome"); - } } std::string InputData::getLongReadBam() const @@ -218,47 +208,6 @@ bool InputData::isSingleChr() const return this->single_chr; } -void InputData::setRegion(std::string region) -{ - // Check if the region is valid - if (region != "") - { - // Split the region by colon - std::istringstream ss(region); - std::string token; - std::vector region_tokens; - - while (std::getline(ss, token, '-')) - { - region_tokens.push_back(token); - } - - // Check if the region is valid - if (region_tokens.size() == 2) - { - // Get the start and end positions - int32_t start = std::stoi(region_tokens[0]); - int32_t end = std::stoi(region_tokens[1]); - - // Set the region - this->start_end = std::make_pair(start, end); - this->region_set = true; - - std::cout << "Region set to " << this->chr << ":" << start << "-" << end << std::endl; - } - } -} - -std::pair InputData::getRegion() const -{ - return this->start_end; -} - -bool InputData::isRegionSet() const -{ - return this->region_set; -} - void InputData::setAlleleFreqFilepaths(std::string filepath) { // Check if empty string @@ -275,19 +224,18 @@ void InputData::setAlleleFreqFilepaths(std::string filepath) exit(1); } - // If a region is set, load only the chromosome in the region + // If a chr is specified, load only the file for that chromosome std::string target_chr; if (this->chr != "") { target_chr = this->chr; - // Check if the region is in chr notation + // Check if in chr notation if (target_chr.find("chr") != std::string::npos) { // Remove the chr notation target_chr = target_chr.substr(3, target_chr.size() - 3); } - //std::cout << "Loading population allele frequency file for chromosome " << target_chr << std::endl; } // Load the file and create a map of chromosome -> VCF file diff --git a/src/main.cpp b/src/main.cpp index 874f444f..45984eb2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,18 +5,19 @@ #include #include #include +#include +#include +#include // For signal handling #include #include -// #include /// @endcond #include "input_data.h" -#include "version.h" #include "utils.h" - +#include "version.h" void printStackTrace(int sig) { @@ -40,7 +41,7 @@ void printBanner() std::strftime(date_str, sizeof(date_str), "%Y-%m-%d", std::localtime(&now)); std::cout << "═══════════════════════════════════════════════════════════════" << std::endl; std::cout << " ContextSV - Long-read Structural Variant Caller" << std::endl; - std::cout << " Version: " << VERSION << std::endl; + std::cout << " Version: " << VERSION_MAJOR << "." << VERSION_MINOR << "." << VERSION_PATCH << std::endl; std::cout << " Date: " << date_str << std::endl; std::cout << "═══════════════════════════════════════════════════════════════" << std::endl; } @@ -67,9 +68,6 @@ void runContextSV(const std::unordered_map& args) if (args.find("chr") != args.end()) { input_data.setChromosome(args.at("chr")); } - if (args.find("region") != args.end()) { - input_data.setRegion(args.at("region")); - } if (args.find("thread-count") != args.end()) { input_data.setThreadCount(std::stoi(args.at("thread-count"))); } @@ -125,14 +123,14 @@ void runContextSV(const std::unordered_map& args) } void printUsage(const std::string& programName) { - std::cerr << "Usage: " << programName << " [options]\n" + std::cout << "ContextSV v" << VERSION_MAJOR << "." << VERSION_MINOR << "." << VERSION_PATCH << std::endl; + std::cout << "Usage: " << programName << " [options]\n" << "Options:\n" << " -b, --bam Long-read BAM file (required)\n" << " -r, --ref Reference genome FASTA file (required)\n" << " -s, --snp SNPs VCF file (required)\n" << " -o, --outdir Output directory (required)\n" << " -c, --chr Chromosome\n" - << " -r, --region Region (start-end)\n" << " -t, --threads Number of threads\n" << " -h, --hmm HMM file\n" << " -n, --sample-size Sample size for HMM predictions\n" @@ -191,7 +189,7 @@ std::unordered_map parseArguments(int argc, char* argv } else if (arg == "--debug") { args["debug"] = "true"; } else if ((arg == "-v" || arg == "--version")) { - std::cout << "ContextSV version " << VERSION << std::endl; + std::cout << "ContextSV v" << VERSION_MAJOR << "." << VERSION_MINOR << "." << VERSION_PATCH << std::endl; exit(0); } else if (arg == "-h" || arg == "--help") { printUsage(argv[0]); @@ -218,6 +216,7 @@ std::unordered_map parseArguments(int argc, char* argv int main(int argc, char* argv[]) { auto args = parseArguments(argc, argv); runContextSV(args); + std::cout << "ContextSV finished successfully!" << std::endl; return 0; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 29dab604..d59e6598 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -26,11 +26,11 @@ #include "ThreadPool.h" #include "utils.h" #include "sv_types.h" -#include "version.h" #include "fasta_query.h" #include "dbscan.h" #include "dbscan1d.h" #include "debug.h" +#include "version.h" /// @endcond # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection @@ -149,7 +149,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map qpos = getAlignmentReadPositions(bam1); - primary_map[bam1->core.tid][qname] = PrimaryAlignment{bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}; + primary_map[bam1->core.tid][qname] = PrimaryAlignment{static_cast(bam1->core.pos + 1), static_cast(bam_endpos(bam1)), static_cast(qpos.first), static_cast(qpos.second), !(bam1->core.flag & BAM_FREVERSE), 0}; alignment_tids.insert(bam1->core.tid); primary_count++; @@ -159,7 +159,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map qpos = getAlignmentReadPositions(bam1); - supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE)}); + supp_map[qname].push_back(SuppAlignment{bam1->core.tid, static_cast(bam1->core.pos + 1), static_cast(bam_endpos(bam1)), static_cast(qpos.first), static_cast(qpos.second), !(bam1->core.flag & BAM_FREVERSE)}); alignment_tids.insert(bam1->core.tid); supp_qnames.insert(qname); supplementary_count++; @@ -415,23 +415,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 1) { - // std::sort(supp_positions.begin(), supp_positions.end()); - // int supp_start = supp_positions.front(); - // int supp_end = supp_positions.back(); - // int sv_length = std::abs(supp_start - supp_end); - - // // Use 50bp as the minimum length for an inversion - // if (sv_length >= 50 && sv_length <= max_length) { - // SVEvidenceFlags aln_type; - // aln_type.set(static_cast(SVDataType::SUPPINV)); - // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); - // // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - // } - // } - // ------------------------------- // SPLIT INSERTION CALLS int read_distance = 0; @@ -755,13 +738,6 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch bam_hdr_destroy(bamHdr); printMessage(chr + ": Merging CIGAR..."); - // Save JSON if chr21 - // if (chr == "chr21") { - // std::string json_fp = input_data.getOutputDir() + "/" + chr + ".json"; - // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, true, json_fp); - // } else { - // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); - // } mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); int region_sv_count = getSVCount(chr_sv_calls); @@ -1185,7 +1161,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, double epsilon, int min_pts, bool k continue; } - DEBUG_PRINT("Merging SV type: " + getSVTypeString(sv_type) + " (epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts) + ", num SVs=" + std::to_string(sv_calls.size()) + ")"); std::vector merged_sv_type_calls; // Create a vector of SV calls for the current SV type and size interval @@ -81,12 +80,13 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { return sv_call.sv_type == sv_type; }); + DEBUG_PRINT("Merging SV type: " + getSVTypeString(sv_type) + " (epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts) + ", num SVs=" + std::to_string(sv_type_calls.size()) + ")"); if (sv_type_calls.size() < 2) { // Add all unclustered points to the merged list for (const auto& sv_call : sv_type_calls) { SVCall noise_sv_call = sv_call; - merged_sv_type_calls.push_back(noise_sv_call); + merged_sv_calls.push_back(noise_sv_call); } continue; } diff --git a/src/utils.cpp b/src/utils.cpp index a27263b7..20dd479e 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -4,6 +4,7 @@ #include // getrusage #include #include +#include #include #include #include @@ -11,72 +12,8 @@ /// @endcond -// Define print mutex std::mutex print_mtx; -// Print a progress bar -void printProgress(int progress, int total) -{ - float percent = (float)progress / (float)total * 100.0; - int num_hashes = (int)(percent / 2.0); - - // Print the progress bar - printf("\r["); - for (int i = 0; i < num_hashes; i++) - { - printf("#"); - } - for (int i = 0; i < 50 - num_hashes; i++) - { - printf(" "); - } - printf("] %3.2f%%", percent); - fflush(stdout); - - // Print a new line if finished - if (progress == total) - { - printf("\n"); - } -} - -// Run bcftools to determine the chr notation of a VCF file -bool isChrNotation(std::string vcf_filepath) -{ - // Create the command to extract the chromosomes from the VCF - std::string cmd = "bcftools query -f '%CHROM\n' " + vcf_filepath + " 2>/dev/null"; - - // Open the pipe - FILE* pipe = popen(cmd.c_str(), "r"); - if (pipe == NULL) - { - std::cerr << "Error: could not open pipe" << std::endl; - return false; - } - - // Read the first line - const int line_size = 256; - char buffer[line_size]; - if (!fgets(buffer, line_size, pipe)) - { - std::cerr << "Error reading from pipe" << std::endl; - return false; - } - - // Check if the first line contains "chr" using std::string::find - bool is_chr_notation = false; - std::string line(buffer); - if (line.find("chr") != std::string::npos) - { - is_chr_notation = true; - } - - // Close the pipe - pclose(pipe); - - return is_chr_notation; -} - // Thread-safe print message function void printMessage(std::string message) { @@ -102,15 +39,6 @@ std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start, return elapsed_time; } -// Function to remove the 'chr' prefix from chromosome names -std::string removeChrPrefix(std::string chr) -{ - if (chr.find("chr") != std::string::npos) { - return chr.substr(3); - } - return chr; -} - void printMemoryUsage(const std::string& functionName) { struct rusage usage; getrusage(RUSAGE_SELF, &usage); @@ -141,3 +69,5 @@ void closeJSON(const std::string &filepath) json_file << "]"; // Close the JSON array json_file.close(); } + + diff --git a/tests/data/ref.fa b/tests/data/ref.fa deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/snps.vcf.gz b/tests/data/snps.vcf.gz deleted file mode 100644 index 6f0175b0..00000000 Binary files a/tests/data/snps.vcf.gz and /dev/null differ diff --git a/tests/data/snps.vcf.gz.csi b/tests/data/snps.vcf.gz.csi deleted file mode 100644 index 2c809709..00000000 Binary files a/tests/data/snps.vcf.gz.csi and /dev/null differ diff --git a/tests/data/snps.vcf.gz.tbi b/tests/data/snps.vcf.gz.tbi deleted file mode 100644 index 5110e913..00000000 Binary files a/tests/data/snps.vcf.gz.tbi and /dev/null differ diff --git a/tests/data/test.bam b/tests/data/test.bam deleted file mode 100644 index 75a95e3e..00000000 Binary files a/tests/data/test.bam and /dev/null differ diff --git a/tests/data/test.bam.bai b/tests/data/test.bam.bai deleted file mode 100644 index 08baf8e9..00000000 Binary files a/tests/data/test.bam.bai and /dev/null differ diff --git a/tests/test_general.py b/tests/test_general.py index ff65faba..c1c59da7 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -3,22 +3,10 @@ """ import os +import subprocess import sys import pytest -# Import lib from the parent directory -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from lib import contextsv - -# Get the path to the test data directory. -TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') - -# Get the path to the test data files. -TEST_BAM_FILE = os.path.join(TEST_DATA_DIR, 'test.bam') -TEST_REF_FILE = "" -TEST_SNPS_FILE = os.path.join(TEST_DATA_DIR, 'snps.vcf.gz') -TEST_PFB_FILE = "" # Set the output directory. TEST_OUTDIR = os.path.join(os.path.dirname(__file__), 'output') @@ -26,62 +14,155 @@ # Check if running unit tests on GitHub Actions local_dir = os.path.expanduser("~/github/ContextSV") if os.getcwd() == local_dir: - test_dir_path = os.path.join(local_dir, 'tests/data') + TEST_DATA_DIR = os.path.join(local_dir, 'tests/data') else: - test_dir_path = os.path.abspath(str("TestData")) - -TEST_REF_FILE = os.path.join(test_dir_path, 'GRCh37_21.fa') -TEST_BAM_FILE = os.path.join(test_dir_path, 'ONT_Kit14_21.bam') -TEST_SNPS_FILE = os.path.join(test_dir_path, 'snps_21.vcf.gz') - - -def test_run(): - """Test the run function of the contextsv module with a small dataset.""" - - # Set input parameters. - input_data = contextsv.InputData() - input_data.setLongReadBam(TEST_BAM_FILE) - input_data.setRefGenome(TEST_REF_FILE) - input_data.setSNPFilepath(TEST_SNPS_FILE) - input_data.setChromosome("21") - input_data.setRegion("14486099-14515105") - input_data.setThreadCount(1) - input_data.setAlleleFreqFilepaths("") - input_data.setHMMFilepath("") - input_data.setOutputDir(TEST_OUTDIR) - input_data.saveCNVData(True) - - # Run the analysis. - contextsv.run(input_data) - - # Check that the output file exists. - output_file = os.path.join(TEST_OUTDIR, 'output.vcf') - assert os.path.exists(output_file) - - # Check that the VCF file is not empty. - assert os.path.getsize(output_file) > 0 - - # Check that the VCF file has the correct number of lines. - with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 22 - - # Check that the VCF file has the correct header, and the correct - # VCF CHROM, POS, and INFO fields in the next 2 lines. - header_line = 17 - with open(output_file, 'r', encoding='utf-8') as f: - for i, line in enumerate(f): - if i == header_line: - assert line.strip() == "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE" - elif i == header_line + 1: - # Get the first, second, and eighth fields from the line - fields = line.strip().split('\t') - assert fields[0] == "21" - assert fields[1] == "14458394" - assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS;CLIPSUP=0;REPTYPE=NA;HMM=0.000000" - elif i == header_line + 2: - fields = line.strip().split('\t') - assert fields[0] == "21" - assert fields[1] == "14502888" - assert fields[7] == "END=14502953;SVTYPE=BOUNDARY;SVLEN=65;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=BOUNDARY;CLIPSUP=0;REPTYPE=NA;HMM=-4.606171" + TEST_DATA_DIR = os.path.abspath(str("tests/data")) + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +BAM_FILE = os.path.join(TEST_DATA_DIR, 'chr3_test.bam') +REF_FILE = os.path.join(TEST_DATA_DIR, 'GRCh38_noalts_chr3.fa') +SNPS_FILE = os.path.join(TEST_DATA_DIR, 'chr3_test.snps.vcf.gz') +GAP_FILE = os.path.join(TEST_DATA_DIR, 'Gaps-HG38-UCSC-chr3.bed') +HMM_FILE = os.path.join(TEST_DATA_DIR, 'wgs_test.hmm') + +# Create a PFB file pointing to the SNP file in the test data directory +PFB_FILE = os.path.join(TEST_DATA_DIR, 'chr3_pfb.txt') +# Remove any existing PFB file to avoid conflicts +if os.path.exists(PFB_FILE): + os.remove(PFB_FILE) + +PFB_SNP_FILE = os.path.join(TEST_DATA_DIR, 'chr3_gnomad_snps_isec.vcf.gz') +with open(PFB_FILE, 'w', encoding='utf-8') as pf: + pf.write(f"3={PFB_SNP_FILE}\n") + +def test_run_help(): + """Ensure the binary runs with --help and exits cleanly.""" + result = subprocess.run( + ["./build/contextsv", "--help"], + capture_output=True, + text=True, + check=True + ) + + # Print output for debugging purposes + print("STDOUT:", result.stdout) + + # Check process exited successfully + assert result.returncode == 0, f"Non-zero exit: {result.returncode}\n{result.stderr}" + assert result.stdout, "No output from --help" + assert "Usage:" in result.stdout, "Help text missing 'Usage:'" + assert "Options:" in result.stdout, "Help text missing 'Options:'" + +def test_run_version(): + """Ensure the binary runs with --version and exits cleanly.""" + result = subprocess.run( + ["./build/contextsv", "--version"], + capture_output=True, + text=True, + check=True + ) + + # Print output for debugging purposes + print("STDOUT:", result.stdout) + + # Check process exited successfully + assert result.returncode == 0, f"Non-zero exit: {result.returncode}\n{result.stderr}" + assert result.stdout, "No output from --version" + +def test_run_basic(): + """Run ContextSV with basic required parameters.""" + out_dir = os.path.join(TEST_OUTDIR, 'test_run_basic') + os.makedirs(out_dir, exist_ok=True) + + vcf_file = os.path.join(out_dir, 'output.vcf') + if os.path.exists(vcf_file): + os.remove(vcf_file) + json_file = os.path.join(out_dir, 'CNVCalls.json') + if os.path.exists(json_file): + os.remove(json_file) + + result = subprocess.run( + ["./build/contextsv", + "--bam", BAM_FILE, + "--ref", REF_FILE, + "--snp", SNPS_FILE, + "--outdir", out_dir, + "--hmm", HMM_FILE, + "--eth", "nfe", + "--pfb", PFB_FILE, + "--sample-size", "20", + "--min-cnv", "2000", + "--eps", "0.1", + "--min-pts-pct", "0.1", + "--assembly-gaps", GAP_FILE, + "--chr", "chr3", + "--save-cnv", + "--debug" + ], + capture_output=True, + text=True, + check=True + ) + + # Print output for debugging purposes + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + + # Check process exited successfully + assert result.returncode == 0, f"Non-zero exit: {result.returncode}\n{result.stderr}" + assert "ContextSV finished successfully!" in result.stdout, "Did not complete successfully" + + # Check for expected output files + expected_files = [ + os.path.join(out_dir, 'CNVCalls.json'), + os.path.join(out_dir, 'output.vcf') + ] + for ef in expected_files: + assert os.path.isfile(ef), f"Expected output file not found: {ef}" + + # Find the large duplication in the VCF output + # chr3 61149366 . N . PASS END=61925600;SVTYPE=DUP;SVLEN=776235;SVMETHOD=ContextSVv1.0.0-1-g4bd038c;ALN=SPLIT,HMM;HMM=-2533.541937;SUPPORT=63;CLUSTER=23;ALNOFFSET=0;CN=6 GT:DP 1/1:63 + found_dup = False + with open(vcf_file, 'r') as vf: + for line in vf: + if line.startswith('#'): + continue + fields = line.strip().split('\t') + if len(fields) < 8: + continue + chrom, pos, id, ref, alt, qual, filter, info = fields[:8] + + if (chrom == 'chr3' and pos == '61149366' and alt == ''): + found_dup = True + info_dict = dict(item.split('=') for item in info.split(';') if '=' in item) + assert info_dict.get('SVTYPE') == 'DUP', "SVTYPE is not DUP" + assert int(info_dict.get('SVLEN', 0)) == 776235, f"SVLEN is not 776235, got {info_dict.get('SVLEN')}" + assert int(info_dict.get('CN', 0)) == 6, f"CN is not 6, got {info_dict.get('CN')}" + break + + assert found_dup, "Expected duplication not found in VCF output" + + # Find the large duplication in the CNVCalls.json output + # [ + # { + # "chromosome": "chr3", + # "start": 61149366, + # "end": 61925600, + # "sv_type": "DUP", + # "likelihood": -2533.54, + # "size": 776235, + + found_dup_json = False + import json + with open(json_file, 'r') as jf: + cnv_data = json.load(jf) + for entry in cnv_data: + if (entry.get('chromosome') == 'chr3' and + entry.get('start') == 61149366 and + entry.get('end') == 61925600 and + entry.get('sv_type') == 'DUP'): + found_dup_json = True + assert entry.get('size') == 776235, f"Size is not 776235, got {entry.get('size')}" break - \ No newline at end of file + + assert found_dup_json, "Expected duplication not found in CNVCalls.json output"