diff --git a/.travis.yml b/.travis.yml index 1464906..1bc5ee8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ sudo: required dist: xenial python: # We don't actually use the Travis Python, but this keeps it organized. - - "3.7" + - "3.6" install: - sudo apt-get update - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh diff --git a/Makefile b/Makefile index 23fc868..7e6edb7 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,10 @@ install: - python3 setup.py -q install + pip install -e ./ test: - python3 -m pytest + pytest -s +local_test: + barseq -i tests/data/input/sequences -f -e test_barcodes -s tests/data/input/samples.csv -b tests/data/input/barcodes.csv -o tests/dump +local_test_ref_free: + barseq -i tests/data/input/sequences -f -e test -s tests/data/input/samples.csv --reference-free -o tests/dump +clean_dump: + rm -r tests/dump/results \ No newline at end of file diff --git a/README.md b/README.md index 83aaa84..b3be9b7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![Build Status](https://travis-ci.com/eburgoswisc/barseq.svg?branch=master)](https://travis-ci.com/eburgoswisc/barseq) -![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg) +![Build Status](https://travis-ci.org/mjmlab/barseq.svg?branch=master) +![Python 3.6](https://img.shields.io/badge/python-3.7-blue.svg) # barseq diff --git a/barseq/main.py b/barseq/main.py index cd98a22..85298c3 100644 --- a/barseq/main.py +++ b/barseq/main.py @@ -12,8 +12,8 @@ from pathlib import Path # Module import -from .utils import write_output, read_barcodes, format_filename, make_barseq_directories -from .process_reads import count_barcodes +from .utils import * +from .process_reads import * __author__ = "Emanuel Burgos" @@ -39,13 +39,40 @@ def __exit__(self, etype, value, traceback): class Run: """ Class that stores settings for barseq processes. """ def __init__(self, args): + # Required parameters self.experiment = args.experiment - self.sequences = Path(args.input) - self.barcodes = Path(args.barcodes) + self.sequence_dir = Path(args.input) + self.sequence_files = Path(self.sequence_dir) + self.barcodes = args.barcodes + self.sample_ids = read_sample_ID(Path(args.samples)) + self.reference_free = args.reference_free + # Settings for sequence search + self.flanking_left = args.flanking_left + self.flanking_right = args.flanking_right + self.barcode_length = args.barcode_length + self.pattern = f"({self.flanking_left}){{e<=1}}([ATGC]{{18}})({self.flanking_right}){{e<=1}}" + # If barcoded provided + if not self.reference_free: + if not Path(self.barcodes).exists(): + logger.error('Reference barcode library provided does not exist. ' + 'If you do not have one, run --reference-free') + self.barcodes = Path(args.barcodes) + # If barcodes provided, grab sample + if self.reference_free: + if not Path(args.samples).exists(): + logger.error("You selected to run barseq using --reference-free " + "but did not provide a file for sample names") + self.min_count = args.min_count # self.barseq_sample_collection = list() self.sample_dict = dict() - self.path = f"results/{self.experiment}/" - self.log = f"{self.path}log.txt" + self.path = Path(args.output).joinpath(f'results/{self.experiment}') + self.log = self.path.joinpath('log.txt') + self.force = args.force + + def get_sample_name(self, seq_tag): + """ Gets sample id given the sequence file tag""" + # TODO: Use grep or regex to find the tag, may be given different ones + return self.sample_ids[seq_tag.name.split('_')[0]] class SampleRecord: @@ -60,46 +87,72 @@ def __init__(self, sample: str, filename: str, barcode_dict): def main(args) -> None: """ - Main pipeline for analyzing barseq data. Will be changed to be more modular, for now - I just need the algorithm worked out. + Main pipeline for analyzing barseq data. Will be changed to be more modular """ # ---- SET UP SETTINGS ---- # runner = Run(args) make_barseq_directories(runner) + # Add file handler fh = logging.FileHandler(runner.log, mode="w") fh.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(module)s - %(message)s", datefmt="%Y-%m-%d %H:%M")) logger.addHandler(fh) + logger.info("***** Starting barseq *****") - # Read in barcode - logger.info(f"Reading in barcodes from {runner.barcodes.name}") - barcodes = read_barcodes(runner.barcodes) - # Process each sequencing file - seq_files_list = sorted(os.listdir(runner.sequences)) - for seq_file in seq_files_list: - if not seq_file.endswith(".DS_Store"): - sample = format_filename(seq_file) - logger.info(f"Counting Barcodes in {sample}") - runner.sample_dict[sample] = deepcopy(barcodes) - # Change cwd - with Cd(runner.sequences): - count_barcodes(seq_file, runner.sample_dict[sample]) - # TODO: Add basic analysis + # Read in barcode if needed + if runner.reference_free: + logger.info('Will find putative unique barcodes and count without referencing a library') - # Write to output - logger.info(f"Writing results to {runner.path}") - write_output(runner.sample_dict, barcodes, runner) + else: + logger.info(f"Reading in barcodes from {runner.barcodes.name}") + barcodes = read_barcodes(runner.barcodes) + + # Process each sequencing file + data_collection = [] + for seq_file in runner.sequence_files.glob('*R1*'): + # Split seq_file name into its suffixes + if '.fastq' in seq_file.suffixes or '.gz' in seq_file.suffixes: + sample = runner.get_sample_name(seq_file) + logger.info(f"Counting Barcodes in {sample}") + # IF GIVEN BARCODES + if not runner.reference_free: + runner.sample_dict[sample] = deepcopy(barcodes) + count_barcodes(seq_file, runner.sample_dict[sample], runner) + # Write to output + logger.info(f"Writing results to {runner.path}") + # REFERENCE FREE + if runner.reference_free: + # Skip control and undetermined fastq files + if seq_file.stem.startswith('Undetermined_S0') or seq_file.stem.startswith('S1000'): + pass + else: + # Find sample name + sample_name = runner.get_sample_name(seq_file) + barcode_counts_dict, good_barcode_counts = count_barcodes_reference_free(seq_file=seq_file, runner=runner) + # Calculate stats on barcode counts + stats_barcode_dict = calculate_barcode_perc(barcode_counts_dict, + total_barcodes=good_barcode_counts, + min_count=runner.min_count) + + # Get formatted row + row = format_sample_data(stats_barcode_dict, good_barcode_counts, sample_id=sample_name) + data_collection.append(row) + + if data_collection: + save_results(data_collection, runner) + + else: + write_output(runner.sample_dict, runner) # Confirm completion of barseq logger.info("***** barseq is complete! *****") if __name__ == "__main__": - """ # -------- START HERE -------- # """ args = sys.argv[1:] main(args) diff --git a/barseq/process_reads.py b/barseq/process_reads.py index 05c6766..fbc50d0 100644 --- a/barseq/process_reads.py +++ b/barseq/process_reads.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +3 # !/usr/bin/env python3 """ Count barcode frequency in fastq/fasta files given by user. @@ -8,7 +8,9 @@ import screed import logging import regex as re -import sys + +# Module imports +from .utils import compile_regex_patterns __author__ = "Emanuel Burgos" __email__ = "eburgos@wisc.edu" @@ -17,27 +19,24 @@ logger = logging.getLogger("barseq") -def count_barcodes(seq_file, barcode_dict) -> None: +def count_barcodes(seq_file, barcode_dict, runner) -> None: """ Count barcode frequency in sequence file. Returns a DataFrame object :param seq_file: file with reads + :param runner: Run object :param barcode_dict: barcode dictionary of sample :return: """ _other_reads = list() - # Compile regex patterns - flank_regex = re.compile("(GCTCATGCACTTGATTCC){e<=1}([ATGC]{18})(GACTTGACCTGGATGTCT){e<=1}") - barcode_regex = dict() - for b in barcode_dict: - barcode_regex[b] = re.compile("(%s){e<=1}" % b) + flank_regex, barcode_regex = compile_regex_patterns(barcode_dict, runner) # Open sequence file with screed.open(seq_file) as reads: n_reads = 0 for read in reads: try: - putative_barcode = re.search(flank_regex, read.sequence)[2] + putative_barcode = re.search(runner.pattern, read.sequence)[2] for known_barcode in barcode_regex: if re.search(barcode_regex[known_barcode], putative_barcode): barcode_dict[known_barcode]["count"] += 1 @@ -56,10 +55,51 @@ def count_barcodes(seq_file, barcode_dict) -> None: _other_reads = barcode_dict['_other']['count'] logger.info(f"For {seq_file}, {matched_reads} of " - f"{n_reads} ({round((matched_reads/n_reads) * 100, 2)}%) matched known barcodes.") - logger.info(f"Reads without barcode match: {_other_reads} ({round((_other_reads/n_reads)*100, 2)}%) for {seq_file}") + f"{n_reads} ({round((matched_reads / n_reads) * 100, 2)}%) matched known barcodes.") + logger.info( + f"Reads without barcode match: {_other_reads} ({round((_other_reads / n_reads) * 100, 2)}%) for {seq_file}") return +def count_barcodes_reference_free(seq_file, runner): + """ + Count barcode frequency in sequence file. + Returns a DataFrame object + + :param seq_file: file with reads + :param runner: Run object + :return barcode_counts: + """ + # Get regex pattern + # flank_regex = compile_regex_patterns(runner=runner) + # Search for barcodes in sequence file + # Barcode count + barcode_counts = dict() + barcode_counts["no_barcode_found"] = 0 + n_reads = 0 + with screed.open(seq_file) as reads: + for read in reads: + match = re.search(runner.pattern, read.sequence) + if match: + # Add to dictionary and count + barcode = match.groups(0)[1] + if barcode not in barcode_counts.keys(): + barcode_counts[barcode] = 0 + barcode_counts[barcode] += 1 + else: + barcode_counts["no_barcode_found"] += 1 + n_reads += 1 + + # Calculate matched reads + good_barcode_count = sum([barcode_counts[s] for s in barcode_counts if s != "no_barcode_found"]) + _other_reads = barcode_counts['no_barcode_found'] + + logger.info(f"For {seq_file}, {good_barcode_count} of " + f"{n_reads} ({round((good_barcode_count / n_reads) * 100, 2)}%) are good barcodes.") + logger.info( + f"Reads without barcode match: {_other_reads} ({round((_other_reads / n_reads) * 100, 2)}%) for {seq_file}") + return barcode_counts, good_barcode_count + + if __name__ == '__main__': pass diff --git a/barseq/tests/data/input/barcodes.csv b/barseq/tests/data/input/barcodes.csv new file mode 100644 index 0000000..7e5a3fb --- /dev/null +++ b/barseq/tests/data/input/barcodes.csv @@ -0,0 +1,13 @@ +Barcode,Gene +ATGAAGACTGTTGCCGTA,bar1 +CACGACGCCCTCCGCGGA,bar2 +ACTATTACGCAAAATAAT,bar3 +ATGGAAGATATTATTATT,bar4 +CCTCTCCAACCGGGTCTG,bar5 +CCCGGTCGCCTAGCCCCG,bar6 +GGCCCCCCGCCCGTCCCC,bar7 +GGATCACTGCTAGCGTAT,bar8 +CCTGCAGCAGCGGCCCGC,bar9 +ACACATGCAGACATAGAG,bar10 +CGCGCCATCCGCCGCCCA,bar11 +AATATTCAGATGGGACGT,bar12 \ No newline at end of file diff --git a/barseq/tests/data/input/samples.csv b/barseq/tests/data/input/samples.csv index 7e5a3fb..e52268b 100644 --- a/barseq/tests/data/input/samples.csv +++ b/barseq/tests/data/input/samples.csv @@ -1,13 +1,5 @@ -Barcode,Gene -ATGAAGACTGTTGCCGTA,bar1 -CACGACGCCCTCCGCGGA,bar2 -ACTATTACGCAAAATAAT,bar3 -ATGGAAGATATTATTATT,bar4 -CCTCTCCAACCGGGTCTG,bar5 -CCCGGTCGCCTAGCCCCG,bar6 -GGCCCCCCGCCCGTCCCC,bar7 -GGATCACTGCTAGCGTAT,bar8 -CCTGCAGCAGCGGCCCGC,bar9 -ACACATGCAGACATAGAG,bar10 -CGCGCCATCCGCCGCCCA,bar11 -AATATTCAGATGGGACGT,bar12 \ No newline at end of file +ID,Sample_descriptor +data1,sample1 +data2,sample2 +data3,sample3 +Undetermined,Undetermined \ No newline at end of file diff --git a/barseq/tests/data/input/samples_ref_free.csv b/barseq/tests/data/input/samples_ref_free.csv new file mode 100644 index 0000000..fd9e1ae --- /dev/null +++ b/barseq/tests/data/input/samples_ref_free.csv @@ -0,0 +1,4 @@ +ID,Sample_descriptor +S001,WT-bar_1 +S002,WT-bar_2 +S003,WT-bar_3 \ No newline at end of file diff --git a/barseq/tests/data/input/sequences/data1.fastq b/barseq/tests/data/input/sequences/data1_R1.fastq similarity index 100% rename from barseq/tests/data/input/sequences/data1.fastq rename to barseq/tests/data/input/sequences/data1_R1.fastq diff --git a/barseq/tests/data/input/sequences/data2.fastq b/barseq/tests/data/input/sequences/data2_R1.fastq similarity index 100% rename from barseq/tests/data/input/sequences/data2.fastq rename to barseq/tests/data/input/sequences/data2_R1.fastq diff --git a/barseq/tests/data/input/sequences/data3.fastq b/barseq/tests/data/input/sequences/data3_R1.fastq similarity index 100% rename from barseq/tests/data/input/sequences/data3.fastq rename to barseq/tests/data/input/sequences/data3_R1.fastq diff --git a/barseq/tests/data/input/sequences/data4_R2.fastq b/barseq/tests/data/input/sequences/data4_R2.fastq new file mode 100644 index 0000000..e69de29 diff --git a/barseq/tests/data/input/sequences/reference-free/S001_S1_L001_R1_001.fastq.gz b/barseq/tests/data/input/sequences/reference-free/S001_S1_L001_R1_001.fastq.gz new file mode 100644 index 0000000..38b22b3 Binary files /dev/null and b/barseq/tests/data/input/sequences/reference-free/S001_S1_L001_R1_001.fastq.gz differ diff --git a/barseq/tests/data/input/sequences/reference-free/S002_S2_L001_R1_001.fastq.gz b/barseq/tests/data/input/sequences/reference-free/S002_S2_L001_R1_001.fastq.gz new file mode 100644 index 0000000..9ab9933 Binary files /dev/null and b/barseq/tests/data/input/sequences/reference-free/S002_S2_L001_R1_001.fastq.gz differ diff --git a/barseq/tests/data/input/sequences/reference-free/S003_S3_L001_R1_001.fastq.gz b/barseq/tests/data/input/sequences/reference-free/S003_S3_L001_R1_001.fastq.gz new file mode 100644 index 0000000..b975199 Binary files /dev/null and b/barseq/tests/data/input/sequences/reference-free/S003_S3_L001_R1_001.fastq.gz differ diff --git a/barseq/tests/data/output/barcode_counts_table.csv b/barseq/tests/data/output/barcode_counts_table.csv deleted file mode 100644 index ca397f7..0000000 --- a/barseq/tests/data/output/barcode_counts_table.csv +++ /dev/null @@ -1,14 +0,0 @@ -Gene,Barcodes,data1,data2,data3 -bar1,ATGAAGACTGTTGCCGTA,15,7,16 -bar2,CACGACGCCCTCCGCGGA,13,19,12 -bar3,ACTATTACGCAAAATAAT,11,10,13 -bar4,ATGGAAGATATTATTATT,2,7,5 -bar5,CCTCTCCAACCGGGTCTG,9,13,7 -bar6,CCCGGTCGCCTAGCCCCG,8,4,9 -bar7,GGCCCCCCGCCCGTCCCC,3,4,2 -bar8,GGATCACTGCTAGCGTAT,4,2,2 -bar9,CCTGCAGCAGCGGCCCGC,4,4,7 -bar10,ACACATGCAGACATAGAG,4,8,5 -bar11,CGCGCCATCCGCCGCCCA,4,9,3 -bar12,AATATTCAGATGGGACGT,9,10,17 -_other,_other,40,29,28 diff --git a/barseq/tests/data/output/barcode_counts_table.txt b/barseq/tests/data/output/barcode_counts_table.txt new file mode 100644 index 0000000..e4b2406 --- /dev/null +++ b/barseq/tests/data/output/barcode_counts_table.txt @@ -0,0 +1,14 @@ +Gene Barcodes sample1 sample2 sample3 +bar1 ATGAAGACTGTTGCCGTA 15 7 16 +bar2 CACGACGCCCTCCGCGGA 13 19 12 +bar3 ACTATTACGCAAAATAAT 11 10 13 +bar4 ATGGAAGATATTATTATT 2 7 5 +bar5 CCTCTCCAACCGGGTCTG 9 13 7 +bar6 CCCGGTCGCCTAGCCCCG 8 4 9 +bar7 GGCCCCCCGCCCGTCCCC 3 4 2 +bar8 GGATCACTGCTAGCGTAT 4 2 2 +bar9 CCTGCAGCAGCGGCCCGC 4 4 7 +bar10 ACACATGCAGACATAGAG 4 8 5 +bar11 CGCGCCATCCGCCGCCCA 4 9 3 +bar12 AATATTCAGATGGGACGT 9 10 17 +_other _other 40 29 28 diff --git a/barseq/tests/data/output/log.txt b/barseq/tests/data/output/log.txt index 28295e6..3314379 100644 --- a/barseq/tests/data/output/log.txt +++ b/barseq/tests/data/output/log.txt @@ -1,13 +1,15 @@ -2019-06-11 10:35 - INFO - main - ***** Starting barseq ***** -2019-06-11 10:35 - INFO - main - Reading in barcodes from samples.csv -2019-06-11 10:35 - INFO - main - Counting Barcodes in data1 -2019-06-11 10:35 - INFO - process_reads - For data1.fastq, 86 of 126 (68.25%) matched known barcodes. -2019-06-11 10:35 - INFO - process_reads - Reads without barcode match: 40 (31.75%) for data1.fastq -2019-06-11 10:35 - INFO - main - Counting Barcodes in data2 -2019-06-11 10:35 - INFO - process_reads - For data2.fastq, 97 of 126 (76.98%) matched known barcodes. -2019-06-11 10:35 - INFO - process_reads - Reads without barcode match: 29 (23.02%) for data2.fastq -2019-06-11 10:35 - INFO - main - Counting Barcodes in data3 -2019-06-11 10:35 - INFO - process_reads - For data3.fastq, 98 of 126 (77.78%) matched known barcodes. -2019-06-11 10:35 - INFO - process_reads - Reads without barcode match: 28 (22.22%) for data3.fastq -2019-06-11 10:35 - INFO - main - Writing results to results/barseq_pytest -2019-06-11 10:35 - INFO - main - ***** barseq is complete! ***** +2022-10-06 14:07 - INFO - main - ***** Starting barseq ***** +2022-10-06 14:07 - INFO - main - Reading in barcodes from barcodes.csv +2022-10-06 14:07 - INFO - main - Counting Barcodes in sample1 +2022-10-06 14:07 - INFO - process_reads - For tests/data/input/sequences/data1_R1.fastq, 86 of 126 (68.25%) matched known barcodes. +2022-10-06 14:07 - INFO - process_reads - Reads without barcode match: 40 (31.75%) for tests/data/input/sequences/data1_R1.fastq +2022-10-06 14:07 - INFO - main - Writing results to tests/dump/results/test_barcodes +2022-10-06 14:07 - INFO - main - Counting Barcodes in sample3 +2022-10-06 14:07 - INFO - process_reads - For tests/data/input/sequences/data3_R1.fastq, 98 of 126 (77.78%) matched known barcodes. +2022-10-06 14:07 - INFO - process_reads - Reads without barcode match: 28 (22.22%) for tests/data/input/sequences/data3_R1.fastq +2022-10-06 14:07 - INFO - main - Writing results to tests/dump/results/test_barcodes +2022-10-06 14:07 - INFO - main - Counting Barcodes in sample2 +2022-10-06 14:07 - INFO - process_reads - For tests/data/input/sequences/data2_R1.fastq, 97 of 126 (76.98%) matched known barcodes. +2022-10-06 14:07 - INFO - process_reads - Reads without barcode match: 29 (23.02%) for tests/data/input/sequences/data2_R1.fastq +2022-10-06 14:07 - INFO - main - Writing results to tests/dump/results/test_barcodes +2022-10-06 14:07 - INFO - main - ***** barseq is complete! ***** diff --git a/barseq/tests/test_functions.py b/barseq/tests/test_functions.py index 65e1337..b795c51 100644 --- a/barseq/tests/test_functions.py +++ b/barseq/tests/test_functions.py @@ -5,6 +5,7 @@ """ +from pathlib import Path # Module import from barseq.utils import read_barcodes, format_filename from barseq.main import Run @@ -14,7 +15,7 @@ def test_read_tab_delimited_barcodes(): - barcode_file = "barseq/tests/data/input/samples.csv" + barcode_file = "barseq/tests/data/input/barcodes.csv" output_d = {'ATGAAGACTGTTGCCGTA': {'count': 0, 'gene': 'bar1'}, 'CACGACGCCCTCCGCGGA': {'count': 0, 'gene': 'bar2'}, 'ACTATTACGCAAAATAAT': {'count': 0, 'gene': 'bar3'}, @@ -33,9 +34,9 @@ def test_read_tab_delimited_barcodes(): def test_format_filename(): - assert "weird_file-name_" == format_filename("weird:_file-%$name_='][") - assert "weird_file-name_" == format_filename(" weird:_file-%$name_='][") - assert "weird_file-name_" == format_filename("weird:_file-%$name_='][ ") + assert "weird_file-name_" == format_filename(Path("weird:_file-%$name_='][")) + assert "weird_file-name_" == format_filename(Path(" weird:_file-%$name_='][")) + assert "weird_file-name_" == format_filename(Path("weird:_file-%$name_='][ ")) # def test_Run_class(): diff --git a/barseq/tests/test_main.py b/barseq/tests/test_main.py index 9a981d3..4cf1e82 100644 --- a/barseq/tests/test_main.py +++ b/barseq/tests/test_main.py @@ -8,7 +8,8 @@ import subprocess import filecmp import os -import pathlib + +import pandas as pd # Module import from .test_setup import temp_dir @@ -21,7 +22,8 @@ def test_barseq(temp_dir): # Create testing paths input_sequence = temp_dir(["data", "input", "sequences"]) - input_barcodes = temp_dir(["data", "input", "samples.csv"]) + input_barcodes = temp_dir(["data", "input", "barcodes.csv"]) + input_samples = temp_dir(["data", "input", "samples.csv"]) expected_output = temp_dir(["data", "output"]) test_output = temp_dir(["results", "barseq_pytest"]) experiment = "barseq_pytest" @@ -29,23 +31,28 @@ def test_barseq(temp_dir): # Move into temp_dir os.chdir(temp_dir()) # Call main with test data + # TODO: Maybe instead of calling barseq from installation, use local script path subprocess.call(["barseq", "-i", input_sequence, "-b", input_barcodes, - "-e", experiment]) + "-e", experiment, + "-s", input_samples] + ) # Check output assert filecmp.dircmp(expected_output, test_output) - assert filecmp.cmp(expected_output.joinpath("barcode_counts_table.csv"), - test_output.joinpath("barcode_counts_table.csv")) + assert filecmp.cmp(expected_output.joinpath("barcode_counts_table.txt"), + test_output.joinpath("barcode_counts_table.txt")) + + # Check log files - cmp_log_file = expected_output.joinpath("log.txt") - test_log_file = test_output.joinpath("log.txt") - with open(cmp_log_file) as cmp_log, open(test_log_file) as test_log: - for cmp, test in zip(cmp_log, test_log): - test_line = "-".join(test.strip().split("-")[3:]) - cmp_line = "-".join(cmp.strip().split("-")[3:]) - - assert cmp_line == test_line + # cmp_log_file = expected_output.joinpath("log.txt") + # test_log_file = test_output.joinpath("log.txt") + # with open(cmp_log_file) as cmp_log, open(test_log_file) as test_log: + # for cmp, test in zip(cmp_log, test_log): + # test_line = "-".join(test.strip().split("-")[3:]) + # cmp_line = "-".join(cmp.strip().split("-")[3:]) + # + # assert cmp_line == test_line diff --git a/barseq/utils.py b/barseq/utils.py index 6bbab7f..d7233bb 100644 --- a/barseq/utils.py +++ b/barseq/utils.py @@ -5,11 +5,12 @@ """ -import csv -import pandas as pd import re -import logging +import csv import sys +import shutil +import logging +import pandas as pd from pathlib import Path __author__ = "Emanuel Burgos" @@ -58,7 +59,7 @@ def read_barcodes(barcodes_file: Path) -> dict: return barcode_dict -def write_output(sample_dict: dict, barcode_dict: dict, runner) -> None: +def write_output(sample_dict: dict, runner) -> None: """ Convert results file into a pandas dataframe with following structure. @@ -87,19 +88,38 @@ def write_output(sample_dict: dict, barcode_dict: dict, runner) -> None: sample_df = pd.DataFrame.from_dict(counts, orient="index", columns=[sample]) df = pd.concat([df, sample_df], axis=1) # Write to output - df.to_csv(f"{runner.path}/barcode_counts_table.csv") + df.to_csv(f"{runner.path}/barcode_counts_table.txt", sep='\t') return -def format_filename(name: str) -> str: +def compile_regex_patterns(barcode_dict=None, runner=None): + """ + Compies regex patterns for matching sequences in reads. + If no barcode_dict is provided then just return flank_regex + Flanking_regex -> finds flanking sequence and random 18bp barcode + Barcode_regex -> compiles each reference barcode into regex object + """ + # Regex pattern: match flanking sequence and random 18-bp barcode + flank_regex = re.compile(runner.pattern) + barcode_regex = dict() + if barcode_dict: + # Compile each barcode in dictionary as regex object + for b in barcode_dict: + barcode_regex[b] = "(%s){e<=1}" % b + return flank_regex, barcode_regex + else: + return flank_regex + + +def format_filename(path: Path) -> str: """ Converts input into a valid name for file and recording results - :param name: Input name + :param path: Filepath :return output: Formatted name """ # Strip extension - name = "".join(name.split(".")[0]) + name = "".join(path.stem.split(".")[0]) # Taken from Pyinseq software return re.sub(r"(?u)[^-\w]", "", name.strip().replace(" ", "_")) @@ -112,15 +132,90 @@ def make_barseq_directories(runner) -> None: :param runner: Run object given in main :return: """ - error_message =f"Barseq Error: {runner.experiment} directory already exists. Delete or rename {runner.experiment}, or provide new name for barseq run." - results_folder = Path("results", runner.experiment) - if results_folder.is_dir(): - logger.error(error_message) + if runner.force: + logger.info(f"--force option is True, {runner.path} will be overwritten") + shutil.rmtree(runner.path, ignore_errors=True) + elif runner.path.exists(): + logger.error(f"Barseq Error: {runner.experiment} directory already exists. " + f"Delete or rename {runner.experiment}, or provide new name for " + f"barseq run.") sys.exit(1) - results_folder.mkdir(parents=True) - runner.path = results_folder + + runner.path.mkdir(parents=True) return +def calculate_barcode_perc(barcode_dict, total_barcodes=0, min_count=20): + """ + Calculate total number of `unique` barcodes and percentage. + Return modified dictionary + + :param barcode_counts: dictionary where key=sequence, value=counts + :return: barcode_stats + """ + barcode_stats = {} + for seq, count in barcode_dict.items(): + if count >= min_count and seq != 'no_barcode_found': + seq_perc = round(count / total_barcodes * 100, 2) + barcode_stats[seq] = [count, seq_perc] + # quick sorting to ensure order + barcode_stats = {k: v for k, v in sorted(barcode_stats.items(), key=lambda x: x[1][0], reverse=True)} + return barcode_stats + + +def read_sample_ID(sample_file): + """ + Reads csv file where rows = [seq_tag, sample_name] + Returns a dictionary where key=seq_tag, value=sample_name + + :params sample_file: path to sample csv file + :returns sample_ids: dictionary + """ + sample_ids = {} + with open(sample_file, 'r') as f: + reader = csv.reader(f) + # skip header + next(reader) + for seq_tag, sample_name in reader: + sample_ids[seq_tag] = sample_name + return sample_ids + + +def format_sample_data(barcode_dict, barcode_count=0, sample_id=None): + """ + Grabs each unique barcode and formats into row for pandas + :param barcode_dict: dict where key=sample + """ + i = 1 + data = {'Sample_ID': sample_id, 'Total Barcode Count': barcode_count} + for seq, values in barcode_dict.items(): + barcode_column = f"{i}° Barcode" + count_column = f"{i}° Count" + perc_column = f"{i}° %" + + data[barcode_column] = seq + data[count_column] = values[0] + data[perc_column] = values[1] + i += 1 + return data + + +def convert_dict_to_dataframe(dict_collection): + df_collection = [] + for data in dict_collection: + df_collection.append(pd.DataFrame(data, index=[0])) + return pd.concat(df_collection) + + +def save_results(data_collection, runner): + print(data_collection) + df = convert_dict_to_dataframe(data_collection) + df.set_index('Sample_ID', inplace=True) + df.to_csv(f'{runner.path}/barcode_counts_table.csv') + # Filter for samples that have less than 75% on 1st barcode percentage + df_less_75 = df[df['1° %'] <= 75] + df_less_75.to_csv(f'{runner.path}/barcode_counts_less_75.csv') + + if __name__ == '__main__': pass diff --git a/bin/barseq b/bin/barseq index 2caafc3..8bbe4c7 100644 --- a/bin/barseq +++ b/bin/barseq @@ -8,6 +8,7 @@ Bin executable fro command line usage import sys import argparse import logging +from pathlib import Path # Module import from barseq.main import main @@ -28,9 +29,23 @@ def parse_args(args): parser.add_argument("-i", "--input", help="Directory where fastq/fastq.gz files are located.") - parser.add_argument("-b", "--barcodes", help="CSV file with barcodes for genes") + parser.add_argument("-b", "--barcodes", help="CSV file with barcodes for genes", default='') - parser.add_argument("-e", "--experiment", help="Name for experiment") + parser.add_argument("-e", "--experiment", help="Name for experiment", default='barseq-run') + + parser.add_argument('-f', '--force', help="If results/ exists, overwrite it", action="store_true", default=False) + + parser.add_argument("-s", "--samples", help='', default=None) + + parser.add_argument("-o", "--output", help='Path to store output files', default=Path('./')) + + parser.add_argument("--reference-free", help="Infer barcodes sequences from reads. " + "If True, provide a table for sample names.", action="store_true", default=False) + + parser.add_argument("--min-count", help="Minimum barcode counts required to be considered", default=20) + parser.add_argument('-fr', '--flanking-right', default='GACTTGACCTGGATGTCT', help='') + parser.add_argument('-fl', '--flanking-left', default='GCTCATGCACTTGATTCC', help='') + parser.add_argument('--barcode-length', default=18, help='Length of barcode sequence') # Process arguments given args = parser.parse_args(args)