From 561fe7518ebc7f5bdff35c22ff1ad4e58d3dfd74 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Mon, 20 Nov 2023 10:22:28 +0000 Subject: [PATCH 01/15] updated rule filter (in merge_samples) so it starts from the file with the most samples --- .../workflow/rules/merge_samples_vcf.smk | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk index 40ad916..1fcc7be 100644 --- a/Snakemake/workflow/rules/merge_samples_vcf.smk +++ b/Snakemake/workflow/rules/merge_samples_vcf.smk @@ -25,18 +25,29 @@ if len(allFiles) != 1: expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', file=allFiles, ext=['gz', 'gz.csi'])]) params: duplicated = '{chromosome}.ids' - # conda: 'bcftools' resources: cpus=1, mem_mb=64000, time_min=60 run: import os + + for file in input: + shell("wc -l {file} >> files_len.txt") + + shell("sort -s -nr -k 1,1 files_len.txt | cut -d' ' -f 2 > files_reorderd.txt") + + with open('files_reorderd.txt', 'r') as file: + files_reord = [] + for line in file: + line = line.strip() + files_reord.append(line) + filtered = [] - for i in range(len(input)): + for i in range(len(files_reord)): print(i) - for j, file in enumerate(input): - if file != input[i] and j > i: - file1 = input[i] - file2 = input[j] + for j, file in enumerate(files_reord): + if file != files_reord[i] and j > i: + file1 = files_reord[i] + file2 = files_reord[j] print(file1, file2) shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}") @@ -57,7 +68,7 @@ if len(allFiles) != 1: shell('bcftools index -f {file2}.filtered.vcf.gz') shell('rm {params.duplicated}') - for file in input: + for file in files_reord: if file not in filtered: file1 = file.split('.')[0] print(f'file {file1} not filtered') From fd7503d3f1f45f01289b23ae7475fe85b7449228 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:26:54 +0100 Subject: [PATCH 02/15] changed rule all input to be the simplified tree --- Snakemake/workflow/Snakefile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile index 6389682..d22892e 100644 --- a/Snakemake/workflow/Snakefile +++ b/Snakemake/workflow/Snakefile @@ -11,8 +11,8 @@ from os.path import isfile, join ##### setup report ##### -configfile: '../config/ancestral_Eddie.yaml' -configfile: '../config/tsinfer_Eddie.yaml' +#configfile: '../config/ancestral_Eddie.yaml' +configfile: '../config/glob_cattle.yaml' report: "reports/tsinfer.rst" ##### setup singularity ##### @@ -37,22 +37,23 @@ vcfdir=config['vcf_dir'] # from config file, the path to vcf # the final directory name is now editable -- so can be named according to specific project # mapdir indicates where to find the genetic map for phasing (full path in the config) Project = config['PROJECT'] +projdir = config['work_dir'] + '/' + Project mapdir = config['genmap'] # if want to split multiallelic sites set to TRUE: #multiallelic = False -chromosomes = [f'Chr{n}' for n in range(1, config['no_chromosomes'] + 1)] +chromosomes = [f'chr{n}' for n in range(1, config['no_chromosomes'] + 1)] # set a list of 'Chr' + chr# combinations to be used as wildcards ##### load rules ##### -include: "rules/ancestral_inference.smk" +#include: "rules/ancestral_inference.smk" include: "rules/split.smk" +include: "rules/filter.smk" include: "rules/merge_samples_vcf.smk" include: "rules/phasing.smk" -include: "rules/prepare_files_for_tsinfer.smk" -#include: "rules/multiallelic.smk" -include: "rules/infer_trees.smk" +include: "rules/ancestral_info_to_vcf.smk" +include: "rules/new_infer_trees.smk" ##### target rules ##### # At the end it should generate a phased vcf file that combines all files for @@ -60,5 +61,5 @@ include: "rules/infer_trees.smk" # configuration file. It also checks that vcfs have been filtered before merged rule all: input: - expand(f'../{Project}/Tsinfer/trees/{{chromosome}}.trees', chromosome=chromosomes) + expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees', chromosome=chromosomes) # tree inference From f8fa59c7404842f92fc3f648cd263a53aa00e454 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:28:34 +0100 Subject: [PATCH 03/15] no changes --- Snakemake/workflow/rules/split.smk | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Snakemake/workflow/rules/split.smk b/Snakemake/workflow/rules/split.smk index 5c45b84..139bc49 100644 --- a/Snakemake/workflow/rules/split.smk +++ b/Snakemake/workflow/rules/split.smk @@ -2,9 +2,11 @@ splitFiles = list(set([f.strip("vcf.gz").split("_")[1] for f in os.listdir(f'{vcfdir}/RawVCF') if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) )])) + combinedFiles = list(set([f.strip("vcf.gz").split("_")[1] for f in os.listdir(f'{vcfdir}/RawVCF') if (f.endswith("vcf.gz") and f.startswith("Combined"))])) + allFiles = splitFiles + combinedFiles if len(splitFiles) > 0: @@ -21,13 +23,8 @@ if len(splitFiles) > 0: log: 'logs/Move_vcf_{chromosome}_{suffixOne}.log' shell: """ - if [ -h {input.vcf} ]; then - ln -s $( realpath {input.vcf} ) {output.vcf} - ln -s $( realpath {input.vcf} ).csi {output.vcf} - else - ln -s {input.vcf} {output.vcf} - ln -s {input.idx} {output.idx} - fi + ln -s {input.vcf} {output.vcf} + ln -s {input.idx} {output.idx} """ if len(combinedFiles) > 0: From cfa7c01f0b9eaa10e32dc4bc894160ae3056051d Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:29:41 +0100 Subject: [PATCH 04/15] now filtering overlapping samples from vcfs in one step --- Snakemake/workflow/rules/filter.smk | 186 ++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 Snakemake/workflow/rules/filter.smk diff --git a/Snakemake/workflow/rules/filter.smk b/Snakemake/workflow/rules/filter.smk new file mode 100644 index 0000000..99ca98f --- /dev/null +++ b/Snakemake/workflow/rules/filter.smk @@ -0,0 +1,186 @@ +if len(allFiles) != 1: + rule compare_and_filter_vcfs: + input: + [f'{vcfdir}' + x + for x in expand('/{{chromosome}}/{{chromosome}}_{file}.vcf.gz', + file=allFiles)] + output: + [f'{vcfdir}' + x + for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', + file=allFiles)] + params: + file_names = allFiles, + path = f'{vcfdir}/{{chromosome}}/{{chromosome}}' + run: + import os + import itertools + import subprocess + from glob import glob + + path_names = [f'{params.path}/{name}' for name in params.file_names] + pairs = list(itertools.combinations(path_names, 2)) + + for i, pair in enumerate(pairs): + print(f'pair {i}') + # for each pair determine the extension of the vcf files + _0, ext0 = [os.path.splitext(file) for file in glob(f'{pair[0]}*') + if os.path.splitext(file)[1] not in ['.csi', '.txt']][0] + + _1, ext1 = [os.path.splitext(file) for file in glob(f'{pair[1]}*') + if os.path.splitext(file)[1] not in ['.csi', '.txt']][0] + + # Run vcftools to find if there are overlapping samples between pairs + command = f""" + module load anaconda + conda activate bcftools + vcftools --gzvcf {os.path.join(pair[0] + ext0)} \ + --gzdiff {os.path.join(pair[1] + ext1)} \ + --diff-indv + """ + subprocess.run(command, shell=True, check=True, cwd=path) + + # if vcftools find any overlapping samples, remove them from the second vcf file + if os.path.exists(os.path.join(path, 'out.diff.indv_in_files')): + print(f'Removing samples from {pair[1]} and renaming {pair[0]}') + command = f""" + grep B out.diff.indv_in_files | cut -f1 > samples.remove + rm out.diff.indv_in_files + """ + subprocess.run(command, shell=True, check=True, cwd=path) + command = f""" + module load anaconda + conda activate bcftools + bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf' + ext1)} \ + -Oz -o {os.path.join(pair[1] + new_ext)} + rm samples.remove + """ + subprocess.run(command, shell=True, check=True, cwd=path) + + # if there are no overlapping samples, + # just rename the files that still have the original extension + else: + print('No overlapping samples') + if 'filtered' not in _0: + print('Rename') + command = f""" + mv {os.path.join(pair[0] + '.vcf' + ext0)} {os.path.join(pair[0] + new_ext)} + """ + subprocess.run(command, shell=True, check=True, cwd=path) + if 'filtered' not in _1: + print('Rename') + command = f""" + mv {os.path.join(pair[1] + '.vcf' + ext1)} {os.path.join(pair[1] + new_ext)} + """ + subprocess.run(command, shell=True, check=True, cwd=path) + + # rule get_samples: + # input: + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz' + # output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.txt') + # conda: "bcftools" + # threads: 1 + # resources: cpus=1, mem_mb=32000, time_min=30 + # log: 'logs/Get_samples_{chromosome}_{file}.log' + # shell: + # """ + # bcftools query -l {input} > {output} + # """ + + # rule filter: + # input: + # [f'{vcfdir}' + x + # for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt', + # file=allFiles)], + # output: + # temp([f'{vcfdir}' + x for x in + # expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', + # file=allFiles, ext=['gz', 'gz.csi'])]) + # params: + # duplicated = '{chromosome}.ids', + # files_len = '{chromosome}_len.txt', + # files_reo = '{chromosome}_reorderd.txt', + # resources: cpus=1, mem_mb=64000, time_min=60 + # run: + # import os + + # for file in input: + # shell("wc -l {file} >> {params.files_len}") + + # shell("sort -us -nr {params.files_len} | cut -d' ' -f 2 > {params.files_reo}") + + # with open(params.files_reo, 'r') as file: + # files_reord = [] + # for line in file: + # line = line.strip() + # files_reord.append(line) + + # filtered = [] + + # for i in range(len(files_reord)): + # print(i) + # for j, file in enumerate(files_reord): + # if file != files_reord[i] and j > i: + # file1 = files_reord[i] + # file2 = files_reord[j] + # print(file1, file2) + + # shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}") + + # remove = params.duplicated + # lremove = os.stat(f'{remove}').st_size + + # if lremove == 0: + # print(f'Nothing to remove {lremove}') + # else: + # print(f'{lremove} samples to remove') + # if file2 not in filtered: + # filtered.append(file2) + + # file2 = file2.split('.')[0] + # print(f'file {file2} filtered') + # shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz') + # #shell('gatk SelectVariants -R {params.refgen} -V {file2} -xl-sn -O {file2}.filtered.vcf.gz') + # shell('bcftools index -f {file2}.filtered.vcf.gz') + # shell('rm {params.duplicated} {file2}.vcf.gz {file2}.vcf.gz.csi') + + # for file in files_reord: + # if file not in filtered: + # file1 = file.split('.')[0] # file does not have extensions + # print(f'file {file1} renamed') + # shell('mv {file1}.vcf.gz {file1}.filtered.vcf.gz') + # shell('mv {file1}.vcf.gz.csi {file1}.filtered.vcf.gz.csi') + # #shell('scripts/create_symlinks.sh {file1}') + # shell('rm {params}') + +else: + rule rename: + input: + vcf = [[f'{vcfdir}' + x + for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz', + suffixOne = splitFiles)] if len(splitFiles) != 0 else [], + [f'{vcfdir}' + x + for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz', + suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []], + idx = [[f'{vcfdir}' + x + for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz.csi', + suffixOne = splitFiles)] if len(splitFiles) != 0 else [], + [f'{vcfdir}' + x + for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz.csi', + suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []], + output: + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz', + idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz.csi' + conda: "bcftools" + log: 'logs/Rename_{chromosome}.log' + resources: cpus=1, mem_mb=32000, time_min=30 + shell: + """ + if [ -h {input} ]; then + ln -s $( realpath {input.vcf} ) {output.vcf} + ln -s $( realpath {input.idx} ) {output.idx} + else + ln -s {input.vcf} {output.vcf} + ln -s {input.idx} {output.idx} + fi + """ + From 4c318c32178aa5e7204d5a16379de9ece902207c Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:30:40 +0100 Subject: [PATCH 05/15] simplified rule --- .../workflow/rules/merge_samples_vcf.smk | 169 +++++++----------- 1 file changed, 61 insertions(+), 108 deletions(-) diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk index 1fcc7be..86cbc2c 100644 --- a/Snakemake/workflow/rules/merge_samples_vcf.smk +++ b/Snakemake/workflow/rules/merge_samples_vcf.smk @@ -1,119 +1,72 @@ -import yaml - - if len(allFiles) != 1: - rule get_samples: - input: - f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz' - output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.txt') - conda: "bcftools" - threads: 1 - resources: cpus=1, mem_mb=32000, time_min=30 - log: 'logs/Get_samples_{chromosome}_{file}.log' - shell: - """ - bcftools query -l {input} > {output} - """ - - rule filter: - input: - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt', - file=allFiles)], - output: - temp([f'{vcfdir}' + x for x in - expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', file=allFiles, ext=['gz', 'gz.csi'])]) - params: - duplicated = '{chromosome}.ids' - resources: cpus=1, mem_mb=64000, time_min=60 - run: - import os - - for file in input: - shell("wc -l {file} >> files_len.txt") - - shell("sort -s -nr -k 1,1 files_len.txt | cut -d' ' -f 2 > files_reorderd.txt") - - with open('files_reorderd.txt', 'r') as file: - files_reord = [] - for line in file: - line = line.strip() - files_reord.append(line) - - filtered = [] - - for i in range(len(files_reord)): - print(i) - for j, file in enumerate(files_reord): - if file != files_reord[i] and j > i: - file1 = files_reord[i] - file2 = files_reord[j] - print(file1, file2) - - shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}") - - remove = params.duplicated - lremove = os.stat(f'{remove}').st_size - - if lremove == 0: - print(f'Nothing to remove {lremove}') - else: - print(f'{lremove} samples to remove') - if file2 not in filtered: - filtered.append(file2) - - file2 = file2.split('.')[0] - print(f'file {file2} filtered') - shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz') - shell('bcftools index -f {file2}.filtered.vcf.gz') - shell('rm {params.duplicated}') - - for file in files_reord: - if file not in filtered: - file1 = file.split('.')[0] - print(f'file {file1} not filtered') - shell("cp {file1}.vcf.gz {file1}.filtered.vcf.gz && bcftools index -f {file1}.filtered.vcf.gz") - shell('rm {params.duplicated}') - rule merge: input: - [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)] + vcf = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)], + idx = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi', file=allFiles)], output: - vcf=temp(f'{vcfdir}/{{chromosome}}_final.vcf.gz'), - index=temp(f'{vcfdir}/{{chromosome}}_final.vcf.gz.csi') + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz', + params: + files_path = f'{vcfdir}/{{chromosome}}/' conda: "bcftools" - threads: 1 - resources: cpus=1, mem_mb=32000, time_min=60 - log: 'logs/Merge_{chromosome}.log' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt' shell: - """ - bcftools merge {input} -O z -o {output.vcf} - bcftools index -f {output.vcf} + r""" + printf '%s\n' {params.files_path}*.gz > files.merge + + bcftools merge \ + -m snps \ + -l files.merge \ + --threads {threads} -O z -o {output.vcf} """ -else: - rule rename: - input: - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz', - suffixOne = splitFiles)] if len(splitFiles) != 0 else [], - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz', - suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else [] - output: - vcf = (f'{vcfdir}/{{chromosome}}_final.vcf.gz'), - idx = (f'{vcfdir}/{{chromosome}}_final.vcf.gz.csi') - conda: "bcftools" - log: 'logs/Rename_{chromosome}.log' - resources: cpus=1, mem_mb=32000, time_min=30 + rule index_merged: + input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz', + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi' + conda: 'bcftools' shell: + r""" + bcftools index -f {input} """ - if [ -h {input} ]; then - ln -s $( realpath {input} ) {output.vcf} - ln -s $( realpath {input} ).csi {output.vcf} - else - ln -s {input} {output.vcf} - ln -s {input}.csi {output.idx} - fi - """ +rule remove_missing_indels: + input: + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz', + index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi' + output: + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz', + index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz.csi' + params: + prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged' + conda: 'bcftools' + shell: + r""" + vcftools --gzvcf {input.vcf} \ + --remove-indels \ + --max-missing 0.1 \ + --remove-filtered-all \ + --recode \ + --out {params.prefix} + + bgzip {params.prefix}.recode.vcf + bcftools index -f {output.vcf} + + rm {params.prefix}.recode.vcf + """ + +# rule summary: +# input: +# vcf_file = rules.filter_merged_vcf.output.vcf, +# index = rules.filter_merged_vcf.output.index, +# output: +# summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk' +# params: +# plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/' +# conda: 'bcftools' +# benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt' +# shell: +# r""" +# bcftools stats {input.vcf_file} > {output.summary} +# plot-vcfstats -p {params.plots} {output.summary} +# """ \ No newline at end of file From 4389bfb70660d1327be9747a0e9b1f0470d1d523 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:31:36 +0100 Subject: [PATCH 06/15] changed options after testing to improve accuracy --- Snakemake/workflow/rules/phasing.smk | 71 +++++++++++++++------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/Snakemake/workflow/rules/phasing.smk b/Snakemake/workflow/rules/phasing.smk index 67e9f2f..53e59db 100644 --- a/Snakemake/workflow/rules/phasing.smk +++ b/Snakemake/workflow/rules/phasing.smk @@ -1,24 +1,17 @@ -# def getInputVcfFile_phasing(wildcards): -# if os.path.isfile(vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz'): -# print("Final file found ") -# vcf_file = vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz' -# print(vcf_file) -# else: -# print("No final file ") -# file = open(vcfdir + '/Vcf_file_' + wildcards.chromosome + '.txt') -# vcf_file = file.read().strip("\n") -# -# return(vcf_file) +file_to_phase = rules.remove_missing_indels.output.vcf +file_to_phase_idx = rules.remove_missing_indels.output.index +phased_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' +phased_output_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' if config['ploidy'] == 1: rule rename_phased: input: - vcf = f'{vcfdir}/{{chromosome}}_final.vcf.gz', - idx = f'{vcfdir}/{{chromosome}}_final.vcf.gz.csi' + vcf = file_to_phase, + idx = file_to_phase_idx, output: - vcf = f'{vcfdir}/{{chromosome}}_phased.vcf.gz', - idx = f'{vcfdir}/{{chromosome}}_phased.vcf.gz.csi' + vcf = temp(phased_output), + idx = temp(phased_output_idx) log: 'logs/Rename_phased_{chromosome}.log' resources: cpus=1, mem_mb=32000, time_min=60 shell: @@ -31,32 +24,44 @@ if config['ploidy'] == 1: ln -s {input.idx} {output.idx} fi """ - else: rule phase: input: - vcf = f'{vcfdir}/{{chromosome}}_final.vcf.gz', + vcf = file_to_phase, + idx = file_to_phase_idx, output: - file = f'{vcfdir}/{{chromosome}}_phased.vcf.gz', - idx = f'{vcfdir}/{{chromosome}}_phased.vcf.gz.csi' + vcf = phased_output, params: map = f'{mapdir}/{{chromosome}}.txt', - #log: 'logs/phase_{chromosome}.log' - threads: 25 -# resources: cpus=20, mem_mb=25000, time_min=5 + threads: 32 + resources: cpus = 32, mem_mb = 2048, time_min = 7200 conda: 'shapeit4am' - log: 'logs/Phase_{chromosome}.log' + benchmark: + 'benchmarks/{chromosome}.shapeit.benchmark.txt' shell: - """ + r""" str='{wildcards.chromosome}' chr=$(echo ${{str:3}}) - start=`date +%s` + shapeit4 --input {input.vcf} \ - --map {params.map} \ - --region ${{chr}} \ - --output {output} \ - --thread {threads} - end=`date +%s` - echo Execution time was `expr $end - $start` seconds > shapeit_{wildcards.chromosome}.time - bcftools index -f {output} - """ + --map {params.map} \ + --region $chr \ + --output {output.vcf} \ + --thread {threads} \ + --pbwt-depth 8 \ + --sequencing \ + --log {wildcards.chromosome}_phased.log + """ + + rule index_phased: + input: + rules.phase.output.vcf + output: + idx = phased_output_idx + conda: 'bcftools' + benchmark: + 'benchmarks/{chromosome}.bcftools_index.bechmark.txt' + shell: + r""" + bcftools index -f {input} + """ \ No newline at end of file From 107192f77f0439ba1acd561c4f13482f8fe95584 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:32:49 +0100 Subject: [PATCH 07/15] updated script to add aa to vcf keeping all sites --- .../workflow/rules/ancestral_info_to_vcf.smk | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 Snakemake/workflow/rules/ancestral_info_to_vcf.smk diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk new file mode 100644 index 0000000..f8870f0 --- /dev/null +++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk @@ -0,0 +1,128 @@ + +# extract all positions in vcf file using bcftools and save to regions file +# for rows in ancestral_alleles file where col 1 match wildcard.chromosome +# for each row in regions file, if value is in ancestral_alleles col 2, add ancestral_alleles col 3 to regions file col 2 +# else add -1 + +ancestral_file = config['ancestral_allele'] + +if "global_cattle" in config['PROJECT']: + # rule list_samples: + # input: + # vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz', + # metadata = config['meta'] + # output: + # samples = f'{vcfdir}/{{chromosome}}/samples_to_remove' + # conda: 'bcftools' + # shell: + # r""" + # bcftools query -l {input.vcf_file} > {wildcards.chromosome}.samples + # cut -d',' -f1 {input.metadata} | tail -n +2 > metadata.samples + # grep -v -f metadata.samples vcf.samples > unique.samples + # """ + + rule remove_samples: + input: + vcf_file = rules.phase.output.vcf, + idx_file = rules.index_phased.output.idx, + output: + vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf', + params: + samples_to_remove = f'{vcfdir}/samples.remove' + conda: 'bcftools' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + shell: + r""" + bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output.vcf_file} + """ + input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf' + +else: + rule decompress: + input: rules.phase.output.vcf + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' + conda: bcftools + shell: + r""" + bcftools view {input} -O v -o {output} + """ + input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' + +rule get_vcf_positions: + input: + vcf_file = input_vcf, + output: + regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.regions') + conda: 'bcftools' + shell: + r""" + bcftools query -f '%POS\n' {input.vcf_file} | awk '{{print $1, -1}}' > {output.regions_file} + """ + +rule get_ancestral_alleles: + input: + ancestral_alleles_file = ancestral_file + output: + ancestrals = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.anc') + params: + chrnum = lambda wc: wc.get('chromosome')[3:] + shell: + r""" + # filter positions in ancestral that correspond to chr + awk '$1 == {params.chrnum}' {input.ancestral_alleles_file} | cut -d' ' -f2,3 > {output.ancestrals} + """ + +rule update_regions_file: + input: + regions_file = rules.get_vcf_positions.output.regions_file, + ancestrals = rules.get_ancestral_alleles.output.ancestrals, + output: + updated_regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.updated.regions'), + positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude') + shell: + r""" + awk 'NR==FNR{{a[$1]=$2;next}} ($1 in a){{$2=a[$1]}}1' {input.ancestrals} {input.regions_file} > {output.updated_regions_file} + + awk '$2 == -1' {output.updated_regions_file} | cut -d' ' -f1 > {output.positions_exclude} + """ + +rule add_AA_to_regions_file: + input: + updated_regions_file = rules.update_regions_file.output.updated_regions_file, + output: + new_regions_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}.final.regions' + shell: + r""" + echo "INFO" > {output.new_regions_file} + awk '{{print "AA="$2}}' {input.updated_regions_file} >> {output.new_regions_file} + """ + +rule add_info_to_vcf: + input: + vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf', + aa_info = rules.add_AA_to_regions_file.output.new_regions_file, + output: + large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf') + conda: 'bcftools' + shell: + r""" + awk 'NR==FNR{{a[NR] = $1; next}} FNR<15{{print}}; \ + FNR==6{{printf "##INFO=\n"}}; \ + FNR>14{{$8=a[FNR-14]; print}}' OFS="\t" {input.aa_info} {input.vcf_file} > {output.large_vcf} + + rm {input.vcf_file} + """ + +rule compress_index_vcf: + input: + large_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf' + output: + vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', + vcf_indx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz.csi' + conda: 'bcftools' + shell: + r""" + bgzip -c {input} > {output.vcf_file} + bcftools index {output.vcf_file} + """ From cdc238c5f140e2b4179124729df12a1b155e25fe Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:35:57 +0100 Subject: [PATCH 08/15] updated rules and scripts for ts inference. final output is now simplified tree --- Snakemake/workflow/rules/new_infer_trees.smk | 145 ++++++++++++++++++ .../workflow/scripts/generate_ancestors.py | 19 +++ .../workflow/scripts/generate_samples.py | 120 +++++++++++++++ Snakemake/workflow/scripts/truncate.py | 18 +++ 4 files changed, 302 insertions(+) create mode 100644 Snakemake/workflow/rules/new_infer_trees.smk create mode 100644 Snakemake/workflow/scripts/generate_ancestors.py create mode 100644 Snakemake/workflow/scripts/generate_samples.py create mode 100644 Snakemake/workflow/scripts/truncate.py diff --git a/Snakemake/workflow/rules/new_infer_trees.smk b/Snakemake/workflow/rules/new_infer_trees.smk new file mode 100644 index 0000000..930c3ca --- /dev/null +++ b/Snakemake/workflow/rules/new_infer_trees.smk @@ -0,0 +1,145 @@ +# Convert vcf to samples file +# generate ancestos samples based only on sites with ancestral allele +# information +# truncate ancestors to remove those spanning too many sites +# generate ancestors trees +# match ancestors and samples to infer trees +# final output is a simplified tree sequence + +rule prepare_sample_file: + input: + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', + meta = config['meta'] + output: f'{projdir}/Tsinfer/samples/{{chromosome}}.samples' + params: + chrLength = lambda wildcards: config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])] + conda: "HLab_tsinfer" + threads: 32 + resources: cpus = 32, mem_mb = 768000, time_min = 200 + benchmark: + 'benchmarks/{chromosome}_new.prep_sample.benchmark.txt' + shell: + r""" + ulimit -v + + python scripts/generate_samples.py \ + {input.vcf} {input.meta} {output} {params.chrLength} + + """ + +rule generate_ancestors: + input: + samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', + sites_to_exclude = f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude', + output: + ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors', + conda: 'HLab_tsinfer' + threads: 64 + resources: cpus = 64, mem_mb = 2048000, time_min = 1680 + benchmark: + 'benchmarks/{chromosome}_generate_ancestors.benchmark.txt' + shell: + r""" + # Check amount of memory (in kbytes) as seen by the job + ulimit -v + python scripts/generate_ancestors.py {input.samples} {input.sites_to_exclude} {output.ancestors_samples} {threads} + """ +# tsinfer generate-ancestors {input} \ +# --num-threads {threads} \ +# --num-flush-threads {threads} \ +# --progress \ +# --verbosity \ +# --log-section tsinfer.inference + + +rule truncate_ancestors: + input: + ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors', + output: + truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated', + # params: + # upper = 0.4, + # lower = 0.2, + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1680 + benchmark: + 'benchmarks/{chromosome}_truncate.benchmark.txt' + shell: + r""" + ulimit -v + python scripts/truncate.py {input} {output} + """ + +rule ancestors_trees: + input: + samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', + truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated', + output: + ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees', + params: + re = '1.1e-8', + mima = '1' + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus = 32, mem_mb = 2048000, time_min = 1680 + benchmark: + 'benchmarks/{chromosome}_ancestors_trees.benchmark.txt' + shell: + r""" + tsinfer match-ancestors {input.samples} \ + -A {output} \ + --log-section tsinfer.inference \ + --num-threads {threads} \ + --progress \ + --verbosity \ + --ancestors {input.truncated_ancestors} + """ +# --recombination-rate {params.re} \ + # --mismatch-ratio {params.mima} \ + + +rule infer_trees: + input: + samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', + ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees', + output: + trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', + params: + re = '1.1e-8', + mima = '1' + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1680 + benchmark: + 'benchmarks/{chromosome}_infer.benchmark.txt' + shell: + r""" + # Check amount of memory (in kbytes) as seen by the job + ulimit -v + tsinfer match-samples {input.samples} \ + -A {input.ancestors_trees} \ + --num-threads {threads} \ + --progress \ + --log-section tsinfer.inference \ + --verbosity \ + -O {output} + """ +# --recombination-rate {params.re} \ +# --mismatch-ratio {params.mima} \ + +rule simplify_trees: + input: + trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', + output: + simplified_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees', + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1680 + benchmark: + 'benchmarks/{chromosome}_simplify.benchmark.txt' + shell: + r""" + ulimit -v + python scripts/simplify.py {input} {output} + """ \ No newline at end of file diff --git a/Snakemake/workflow/scripts/generate_ancestors.py b/Snakemake/workflow/scripts/generate_ancestors.py new file mode 100644 index 0000000..1e4ce05 --- /dev/null +++ b/Snakemake/workflow/scripts/generate_ancestors.py @@ -0,0 +1,19 @@ +import sys +import tsinfer +import numpy as np + +args = sys.argv + +samples = tsinfer.load(args[1]) +sites_to_exclude = np.loadtxt(args[2]) +output = args[3] +threads = int(args[4]) + + +ancestors = tsinfer.generate_ancestors( + sample_data=samples, + path=output, + exclude_positions=sites_to_exclude, + progress_monitor=True, + num_threads=threads, +) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/generate_samples.py b/Snakemake/workflow/scripts/generate_samples.py new file mode 100644 index 0000000..9db40a9 --- /dev/null +++ b/Snakemake/workflow/scripts/generate_samples.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import sys +import json +import cyvcf2 +import tsinfer +import numpy as np +import pandas as pd +from tqdm import tqdm + + +def add_populations(vcf, samples, metaData): + """ + This is a modified add populations function for drones from David Wragg data + Add tsinfer Population objects for drone subspecies, stored in metaPop object, and return a list od IDs correposning to the VCF samples + Input: vcf = cyvcf2 VCF object, samples is tsinfer SampleData and metaData is a pandas dataframe with id in ID column and populations in Type column + pops = dictinary holding name and additional information about populations + Return: A list of population indexes + """ + pop_lookup = {} + metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all') + + # to account for same subpop in different countries - creating a 'fake subpop' to be used as key + # if subpop is repeated new code is subpop+origin, else subpop + d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)] + metaPop['code'] = d + # add code col to the full meatafile + metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner') + + # list population per sample based on recoded subpop + sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples] + + # the dictionary key will be based on the recoding - so keeps the correct amount of population + # but the metadata takes in the real subpop value + for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code): + pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon}) + return [pop_lookup[code[0]] for code in sample_subpop] + +def add_diploid_individuals(vcf, samples, populations): + for iid, population in zip(vcf.samples, populations): + samples.add_individual(ploidy=2, metadata={'ID': iid}, population=population) + + +def add_diploid_sites(vcf, samples): + """ + Read the sites in the vcf and add them to the samples object. + """ + # You may want to change the following line, e.g. here we allow + # "*" (a spanning deletion) to be a valid allele state + allele_chars = set("ATGCatgc*") + + pos = 0 + + progressbar = tqdm(total=samples.sequence_length, desc="Read VCF", unit='bp') + + for variant in vcf: # Loop over variants, each assumed at a unique site + progressbar.update(variant.POS - pos) + + if pos == variant.POS: + print(f"Duplicate entries at position {pos}, ignoring all but the first") + continue + else: + pos = variant.POS + + if any([not phased for _, _, phased in variant.genotypes]): + raise ValueError("Unphased genotypes for variant at position", pos) + + alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT] + + ancestral = variant.INFO.get("AA", ".") # "." means unknown + + if(ancestral == 'ambiguous' or ancestral not in alleles): # set ancestral == reference + ancestral_allele = 0 + elif ancestral == '-1': + ancestral_allele = -1 + else: + ancestral_allele = alleles.index(ancestral) + + # Check we have ATCG alleles + for a in alleles: + if len(set(a) - allele_chars) > 0: + print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}") + continue + + # Map original allele indexes to their indexes in the new alleles list. + genotypes = [g for row in variant.genotypes for g in row[0:2]] + samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele) + +# ----------------------------------------------------------------------------------------- + +args = sys.argv +vcfFile = args[1] +meta = args[2] +sampleFile = args[3] +chrLength = args[4] + +metaFile = pd.read_csv(meta) + +# Create a population (subspecie) list for the samples in the VCF +vcfD = cyvcf2.VCF(vcfFile, strict_gt=True) + +# Create samples for haploid data +with tsinfer.SampleData(path=sampleFile, + sequence_length=chrLength, + num_flush_threads=16, max_file_size=2**30) as samples: + populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile) + print("populations determined") + add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations) + print("individuals added") + add_diploid_sites(vcf = vcfD, samples = samples) + print("sites added") + +print( + "Sample file created for {} samples ".format(samples.num_samples) + + "({} individuals) ".format(samples.num_individuals) + + "with {} variable sites.".format(samples.num_sites), + flush=True, +) diff --git a/Snakemake/workflow/scripts/truncate.py b/Snakemake/workflow/scripts/truncate.py new file mode 100644 index 0000000..17037bc --- /dev/null +++ b/Snakemake/workflow/scripts/truncate.py @@ -0,0 +1,18 @@ +import sys +import tsinfer + +args = sys.argv +ancestors = tsinfer.load(args[1]) +#uprtime = args[2] +#lwertime = args[3] +output = args[2] + +ancestors.truncate_ancestors( + path=output, + max_file_size=2**38, + lower_time_bound=0.2, + upper_time_bound=0.4, + length_multiplier=2, +) + + From 4b3b9141e4accdf8b9806212b38ba9021cbfb2a9 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:37:15 +0100 Subject: [PATCH 09/15] script to simplify ts --- Snakemake/workflow/scripts/simplify.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 Snakemake/workflow/scripts/simplify.py diff --git a/Snakemake/workflow/scripts/simplify.py b/Snakemake/workflow/scripts/simplify.py new file mode 100644 index 0000000..f59ebfa --- /dev/null +++ b/Snakemake/workflow/scripts/simplify.py @@ -0,0 +1,9 @@ +import sys +import tskit + +args = sys.argv +ts = tskit.load(args[1]) +output = args[2] + +ts_simplified = ts.simplify(keep_unary=False) +ts_simplified.dump(output) \ No newline at end of file From 6e700c4dbb2066a0af896f2123476f67f0d5d6aa Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:39:24 +0100 Subject: [PATCH 10/15] updated default resources and increased for certain rules (to run on supernodes) --- Snakemake/workflow/profile/cluster.json | 84 ++++++++++++++----------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json index f4b023c..5575b57 100644 --- a/Snakemake/workflow/profile/cluster.json +++ b/Snakemake/workflow/profile/cluster.json @@ -1,55 +1,49 @@ { "__default__": { - "mem": "64G", - "threads": "1", + "mem": "32G", + "threads": "16", "time": "10:00:00" }, - "rename": + "merge": { "mem": "64G", + "threads": "32", + "time": "20:00:00" + }, + "rename": + { + "mem": "32G", "threads": "1", "time": "03:00:00" }, "phase": - { - "mem": "350G", - "threads": "25", - "time": "336:00:00" - }, - "rename_phased": { "mem": "64G", - "threads": "1", - "time": "03:00:00" + "threads": "32", + "time": "120:00:00" }, - "get_af": + "filter_positions": { "mem": "64G", - "threads": "1", - "time": "03:00:00" - }, - "get_major": - { - "mem": "64G", - "threads": "1", - "time": "03:00:00" + "threads": "32", + "time": "05:00:00" }, "extract_ancestral_chromosome": { - "mem": "64G", + "mem": "32G", "threads": "1", "time": "02:00:00" }, "join_major_ancestral": { - "mem": "64G", + "mem": "32G", "threads": "1", "time": "02:00:00" }, "determine_ancestral_major": { - "mem": "64G", + "mem": "32G", "threads": "1", "time": "04:00:00" }, @@ -67,43 +61,61 @@ }, "decompress": { - "mem": "64G", + "mem": "8G", "threads": "1", "time": "03:00:00" }, "change_infoAA_vcf": { - "mem": "128G", + "mem": "32G", "threads": "1", "time": "05:00:00" }, "prepare_sample_file": { - "mem": "128G", - "threads": "1", - "time": "10:00:00" + "mem": "64G", + "threads": "32", + "time": "336:00:00" }, - "infer": + "generate_ancestors": { - "mem": "128G", - "threads": "1", - "time": "40:00:00" + "mem": "64G", + "threads": "32", + "time": "336:00:00" + }, + "truncate_ancestors": + { + "mem": "64G", + "threads": "32", + "time": "336:00:00" + }, + "ancestors_trees": + { + "mem": "64G", + "threads": "32", + "time": "336:00:00" + }, + "infer_trees": + { + "mem": "64G", + "threads": "32", + "time": "336:00:00" }, "extract_aligned_alleles_from_vcf": { - "mem": "32G", + "mem": "16G", "threads": "1", "time": "03:00:00" }, "extract_snps_from_info": { - "mem": "32G", + "mem": "16G", "threads": "1", "time": "03:00:00" }, "extract_pos_aligned_alleles_from_vcf": { - "mem": "32G", + "mem": "16G", "threads": "1", "time": "03:00:00" }, @@ -116,7 +128,7 @@ "create_estsfs_dicts": { "mem": "16G", - "threads": 1", + "threads": "1", "time" : "05:00:00" }, "edit_estsfs_dicts": From 0b84d650c1c732835d6b2bea543988e5f7b4211d Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Fri, 12 Apr 2024 11:40:56 +0100 Subject: [PATCH 11/15] config file to run global cattle demography project --- Snakemake/config/glob_cattle.yaml | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 Snakemake/config/glob_cattle.yaml diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml new file mode 100644 index 0000000..67e2381 --- /dev/null +++ b/Snakemake/config/glob_cattle.yaml @@ -0,0 +1,40 @@ +PROJECT: "global_cattle" +work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree_sequence" +species: "Bos taurus" +ploidy: 2 +no_chromosomes: 29 +vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global" +genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2" +reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa" +ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt" +meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv" +chromosome_length: + 1 : 158534110 + 2 : 136231102 + 3 : 121005158 + 4 : 120000601 + 5 : 120089316 + 6 : 117806340 + 7 : 110682743 + 8 : 113319770 + 9 : 105454467 + 10 : 103308737 + 11 : 106982474 + 12 : 87216183 + 13 : 83472345 + 14 : 82403003 + 15 : 85007780 + 16 : 81013979 + 17 : 73167244 + 18 : 65820629 + 19 : 63449741 + 20 : 71974595 + 21 : 69862954 + 22 : 60773035 + 23 : 52498615 + 24 : 62317253 + 25 : 42350435 + 26 : 51992305 + 27 : 45612108 + 28 : 45940150 + 29 : 51098607 \ No newline at end of file From 19e32cfb6cd66836c2137adda719e6be4a75b1b2 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Date: Thu, 18 Jul 2024 13:06:48 +0100 Subject: [PATCH 12/15] updated version to run cattle inferences --- Snakemake/config/glob_cattle.yaml | 13 + Snakemake/workflow/Snakefile | 21 +- Snakemake/workflow/profile/cluster.json | 2 +- Snakemake/workflow/profile/config.yaml | 8 +- Snakemake/workflow/rules/add_ancestral.smk | 167 +++++++++++ .../workflow/rules/ancestral_info_to_vcf.smk | 47 +-- Snakemake/workflow/rules/date.smk | 19 ++ Snakemake/workflow/rules/filter.smk | 267 ++++++++---------- Snakemake/workflow/rules/infer_trees.smk | 36 +-- Snakemake/workflow/rules/keep_snp.smk | 43 +++ .../workflow/rules/merge_samples_vcf.smk | 72 ++--- Snakemake/workflow/rules/new_infer_trees.smk | 82 +++--- Snakemake/workflow/rules/phasing.smk | 63 ++--- .../rules/prepare_files_for_tsinfer.smk | 53 ++-- Snakemake/workflow/rules/split.smk | 31 +- Snakemake/workflow/scripts/Infer_trees.py | 33 +-- .../scripts/Prepare_tsinfer_sample_file.py | 41 ++- .../workflow/scripts/generate_ancestors.py | 3 + .../workflow/scripts/generate_samples.py | 83 ++++-- Snakemake/workflow/scripts/simplify.py | 26 +- Snakemake/workflow/scripts/truncate.py | 11 +- Snakemake/workflow/scripts/tsparam.py | 2 +- 22 files changed, 713 insertions(+), 410 deletions(-) create mode 100644 Snakemake/workflow/rules/add_ancestral.smk create mode 100644 Snakemake/workflow/rules/date.smk create mode 100644 Snakemake/workflow/rules/keep_snp.smk diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml index 67e2381..7c8a537 100644 --- a/Snakemake/config/glob_cattle.yaml +++ b/Snakemake/config/glob_cattle.yaml @@ -3,11 +3,24 @@ work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree species: "Bos taurus" ploidy: 2 no_chromosomes: 29 +start_chromosome: 1 +end_chromosome: 29 vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global" genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2" reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa" ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt" meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv" +phase_only: False +remove_indel: True +missing_data: "0.2" +truncate_upper: "0.4" +truncate_lower: "0.2" +recombination_rate_tree: False # "1e-8" +recombination_rate_anc: False +mismatch_ratio_anc: False +mismatch_ratio_tree: False # "1" +mutation_rate: "1e-8" +generation_interval: 5 chromosome_length: 1 : 158534110 2 : 136231102 diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile index d22892e..611801d 100644 --- a/Snakemake/workflow/Snakefile +++ b/Snakemake/workflow/Snakefile @@ -18,7 +18,6 @@ report: "reports/tsinfer.rst" ##### setup singularity ##### # Can we set conda env here??? - ##### Define some variables ##### # all files should be in the vcfDir/RawVCF folder # files that need spliting are named: Combined_something.vcf.gz @@ -40,12 +39,13 @@ Project = config['PROJECT'] projdir = config['work_dir'] + '/' + Project mapdir = config['genmap'] -# if want to split multiallelic sites set to TRUE: -#multiallelic = False - -chromosomes = [f'chr{n}' for n in range(1, config['no_chromosomes'] + 1)] +chromosomes = [f'chr{n}' for n in range(config['start_chromosome'], config['end_chromosome'] + 1)] # set a list of 'Chr' + chr# combinations to be used as wildcards +wildcard_constraints: + chromosome = r'chr\d+' +# constraining the wildcard to avoid errors + ##### load rules ##### #include: "rules/ancestral_inference.smk" include: "rules/split.smk" @@ -55,11 +55,16 @@ include: "rules/phasing.smk" include: "rules/ancestral_info_to_vcf.smk" include: "rules/new_infer_trees.smk" +##### Conditional inputs for rule all based on the existence of phased VCFs and config['phase_only'] ##### +if not config['phase_only']: + rule_all_input = expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', chromosome=chromosomes) +else: + rule_all_input = expand(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi', chromosome=chromosomes) + + ##### target rules ##### # At the end it should generate a phased vcf file that combines all files for # each chromosome. These files are stored in the vcf folder declaired in the # configuration file. It also checks that vcfs have been filtered before merged rule all: - input: - expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees', chromosome=chromosomes) - # tree inference + input: rule_all_input \ No newline at end of file diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json index 5575b57..574c29c 100644 --- a/Snakemake/workflow/profile/cluster.json +++ b/Snakemake/workflow/profile/cluster.json @@ -21,7 +21,7 @@ { "mem": "64G", "threads": "32", - "time": "120:00:00" + "time": "240:00:00" }, "filter_positions": { diff --git a/Snakemake/workflow/profile/config.yaml b/Snakemake/workflow/profile/config.yaml index 1dc5486..e7a1c31 100644 --- a/Snakemake/workflow/profile/config.yaml +++ b/Snakemake/workflow/profile/config.yaml @@ -3,10 +3,8 @@ use-conda: true jobscript: "jobscript.sh" cluster-config: "cluster.json" jobname: "{rule}.{jobid}" -jobs: 16 -#drmaa: " -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -P roslin_HighlanderLab" +drmaa: " -P roslin_HighlanderLab -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -R y" rerun-incomplete: true -rerun-triggers: mtime -cores: 16 # how many jobs you want to submit to your cluster queue -local-cores: 1 +rerun-triggers: [mtime, input] +local-cores: 99 keep-going: false diff --git a/Snakemake/workflow/rules/add_ancestral.smk b/Snakemake/workflow/rules/add_ancestral.smk new file mode 100644 index 0000000..dd404f1 --- /dev/null +++ b/Snakemake/workflow/rules/add_ancestral.smk @@ -0,0 +1,167 @@ + +ancestral_file = config['ancestral_allele'] + +# input and output +infile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' +infileidx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' +outfile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz' +outidx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi' + +# temporary files +filtered_samples = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sam.vcf.gz' +filtered_positions = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sampos.vcf' +ancestral_not_compressed = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf' +regfile = f'{vcfdir}/{{chromosome}}_regions.tsv' +aaInfo = f'{vcfdir}/{{chromosome}}_aa.info' + +if config['PROJECT'] == "global_cattle": + rule remove_samples: + input: infile + output: filtered_samples + params: + samples_to_remove = f'{vcfdir}/{{chromosome}}/remove' + conda: 'bcftools' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + shell: + r""" + bcftools view -S ^{params.samples_to_remove} {input} --threads {threads} -O z -o {output} + + bcftools index {output} + """ + + +# need to some helper files: +# 1) create a chr pos file for filtering +# 2) create an info file for adding ancestrals +# filter vcf positions so that it has the same as the aa file +# add ancestral allele to INFO + +rule make_regions_file: + input: + anc = ancestral_file, + vcf = filtered_samples + output: temp(regfile) + params: + chrnum = lambda wc: wc.get('chromosome')[3:] + shell: + r""" + # filter positions in ancestral that correspond to chr + cut -d' ' -f1,2 {input.ancestral_file} | awk '$1 == {params.chrnum}' | sed -e $'s/ /\t/g' > chr{params.chrnum}.anc + + cut -f2 chr{params.chrnum}.anc > chr{params.chrnum}.compare + + # get positions in vcf + bcftools query -l '%POS\n' {input.vcf} > chr{params.chrnum}.file + + # match positions ancestral and vcf + comm -12 <( sort chr{params.chrnum}.file ) <( chr{params.chrnum}.compare) > chr{params.chrnum}.keep + + # number of lines + l=($(wc -l chr{params.chrnum}.keep)) + + # create final regions file + yes {params.chrnum} | head -n $l > chr{params.chrnum} + paste chr{params.chrnum} chr{params.chrnum}.keep > {output} + + rm chr{params.chrnum} chr{params.chrnum}.keep chr{params.chrnum}.file chr{params.chrnum}.compare chr{params.chrnum}.anc + """ +#grep -vw 4579381 3565339 + +rule filter_positions: + input: + vcf = filtered_samples, + idx = f'{filtered_samples}.csi', + regions = regfile, + output: filtered_positions + conda: 'bcftools' + threads: 32 + resources: cpus = 32, mem_mb = 768000, time_min = 300 + shell: + r""" + bcftools view -R {input.regions} {input.vcf} \ + # --threads {threads} \ + -o {output} + """ + +# take only the rows where col1 == current_chromosome +# add AA= in front of the aa col +# use only col pos & aa +# match with the left pos in filtered vcf + +rule make_aaInfo_file: + input: + aa_file = ancestral_file, + vcf = filtered_positions + output: temp(aaInfo) + params: + chrnum = lambda wc: wc.get('chromosome')[3:] + shell: + r""" + # filter by chromosome and add AA= to the allele col + awk '$1 == {params.chrnum}' {input} | awk '$3="AA="$3' | cut -d' ' -f2,3 > {wildcards.chromosome}.aa + + # get the positions in the vcf + bcftools query -f '%POS\n' {input.vcf} > {wildcards.chromosome}.pos + + # add allele col to the pos in the vcf + awk 'NR==FNR{{a[$1];next}}($1 in a){{print $0}}' {wildcards.chromosome}.pos {wildcards.chromosome}.aa > {wildcards.chromosome}.pos.aa + + cut -d' ' -f2 {wildcards.chromosome}.pos.aa | sed '1s/^/INFO\n/' > {output} + + rm {wildcards.chromosome}.aa {wildcards.chromosome}.pos {wildcards.chromosome}.pos.aa + """ + +# need to run shapeit first to know line numbers +rule add_ancestral: + input: + vcf = filtered_positions, + aa_info = aaInfo, + output: ancestral_not_compressed + conda: 'bcftools' + shell: + r""" + awk 'NR==FNR{{a[NR] = $1; next}} FNR<13{{print}}; \ + FNR==6{{printf "##INFO=\n"}}; \ + FNR>12{{$8=a[FNR-11]; print}}' OFS="\t" {input.aa_info} {input.vcf} > {output} + """ + + +# HEADERNUM="$(( $(bcftools view -h {input.vcf} | wc -l) - 1 ))" + +# INFOLINE=$(( $(bcftools view -h {input.vcf} | awk '/INFO/{{print NR}}' | head -n 1) )) + +# awk -v OFS="\t" -v HEADER=$HEADERNUM -v INFO=$INFOLINE 'NR==FNR{{{{a[FNR] = $2; next}}}} FNR<=HEADER{{{{print}}}}; \ +# FNR==INFO{{{{printf "##INFO=\\n"}}}}; \ +# FNR>HEADER{{{{$8=a[FNR-HEADER]; print}}}}' OFS="\t" {input.aa_info} {input.vcf} > {output} +# All_AAInfo.txt AllFiltered.recode.vcf > AllFiltered.recode.Ancestral.vcf + +# compress and index final vcf +# rule match_samples: +# input: ancestral_not_compressed +# output: outfile +# params: +# samples_to_keep = f'{vcfdir}/{{chromosome}}/keep.samples' +# conda: 'bcftools' +# shell: +# r""" +# bcftools view -S {params.samples_to_keep} {input} -O z -o {output} +# """ + +rule compress_vcf: + input: ancestral_not_compressed + output: outfile, + conda: 'bcftools' + shell: + r""" + bgzip -c {input} > {output} + """ + +rule index_output: + input: outfile + output: outidx + conda: 'bcftools' + shell: + r""" + bcftools index {input} + """ diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk index f8870f0..f69bee0 100644 --- a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk +++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk @@ -1,7 +1,8 @@ # extract all positions in vcf file using bcftools and save to regions file # for rows in ancestral_alleles file where col 1 match wildcard.chromosome -# for each row in regions file, if value is in ancestral_alleles col 2, add ancestral_alleles col 3 to regions file col 2 +# for each row in regions file, if value is in ancestral_alleles col 2, add +# ancestral_alleles col 3 to regions file col 2 # else add -1 ancestral_file = config['ancestral_allele'] @@ -23,38 +24,42 @@ if "global_cattle" in config['PROJECT']: rule remove_samples: input: - vcf_file = rules.phase.output.vcf, - idx_file = rules.index_phased.output.idx, - output: - vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf', + vcf_file = input_ancestral, + idx_file = input_ancestral_idx, + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf', params: samples_to_remove = f'{vcfdir}/samples.remove' conda: 'bcftools' threads: 32 resources: cpus=32, mem_mb=2048000, time_min=1200 + benchmark: 'benchmarks/{chromosome}.remove.benchmark.txt' shell: r""" - bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output.vcf_file} + bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output} """ - input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf' else: rule decompress: - input: rules.phase.output.vcf - output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' - conda: bcftools + input: + vcf_file = input_ancestral, + idx_file = input_ancestral_idx + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf' + conda: bcftools + benchmark: 'benchmarks/{chromosome}.decompress.benchmark.txt' shell: r""" - bcftools view {input} -O v -o {output} + bcftools view {input.vcf_file} -O v -o {output} """ - input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' + +input_ancestral = f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf' rule get_vcf_positions: input: - vcf_file = input_vcf, + vcf_file = input_ancestral, output: regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.regions') conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.positions.benchmark.txt' shell: r""" bcftools query -f '%POS\n' {input.vcf_file} | awk '{{print $1, -1}}' > {output.regions_file} @@ -67,6 +72,7 @@ rule get_ancestral_alleles: ancestrals = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.anc') params: chrnum = lambda wc: wc.get('chromosome')[3:] + benchmark: 'benchmarks/{chromosome}.getaa.benchmark.txt' shell: r""" # filter positions in ancestral that correspond to chr @@ -79,7 +85,8 @@ rule update_regions_file: ancestrals = rules.get_ancestral_alleles.output.ancestrals, output: updated_regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.updated.regions'), - positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude') + positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude'), + benchmark: 'benchmarks/{chromosome}.regions.benchmark.txt' shell: r""" awk 'NR==FNR{{a[$1]=$2;next}} ($1 in a){{$2=a[$1]}}1' {input.ancestrals} {input.regions_file} > {output.updated_regions_file} @@ -92,6 +99,7 @@ rule add_AA_to_regions_file: updated_regions_file = rules.update_regions_file.output.updated_regions_file, output: new_regions_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}.final.regions' + benchmark: 'benchmarks/{chromosome}.addaaregions.benchmark.txt' shell: r""" echo "INFO" > {output.new_regions_file} @@ -100,11 +108,13 @@ rule add_AA_to_regions_file: rule add_info_to_vcf: input: - vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf', + vcf_file = input_ancestral, + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf', aa_info = rules.add_AA_to_regions_file.output.new_regions_file, output: - large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf') + large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf') conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.aainfo.benchmark.txt' shell: r""" awk 'NR==FNR{{a[NR] = $1; next}} FNR<15{{print}}; \ @@ -116,13 +126,16 @@ rule add_info_to_vcf: rule compress_index_vcf: input: - large_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf' + large_vcf = rules.add_info_to_vcf.output.large_vcf + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf' output: vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', vcf_indx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz.csi' conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.compress.benchmark.txt' shell: r""" bgzip -c {input} > {output.vcf_file} bcftools index {output.vcf_file} """ +input_ts = rules.compress_index_vcf.output.vcf_file \ No newline at end of file diff --git a/Snakemake/workflow/rules/date.smk b/Snakemake/workflow/rules/date.smk new file mode 100644 index 0000000..a9afb85 --- /dev/null +++ b/Snakemake/workflow/rules/date.smk @@ -0,0 +1,19 @@ +# Script for (iteractively) dating the tree sequence +# trees must be dated, re-inferred and dated again +# to get the correct dates (test number of iterations) + +# if this is the first iteration, start from the existing tree sequence; +# otherwise, re-infer the trees using the dated sampleFile and date the trees again + +input_ts = + +print(f'Starting dating of tree sequence {input_ts}...') + +rule date_trees: + input: rules.simplify_trees.output + output: f'{_input:n}.dated' + conda: '' + + +print('A rough estimate of the effective population size is', ts.diversity() / (4 * 1e-6)) + diff --git a/Snakemake/workflow/rules/filter.smk b/Snakemake/workflow/rules/filter.smk index 99ca98f..54a596a 100644 --- a/Snakemake/workflow/rules/filter.smk +++ b/Snakemake/workflow/rules/filter.smk @@ -1,186 +1,159 @@ -if len(allFiles) != 1: +# if there is only one vcf files, use the output from split_and_move as input +# for the next step samples +# else, compare the files and filter out overlapping then merge the vcfs into +# one and index and use the output from the merge as input for the next step +if len(allFiles) == 1: + input_vcf = rules.split_and_move_vcfs.output + input_index = f'{rules.split_and_move_vcfs.output}.csi' + input_summary = 'None' + +else: + input_to_filter = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.vcf.gz' for file in allFiles] + + filtered_output = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz' for file in allFiles] + filtered_output_idx = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi' + for file in allFiles] + merged_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz' + rule compare_and_filter_vcfs: - input: - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{file}.vcf.gz', - file=allFiles)] - output: - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', - file=allFiles)] + input: input_to_filter + output: temp(filtered_output) params: - file_names = allFiles, - path = f'{vcfdir}/{{chromosome}}/{{chromosome}}' + file_names = [f'{{chromosome}}_{file}' for file in allFiles], + path = f'{vcfdir}/{{chromosome}}' + benchmark: 'benchmarks/{chromosome}.sample.benchmark.txt' run: import os + import shlex import itertools import subprocess from glob import glob path_names = [f'{params.path}/{name}' for name in params.file_names] pairs = list(itertools.combinations(path_names, 2)) + new_ext = '.filtered.vcf.gz' + to_rename = path_names.copy() + + # to avoid shell injection, sanitize the file names + # -- ensure speciall shell characters are escaped + sanitized_path = shlex.quote(params.path) + sanitized_new_ext = shlex.quote(new_ext) + sanitized_idx = shlex.quote(new_ext + '.csi') for i, pair in enumerate(pairs): - print(f'pair {i}') - # for each pair determine the extension of the vcf files - _0, ext0 = [os.path.splitext(file) for file in glob(f'{pair[0]}*') - if os.path.splitext(file)[1] not in ['.csi', '.txt']][0] - - _1, ext1 = [os.path.splitext(file) for file in glob(f'{pair[1]}*') - if os.path.splitext(file)[1] not in ['.csi', '.txt']][0] + print(f'pair {i} {pair}') # Run vcftools to find if there are overlapping samples between pairs command = f""" module load anaconda - conda activate bcftools - vcftools --gzvcf {os.path.join(pair[0] + ext0)} \ - --gzdiff {os.path.join(pair[1] + ext1)} \ - --diff-indv + source activate bcftools + vcftools --gzvcf {os.path.join(pair[0] + '.vcf.gz')} \ + --gzdiff {os.path.join(pair[1] + '.vcf.gz')} \ + --diff-indv """ - subprocess.run(command, shell=True, check=True, cwd=path) - + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + # if vcftools find any overlapping samples, remove them from the second vcf file - if os.path.exists(os.path.join(path, 'out.diff.indv_in_files')): - print(f'Removing samples from {pair[1]} and renaming {pair[0]}') + if "B" in subprocess.run("awk '{if ($2 == \"B\") print $2}' out.diff.indv_in_files", + shell=True, + capture_output=True, + text=True, + cwd=sanitized_path).stdout: + print(f'Overlapping samples found!! Removing samples from {pair[1]}') command = f""" grep B out.diff.indv_in_files | cut -f1 > samples.remove rm out.diff.indv_in_files """ - subprocess.run(command, shell=True, check=True, cwd=path) + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) command = f""" - module load anaconda - conda activate bcftools - bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf' + ext1)} \ - -Oz -o {os.path.join(pair[1] + new_ext)} + bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf.gz')} \ + -Oz -o {os.path.join(pair[1] + sanitized_new_ext)} + + bcftools index -f {os.path.join(pair[1] + sanitized_new_ext)} + rm samples.remove """ - subprocess.run(command, shell=True, check=True, cwd=path) + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + # remove the file that has been sampled from renaming list + to_rename.remove(pair[1]) + print(f'Files to rename are: {to_rename}') # if there are no overlapping samples, # just rename the files that still have the original extension else: print('No overlapping samples') - if 'filtered' not in _0: - print('Rename') - command = f""" - mv {os.path.join(pair[0] + '.vcf' + ext0)} {os.path.join(pair[0] + new_ext)} - """ - subprocess.run(command, shell=True, check=True, cwd=path) - if 'filtered' not in _1: - print('Rename') - command = f""" - mv {os.path.join(pair[1] + '.vcf' + ext1)} {os.path.join(pair[1] + new_ext)} - """ - subprocess.run(command, shell=True, check=True, cwd=path) - - # rule get_samples: - # input: - # f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz' - # output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.txt') - # conda: "bcftools" - # threads: 1 - # resources: cpus=1, mem_mb=32000, time_min=30 - # log: 'logs/Get_samples_{chromosome}_{file}.log' - # shell: - # """ - # bcftools query -l {input} > {output} - # """ - - # rule filter: - # input: - # [f'{vcfdir}' + x - # for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt', - # file=allFiles)], - # output: - # temp([f'{vcfdir}' + x for x in - # expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', - # file=allFiles, ext=['gz', 'gz.csi'])]) - # params: - # duplicated = '{chromosome}.ids', - # files_len = '{chromosome}_len.txt', - # files_reo = '{chromosome}_reorderd.txt', - # resources: cpus=1, mem_mb=64000, time_min=60 - # run: - # import os - - # for file in input: - # shell("wc -l {file} >> {params.files_len}") - - # shell("sort -us -nr {params.files_len} | cut -d' ' -f 2 > {params.files_reo}") - - # with open(params.files_reo, 'r') as file: - # files_reord = [] - # for line in file: - # line = line.strip() - # files_reord.append(line) - - # filtered = [] - - # for i in range(len(files_reord)): - # print(i) - # for j, file in enumerate(files_reord): - # if file != files_reord[i] and j > i: - # file1 = files_reord[i] - # file2 = files_reord[j] - # print(file1, file2) - # shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}") + print(f'Finished comparing and sampling vcfs') - # remove = params.duplicated - # lremove = os.stat(f'{remove}').st_size - - # if lremove == 0: - # print(f'Nothing to remove {lremove}') - # else: - # print(f'{lremove} samples to remove') - # if file2 not in filtered: - # filtered.append(file2) - - # file2 = file2.split('.')[0] - # print(f'file {file2} filtered') - # shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz') - # #shell('gatk SelectVariants -R {params.refgen} -V {file2} -xl-sn -O {file2}.filtered.vcf.gz') - # shell('bcftools index -f {file2}.filtered.vcf.gz') - # shell('rm {params.duplicated} {file2}.vcf.gz {file2}.vcf.gz.csi') + # rename the files that have not been sampled + for name in to_rename: + sanitized_name = shlex.quote(name) + print(f'Renaming {sanitized_name}') + command = f""" + mv {os.path.join(sanitized_name + '.vcf.gz')} {os.path.join(sanitized_name + sanitized_new_ext)} + mv {os.path.join(sanitized_name + '.vcf.gz.csi')} {os.path.join(sanitized_name + sanitized_idx)} + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) - # for file in files_reord: - # if file not in filtered: - # file1 = file.split('.')[0] # file does not have extensions - # print(f'file {file1} renamed') - # shell('mv {file1}.vcf.gz {file1}.filtered.vcf.gz') - # shell('mv {file1}.vcf.gz.csi {file1}.filtered.vcf.gz.csi') - # #shell('scripts/create_symlinks.sh {file1}') - # shell('rm {params}') + # Remove all files that do not contain the string new_ext + command = f""" + find {sanitized_path} -type f ! -name '*{sanitized_new_ext}*' -exec rm -f {{}} \; + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) -else: - rule rename: - input: - vcf = [[f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz', - suffixOne = splitFiles)] if len(splitFiles) != 0 else [], - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz', - suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []], - idx = [[f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz.csi', - suffixOne = splitFiles)] if len(splitFiles) != 0 else [], - [f'{vcfdir}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz.csi', - suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []], - output: - vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz', - idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz.csi' + rule merge: + input: rules.compare_and_filter_vcfs.output + output: merged_output + params: + files_path = f'{vcfdir}/{{chromosome}}/', + indexes = filtered_output_idx conda: "bcftools" - log: 'logs/Rename_{chromosome}.log' - resources: cpus=1, mem_mb=32000, time_min=30 + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt' shell: + r""" + printf '%s\n' {params.files_path}*filtered*.gz > {wildcards.chromosome}.merge + + bcftools merge \ + -m snps \ + -l {wildcards.chromosome}.merge \ + --threads {threads} -O z -o {output} + + rm {wildcards.chromosome}.merge {params.indexes} """ - if [ -h {input} ]; then - ln -s $( realpath {input.vcf} ) {output.vcf} - ln -s $( realpath {input.idx} ) {output.idx} - else - ln -s {input.vcf} {output.vcf} - ln -s {input.idx} {output.idx} - fi + + rule index_merged: + input: rules.merge.output, + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.indxmerged.benchmark.txt' + shell: + r""" + bcftools index -f {input} + """ + + rule vcf_summary: + input: + vcf_file = rules.merge.output, + index = rules.index_merged.output, + output: + summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk', + # done = touch(f'{vcfdir}/{{chromosome}}/merged_summary.done') + params: + plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt' + shell: + r""" + bcftools stats {input.vcf_file} > {output.summary} + plot-vcfstats -p {params.plots} {output.summary} """ + input_vcf = rules.merge.output + input_index = rules.index_merged.output + input_summary = rules.vcf_summary.output.summary + +# else, use the output from split_and_move as input for the next step +# else: +# input_vcf = rules.split_and_move_vcfs.output \ No newline at end of file diff --git a/Snakemake/workflow/rules/infer_trees.smk b/Snakemake/workflow/rules/infer_trees.smk index f7c2a76..cf36dcc 100644 --- a/Snakemake/workflow/rules/infer_trees.smk +++ b/Snakemake/workflow/rules/infer_trees.smk @@ -1,36 +1,38 @@ import re -# if multiallelic: -# vcf_4_inference = f'{vcfdir}/{{chromosome}}_allbi.vcf.gz' -# else: -# vcf_4_inference = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz' - rule prepare_sample_file: input: - #vcf=vcf_4_inference, vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz', meta = config['meta'] output: - f"../{Project}/Tsinfer/samples/{{chromosome}}.samples" + f'{projdir}/Tsinfer/samples/{{chromosome}}.samples' params: chrLength= lambda wildcards: config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])], ploidy=config['ploidy'] conda: "HLab_tsinfer" - threads: 1 - resources: cpus=1, mem_mb=128000, time_min=200 - log: 'logs/Prepare_sample_file_{chromosome}.log' + threads: 32 + resources: cpus=32, mem_mb=768000, time_min=200 + benchmark: + 'benchmarks/{chromosome}.prep_sample.benchmark.txt' + #log: 'logs/Prepare_sample_file_{chromosome}.log' shell: "python scripts/Prepare_tsinfer_sample_file.py " "{input.vcf} {input.meta} {output} {params.ploidy} {params.chrLength}" rule infer: input: - f"../{Project}/Tsinfer/samples/{{chromosome}}.samples" - conda: "HLab_tsinfer" - threads: 1 - resources: cpus=1, mem_mb=128000, time_min=300 - log: 'logs/Infer_{chromosome}.log' + f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', output: - f"../{Project}/Tsinfer/trees/{{chromosome}}.trees" + f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', + conda: "HLab_tsinfer" + threads: 16 + resources: cpus=16, mem_mb=1024000, time_min=10080 + benchmark: + 'benchmarks/{chromosome}.infer.benchmark.txt' + #log: 'logs/Infer_{chromosome}.log' shell: - "python scripts/Infer_trees.py {input} {output}" + r""" + # Check amount of memory (in kbytes) as seen by the job + ulimit -v + python scripts/Infer_trees.py {input} {output} + """ diff --git a/Snakemake/workflow/rules/keep_snp.smk b/Snakemake/workflow/rules/keep_snp.smk new file mode 100644 index 0000000..323bd1b --- /dev/null +++ b/Snakemake/workflow/rules/keep_snp.smk @@ -0,0 +1,43 @@ +# rule gatk_index: +# input: +# vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz', +# idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' +# output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.tbi'), +# conda: 'gatk-env' +# benchmark: +# 'benchmark/{chromosome}.gatk_index.benchmark.txt' +# shell: +# r""" +# gatk IndexFeatureFile -I {input.vcf} +# """ + +# rule filter_snps: +# input: +# vcf = rules.phase.output, +# idx = rules.index_phased.output, +# output: +# vcf_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz'), +# idx_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi'), +# params: config['reference_genome'], +# conda: 'gatk-env' +# benchmark: 'benchmarks/{chromosome}.filter_snps.benchmark.txt' +# shell: +# r""" +# gatk SelectVariants \ +# -R {params} \ +# -V {input.vcf} \ +# --select-type-to-include SNP \ +# -O {output.vcf_file} + +# bcftools index {output.vcf_file} +# """ + +# rule index: +# input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz', +# output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi') +# conda: 'bcftools' +# benchmark: 'benchmarks/{chromosome}.index_filtered_snps.benchmark.txt' +# shell: +# r""" +# bcftools index {input} +# """ diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk index 86cbc2c..72f74c1 100644 --- a/Snakemake/workflow/rules/merge_samples_vcf.smk +++ b/Snakemake/workflow/rules/merge_samples_vcf.smk @@ -1,50 +1,26 @@ -if len(allFiles) != 1: - rule merge: - input: - vcf = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)], - idx = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi', file=allFiles)], - output: - vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz', - params: - files_path = f'{vcfdir}/{{chromosome}}/' - conda: "bcftools" - threads: 32 - resources: cpus=32, mem_mb=2048000, time_min=1200 - benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt' - shell: - r""" - printf '%s\n' {params.files_path}*.gz > files.merge - - bcftools merge \ - -m snps \ - -l files.merge \ - --threads {threads} -O z -o {output.vcf} - """ - - rule index_merged: - input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz', - output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi' - conda: 'bcftools' - shell: - r""" - bcftools index -f {input} - """ +# Filter out INDELS and missing data (>20%) from the vcf file +# The input vcf file is the merged vcf file from the previous step +# or the output from split and move (if only one file is present) rule remove_missing_indels: input: - vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz', - index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi' + vcf = input_vcf, + index = input_index, + summary = input_summary, output: - vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz', - index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz.csi' + vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz'), + index = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi') params: - prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged' + prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}', + missing = config['missing_data'], + indel = '--remove-indels' if config['remove_indel'] else '' conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.missindel.benchmark.txt' shell: r""" vcftools --gzvcf {input.vcf} \ - --remove-indels \ - --max-missing 0.1 \ + {params.indel} \ + --max-missing {params.missing} \ --remove-filtered-all \ --recode \ --out {params.prefix} @@ -52,21 +28,5 @@ rule remove_missing_indels: bgzip {params.prefix}.recode.vcf bcftools index -f {output.vcf} - rm {params.prefix}.recode.vcf - """ - -# rule summary: -# input: -# vcf_file = rules.filter_merged_vcf.output.vcf, -# index = rules.filter_merged_vcf.output.index, -# output: -# summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk' -# params: -# plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/' -# conda: 'bcftools' -# benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt' -# shell: -# r""" -# bcftools stats {input.vcf_file} > {output.summary} -# plot-vcfstats -p {params.plots} {output.summary} -# """ \ No newline at end of file + # rm {params.prefix}.recode.vcf + """ diff --git a/Snakemake/workflow/rules/new_infer_trees.smk b/Snakemake/workflow/rules/new_infer_trees.smk index 930c3ca..0dec01f 100644 --- a/Snakemake/workflow/rules/new_infer_trees.smk +++ b/Snakemake/workflow/rules/new_infer_trees.smk @@ -8,7 +8,8 @@ rule prepare_sample_file: input: - vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', + vcf = input_ts, + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', meta = config['meta'] output: f'{projdir}/Tsinfer/samples/{{chromosome}}.samples' params: @@ -16,103 +17,92 @@ rule prepare_sample_file: conda: "HLab_tsinfer" threads: 32 resources: cpus = 32, mem_mb = 768000, time_min = 200 - benchmark: - 'benchmarks/{chromosome}_new.prep_sample.benchmark.txt' + benchmark: 'benchmarks/{chromosome}_new.prep_sample.benchmark.txt' shell: r""" ulimit -v python scripts/generate_samples.py \ - {input.vcf} {input.meta} {output} {params.chrLength} + {input.vcf} {output} {input.meta} {params.chrLength} {threads} """ rule generate_ancestors: input: - samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', + samples = rules.prepare_sample_file.output, sites_to_exclude = f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude', output: ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors', conda: 'HLab_tsinfer' threads: 64 resources: cpus = 64, mem_mb = 2048000, time_min = 1680 - benchmark: - 'benchmarks/{chromosome}_generate_ancestors.benchmark.txt' + benchmark: 'benchmarks/{chromosome}_generate_ancestors.benchmark.txt' shell: r""" # Check amount of memory (in kbytes) as seen by the job ulimit -v python scripts/generate_ancestors.py {input.samples} {input.sites_to_exclude} {output.ancestors_samples} {threads} """ -# tsinfer generate-ancestors {input} \ -# --num-threads {threads} \ -# --num-flush-threads {threads} \ -# --progress \ -# --verbosity \ -# --log-section tsinfer.inference - rule truncate_ancestors: input: - ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors', + ancestors_samples = rules.generate_ancestors.output.ancestors_samples, output: truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated', - # params: - # upper = 0.4, - # lower = 0.2, + params: + upper = config['truncate_upper'], + lower = config['truncate_lower'], conda: 'HLab_tsinfer' threads: 32 resources: cpus=32, mem_mb=2048000, time_min=1680 - benchmark: - 'benchmarks/{chromosome}_truncate.benchmark.txt' + benchmark: 'benchmarks/{chromosome}_truncate.benchmark.txt' shell: r""" ulimit -v - python scripts/truncate.py {input} {output} + python scripts/truncate.py {input} {params.upper} {params.lower} {output} """ rule ancestors_trees: input: - samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', - truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated', + samples = rules.prepare_sample_file.output, + truncated_ancestors = rules.truncate_ancestors.output.truncated_ancestors, output: ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees', params: - re = '1.1e-8', - mima = '1' + re = f"--recombination-rate {config['recombination_rate_anc']}" if config['recombination_rate_anc'] else '', + misma = f"--mismatch-ratio {config['mismatch_ratio_anc']}" if config['mismatch_ratio_anc'] else '', conda: 'HLab_tsinfer' threads: 32 resources: cpus = 32, mem_mb = 2048000, time_min = 1680 - benchmark: - 'benchmarks/{chromosome}_ancestors_trees.benchmark.txt' + benchmark: 'benchmarks/{chromosome}_ancestors_trees.benchmark.txt' shell: r""" tsinfer match-ancestors {input.samples} \ -A {output} \ + {params.re} \ + {params.misma} \ --log-section tsinfer.inference \ --num-threads {threads} \ --progress \ --verbosity \ --ancestors {input.truncated_ancestors} """ -# --recombination-rate {params.re} \ - # --mismatch-ratio {params.mima} \ - rule infer_trees: input: - samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', - ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees', + samples = rules.prepare_sample_file.output, + ancestors_trees = rules.ancestors_trees.output.ancestors_trees, output: trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', params: - re = '1.1e-8', - mima = '1' + re = f"--recombination-rate {config['recombination_rate_tree']}" + if config['recombination_rate_tree'] else '', + misma = f"--mismatch-ratio {config['mismatch_ratio_tree']}" + if config['mismatch_ratio_tree'] else '', conda: 'HLab_tsinfer' threads: 32 resources: cpus=32, mem_mb=2048000, time_min=1680 - benchmark: - 'benchmarks/{chromosome}_infer.benchmark.txt' + benchmark: 'benchmarks/{chromosome}_infer.benchmark.txt' shell: r""" # Check amount of memory (in kbytes) as seen by the job @@ -120,26 +110,30 @@ rule infer_trees: tsinfer match-samples {input.samples} \ -A {input.ancestors_trees} \ --num-threads {threads} \ + {params.re} \ + {params.misma} \ --progress \ --log-section tsinfer.inference \ --verbosity \ -O {output} """ -# --recombination-rate {params.re} \ -# --mismatch-ratio {params.mima} \ rule simplify_trees: input: - trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', + trees = rules.infer_trees.output.trees, output: - simplified_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees', + f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', + params: + genint = config['generation_interval'], + mutrate = config['mutation_rate'], conda: 'HLab_tsinfer' threads: 32 resources: cpus=32, mem_mb=2048000, time_min=1680 - benchmark: - 'benchmarks/{chromosome}_simplify.benchmark.txt' + benchmark: 'benchmarks/{chromosome}_simplify.benchmark.txt' shell: r""" ulimit -v - python scripts/simplify.py {input} {output} - """ \ No newline at end of file + python scripts/simplify.py {input} {params.genint} {params.mutrate} {output} + """ + +# ,[^.] \ No newline at end of file diff --git a/Snakemake/workflow/rules/phasing.smk b/Snakemake/workflow/rules/phasing.smk index 53e59db..6ea0fad 100644 --- a/Snakemake/workflow/rules/phasing.smk +++ b/Snakemake/workflow/rules/phasing.smk @@ -1,29 +1,27 @@ -file_to_phase = rules.remove_missing_indels.output.vcf -file_to_phase_idx = rules.remove_missing_indels.output.index +# Phase files if species is diploid using the output from the +# remove_missing_indels rule + +file_to_phase = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz' +file_to_phase_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi' phased_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' phased_output_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' +checkpoint decide_start_point: + output: "workflow_start.txt" + shell: + """ + if [ {config['phase_only']} == False ] && [ -f {phased_output} {phased_output_idx} ]; then + echo "ancestral_info_to_vcf" > {output} + else + echo "start" > {output} + fi + """ + if config['ploidy'] == 1: - rule rename_phased: - input: - vcf = file_to_phase, - idx = file_to_phase_idx, - output: - vcf = temp(phased_output), - idx = temp(phased_output_idx) - log: 'logs/Rename_phased_{chromosome}.log' - resources: cpus=1, mem_mb=32000, time_min=60 - shell: - """ - if [ -h {input.vcf} ]; then - ln -s $( realpath {input.vcf} ) {output.vcf} - ln -s $( realpath {input.vcf} ).csi {output.vcf} - else - ln -s {input.vcf} {output.vcf} - ln -s {input.idx} {output.idx} - fi - """ + input_ancestral = file_to_phase + input_ancestral_idx = file_to_phase_idx + else: rule phase: input: @@ -36,8 +34,7 @@ else: threads: 32 resources: cpus = 32, mem_mb = 2048, time_min = 7200 conda: 'shapeit4am' - benchmark: - 'benchmarks/{chromosome}.shapeit.benchmark.txt' + benchmark: 'benchmarks/{chromosome}.shapeit.benchmark.txt' shell: r""" str='{wildcards.chromosome}' @@ -53,15 +50,17 @@ else: --log {wildcards.chromosome}_phased.log """ - rule index_phased: - input: - rules.phase.output.vcf - output: - idx = phased_output_idx + rule index_phased_cleanup: + input: rules.phase.output + output: phased_output_idx + params: + chr = lambda wildcards: wildcards.chromosome[3:], + dire = f'{vcfdir}/{{chromosome}}', conda: 'bcftools' - benchmark: - 'benchmarks/{chromosome}.bcftools_index.bechmark.txt' + benchmark: 'benchmarks/{chromosome}.bcftools_index.bechmark.txt' shell: r""" - bcftools index -f {input} - """ \ No newline at end of file + bcftools index {input} + """ + input_ancestral = phased_output + input_ancestral_idx = phased_output_idx \ No newline at end of file diff --git a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk index cd855b2..52565f4 100644 --- a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk +++ b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk @@ -5,19 +5,20 @@ else: rule get_af: - input: f'{vcfdir}/{{chromosome}}_phased.vcf.gz' + input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' output: - info = temp(f'{vcfdir}/Info{{chromosome}}.INFO') + vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_info.vcf.gz'), + info = temp(f'{vcfdir}/{{chromosome}}/Info{{chromosome}}.INFO') params: - prefix=f'{vcfdir}/Info{{chromosome}}' + prefix=f'{vcfdir}/{{chromosome}}/Info{{chromosome}}' conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=32000, time_min=60 log: 'logs/get_af_{chromosome}.log' shell: """ - bcftools +fill-tags {input} -Oz -o {input} -- -t AN,AC,AF - vcftools --gzvcf {input} --out {params.prefix} --get-INFO AC --get-INFO AF + bcftools +fill-tags {input} -Oz -o {output.vcf} -- -t AN,AC,AF + vcftools --gzvcf {output.vcf} --out {params.prefix} --get-INFO AC --get-INFO AF LOGFILE={params.prefix}.log if test -f "$LOGFILE"; then @@ -27,12 +28,10 @@ rule get_af: rule get_major: input: rules.get_af.output.info - output: temp(f'{vcfdir}/Major{{chromosome}}.txt') + output: temp(f'{vcfdir}/{{chromosome}}/Major{{chromosome}}.txt') threads: 1 resources: cpus=1, mem_mb=32000, time_min=60 log: 'logs/Get_major_{chromosome}.log' -# wildcard_constraints: -# chromosome="^chr | ^Chr" shell: """ awk '{{if (NR!=1 && $5>=0.5) {{print $1"_"$2","$4}} else if (NR!=1 && $5<0.5) {{print $1"_"$2","$3}}}}' {input} > {output} @@ -42,7 +41,7 @@ rule extract_ancestral_chromosome: input: ancestral_file output: - temp(f'{vcfdir}/Ancestral{{chromosome}}.txt') + temp(f'{vcfdir}/{{chromosome}}/Ancestral{{chromosome}}.txt') params: chrNum = lambda wc: wc.get('chromosome')[3:] log: 'logs/Extract_ancestral_chromosome_{chromosome}.log' @@ -57,7 +56,7 @@ rule join_major_ancestral: ancestral = rules.extract_ancestral_chromosome.output, major = rules.get_major.output output: - temp(f'{vcfdir}/MajAAnc_{{chromosome}}.txt') + temp(f'{vcfdir}/{{chromosome}}/MajAAnc_{{chromosome}}.txt') log: 'logs/Join_major_ancestral_{chromosome}.log' resources: cpus=1, mem_mb=32000, time_min=60 shell: @@ -69,7 +68,7 @@ rule determine_ancestral_major: input: rules.join_major_ancestral.output output: - temp(f'{vcfdir}/MajOAnc_{{chromosome}}.txt') + temp(f'{vcfdir}/{{chromosome}}/MajOAnc_{{chromosome}}.txt') log: 'logs/Determine_major_ancestral_{chromosome}.log' shell: """ @@ -78,9 +77,9 @@ rule determine_ancestral_major: rule decompress: input: - vcf=f'{vcfdir}/{{chromosome}}_phased.vcf.gz', - majororaa=rules.determine_ancestral_major.output - output: f'{vcfdir}/{{chromosome}}_phased.vcf' # this is removing both the .gz and the decompressed file + vcf = rules.get_af.output.vcf, + majororaa = rules.determine_ancestral_major.output + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' threads: 1 resources: cpus=1, mem_mb=32000, time_min=30 log: 'logs/Decompress_{chromosome}.log' @@ -92,9 +91,9 @@ rule decompress: rule change_infoAA_vcf: input: - vcf=rules.decompress.output, - ancestralAllele=rules.determine_ancestral_major.output - output: f'{vcfdir}/{{chromosome}}_ancestral.vcf' + vcf = rules.decompress.output, + ancestralAllele = rules.determine_ancestral_major.output + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf' conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=64000, time_min=120 @@ -109,17 +108,23 @@ rule change_infoAA_vcf: """ rule compress_vcf: - input: rules.change_infoAA_vcf.output, + input: + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf', output: - file = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz', - idx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi' + vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz', + idx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi', + params: + chrdir = f'{vcfdir}/{{chromosome}}/', + outdir = f'{vcfdir}/' conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=64000, time_min=60 log: 'logs/Compress_{chromosome}.log' shell: + r""" + bgzip {input.vcf} + bcftools index {input.vcf}.gz + + mv {input}.gz {input}.gz.csi {params.outdir} + rm -r {params.chrdir} """ - bgzip {input} - bcftools index -f {output.file} - """ -# keeping the phased files w/o ancestral alleles diff --git a/Snakemake/workflow/rules/split.smk b/Snakemake/workflow/rules/split.smk index 139bc49..114a7db 100644 --- a/Snakemake/workflow/rules/split.smk +++ b/Snakemake/workflow/rules/split.smk @@ -1,12 +1,29 @@ +# This script is used for managing VCF files in a directory previous to tree inference +# It identifies two types of VCF files in the 'RawVCF' directory: +# 1. Files that start with either "Chr" or "chr" and end with "vcf.gz" (referred to as 'splitFiles') +# 2. Files that start with "Combined" and end with "vcf.gz" (referred to as 'combinedFiles') +# It then creates regex patterns for these file names for later use in rules. +# The script also combines the names of both types of files into a single list (allFiles). +# If there are any 'splitFiles', it defines a rule (move_vcf) to move these files +# from the 'RawVCF' directory to a directory named after the chromosome in the file name. +# The rule also renames the files to include the chromosome name at the start of the file name. +import os +import re +print(chromosomes) splitFiles = list(set([f.strip("vcf.gz").split("_")[1] for f in os.listdir(f'{vcfdir}/RawVCF') - if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) )])) + if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) and "_" in f)])) combinedFiles = list(set([f.strip("vcf.gz").split("_")[1] for f in os.listdir(f'{vcfdir}/RawVCF') if (f.endswith("vcf.gz") and f.startswith("Combined"))])) +# Prepare a regex pattern that matches any of the file names in splitFiles and +# combinedFiles so rules only look for them +splitFiles_pattern = "(" + "|".join(re.escape(name) for name in splitFiles) + ")" +combinedFiles_pattern = "(" + "|".join(re.escape(name) for name in combinedFiles) + ")" + allFiles = splitFiles + combinedFiles if len(splitFiles) > 0: @@ -20,7 +37,9 @@ if len(splitFiles) > 0: conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=32000, time_min=60 - log: 'logs/Move_vcf_{chromosome}_{suffixOne}.log' + benchmark: 'benchmarks/{chromosome}.{suffixOne}.move.benchmark.txt' + wildcard_constraints: + suffixOne = splitFiles_pattern shell: """ ln -s {input.vcf} {output.vcf} @@ -32,13 +51,15 @@ if len(combinedFiles) > 0: input: f'{vcfdir}/RawVCF/Combined_{{suffixTwo}}.vcf.gz' output: - [f'{vcfdir}' + x + temp([f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{{suffixTwo}}.vcf.gz', - chromosome = chromosomes, suffixTwo = combinedFiles)] + chromosome = chromosomes, suffixTwo = combinedFiles)]) conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=64000, time_min=60 - log: expand('logs/Split_and_move_vcfs_{{chromosome}}_{{suffixTwo}}.log') + benchmark: 'benchmarks/{chromosome}.{suffixTwo}.split.benchmark.txt' + wildcard_constraints: + suffixTwo = combinedFiles_pattern shell: """ str='{wildcards.chromosome}' diff --git a/Snakemake/workflow/scripts/Infer_trees.py b/Snakemake/workflow/scripts/Infer_trees.py index 50810fb..41f6b53 100644 --- a/Snakemake/workflow/scripts/Infer_trees.py +++ b/Snakemake/workflow/scripts/Infer_trees.py @@ -44,6 +44,8 @@ progress_monitor=True, ) +ancestors_ts.dump(f'{outputfile}.ancestors') + ts = tsinfer.match_samples( sampleFile, ancestors_ts, @@ -53,25 +55,24 @@ progress_monitor=True, ).simplify(keep_unary=False) -""" -ts = tsinfer.infer(sampleFile) +#ts = tsinfer.infer(sampleFile) print( "Inferred tree sequence `{}`: {} trees over {} Mb".format( - "drone_ts", ts.num_trees, ts.sequence_length / 1e6 + "ts", ts.num_trees, ts.sequence_length / 1e6 ) ) -""" -# # Check the metadata -# for sample_node_id in ts.samples(): -# individual_id = ts.node(sample_node_id).individual -# population_id = ts.node(sample_node_id).population -# print( -# "Node", -# sample_node_id, -# "labels genome sampled from", -# json.loads(ts.individual(individual_id).metadata), -# "in", -# json.loads(ts.population(population_id).metadata)["subpop"], -# ) + +# Check the metadata +for sample_node_id in ts.samples(): + individual_id = ts.node(sample_node_id).individual + population_id = ts.node(sample_node_id).population + print( + "Node", + sample_node_id, + "labels genome sampled from", + json.loads(ts.individual(individual_id).metadata), + "in", + json.loads(ts.population(population_id).metadata)["subpop"], + ) ts.dump(outputFile) diff --git a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py index 84972c3..5d9c45e 100644 --- a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py +++ b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py @@ -8,6 +8,8 @@ import numpy as np import pandas as pd import pkg_resources +from itertools import chain +from tsinfer import MISSING_DATA print(sys.executable) @@ -70,7 +72,9 @@ def add_sites(vcf, samples, ploidy): for variant in vcf: # Loop over variants, each assumed at a unique site progressbar.update(variant.POS - pos) if pos == variant.POS: - raise ValueError("Duplicate positions for variant at position", pos) + print("Duplicate positions for variant at position", pos) + continue + else: pos = variant.POS if ploidy > 1: @@ -83,10 +87,10 @@ def add_sites(vcf, samples, ploidy): alleles = alleles + [letter for letter in string] #ALT = ALT + [letter for letter in string] - print(alleles) + #print(alleles) # ignores non-bialllelic sites if len(alleles) > 2: - print('Ignoring non-biallelic site') + # print('Ignoring non-biallelic site') continue # if len([*ancestral]) > 1: @@ -95,7 +99,7 @@ def add_sites(vcf, samples, ploidy): # #continue ancestral = variant.INFO.get("AA", variant.REF) - print(f'alleles: {alleles} | ancestal: {[*ancestral]}') + #print(f'alleles: {alleles} | ancestal: {[*ancestral]}') try: ancestral_allele = alleles.index(ancestral[0]) @@ -111,6 +115,7 @@ def add_sites(vcf, samples, ploidy): ancestral_allele=alleles.index(ancestral[0])) + def add_populations(vcf, samples, metaData): """ This is a modified add populations function for drones from David Wragg data @@ -121,10 +126,23 @@ def add_populations(vcf, samples, metaData): """ pop_lookup = {} metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all') - sample_subpop = [list(metaData.SubPop[metaData.ID == x]) for x in vcf.samples] - for subpop, pop in zip(metaPop.SubPop, metaPop.Pop): - pop_lookup[subpop] = samples.add_population(metadata={"pop": pop, "subpop": subpop}) - return [pop_lookup[subpop[0]] for subpop in sample_subpop] + + # to account for same subpop in different countries - creating a 'fake subpop' to be used as key + # if subpop is repeated new code is subpop+origin, else subpop + d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)] + metaPop['code'] = d + # add code col to the full meatafile + metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner') + + # list population per sample based on recoded subpop + sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples] + + # the dictionary key will be based on the recoding - so keeps the correct amount of population + # but the metadata takes in the real subpop value + for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code): + pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon}) + return [pop_lookup[code[0]] for code in sample_subpop] + ####################################################################### # Read in the information and prepare the data @@ -138,7 +156,7 @@ def add_populations(vcf, samples, metaData): # Create samples for haploid data with tsinfer.SampleData(path=sampleFile, sequence_length=chrLength, - num_flush_threads=20, max_file_size=2**30) as samples: + num_flush_threads=16, max_file_size=2**30) as samples: populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile) print("populations determined") add_individuals(vcf = vcfD, samples = samples, populations = populations, metaData = metaFile, ploidy = ploidy) @@ -148,5 +166,6 @@ def add_populations(vcf, samples, metaData): print( "Sample file created for {} samples ".format(samples.num_samples) + "({} individuals) ".format(samples.num_individuals) - + "with {} variable sites.".format(samples.num_sites) -) + + "with {} variable sites.".format(samples.num_sites), + flush=True, +) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/generate_ancestors.py b/Snakemake/workflow/scripts/generate_ancestors.py index 1e4ce05..9a8ef82 100644 --- a/Snakemake/workflow/scripts/generate_ancestors.py +++ b/Snakemake/workflow/scripts/generate_ancestors.py @@ -1,5 +1,6 @@ import sys import tsinfer +import numcodecs import numpy as np args = sys.argv @@ -9,6 +10,7 @@ output = args[3] threads = int(args[4]) +gzip_compressor = numcodecs.GZip(level=1) ancestors = tsinfer.generate_ancestors( sample_data=samples, @@ -16,4 +18,5 @@ exclude_positions=sites_to_exclude, progress_monitor=True, num_threads=threads, + compressor=gzip_compressor, ) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/generate_samples.py b/Snakemake/workflow/scripts/generate_samples.py index 9db40a9..3543fc7 100644 --- a/Snakemake/workflow/scripts/generate_samples.py +++ b/Snakemake/workflow/scripts/generate_samples.py @@ -6,10 +6,13 @@ import json import cyvcf2 import tsinfer +import numcodecs import numpy as np import pandas as pd from tqdm import tqdm +# create a GZip compressor instance with a specific compression level +gzip_compressor = numcodecs.GZip(level=1) def add_populations(vcf, samples, metaData): """ @@ -88,13 +91,33 @@ def add_diploid_sites(vcf, samples): genotypes = [g for row in variant.genotypes for g in row[0:2]] samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele) +def process_samples(file, metadata, sample_path, chr_length, compressor, threads): + with tsinfer.SampleData(path=sample_path, + sequence_length=chr_length, + compressor=compressor, + num_flush_threads=threads, max_file_size=2**30) as samples: + populations = add_populations(vcf=file, samples=samples, metaData=metadata) + print("populations determined") + add_diploid_individuals(vcf=file, samples=samples, populations=populations) + print("individuals added") + add_diploid_sites(vcf=file, samples=samples) + print("sites added") + + print( + "Sample file created for {} samples ".format(samples.num_samples) + + "({} individuals) ".format(samples.num_individuals) + + "with {} variable sites.".format(samples.num_sites), + flush=True, + ) + # ----------------------------------------------------------------------------------------- args = sys.argv vcfFile = args[1] -meta = args[2] -sampleFile = args[3] +sampleFile = args[2] +meta = args[3] chrLength = args[4] +nthreads = int(args[5]) metaFile = pd.read_csv(meta) @@ -102,19 +125,45 @@ def add_diploid_sites(vcf, samples): vcfD = cyvcf2.VCF(vcfFile, strict_gt=True) # Create samples for haploid data -with tsinfer.SampleData(path=sampleFile, - sequence_length=chrLength, - num_flush_threads=16, max_file_size=2**30) as samples: - populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile) - print("populations determined") - add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations) - print("individuals added") - add_diploid_sites(vcf = vcfD, samples = samples) - print("sites added") +try: + process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, nthreads) +except Exception as e: + print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.") + process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, 16) + +# with tsinfer.SampleData(path=sampleFile, +# sequence_length=chrLength, +# num_flush_threads=nthreads, max_file_size=2**30) as samples: +# populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile) +# print("populations determined") +# add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations) +# print("individuals added") +# add_diploid_sites(vcf = vcfD, samples = samples) +# print("sites added") -print( - "Sample file created for {} samples ".format(samples.num_samples) - + "({} individuals) ".format(samples.num_individuals) - + "with {} variable sites.".format(samples.num_sites), - flush=True, -) +# ----------------------------------------------------------------------------------------- +# def execute_with_threads(nthreads): +# try: +# with tsinfer.SampleData(path=sampleFile, +# sequence_length=chrLength, +# compressor=gzip_compressor, +# num_flush_threads=nthreads, max_file_size=2**30) as samples: +# populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile) +# print("populations determined") +# add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations) +# print("individuals added") +# add_diploid_sites(vcf=vcfD, samples=samples) +# print("sites added") +# except Exception as e: +# print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.") +# with tsinfer.SampleData(path=sampleFile, +# sequence_length=chrLength, +# compressor=gzip_compressor, +# num_flush_threads=16, max_file_size=2**30) as samples: +# populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile) +# print("populations determined") +# add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations) +# print("individuals added") +# add_diploid_sites(vcf=vcfD, samples=samples) +# print("sites added") +# execute_with_threads(nthreads) diff --git a/Snakemake/workflow/scripts/simplify.py b/Snakemake/workflow/scripts/simplify.py index f59ebfa..7659671 100644 --- a/Snakemake/workflow/scripts/simplify.py +++ b/Snakemake/workflow/scripts/simplify.py @@ -1,9 +1,29 @@ import sys import tskit +import subprocess +# upgrade to the latest version of tsdate +subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "tsdate"]) +import tsdate args = sys.argv ts = tskit.load(args[1]) -output = args[2] +generation_interval = float(args[2]) +mu = float(args[3]) +output = args[4] -ts_simplified = ts.simplify(keep_unary=False) -ts_simplified.dump(output) \ No newline at end of file +# Ne = ts.diversity()/4/mu + +processed_ts = tsdate.preprocess_ts( + ts, + split_disjoint=True, + keep_unary=False, + ) + +dated_ts = tsdate.variational_gamma(tree_sequence = processed_ts, + mutation_rate = mu, + eps = generation_interval, + max_iterations = 100, + time_units = 'generations', + progress = True,) + +dated_ts.dump(output) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/truncate.py b/Snakemake/workflow/scripts/truncate.py index 17037bc..1fbf0f4 100644 --- a/Snakemake/workflow/scripts/truncate.py +++ b/Snakemake/workflow/scripts/truncate.py @@ -3,16 +3,15 @@ args = sys.argv ancestors = tsinfer.load(args[1]) -#uprtime = args[2] -#lwertime = args[3] -output = args[2] +uprtime = float(args[2]) +lwertime = float(args[3]) +output = args[4] ancestors.truncate_ancestors( path=output, max_file_size=2**38, - lower_time_bound=0.2, - upper_time_bound=0.4, + lower_time_bound=lwertime, + upper_time_bound=uprtime, length_multiplier=2, ) - diff --git a/Snakemake/workflow/scripts/tsparam.py b/Snakemake/workflow/scripts/tsparam.py index 8634e20..676e691 100644 --- a/Snakemake/workflow/scripts/tsparam.py +++ b/Snakemake/workflow/scripts/tsparam.py @@ -1,5 +1,5 @@ # parameters for tsinfer --- change accordingly -threads=20 +threads=16 lwertime=0.4 uprtime=0.6 lenmultiply=2 From 5606ece6beacac61100aad952a46f789b1d88338 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Fortuna <88657362+gmafrafortuna@users.noreply.github.com> Date: Thu, 25 Jul 2024 06:30:32 +0200 Subject: [PATCH 13/15] Update Snakefile --- Snakemake/workflow/Snakefile | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile index 27b38e1..1f6de40 100644 --- a/Snakemake/workflow/Snakefile +++ b/Snakemake/workflow/Snakefile @@ -15,11 +15,6 @@ from os.path import isfile, join configfile: '../config/glob_cattle.yaml' report: "reports/tsinfer.rst" -# specify config files on the command line using --configfile -#configfile: '../config/ancestral_Eddie.yaml' -#configfile: '../config/tsinfer_Eddie.yaml' - - ##### setup singularity ##### # Can we set conda env here??? @@ -31,27 +26,13 @@ report: "reports/tsinfer.rst" # (2) sort into the correct Chr folder in vcfDir. # defining variables that will be used throughout the script - -# the final directory name is now editable -- so can be named according to specific project -# mapdir indicates where to find the genetic map for phasing (full path in the config) -mapdir = config['genmap'] -oDir = Path(config['o_dir'], config['PROJECT']) - -vcfdir=config['vcf_dir'] # from config file, the directory containing the VCF(s) to process -# the path where to save intermetdiate and final VCFs -if config['process_vcf_in_original_dir']: - vcfOut = config['vcf_dir'] # same as above -else: - vcfOut = f"{oDir}/VCF" # VCF dir inside the output dir - - +vcfdir=config['vcf_dir'] # from config file, the path to vcf # this folder can be any where but must have the following structure: # vcfDir: # - RawVCF # - Chr1 # -Chr... - # the final directory name is now editable -- so can be named according to specific project # mapdir indicates where to find the genetic map for phasing (full path in the config) Project = config['PROJECT'] @@ -86,4 +67,4 @@ else: # each chromosome. These files are stored in the vcf folder declaired in the # configuration file. It also checks that vcfs have been filtered before merged rule all: - input: rule_all_input \ No newline at end of file + input: rule_all_input From f4cc4b11cf27c0b59c38d60e928e3ac0f1e8e893 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Fortuna <88657362+gmafrafortuna@users.noreply.github.com> Date: Thu, 25 Jul 2024 16:09:18 +0200 Subject: [PATCH 14/15] Update ancestral_info_to_vcf.smk --- Snakemake/workflow/rules/ancestral_info_to_vcf.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk index f69bee0..1aeb6c0 100644 --- a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk +++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk @@ -44,7 +44,7 @@ else: vcf_file = input_ancestral, idx_file = input_ancestral_idx output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf' - conda: bcftools + conda: 'bcftools' benchmark: 'benchmarks/{chromosome}.decompress.benchmark.txt' shell: r""" @@ -138,4 +138,4 @@ rule compress_index_vcf: bgzip -c {input} > {output.vcf_file} bcftools index {output.vcf_file} """ -input_ts = rules.compress_index_vcf.output.vcf_file \ No newline at end of file +input_ts = rules.compress_index_vcf.output.vcf_file From a911031ec144115d7edfb9484de099b32488d0f8 Mon Sep 17 00:00:00 2001 From: Gabriela Mafra Fortuna <88657362+gmafrafortuna@users.noreply.github.com> Date: Fri, 26 Jul 2024 09:30:29 +0200 Subject: [PATCH 15/15] Update cluster.json --- Snakemake/workflow/profile/cluster.json | 156 +++++------------------- 1 file changed, 33 insertions(+), 123 deletions(-) diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json index 574c29c..2670dad 100644 --- a/Snakemake/workflow/profile/cluster.json +++ b/Snakemake/workflow/profile/cluster.json @@ -1,21 +1,27 @@ { "__default__": { - "mem": "32G", - "threads": "16", + "mem": "8G", + "threads": "1", "time": "10:00:00" }, + "compare_and_filter_vcfs": + { + "mem": "16G", + "threads": "8", + "time": "20:00:00" + }, "merge": { - "mem": "64G", - "threads": "32", + "mem": "16G", + "threads": "8", "time": "20:00:00" }, - "rename": + "remove_missing_indels": { - "mem": "32G", - "threads": "1", - "time": "03:00:00" + "mem": "16G", + "threads": "16", + "time": "05:00:00" }, "phase": { @@ -23,25 +29,25 @@ "threads": "32", "time": "240:00:00" }, - "filter_positions": + "get_vcf_positions": { - "mem": "64G", + "mem": "16G", "threads": "32", "time": "05:00:00" }, - "extract_ancestral_chromosome": + "get_ancestral_alleles": { "mem": "32G", "threads": "1", "time": "02:00:00" }, - "join_major_ancestral": + "update_regions_file": { "mem": "32G", "threads": "1", "time": "02:00:00" }, - "determine_ancestral_major": + "add_info_to_vcf": { "mem": "32G", "threads": "1", @@ -53,136 +59,40 @@ "threads": "1", "time": "03:00:00" }, - "sort_ancestral_to_vcf": - { - "mem": "64G", - "threads": "1", - "time": "03:00:00" - }, - "decompress": - { - "mem": "8G", - "threads": "1", - "time": "03:00:00" - }, - "change_infoAA_vcf": - { - "mem": "32G", - "threads": "1", - "time": "05:00:00" - }, "prepare_sample_file": { - "mem": "64G", - "threads": "32", + "mem": "32G", + "threads": "16", "time": "336:00:00" }, "generate_ancestors": { - "mem": "64G", - "threads": "32", + "mem": "32G", + "threads": "16", "time": "336:00:00" }, "truncate_ancestors": { - "mem": "64G", - "threads": "32", + "mem": "32G", + "threads": "16", "time": "336:00:00" }, "ancestors_trees": { - "mem": "64G", - "threads": "32", + "mem": "32G", + "threads": "16", "time": "336:00:00" }, "infer_trees": { - "mem": "64G", - "threads": "32", + "mem": "32G", + "threads": "16", "time": "336:00:00" }, - "extract_aligned_alleles_from_vcf": + "simplify_trees": { - "mem": "16G", - "threads": "1", + "mem": "32G", + "threads": "16", "time": "03:00:00" - }, - "extract_snps_from_info": - { - "mem": "16G", - "threads": "1", - "time": "03:00:00" - }, - "extract_pos_aligned_alleles_from_vcf": - { - "mem": "16G", - "threads": "1", - "time": "03:00:00" - }, - "extract_vcf_alleles_from_aligned": - { - "mem": "16G", - "threads": "1", - "time" : "03:00:00" - }, - "create_estsfs_dicts": - { - "mem": "16G", - "threads": "1", - "time" : "05:00:00" - }, - "edit_estsfs_dicts": - { - "mem": "16G", - "threads": "1", - "time": "01:00:00" - }, - "combine_estsfs_dicts": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "run_estsfs": - { - "mem": "32G", - "threads": "1", - "time": "20:00:00" - }, - "extract_major": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "extract_minor": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "extract_major_outgroup": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "extract_estsfs_prob": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "determine_ancestral": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "move_and_rename_aa": - { - "mem": "16G", - "threads": "1", - "time": "05:00:00" - } + } }