From 561fe7518ebc7f5bdff35c22ff1ad4e58d3dfd74 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Mon, 20 Nov 2023 10:22:28 +0000
Subject: [PATCH 01/15] updated rule filter (in merge_samples) so it starts
 from the file with the most samples

---
 .../workflow/rules/merge_samples_vcf.smk      | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk
index 40ad916..1fcc7be 100644
--- a/Snakemake/workflow/rules/merge_samples_vcf.smk
+++ b/Snakemake/workflow/rules/merge_samples_vcf.smk
@@ -25,18 +25,29 @@ if len(allFiles) != 1:
                 expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', file=allFiles, ext=['gz', 'gz.csi'])])
         params:
             duplicated = '{chromosome}.ids'
-       # conda: 'bcftools'
         resources: cpus=1, mem_mb=64000, time_min=60
         run:
             import os
+
+            for file in input:
+                shell("wc -l {file} >> files_len.txt")
+            
+            shell("sort -s -nr -k 1,1 files_len.txt | cut -d' ' -f 2 > files_reorderd.txt")
+
+            with open('files_reorderd.txt', 'r') as file:
+                files_reord = []
+                for line in file:
+                    line = line.strip()
+                    files_reord.append(line)
+            
             filtered = []
 
-            for i in range(len(input)):
+            for i in range(len(files_reord)):
                 print(i)
-                for j, file in enumerate(input):
-                    if file != input[i] and j > i:
-                        file1 = input[i]
-                        file2 = input[j]
+                for j, file in enumerate(files_reord):
+                    if file != files_reord[i] and j > i:
+                        file1 = files_reord[i]
+                        file2 = files_reord[j]
                         print(file1, file2)
 
                         shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}")
@@ -57,7 +68,7 @@ if len(allFiles) != 1:
                             shell('bcftools index -f {file2}.filtered.vcf.gz')
                             shell('rm {params.duplicated}')
 
-            for file in input:
+            for file in files_reord:
                 if file not in filtered:
                     file1 = file.split('.')[0]
                     print(f'file {file1} not filtered')

From fd7503d3f1f45f01289b23ae7475fe85b7449228 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:26:54 +0100
Subject: [PATCH 02/15] changed rule all input to be the simplified tree

---
 Snakemake/workflow/Snakefile | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile
index 6389682..d22892e 100644
--- a/Snakemake/workflow/Snakefile
+++ b/Snakemake/workflow/Snakefile
@@ -11,8 +11,8 @@ from os.path import isfile, join
 
 
 ##### setup report #####
-configfile: '../config/ancestral_Eddie.yaml'
-configfile: '../config/tsinfer_Eddie.yaml'
+#configfile: '../config/ancestral_Eddie.yaml'
+configfile: '../config/glob_cattle.yaml'
 report: "reports/tsinfer.rst"
 
 ##### setup singularity #####
@@ -37,22 +37,23 @@ vcfdir=config['vcf_dir'] # from config file, the path to vcf
 # the final directory name is now editable -- so can be named according to specific project
 # mapdir indicates where to find the genetic map for phasing (full path in the config)
 Project = config['PROJECT']
+projdir = config['work_dir'] + '/' + Project
 mapdir = config['genmap']
 
 # if want to split multiallelic sites set to TRUE:
 #multiallelic = False
 
-chromosomes = [f'Chr{n}' for n in range(1, config['no_chromosomes'] + 1)]
+chromosomes = [f'chr{n}' for n in range(1, config['no_chromosomes'] + 1)]
 # set a list of 'Chr' + chr# combinations to be used as wildcards
 
 ##### load rules #####
-include: "rules/ancestral_inference.smk"
+#include: "rules/ancestral_inference.smk"
 include: "rules/split.smk"
+include: "rules/filter.smk"
 include: "rules/merge_samples_vcf.smk"
 include: "rules/phasing.smk"
-include: "rules/prepare_files_for_tsinfer.smk"
-#include: "rules/multiallelic.smk"
-include: "rules/infer_trees.smk"
+include: "rules/ancestral_info_to_vcf.smk"
+include: "rules/new_infer_trees.smk"
 
 ##### target rules #####
 # At the end it should generate a phased vcf file that combines all files for
@@ -60,5 +61,5 @@ include: "rules/infer_trees.smk"
 # configuration file. It also checks that vcfs have been filtered before merged
 rule all:
     input:
-        expand(f'../{Project}/Tsinfer/trees/{{chromosome}}.trees', chromosome=chromosomes)
+        expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees', chromosome=chromosomes)
         # tree inference

From f8fa59c7404842f92fc3f648cd263a53aa00e454 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:28:34 +0100
Subject: [PATCH 03/15] no changes

---
 Snakemake/workflow/rules/split.smk | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/Snakemake/workflow/rules/split.smk b/Snakemake/workflow/rules/split.smk
index 5c45b84..139bc49 100644
--- a/Snakemake/workflow/rules/split.smk
+++ b/Snakemake/workflow/rules/split.smk
@@ -2,9 +2,11 @@
 splitFiles = list(set([f.strip("vcf.gz").split("_")[1]
     for f in os.listdir(f'{vcfdir}/RawVCF')
         if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) )]))
+
 combinedFiles = list(set([f.strip("vcf.gz").split("_")[1]
     for f in os.listdir(f'{vcfdir}/RawVCF')
         if (f.endswith("vcf.gz") and f.startswith("Combined"))]))
+
 allFiles = splitFiles + combinedFiles
 
 if len(splitFiles) > 0:
@@ -21,13 +23,8 @@ if len(splitFiles) > 0:
         log: 'logs/Move_vcf_{chromosome}_{suffixOne}.log'
         shell:
             """
-            if [ -h {input.vcf} ]; then
-                ln -s $( realpath {input.vcf} ) {output.vcf}
-                ln -s $( realpath {input.vcf} ).csi {output.vcf}
-            else
-                ln -s {input.vcf} {output.vcf}
-                ln -s {input.idx} {output.idx}
-            fi
+            ln -s {input.vcf} {output.vcf}
+            ln -s {input.idx} {output.idx}
             """
 
 if len(combinedFiles) > 0:

From cfa7c01f0b9eaa10e32dc4bc894160ae3056051d Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:29:41 +0100
Subject: [PATCH 04/15] now filtering overlapping samples from vcfs in one step

---
 Snakemake/workflow/rules/filter.smk | 186 ++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 Snakemake/workflow/rules/filter.smk

diff --git a/Snakemake/workflow/rules/filter.smk b/Snakemake/workflow/rules/filter.smk
new file mode 100644
index 0000000..99ca98f
--- /dev/null
+++ b/Snakemake/workflow/rules/filter.smk
@@ -0,0 +1,186 @@
+if len(allFiles) != 1:
+    rule compare_and_filter_vcfs:
+        input: 
+            [f'{vcfdir}' + x
+                for x in expand('/{{chromosome}}/{{chromosome}}_{file}.vcf.gz',
+                    file=allFiles)]
+        output:
+            [f'{vcfdir}' + x
+                for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz',
+                    file=allFiles)]
+        params:
+            file_names = allFiles,
+            path = f'{vcfdir}/{{chromosome}}/{{chromosome}}'
+        run:
+            import os
+            import itertools
+            import subprocess
+            from glob import glob
+
+            path_names = [f'{params.path}/{name}' for name in params.file_names]
+            pairs = list(itertools.combinations(path_names, 2))
+            
+            for i, pair in enumerate(pairs):
+                print(f'pair {i}')
+                # for each pair determine the extension of the vcf files
+                _0, ext0 = [os.path.splitext(file) for file in glob(f'{pair[0]}*') 
+                                if os.path.splitext(file)[1] not in ['.csi', '.txt']][0]
+
+                _1, ext1 = [os.path.splitext(file) for file in glob(f'{pair[1]}*') 
+                                if os.path.splitext(file)[1] not in ['.csi', '.txt']][0]
+                
+                # Run vcftools to find if there are overlapping samples between pairs
+                command = f"""
+                        module load anaconda
+                        conda activate bcftools
+                        vcftools --gzvcf {os.path.join(pair[0] + ext0)} \
+                                --gzdiff {os.path.join(pair[1] + ext1)} \
+                                --diff-indv
+                        """
+                subprocess.run(command, shell=True, check=True, cwd=path)   
+
+                # if vcftools find any overlapping samples, remove them from the second vcf file
+                if os.path.exists(os.path.join(path, 'out.diff.indv_in_files')):
+                    print(f'Removing samples from {pair[1]} and renaming {pair[0]}')
+                    command = f"""
+                            grep B out.diff.indv_in_files | cut -f1 > samples.remove
+                            rm out.diff.indv_in_files
+                            """
+                    subprocess.run(command, shell=True, check=True, cwd=path)     
+                    command = f"""
+                              module load anaconda
+                              conda activate bcftools
+                              bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf' + ext1)} \
+                                            -Oz -o {os.path.join(pair[1] + new_ext)}
+                              rm samples.remove
+                            """
+                    subprocess.run(command, shell=True, check=True, cwd=path)
+
+                # if there are no overlapping samples, 
+                # just rename the files that still have the original extension
+                else:
+                    print('No overlapping samples')
+                    if 'filtered' not in _0:
+                        print('Rename')
+                        command = f"""
+                                  mv {os.path.join(pair[0] + '.vcf' + ext0)} {os.path.join(pair[0] + new_ext)}
+                                  """
+                        subprocess.run(command, shell=True, check=True, cwd=path)           
+                    if 'filtered' not in _1:
+                        print('Rename')
+                        command = f"""
+                                  mv {os.path.join(pair[1] + '.vcf' + ext1)} {os.path.join(pair[1] + new_ext)}
+                                  """
+                        subprocess.run(command, shell=True, check=True, cwd=path)
+                        
+    # rule get_samples:
+    #     input:
+    #         f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz'
+    #     output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.txt')
+    #     conda: "bcftools"
+    #     threads: 1
+    #     resources: cpus=1, mem_mb=32000, time_min=30
+    #     log: 'logs/Get_samples_{chromosome}_{file}.log'
+    #     shell:
+    #         """
+    #         bcftools query -l {input} > {output}
+    #         """
+
+    # rule filter:
+    #     input:
+    #         [f'{vcfdir}' + x
+    #             for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt',
+    #                 file=allFiles)],
+    #     output:
+    #         temp([f'{vcfdir}' + x for x in
+    #             expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', 
+    #             file=allFiles, ext=['gz', 'gz.csi'])])
+    #     params:
+    #         duplicated = '{chromosome}.ids',
+    #         files_len  = '{chromosome}_len.txt',
+    #         files_reo = '{chromosome}_reorderd.txt', 
+    #     resources: cpus=1, mem_mb=64000, time_min=60
+    #     run:
+    #         import os
+
+    #         for file in input:
+    #             shell("wc -l {file} >> {params.files_len}")
+            
+    #         shell("sort -us -nr {params.files_len} | cut -d' ' -f 2 > {params.files_reo}")
+
+    #         with open(params.files_reo, 'r') as file:
+    #             files_reord = []
+    #             for line in file:
+    #                 line = line.strip()
+    #                 files_reord.append(line)
+            
+    #         filtered = []
+
+    #         for i in range(len(files_reord)):
+    #             print(i)
+    #             for j, file in enumerate(files_reord):
+    #                 if file != files_reord[i] and j > i:
+    #                     file1 = files_reord[i]
+    #                     file2 = files_reord[j]
+    #                     print(file1, file2)
+
+    #                     shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}")
+
+    #                     remove = params.duplicated
+    #                     lremove = os.stat(f'{remove}').st_size
+
+    #                     if lremove == 0:
+    #                         print(f'Nothing to remove {lremove}')
+    #                     else:
+    #                         print(f'{lremove} samples to remove')
+    #                         if file2 not in filtered:
+    #                             filtered.append(file2)
+
+    #                         file2 = file2.split('.')[0]
+    #                         print(f'file {file2} filtered')
+    #                         shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz')
+    #                         #shell('gatk SelectVariants -R {params.refgen} -V {file2} -xl-sn -O {file2}.filtered.vcf.gz')
+    #                         shell('bcftools index -f {file2}.filtered.vcf.gz')
+    #                         shell('rm {params.duplicated} {file2}.vcf.gz {file2}.vcf.gz.csi')
+
+    #         for file in files_reord:
+    #             if file not in filtered:
+    #                 file1 = file.split('.')[0] # file does not have extensions
+    #                 print(f'file {file1} renamed')   
+    #                 shell('mv {file1}.vcf.gz {file1}.filtered.vcf.gz')
+    #                 shell('mv {file1}.vcf.gz.csi {file1}.filtered.vcf.gz.csi')
+    #             #shell('scripts/create_symlinks.sh {file1}')
+    #         shell('rm {params}')
+
+else:
+    rule rename:
+        input:
+            vcf = [[f'{vcfdir}' + x
+                    for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz',
+                        suffixOne = splitFiles)] if len(splitFiles) != 0 else [],
+                    [f'{vcfdir}' + x
+                        for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz',
+                            suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []],
+            idx = [[f'{vcfdir}' + x
+                    for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz.csi',
+                        suffixOne = splitFiles)] if len(splitFiles) != 0 else [],
+                    [f'{vcfdir}' + x
+                        for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz.csi',
+                            suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []],
+        output:
+            vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz',
+            idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz.csi'
+        conda: "bcftools"
+        log: 'logs/Rename_{chromosome}.log'
+        resources: cpus=1, mem_mb=32000, time_min=30
+        shell:
+            """
+            if [ -h {input} ]; then
+                ln -s $( realpath {input.vcf} ) {output.vcf}
+                ln -s $( realpath {input.idx} ) {output.idx}
+            else
+                ln -s {input.vcf} {output.vcf}
+                ln -s {input.idx} {output.idx}
+            fi
+            """
+

From 4c318c32178aa5e7204d5a16379de9ece902207c Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:30:40 +0100
Subject: [PATCH 05/15] simplified rule

---
 .../workflow/rules/merge_samples_vcf.smk      | 169 +++++++-----------
 1 file changed, 61 insertions(+), 108 deletions(-)

diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk
index 1fcc7be..86cbc2c 100644
--- a/Snakemake/workflow/rules/merge_samples_vcf.smk
+++ b/Snakemake/workflow/rules/merge_samples_vcf.smk
@@ -1,119 +1,72 @@
-import yaml
-
-
 if len(allFiles) != 1:
-    rule get_samples:
-        input:
-            f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz'
-        output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.txt')
-        conda: "bcftools"
-        threads: 1
-        resources: cpus=1, mem_mb=32000, time_min=30
-        log: 'logs/Get_samples_{chromosome}_{file}.log'
-        shell:
-            """
-            bcftools query -l {input} > {output}
-            """
-
-    rule filter:
-        input:
-            [f'{vcfdir}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt',
-                    file=allFiles)],
-        output:
-            temp([f'{vcfdir}' + x for x in
-                expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', file=allFiles, ext=['gz', 'gz.csi'])])
-        params:
-            duplicated = '{chromosome}.ids'
-        resources: cpus=1, mem_mb=64000, time_min=60
-        run:
-            import os
-
-            for file in input:
-                shell("wc -l {file} >> files_len.txt")
-            
-            shell("sort -s -nr -k 1,1 files_len.txt | cut -d' ' -f 2 > files_reorderd.txt")
-
-            with open('files_reorderd.txt', 'r') as file:
-                files_reord = []
-                for line in file:
-                    line = line.strip()
-                    files_reord.append(line)
-            
-            filtered = []
-
-            for i in range(len(files_reord)):
-                print(i)
-                for j, file in enumerate(files_reord):
-                    if file != files_reord[i] and j > i:
-                        file1 = files_reord[i]
-                        file2 = files_reord[j]
-                        print(file1, file2)
-
-                        shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}")
-
-                        remove = params.duplicated
-                        lremove = os.stat(f'{remove}').st_size
-
-                        if lremove == 0:
-                            print(f'Nothing to remove {lremove}')
-                        else:
-                            print(f'{lremove} samples to remove')
-                            if file2 not in filtered:
-                                filtered.append(file2)
-
-                            file2 = file2.split('.')[0]
-                            print(f'file {file2} filtered')
-                            shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz')
-                            shell('bcftools index -f {file2}.filtered.vcf.gz')
-                            shell('rm {params.duplicated}')
-
-            for file in files_reord:
-                if file not in filtered:
-                    file1 = file.split('.')[0]
-                    print(f'file {file1} not filtered')
-                    shell("cp {file1}.vcf.gz {file1}.filtered.vcf.gz && bcftools index -f {file1}.filtered.vcf.gz")
-            shell('rm {params.duplicated}')
-
     rule merge:
         input:
-            [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)]
+            vcf = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)],
+            idx = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi', file=allFiles)],
         output:
-            vcf=temp(f'{vcfdir}/{{chromosome}}_final.vcf.gz'),
-            index=temp(f'{vcfdir}/{{chromosome}}_final.vcf.gz.csi')
+            vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz',
+        params:
+            files_path = f'{vcfdir}/{{chromosome}}/'
         conda: "bcftools"
-        threads: 1
-        resources: cpus=1, mem_mb=32000, time_min=60
-        log: 'logs/Merge_{chromosome}.log'
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt'
         shell:
-            """
-            bcftools merge {input} -O z -o {output.vcf}
-            bcftools index -f {output.vcf}
+            r"""
+            printf '%s\n' {params.files_path}*.gz > files.merge
+            
+            bcftools merge \
+                -m snps \
+                -l files.merge \
+                --threads {threads} -O z -o {output.vcf} 
             """
 
-else:
-    rule rename:
-        input:
-            [f'{vcfdir}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz',
-                    suffixOne = splitFiles)] if len(splitFiles) != 0 else [],
-            [f'{vcfdir}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz',
-                    suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []
-        output:
-            vcf = (f'{vcfdir}/{{chromosome}}_final.vcf.gz'),
-            idx = (f'{vcfdir}/{{chromosome}}_final.vcf.gz.csi')
-        conda: "bcftools"
-        log: 'logs/Rename_{chromosome}.log'
-        resources: cpus=1, mem_mb=32000, time_min=30
+    rule index_merged:
+        input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz',
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi'
+        conda: 'bcftools'
         shell:
+            r"""
+            bcftools index -f {input}
             """
-            if [ -h {input} ]; then
-                ln -s $( realpath {input} ) {output.vcf}
-                ln -s $( realpath {input} ).csi {output.vcf}
-            else
-                ln -s {input} {output.vcf}
-                ln -s {input}.csi {output.idx}
-            fi
-            """
 
+rule remove_missing_indels:
+    input: 
+        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz',
+        index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi'
+    output:    
+        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz',
+        index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz.csi'
+    params:
+        prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged'
+    conda: 'bcftools'
+    shell:
+        r"""
+        vcftools --gzvcf {input.vcf} \
+            --remove-indels \
+            --max-missing 0.1 \
+            --remove-filtered-all \
+            --recode \
+            --out {params.prefix}
+
+        bgzip {params.prefix}.recode.vcf
+        bcftools index -f {output.vcf}  
+
+        rm  {params.prefix}.recode.vcf 
+        """   
+
+# rule summary:
+#     input: 
+#         vcf_file = rules.filter_merged_vcf.output.vcf,
+#         index = rules.filter_merged_vcf.output.index,
+#     output:
+#         summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk'
+#     params:
+#         plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/'
+#     conda: 'bcftools'
+#     benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt'
+#     shell:
+#         r"""
+#         bcftools stats {input.vcf_file} > {output.summary}
+#         plot-vcfstats -p {params.plots} {output.summary}
+#         """
\ No newline at end of file

From 4389bfb70660d1327be9747a0e9b1f0470d1d523 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:31:36 +0100
Subject: [PATCH 06/15] changed options after testing to improve accuracy

---
 Snakemake/workflow/rules/phasing.smk | 71 +++++++++++++++-------------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/Snakemake/workflow/rules/phasing.smk b/Snakemake/workflow/rules/phasing.smk
index 67e9f2f..53e59db 100644
--- a/Snakemake/workflow/rules/phasing.smk
+++ b/Snakemake/workflow/rules/phasing.smk
@@ -1,24 +1,17 @@
-# def getInputVcfFile_phasing(wildcards):
-#     if os.path.isfile(vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz'):
-#         print("Final file found ")
-#         vcf_file = vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz'
-#         print(vcf_file)
-#     else:
-#         print("No final file ")
-#         file = open(vcfdir + '/Vcf_file_' + wildcards.chromosome + '.txt')
-#         vcf_file = file.read().strip("\n")
-#
-#     return(vcf_file)
+file_to_phase = rules.remove_missing_indels.output.vcf
+file_to_phase_idx = rules.remove_missing_indels.output.index
 
+phased_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
+phased_output_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi'
 
 if config['ploidy'] == 1:
     rule rename_phased:
         input:
-            vcf = f'{vcfdir}/{{chromosome}}_final.vcf.gz',
-            idx = f'{vcfdir}/{{chromosome}}_final.vcf.gz.csi'
+            vcf = file_to_phase,
+            idx = file_to_phase_idx,
         output:
-            vcf = f'{vcfdir}/{{chromosome}}_phased.vcf.gz',
-            idx = f'{vcfdir}/{{chromosome}}_phased.vcf.gz.csi'
+            vcf = temp(phased_output),
+            idx = temp(phased_output_idx)
         log: 'logs/Rename_phased_{chromosome}.log'
         resources: cpus=1, mem_mb=32000, time_min=60
         shell:
@@ -31,32 +24,44 @@ if config['ploidy'] == 1:
                 ln -s {input.idx} {output.idx}
             fi
             """
-
 else:
     rule phase:
         input:
-            vcf = f'{vcfdir}/{{chromosome}}_final.vcf.gz',
+            vcf = file_to_phase,
+            idx = file_to_phase_idx,
         output:
-            file = f'{vcfdir}/{{chromosome}}_phased.vcf.gz',
-            idx = f'{vcfdir}/{{chromosome}}_phased.vcf.gz.csi'
+            vcf = phased_output,
         params:
             map = f'{mapdir}/{{chromosome}}.txt',
-        #log: 'logs/phase_{chromosome}.log'
-        threads: 25
-#        resources: cpus=20, mem_mb=25000, time_min=5
+        threads: 32
+        resources: cpus = 32, mem_mb = 2048, time_min = 7200
         conda: 'shapeit4am'
-        log: 'logs/Phase_{chromosome}.log'
+        benchmark:
+            'benchmarks/{chromosome}.shapeit.benchmark.txt'
         shell:
-            """
+            r"""
             str='{wildcards.chromosome}'
             chr=$(echo ${{str:3}})
-            start=`date +%s`
+
             shapeit4 --input {input.vcf} \
-                             --map {params.map} \
-                             --region ${{chr}} \
-                             --output {output} \
-                             --thread {threads}
-            end=`date +%s`
-            echo Execution time was `expr $end - $start` seconds > shapeit_{wildcards.chromosome}.time
-            bcftools index -f {output}
-            """
+                                --map {params.map} \
+                                --region $chr \
+                                --output {output.vcf} \
+                                --thread {threads} \
+                                --pbwt-depth 8 \
+                                --sequencing \
+                                --log {wildcards.chromosome}_phased.log                
+            """         
+
+    rule index_phased: 
+        input:
+            rules.phase.output.vcf
+        output:
+            idx = phased_output_idx
+        conda: 'bcftools'
+        benchmark:
+            'benchmarks/{chromosome}.bcftools_index.bechmark.txt'
+        shell:
+            r"""
+            bcftools index -f {input}
+            """
\ No newline at end of file

From 107192f77f0439ba1acd561c4f13482f8fe95584 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:32:49 +0100
Subject: [PATCH 07/15] updated script to add aa to vcf keeping all sites

---
 .../workflow/rules/ancestral_info_to_vcf.smk  | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 Snakemake/workflow/rules/ancestral_info_to_vcf.smk

diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
new file mode 100644
index 0000000..f8870f0
--- /dev/null
+++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
@@ -0,0 +1,128 @@
+
+# extract all positions in vcf file using bcftools and save to regions file
+# for rows in ancestral_alleles file where col 1 match wildcard.chromosome
+# for each row in regions file, if value is in ancestral_alleles col 2, add ancestral_alleles col 3 to regions file col 2
+# else add -1
+
+ancestral_file = config['ancestral_allele']
+
+if "global_cattle" in config['PROJECT']:
+    # rule list_samples:
+    #     input:
+    #         vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz',
+    #         metadata = config['meta']
+    #     output:
+    #         samples = f'{vcfdir}/{{chromosome}}/samples_to_remove'
+    #     conda: 'bcftools'
+    #     shell:
+    #         r"""
+    #         bcftools query -l {input.vcf_file} > {wildcards.chromosome}.samples
+    #         cut -d',' -f1 {input.metadata} | tail -n +2 > metadata.samples
+    #         grep -v -f metadata.samples vcf.samples > unique.samples
+    #         """
+    
+    rule remove_samples:
+        input: 
+            vcf_file = rules.phase.output.vcf,
+            idx_file = rules.index_phased.output.idx,
+        output: 
+            vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf',
+        params:
+            samples_to_remove = f'{vcfdir}/samples.remove'
+        conda: 'bcftools'
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        shell:
+            r"""
+            bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output.vcf_file}
+            """
+    input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf'
+
+else:
+    rule decompress:
+        input: rules.phase.output.vcf
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf'
+        conda: bcftools 
+        shell:
+            r"""
+            bcftools view {input} -O v -o {output}
+            """
+    input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf'        
+
+rule get_vcf_positions:
+    input: 
+        vcf_file = input_vcf,
+    output: 
+        regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.regions')
+    conda: 'bcftools'
+    shell:
+        r"""
+        bcftools query -f '%POS\n' {input.vcf_file} | awk '{{print $1, -1}}' > {output.regions_file}
+        """
+
+rule get_ancestral_alleles:
+    input: 
+        ancestral_alleles_file = ancestral_file
+    output:
+        ancestrals = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.anc')
+    params:
+        chrnum = lambda wc: wc.get('chromosome')[3:]
+    shell:
+       r"""
+       # filter positions in ancestral that correspond to chr
+        awk '$1 == {params.chrnum}' {input.ancestral_alleles_file} | cut -d' ' -f2,3 > {output.ancestrals}
+       """       
+
+rule update_regions_file:
+    input:
+        regions_file = rules.get_vcf_positions.output.regions_file,
+        ancestrals = rules.get_ancestral_alleles.output.ancestrals,
+    output:
+        updated_regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.updated.regions'),
+        positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude')
+    shell:
+        r"""
+        awk 'NR==FNR{{a[$1]=$2;next}} ($1 in a){{$2=a[$1]}}1' {input.ancestrals} {input.regions_file} > {output.updated_regions_file}
+
+        awk '$2 == -1' {output.updated_regions_file} | cut -d' ' -f1 > {output.positions_exclude}
+        """
+
+rule add_AA_to_regions_file:
+    input:
+        updated_regions_file = rules.update_regions_file.output.updated_regions_file,
+    output:
+        new_regions_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}.final.regions'
+    shell:
+        r"""
+        echo "INFO" > {output.new_regions_file}
+        awk '{{print "AA="$2}}' {input.updated_regions_file} >> {output.new_regions_file}
+        """
+
+rule add_info_to_vcf:
+    input: 
+        vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf',
+        aa_info = rules.add_AA_to_regions_file.output.new_regions_file,
+    output: 
+        large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf')
+    conda: 'bcftools'
+    shell:
+        r"""
+        awk 'NR==FNR{{a[NR] = $1; next}} FNR<15{{print}}; \
+        FNR==6{{printf "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n"}}; \
+        FNR>14{{$8=a[FNR-14]; print}}' OFS="\t" {input.aa_info} {input.vcf_file} > {output.large_vcf}
+
+        rm {input.vcf_file}
+        """
+
+rule compress_index_vcf:
+    input: 
+        large_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf'
+    output: 
+        vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
+        vcf_indx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz.csi'
+    conda: 'bcftools'
+    shell:
+        r"""
+        bgzip -c {input} > {output.vcf_file}
+        bcftools index {output.vcf_file}
+        """

From cdc238c5f140e2b4179124729df12a1b155e25fe Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:35:57 +0100
Subject: [PATCH 08/15] updated rules and scripts for ts inference. final
 output is now simplified tree

---
 Snakemake/workflow/rules/new_infer_trees.smk  | 145 ++++++++++++++++++
 .../workflow/scripts/generate_ancestors.py    |  19 +++
 .../workflow/scripts/generate_samples.py      | 120 +++++++++++++++
 Snakemake/workflow/scripts/truncate.py        |  18 +++
 4 files changed, 302 insertions(+)
 create mode 100644 Snakemake/workflow/rules/new_infer_trees.smk
 create mode 100644 Snakemake/workflow/scripts/generate_ancestors.py
 create mode 100644 Snakemake/workflow/scripts/generate_samples.py
 create mode 100644 Snakemake/workflow/scripts/truncate.py

diff --git a/Snakemake/workflow/rules/new_infer_trees.smk b/Snakemake/workflow/rules/new_infer_trees.smk
new file mode 100644
index 0000000..930c3ca
--- /dev/null
+++ b/Snakemake/workflow/rules/new_infer_trees.smk
@@ -0,0 +1,145 @@
+# Convert vcf to samples file
+# generate ancestos samples based only on sites with ancestral allele
+# information
+# truncate ancestors to remove those spanning too many sites
+# generate ancestors trees
+# match ancestors and samples to infer trees
+# final output is a simplified tree sequence
+
+rule prepare_sample_file:
+    input:
+        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
+        meta = config['meta']
+    output: f'{projdir}/Tsinfer/samples/{{chromosome}}.samples'
+    params:
+        chrLength = lambda wildcards:  config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])]
+    conda: "HLab_tsinfer"
+    threads: 32
+    resources: cpus = 32, mem_mb = 768000, time_min = 200
+    benchmark:
+            'benchmarks/{chromosome}_new.prep_sample.benchmark.txt'
+    shell:
+        r"""
+        ulimit -v
+        
+        python scripts/generate_samples.py \
+                {input.vcf} {input.meta} {output} {params.chrLength}
+        
+        """
+
+rule generate_ancestors:
+    input: 
+        samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
+        sites_to_exclude = f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude',
+    output:
+        ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors',
+    conda: 'HLab_tsinfer'
+    threads: 64
+    resources: cpus = 64, mem_mb = 2048000, time_min = 1680
+    benchmark:
+            'benchmarks/{chromosome}_generate_ancestors.benchmark.txt'
+    shell:
+        r"""
+        # Check amount of memory (in kbytes) as seen by the job
+        ulimit -v
+        python scripts/generate_ancestors.py {input.samples} {input.sites_to_exclude} {output.ancestors_samples} {threads}
+        """
+# tsinfer generate-ancestors {input} \
+#                 --num-threads {threads} \
+#                 --num-flush-threads {threads} \
+#                 --progress \
+#                 --verbosity \
+#                 --log-section tsinfer.inference
+
+
+rule truncate_ancestors:
+    input:
+        ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors',
+    output: 
+        truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated',
+   # params:
+   #     upper = 0.4,
+   #     lower = 0.2,
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus=32, mem_mb=2048000, time_min=1680
+    benchmark:
+            'benchmarks/{chromosome}_truncate.benchmark.txt'
+    shell:
+        r"""
+        ulimit -v
+        python scripts/truncate.py {input} {output}
+        """
+
+rule ancestors_trees:
+    input:
+        samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
+        truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated',
+    output: 
+        ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees',
+    params:
+        re = '1.1e-8',
+        mima = '1'
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus = 32, mem_mb = 2048000, time_min = 1680
+    benchmark:
+            'benchmarks/{chromosome}_ancestors_trees.benchmark.txt'
+    shell:
+        r"""
+        tsinfer match-ancestors {input.samples} \
+                -A {output} \
+                --log-section tsinfer.inference \
+                --num-threads {threads} \
+                --progress \
+                --verbosity \
+                --ancestors {input.truncated_ancestors} 
+        """
+# --recombination-rate {params.re} \
+               # --mismatch-ratio {params.mima} \
+               
+
+rule infer_trees:
+    input:
+        samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
+        ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees',
+    output:
+        trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
+    params:
+        re = '1.1e-8',
+        mima = '1'
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus=32, mem_mb=2048000, time_min=1680
+    benchmark:
+            'benchmarks/{chromosome}_infer.benchmark.txt'
+    shell:
+        r"""
+        # Check amount of memory (in kbytes) as seen by the job
+        ulimit -v
+        tsinfer match-samples {input.samples} \
+                -A {input.ancestors_trees} \
+                --num-threads {threads} \
+                --progress \
+                --log-section tsinfer.inference \
+                --verbosity \
+                -O {output}
+        """
+# --recombination-rate {params.re} \
+#                 --mismatch-ratio {params.mima} \
+
+rule simplify_trees:
+    input: 
+        trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
+    output: 
+        simplified_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees',
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus=32, mem_mb=2048000, time_min=1680
+    benchmark:
+            'benchmarks/{chromosome}_simplify.benchmark.txt'
+    shell: 
+        r"""
+        ulimit -v
+        python scripts/simplify.py {input} {output}
+        """             
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/generate_ancestors.py b/Snakemake/workflow/scripts/generate_ancestors.py
new file mode 100644
index 0000000..1e4ce05
--- /dev/null
+++ b/Snakemake/workflow/scripts/generate_ancestors.py
@@ -0,0 +1,19 @@
+import sys
+import tsinfer
+import numpy as np
+
+args = sys.argv
+
+samples = tsinfer.load(args[1])
+sites_to_exclude = np.loadtxt(args[2])
+output = args[3]
+threads = int(args[4])
+
+
+ancestors = tsinfer.generate_ancestors(
+    sample_data=samples,
+    path=output,
+    exclude_positions=sites_to_exclude,
+    progress_monitor=True,
+    num_threads=threads,
+)
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/generate_samples.py b/Snakemake/workflow/scripts/generate_samples.py
new file mode 100644
index 0000000..9db40a9
--- /dev/null
+++ b/Snakemake/workflow/scripts/generate_samples.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import os
+import sys
+import json
+import cyvcf2
+import tsinfer
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+
+def add_populations(vcf, samples, metaData):
+    """
+    This is a modified add populations function for drones from David Wragg data
+    Add tsinfer Population objects for drone subspecies, stored in metaPop object, and return a list od IDs correposning to the VCF samples
+    Input: vcf = cyvcf2 VCF object, samples is tsinfer SampleData and metaData is a pandas dataframe with id in ID column and populations in Type column
+    pops = dictinary holding name and additional information about populations
+    Return: A list of population indexes
+    """
+    pop_lookup = {}
+    metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all')
+    
+    # to account for same subpop in different countries - creating a 'fake subpop' to be used as key
+    # if subpop is repeated new code is subpop+origin, else subpop
+    d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)]
+    metaPop['code'] = d
+    # add code col to the full meatafile
+    metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner')
+    
+    # list population per sample based on recoded subpop
+    sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples]
+    
+    # the dictionary key will be based on the recoding - so keeps the correct amount of population
+    # but the metadata takes in the real subpop value
+    for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code):
+        pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon})
+    return [pop_lookup[code[0]] for code in sample_subpop]
+
+def add_diploid_individuals(vcf, samples, populations):
+    for iid, population in zip(vcf.samples, populations):
+        samples.add_individual(ploidy=2, metadata={'ID': iid}, population=population)
+
+
+def add_diploid_sites(vcf, samples):
+    """
+    Read the sites in the vcf and add them to the samples object.
+    """
+    # You may want to change the following line, e.g. here we allow
+    # "*" (a spanning deletion) to be a valid allele state
+    allele_chars = set("ATGCatgc*")
+    
+    pos = 0
+    
+    progressbar = tqdm(total=samples.sequence_length, desc="Read VCF", unit='bp')
+    
+    for variant in vcf:  # Loop over variants, each assumed at a unique site
+        progressbar.update(variant.POS - pos)
+        
+        if pos == variant.POS:
+            print(f"Duplicate entries at position {pos}, ignoring all but the first")
+            continue
+        else:
+            pos = variant.POS
+        
+        if any([not phased for _, _, phased in variant.genotypes]):
+            raise ValueError("Unphased genotypes for variant at position", pos)
+        
+        alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
+        
+        ancestral = variant.INFO.get("AA", ".")  # "." means unknown
+        
+        if(ancestral == 'ambiguous' or ancestral not in alleles): # set ancestral == reference
+            ancestral_allele = 0
+        elif ancestral == '-1':
+            ancestral_allele = -1    
+        else:
+            ancestral_allele = alleles.index(ancestral)
+        
+        # Check we have ATCG alleles
+        for a in alleles:
+            if len(set(a) - allele_chars) > 0:
+                print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
+                continue
+        
+        # Map original allele indexes to their indexes in the new alleles list.
+        genotypes = [g for row in variant.genotypes for g in row[0:2]]
+        samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
+
+# -----------------------------------------------------------------------------------------
+
+args = sys.argv
+vcfFile = args[1]
+meta = args[2]
+sampleFile = args[3]
+chrLength = args[4]
+
+metaFile = pd.read_csv(meta)
+
+# Create a population (subspecie) list for the samples in the VCF
+vcfD = cyvcf2.VCF(vcfFile, strict_gt=True)
+
+# Create samples for haploid data
+with tsinfer.SampleData(path=sampleFile,
+                        sequence_length=chrLength,
+                        num_flush_threads=16, max_file_size=2**30) as samples:
+   populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile)
+   print("populations determined")
+   add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations)
+   print("individuals added")
+   add_diploid_sites(vcf = vcfD, samples = samples)
+   print("sites added")
+
+print(
+   "Sample file created for {} samples ".format(samples.num_samples)
+   + "({} individuals) ".format(samples.num_individuals)
+   + "with {} variable sites.".format(samples.num_sites),
+   flush=True,
+)
diff --git a/Snakemake/workflow/scripts/truncate.py b/Snakemake/workflow/scripts/truncate.py
new file mode 100644
index 0000000..17037bc
--- /dev/null
+++ b/Snakemake/workflow/scripts/truncate.py
@@ -0,0 +1,18 @@
+import sys
+import tsinfer
+
+args = sys.argv
+ancestors = tsinfer.load(args[1])
+#uprtime = args[2]
+#lwertime = args[3]
+output = args[2]
+
+ancestors.truncate_ancestors(
+    path=output, 
+    max_file_size=2**38,
+    lower_time_bound=0.2,
+    upper_time_bound=0.4,
+    length_multiplier=2,
+)
+
+

From 4b3b9141e4accdf8b9806212b38ba9021cbfb2a9 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:37:15 +0100
Subject: [PATCH 09/15] script to simplify ts

---
 Snakemake/workflow/scripts/simplify.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 Snakemake/workflow/scripts/simplify.py

diff --git a/Snakemake/workflow/scripts/simplify.py b/Snakemake/workflow/scripts/simplify.py
new file mode 100644
index 0000000..f59ebfa
--- /dev/null
+++ b/Snakemake/workflow/scripts/simplify.py
@@ -0,0 +1,9 @@
+import sys
+import tskit
+
+args = sys.argv
+ts = tskit.load(args[1])
+output = args[2]
+
+ts_simplified = ts.simplify(keep_unary=False)
+ts_simplified.dump(output)
\ No newline at end of file

From 6e700c4dbb2066a0af896f2123476f67f0d5d6aa Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:39:24 +0100
Subject: [PATCH 10/15] updated default resources and increased for certain
 rules (to run on supernodes)

---
 Snakemake/workflow/profile/cluster.json | 84 ++++++++++++++-----------
 1 file changed, 48 insertions(+), 36 deletions(-)

diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json
index f4b023c..5575b57 100644
--- a/Snakemake/workflow/profile/cluster.json
+++ b/Snakemake/workflow/profile/cluster.json
@@ -1,55 +1,49 @@
 {
  "__default__": 
  {
-   "mem": "64G",
-   "threads": "1",
+   "mem": "32G",
+   "threads": "16",
    "time": "10:00:00"
  },
- "rename":
+ "merge":
  {
   "mem": "64G",
+  "threads": "32",
+  "time": "20:00:00"
+ },
+ "rename":
+ {
+  "mem": "32G",
   "threads": "1",
   "time": "03:00:00"
  },
  "phase":
-  {
-    "mem": "350G",
-    "threads": "25",
-    "time": "336:00:00"
-  },
- "rename_phased":
   {
     "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
+    "threads": "32",
+    "time": "120:00:00"
   },
-  "get_af":
+  "filter_positions":
   {
     "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "get_major":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
+    "threads": "32",
+    "time": "05:00:00"
   },
   "extract_ancestral_chromosome":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
   "join_major_ancestral":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
   "determine_ancestral_major":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "04:00:00"
   },
@@ -67,43 +61,61 @@
   },
   "decompress":
   {
-    "mem": "64G",
+    "mem": "8G",
     "threads": "1",
     "time": "03:00:00"
   },
   "change_infoAA_vcf":
   {
-    "mem": "128G",
+    "mem": "32G",
     "threads": "1",
     "time": "05:00:00"
   },
   "prepare_sample_file":
   {
-    "mem": "128G",
-    "threads": "1",
-    "time": "10:00:00"
+    "mem": "64G",
+    "threads": "32",
+    "time": "336:00:00"
   },
-  "infer":
+  "generate_ancestors":
   {
-    "mem": "128G",
-    "threads": "1",
-    "time": "40:00:00"
+    "mem": "64G",
+    "threads": "32",
+    "time": "336:00:00"
+  },
+  "truncate_ancestors":
+  {
+    "mem": "64G",
+    "threads": "32",
+    "time": "336:00:00"
+  },
+  "ancestors_trees":
+  {
+    "mem": "64G",
+    "threads": "32",
+    "time": "336:00:00"
+  },
+  "infer_trees":
+  {
+    "mem": "64G",
+    "threads": "32",
+    "time": "336:00:00"
   },
  "extract_aligned_alleles_from_vcf":
  {
-  "mem": "32G",
+  "mem": "16G",
   "threads": "1",
   "time": "03:00:00"
  },
  "extract_snps_from_info":
   {
-    "mem": "32G",
+    "mem": "16G",
     "threads": "1",
     "time": "03:00:00"
   },
   "extract_pos_aligned_alleles_from_vcf":
   {
-    "mem": "32G",
+    "mem": "16G",
     "threads": "1",
     "time": "03:00:00"
   },
@@ -116,7 +128,7 @@
   "create_estsfs_dicts":
   {
     "mem": "16G",
-    "threads": 1",
+    "threads": "1",
     "time" : "05:00:00"
   },
   "edit_estsfs_dicts":

From 0b84d650c1c732835d6b2bea543988e5f7b4211d Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Fri, 12 Apr 2024 11:40:56 +0100
Subject: [PATCH 11/15] config file to run global cattle demography project

---
 Snakemake/config/glob_cattle.yaml | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 Snakemake/config/glob_cattle.yaml

diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml
new file mode 100644
index 0000000..67e2381
--- /dev/null
+++ b/Snakemake/config/glob_cattle.yaml
@@ -0,0 +1,40 @@
+PROJECT: "global_cattle"
+work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree_sequence"
+species: "Bos taurus"
+ploidy: 2
+no_chromosomes: 29
+vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global"
+genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2"
+reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa"
+ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt"
+meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv"
+chromosome_length:
+  1 : 158534110
+  2 : 136231102
+  3 : 121005158
+  4 : 120000601
+  5 : 120089316
+  6 : 117806340
+  7 : 110682743
+  8 : 113319770
+  9 : 105454467
+  10 : 103308737
+  11 : 106982474
+  12 : 87216183
+  13 : 83472345
+  14 : 82403003
+  15 : 85007780
+  16 : 81013979
+  17 : 73167244
+  18 : 65820629
+  19 : 63449741
+  20 : 71974595
+  21 : 69862954
+  22 : 60773035
+  23 : 52498615
+  24 : 62317253
+  25 : 42350435
+  26 : 51992305
+  27 : 45612108
+  28 : 45940150
+  29 : 51098607
\ No newline at end of file

From 19e32cfb6cd66836c2137adda719e6be4a75b1b2 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra <gabrielamfortuna@gmail.com>
Date: Thu, 18 Jul 2024 13:06:48 +0100
Subject: [PATCH 12/15] updated version to run cattle inferences

---
 Snakemake/config/glob_cattle.yaml             |  13 +
 Snakemake/workflow/Snakefile                  |  21 +-
 Snakemake/workflow/profile/cluster.json       |   2 +-
 Snakemake/workflow/profile/config.yaml        |   8 +-
 Snakemake/workflow/rules/add_ancestral.smk    | 167 +++++++++++
 .../workflow/rules/ancestral_info_to_vcf.smk  |  47 +--
 Snakemake/workflow/rules/date.smk             |  19 ++
 Snakemake/workflow/rules/filter.smk           | 267 ++++++++----------
 Snakemake/workflow/rules/infer_trees.smk      |  36 +--
 Snakemake/workflow/rules/keep_snp.smk         |  43 +++
 .../workflow/rules/merge_samples_vcf.smk      |  72 ++---
 Snakemake/workflow/rules/new_infer_trees.smk  |  82 +++---
 Snakemake/workflow/rules/phasing.smk          |  63 ++---
 .../rules/prepare_files_for_tsinfer.smk       |  53 ++--
 Snakemake/workflow/rules/split.smk            |  31 +-
 Snakemake/workflow/scripts/Infer_trees.py     |  33 +--
 .../scripts/Prepare_tsinfer_sample_file.py    |  41 ++-
 .../workflow/scripts/generate_ancestors.py    |   3 +
 .../workflow/scripts/generate_samples.py      |  83 ++++--
 Snakemake/workflow/scripts/simplify.py        |  26 +-
 Snakemake/workflow/scripts/truncate.py        |  11 +-
 Snakemake/workflow/scripts/tsparam.py         |   2 +-
 22 files changed, 713 insertions(+), 410 deletions(-)
 create mode 100644 Snakemake/workflow/rules/add_ancestral.smk
 create mode 100644 Snakemake/workflow/rules/date.smk
 create mode 100644 Snakemake/workflow/rules/keep_snp.smk

diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml
index 67e2381..7c8a537 100644
--- a/Snakemake/config/glob_cattle.yaml
+++ b/Snakemake/config/glob_cattle.yaml
@@ -3,11 +3,24 @@ work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree
 species: "Bos taurus"
 ploidy: 2
 no_chromosomes: 29
+start_chromosome: 1
+end_chromosome: 29
 vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global"
 genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2"
 reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa"
 ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt"
 meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv"
+phase_only: False
+remove_indel: True
+missing_data: "0.2"
+truncate_upper: "0.4"
+truncate_lower: "0.2"
+recombination_rate_tree: False # "1e-8"
+recombination_rate_anc: False
+mismatch_ratio_anc: False
+mismatch_ratio_tree: False # "1"
+mutation_rate: "1e-8"
+generation_interval: 5
 chromosome_length:
   1 : 158534110
   2 : 136231102
diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile
index d22892e..611801d 100644
--- a/Snakemake/workflow/Snakefile
+++ b/Snakemake/workflow/Snakefile
@@ -18,7 +18,6 @@ report: "reports/tsinfer.rst"
 ##### setup singularity #####
 # Can we set conda env here???
 
-
 ##### Define some variables #####
 # all files should be in the vcfDir/RawVCF folder
 # files that need spliting are named: Combined_something.vcf.gz
@@ -40,12 +39,13 @@ Project = config['PROJECT']
 projdir = config['work_dir'] + '/' + Project
 mapdir = config['genmap']
 
-# if want to split multiallelic sites set to TRUE:
-#multiallelic = False
-
-chromosomes = [f'chr{n}' for n in range(1, config['no_chromosomes'] + 1)]
+chromosomes = [f'chr{n}' for n in range(config['start_chromosome'], config['end_chromosome'] + 1)]
 # set a list of 'Chr' + chr# combinations to be used as wildcards
 
+wildcard_constraints:
+    chromosome = r'chr\d+'
+# constraining the wildcard to avoid errors 
+
 ##### load rules #####
 #include: "rules/ancestral_inference.smk"
 include: "rules/split.smk"
@@ -55,11 +55,16 @@ include: "rules/phasing.smk"
 include: "rules/ancestral_info_to_vcf.smk"
 include: "rules/new_infer_trees.smk"
 
+##### Conditional inputs for rule all based on the existence of phased VCFs and config['phase_only'] #####
+if not config['phase_only']:
+    rule_all_input = expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', chromosome=chromosomes)
+else:
+    rule_all_input = expand(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi', chromosome=chromosomes)
+
+
 ##### target rules #####
 # At the end it should generate a phased vcf file that combines all files for
 # each chromosome. These files are stored in the vcf folder declaired in the
 # configuration file. It also checks that vcfs have been filtered before merged
 rule all:
-    input:
-        expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees', chromosome=chromosomes)
-        # tree inference
+    input: rule_all_input
\ No newline at end of file
diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json
index 5575b57..574c29c 100644
--- a/Snakemake/workflow/profile/cluster.json
+++ b/Snakemake/workflow/profile/cluster.json
@@ -21,7 +21,7 @@
   {
     "mem": "64G",
     "threads": "32",
-    "time": "120:00:00"
+    "time": "240:00:00"
   },
   "filter_positions":
   {
diff --git a/Snakemake/workflow/profile/config.yaml b/Snakemake/workflow/profile/config.yaml
index 1dc5486..e7a1c31 100644
--- a/Snakemake/workflow/profile/config.yaml
+++ b/Snakemake/workflow/profile/config.yaml
@@ -3,10 +3,8 @@ use-conda: true
 jobscript: "jobscript.sh"
 cluster-config: "cluster.json"
 jobname: "{rule}.{jobid}"
-jobs: 16
-#drmaa: " -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -P roslin_HighlanderLab"
+drmaa: " -P roslin_HighlanderLab -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -R y"
 rerun-incomplete: true
-rerun-triggers: mtime
-cores: 16 # how many jobs you want to submit to your cluster queue
-local-cores: 1
+rerun-triggers: [mtime, input]
+local-cores: 99
 keep-going: false
diff --git a/Snakemake/workflow/rules/add_ancestral.smk b/Snakemake/workflow/rules/add_ancestral.smk
new file mode 100644
index 0000000..dd404f1
--- /dev/null
+++ b/Snakemake/workflow/rules/add_ancestral.smk
@@ -0,0 +1,167 @@
+
+ancestral_file = config['ancestral_allele']
+
+# input and output
+infile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
+infileidx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi'
+outfile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz'
+outidx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi'
+
+# temporary files
+filtered_samples = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sam.vcf.gz'
+filtered_positions = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sampos.vcf'
+ancestral_not_compressed = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf'
+regfile = f'{vcfdir}/{{chromosome}}_regions.tsv'
+aaInfo = f'{vcfdir}/{{chromosome}}_aa.info'
+
+if config['PROJECT'] == "global_cattle":
+    rule remove_samples:
+        input: infile
+        output: filtered_samples
+        params:
+            samples_to_remove = f'{vcfdir}/{{chromosome}}/remove'
+        conda: 'bcftools'
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        shell:
+            r"""
+            bcftools view -S ^{params.samples_to_remove} {input} --threads {threads} -O z -o {output}
+
+            bcftools index {output}
+            """
+
+
+# need to some helper files:
+# 1) create a chr pos file for filtering
+# 2) create an info file for adding ancestrals
+# filter vcf positions so that it has the same as the aa file
+# add ancestral allele to INFO
+
+rule make_regions_file:
+    input: 
+        anc = ancestral_file,
+        vcf = filtered_samples
+    output: temp(regfile)
+    params:
+        chrnum = lambda wc: wc.get('chromosome')[3:]
+    shell:
+        r"""
+        # filter positions in ancestral that correspond to chr
+        cut -d' ' -f1,2 {input.ancestral_file} | awk '$1 == {params.chrnum}' | sed -e $'s/ /\t/g' > chr{params.chrnum}.anc
+
+        cut -f2 chr{params.chrnum}.anc > chr{params.chrnum}.compare
+
+        # get positions in vcf
+        bcftools query -l '%POS\n' {input.vcf} > chr{params.chrnum}.file
+
+        # match positions ancestral and vcf
+        comm -12 <( sort chr{params.chrnum}.file ) <( chr{params.chrnum}.compare) > chr{params.chrnum}.keep
+
+        # number of lines
+        l=($(wc -l chr{params.chrnum}.keep))
+
+        # create final regions file
+        yes {params.chrnum} | head -n $l > chr{params.chrnum}
+        paste chr{params.chrnum} chr{params.chrnum}.keep > {output}
+
+        rm chr{params.chrnum} chr{params.chrnum}.keep chr{params.chrnum}.file chr{params.chrnum}.compare chr{params.chrnum}.anc
+        """
+#grep -vw 4579381 3565339
+
+rule filter_positions:
+    input: 
+        vcf = filtered_samples,
+        idx = f'{filtered_samples}.csi',
+        regions = regfile,
+    output: filtered_positions
+    conda: 'bcftools'
+    threads: 32
+    resources: cpus = 32, mem_mb = 768000, time_min = 300
+    shell:
+        r"""
+        bcftools view -R {input.regions} {input.vcf} \
+        #            --threads {threads} \
+                     -o {output}
+        """
+
+# take only the rows where col1 == current_chromosome
+# add AA= in front of the aa col
+# use only col pos & aa
+# match with the left pos in filtered vcf
+
+rule make_aaInfo_file:
+    input: 
+        aa_file = ancestral_file,
+        vcf = filtered_positions
+    output: temp(aaInfo)
+    params:
+        chrnum = lambda wc: wc.get('chromosome')[3:]
+    shell:
+        r"""
+        # filter by chromosome and add AA= to the allele col
+        awk '$1 == {params.chrnum}' {input} | awk '$3="AA="$3' | cut -d' ' -f2,3 > {wildcards.chromosome}.aa 
+
+        # get the positions in the vcf
+        bcftools query -f '%POS\n' {input.vcf} > {wildcards.chromosome}.pos 
+
+        # add allele col to the pos in the vcf
+        awk 'NR==FNR{{a[$1];next}}($1 in a){{print $0}}' {wildcards.chromosome}.pos {wildcards.chromosome}.aa > {wildcards.chromosome}.pos.aa
+
+        cut -d' ' -f2  {wildcards.chromosome}.pos.aa | sed '1s/^/INFO\n/' > {output}
+
+        rm {wildcards.chromosome}.aa {wildcards.chromosome}.pos {wildcards.chromosome}.pos.aa
+        """
+
+# need to run shapeit first to know line numbers
+rule add_ancestral:
+    input: 
+        vcf = filtered_positions,
+        aa_info = aaInfo,
+    output: ancestral_not_compressed
+    conda: 'bcftools'
+    shell:
+        r"""
+        awk 'NR==FNR{{a[NR] = $1; next}} FNR<13{{print}}; \
+        FNR==6{{printf "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n"}}; \
+        FNR>12{{$8=a[FNR-11]; print}}' OFS="\t" {input.aa_info} {input.vcf} > {output}
+        """
+
+
+# HEADERNUM="$(( $(bcftools view -h {input.vcf} | wc -l) - 1 ))"
+
+#         INFOLINE=$(( $(bcftools view -h {input.vcf} | awk '/INFO/{{print NR}}' | head -n 1) ))
+
+#         awk -v OFS="\t" -v HEADER=$HEADERNUM -v INFO=$INFOLINE 'NR==FNR{{{{a[FNR] = $2; next}}}} FNR<=HEADER{{{{print}}}}; \
+#         FNR==INFO{{{{printf "##INFO=<ID=AA,Number=1,Type=String,Description=Ancestral Allele>\\n"}}}}; \
+#         FNR>HEADER{{{{$8=a[FNR-HEADER]; print}}}}' OFS="\t" {input.aa_info} {input.vcf} > {output}
+#  All_AAInfo.txt AllFiltered.recode.vcf > AllFiltered.recode.Ancestral.vcf
+
+# compress and index final vcf
+# rule match_samples:
+#     input: ancestral_not_compressed
+#     output: outfile
+#     params: 
+#         samples_to_keep = f'{vcfdir}/{{chromosome}}/keep.samples'
+#     conda: 'bcftools'
+#     shell:
+#         r"""
+#         bcftools view -S {params.samples_to_keep} {input} -O z -o {output}
+#         """
+
+rule compress_vcf:
+    input: ancestral_not_compressed
+    output: outfile,
+    conda: 'bcftools'
+    shell:
+        r"""
+        bgzip -c {input} > {output}
+        """
+
+rule index_output:
+    input: outfile
+    output: outidx
+    conda: 'bcftools'
+    shell:
+        r"""
+        bcftools index {input}
+        """
diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
index f8870f0..f69bee0 100644
--- a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
+++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
@@ -1,7 +1,8 @@
 
 # extract all positions in vcf file using bcftools and save to regions file
 # for rows in ancestral_alleles file where col 1 match wildcard.chromosome
-# for each row in regions file, if value is in ancestral_alleles col 2, add ancestral_alleles col 3 to regions file col 2
+# for each row in regions file, if value is in ancestral_alleles col 2, add
+# ancestral_alleles col 3 to regions file col 2
 # else add -1
 
 ancestral_file = config['ancestral_allele']
@@ -23,38 +24,42 @@ if "global_cattle" in config['PROJECT']:
     
     rule remove_samples:
         input: 
-            vcf_file = rules.phase.output.vcf,
-            idx_file = rules.index_phased.output.idx,
-        output: 
-            vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf',
+            vcf_file = input_ancestral,
+            idx_file = input_ancestral_idx,
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf',
         params:
             samples_to_remove = f'{vcfdir}/samples.remove'
         conda: 'bcftools'
         threads: 32
         resources: cpus=32, mem_mb=2048000, time_min=1200
+        benchmark: 'benchmarks/{chromosome}.remove.benchmark.txt' 
         shell:
             r"""
-            bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output.vcf_file}
+            bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output}
             """
-    input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf'
 
 else:
     rule decompress:
-        input: rules.phase.output.vcf
-        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf'
-        conda: bcftools 
+        input: 
+            vcf_file = input_ancestral,
+            idx_file = input_ancestral_idx
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf'
+        conda: bcftools
+        benchmark: 'benchmarks/{chromosome}.decompress.benchmark.txt' 
         shell:
             r"""
-            bcftools view {input} -O v -o {output}
+            bcftools view {input.vcf_file} -O v -o {output}
             """
-    input_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf'        
+
+input_ancestral = f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf'     
 
 rule get_vcf_positions:
     input: 
-        vcf_file = input_vcf,
+        vcf_file = input_ancestral,
     output: 
         regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.regions')
     conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.positions.benchmark.txt' 
     shell:
         r"""
         bcftools query -f '%POS\n' {input.vcf_file} | awk '{{print $1, -1}}' > {output.regions_file}
@@ -67,6 +72,7 @@ rule get_ancestral_alleles:
         ancestrals = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.anc')
     params:
         chrnum = lambda wc: wc.get('chromosome')[3:]
+    benchmark: 'benchmarks/{chromosome}.getaa.benchmark.txt'     
     shell:
        r"""
        # filter positions in ancestral that correspond to chr
@@ -79,7 +85,8 @@ rule update_regions_file:
         ancestrals = rules.get_ancestral_alleles.output.ancestrals,
     output:
         updated_regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.updated.regions'),
-        positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude')
+        positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude'),
+    benchmark: 'benchmarks/{chromosome}.regions.benchmark.txt'     
     shell:
         r"""
         awk 'NR==FNR{{a[$1]=$2;next}} ($1 in a){{$2=a[$1]}}1' {input.ancestrals} {input.regions_file} > {output.updated_regions_file}
@@ -92,6 +99,7 @@ rule add_AA_to_regions_file:
         updated_regions_file = rules.update_regions_file.output.updated_regions_file,
     output:
         new_regions_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}.final.regions'
+    benchmark: 'benchmarks/{chromosome}.addaaregions.benchmark.txt'     
     shell:
         r"""
         echo "INFO" > {output.new_regions_file}
@@ -100,11 +108,13 @@ rule add_AA_to_regions_file:
 
 rule add_info_to_vcf:
     input: 
-        vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf',
+        vcf_file = input_ancestral,
+        # f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf',
         aa_info = rules.add_AA_to_regions_file.output.new_regions_file,
     output: 
-        large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf')
+        large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf')
     conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.aainfo.benchmark.txt' 
     shell:
         r"""
         awk 'NR==FNR{{a[NR] = $1; next}} FNR<15{{print}}; \
@@ -116,13 +126,16 @@ rule add_info_to_vcf:
 
 rule compress_index_vcf:
     input: 
-        large_vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased_ancestral.vcf'
+        large_vcf = rules.add_info_to_vcf.output.large_vcf
+        # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf'
     output: 
         vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
         vcf_indx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz.csi'
     conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.compress.benchmark.txt' 
     shell:
         r"""
         bgzip -c {input} > {output.vcf_file}
         bcftools index {output.vcf_file}
         """
+input_ts = rules.compress_index_vcf.output.vcf_file
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/date.smk b/Snakemake/workflow/rules/date.smk
new file mode 100644
index 0000000..a9afb85
--- /dev/null
+++ b/Snakemake/workflow/rules/date.smk
@@ -0,0 +1,19 @@
+# Script for (iteractively) dating the tree sequence 
+# trees must be dated, re-inferred and dated again
+# to get the correct dates (test number of iterations)
+
+# if this is the first iteration, start from the existing tree sequence;
+# otherwise, re-infer the trees using the dated sampleFile and date the trees again
+
+input_ts = 
+
+print(f'Starting dating of tree sequence {input_ts}...')
+
+rule date_trees:
+    input: rules.simplify_trees.output
+    output: f'{_input:n}.dated'
+    conda: ''
+
+
+print('A rough estimate of the effective population size is', ts.diversity() / (4 * 1e-6))
+
diff --git a/Snakemake/workflow/rules/filter.smk b/Snakemake/workflow/rules/filter.smk
index 99ca98f..54a596a 100644
--- a/Snakemake/workflow/rules/filter.smk
+++ b/Snakemake/workflow/rules/filter.smk
@@ -1,186 +1,159 @@
-if len(allFiles) != 1:
+# if there is only one vcf files, use the output from split_and_move as input
+# for the next step samples
+# else, compare the files and filter out overlapping then merge the vcfs into
+# one and index and use the output from the merge as input for the next step
+if len(allFiles) == 1:
+    input_vcf = rules.split_and_move_vcfs.output
+    input_index = f'{rules.split_and_move_vcfs.output}.csi'
+    input_summary = 'None'
+
+else:
+    input_to_filter = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.vcf.gz' for file in allFiles]
+    
+    filtered_output = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz' for file in allFiles]
+    filtered_output_idx = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi' 
+                            for file in allFiles]                    
+    merged_output   = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz'
+
     rule compare_and_filter_vcfs:
-        input: 
-            [f'{vcfdir}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{file}.vcf.gz',
-                    file=allFiles)]
-        output:
-            [f'{vcfdir}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz',
-                    file=allFiles)]
+        input: input_to_filter 
+        output: temp(filtered_output)
         params:
-            file_names = allFiles,
-            path = f'{vcfdir}/{{chromosome}}/{{chromosome}}'
+            file_names = [f'{{chromosome}}_{file}' for file in allFiles],
+            path = f'{vcfdir}/{{chromosome}}'
+        benchmark: 'benchmarks/{chromosome}.sample.benchmark.txt'    
         run:
             import os
+            import shlex
             import itertools
             import subprocess
             from glob import glob
 
             path_names = [f'{params.path}/{name}' for name in params.file_names]
             pairs = list(itertools.combinations(path_names, 2))
+            new_ext = '.filtered.vcf.gz'
+            to_rename = path_names.copy()
+
+            # to avoid shell injection, sanitize the file names
+            # -- ensure speciall shell characters are escaped
+            sanitized_path = shlex.quote(params.path)
+            sanitized_new_ext = shlex.quote(new_ext)
+            sanitized_idx = shlex.quote(new_ext + '.csi')
             
             for i, pair in enumerate(pairs):
-                print(f'pair {i}')
-                # for each pair determine the extension of the vcf files
-                _0, ext0 = [os.path.splitext(file) for file in glob(f'{pair[0]}*') 
-                                if os.path.splitext(file)[1] not in ['.csi', '.txt']][0]
-
-                _1, ext1 = [os.path.splitext(file) for file in glob(f'{pair[1]}*') 
-                                if os.path.splitext(file)[1] not in ['.csi', '.txt']][0]
+                print(f'pair {i} {pair}')
                 
                 # Run vcftools to find if there are overlapping samples between pairs
                 command = f"""
                         module load anaconda
-                        conda activate bcftools
-                        vcftools --gzvcf {os.path.join(pair[0] + ext0)} \
-                                --gzdiff {os.path.join(pair[1] + ext1)} \
-                                --diff-indv
+                        source activate bcftools
+                        vcftools --gzvcf {os.path.join(pair[0] + '.vcf.gz')} \
+                                 --gzdiff {os.path.join(pair[1] + '.vcf.gz')} \
+                                 --diff-indv
                         """
-                subprocess.run(command, shell=True, check=True, cwd=path)   
-
+                subprocess.run(command, shell=True, check=True, cwd=sanitized_path)   
+                
                 # if vcftools find any overlapping samples, remove them from the second vcf file
-                if os.path.exists(os.path.join(path, 'out.diff.indv_in_files')):
-                    print(f'Removing samples from {pair[1]} and renaming {pair[0]}')
+                if "B" in subprocess.run("awk '{if ($2 == \"B\") print $2}' out.diff.indv_in_files", 
+                                            shell=True, 
+                                            capture_output=True, 
+                                            text=True, 
+                                            cwd=sanitized_path).stdout:
+                    print(f'Overlapping samples found!! Removing samples from {pair[1]}')
                     command = f"""
                             grep B out.diff.indv_in_files | cut -f1 > samples.remove
                             rm out.diff.indv_in_files
                             """
-                    subprocess.run(command, shell=True, check=True, cwd=path)     
+                    subprocess.run(command, shell=True, check=True, cwd=sanitized_path)     
                     command = f"""
-                              module load anaconda
-                              conda activate bcftools
-                              bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf' + ext1)} \
-                                            -Oz -o {os.path.join(pair[1] + new_ext)}
+                              bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf.gz')} \
+                                            -Oz -o {os.path.join(pair[1] + sanitized_new_ext)}
+                              
+                              bcftools index -f {os.path.join(pair[1] + sanitized_new_ext)}      
+
                               rm samples.remove
                             """
-                    subprocess.run(command, shell=True, check=True, cwd=path)
+                    subprocess.run(command, shell=True, check=True, cwd=sanitized_path)
+                    # remove the file that has been sampled from renaming list
+                    to_rename.remove(pair[1])
+                    print(f'Files to rename are: {to_rename}')
 
                 # if there are no overlapping samples, 
                 # just rename the files that still have the original extension
                 else:
                     print('No overlapping samples')
-                    if 'filtered' not in _0:
-                        print('Rename')
-                        command = f"""
-                                  mv {os.path.join(pair[0] + '.vcf' + ext0)} {os.path.join(pair[0] + new_ext)}
-                                  """
-                        subprocess.run(command, shell=True, check=True, cwd=path)           
-                    if 'filtered' not in _1:
-                        print('Rename')
-                        command = f"""
-                                  mv {os.path.join(pair[1] + '.vcf' + ext1)} {os.path.join(pair[1] + new_ext)}
-                                  """
-                        subprocess.run(command, shell=True, check=True, cwd=path)
-                        
-    # rule get_samples:
-    #     input:
-    #         f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz'
-    #     output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.txt')
-    #     conda: "bcftools"
-    #     threads: 1
-    #     resources: cpus=1, mem_mb=32000, time_min=30
-    #     log: 'logs/Get_samples_{chromosome}_{file}.log'
-    #     shell:
-    #         """
-    #         bcftools query -l {input} > {output}
-    #         """
-
-    # rule filter:
-    #     input:
-    #         [f'{vcfdir}' + x
-    #             for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt',
-    #                 file=allFiles)],
-    #     output:
-    #         temp([f'{vcfdir}' + x for x in
-    #             expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', 
-    #             file=allFiles, ext=['gz', 'gz.csi'])])
-    #     params:
-    #         duplicated = '{chromosome}.ids',
-    #         files_len  = '{chromosome}_len.txt',
-    #         files_reo = '{chromosome}_reorderd.txt', 
-    #     resources: cpus=1, mem_mb=64000, time_min=60
-    #     run:
-    #         import os
-
-    #         for file in input:
-    #             shell("wc -l {file} >> {params.files_len}")
-            
-    #         shell("sort -us -nr {params.files_len} | cut -d' ' -f 2 > {params.files_reo}")
-
-    #         with open(params.files_reo, 'r') as file:
-    #             files_reord = []
-    #             for line in file:
-    #                 line = line.strip()
-    #                 files_reord.append(line)
-            
-    #         filtered = []
-
-    #         for i in range(len(files_reord)):
-    #             print(i)
-    #             for j, file in enumerate(files_reord):
-    #                 if file != files_reord[i] and j > i:
-    #                     file1 = files_reord[i]
-    #                     file2 = files_reord[j]
-    #                     print(file1, file2)
 
-    #                     shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}")
+            print(f'Finished comparing and sampling vcfs')
 
-    #                     remove = params.duplicated
-    #                     lremove = os.stat(f'{remove}').st_size
-
-    #                     if lremove == 0:
-    #                         print(f'Nothing to remove {lremove}')
-    #                     else:
-    #                         print(f'{lremove} samples to remove')
-    #                         if file2 not in filtered:
-    #                             filtered.append(file2)
-
-    #                         file2 = file2.split('.')[0]
-    #                         print(f'file {file2} filtered')
-    #                         shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz')
-    #                         #shell('gatk SelectVariants -R {params.refgen} -V {file2} -xl-sn -O {file2}.filtered.vcf.gz')
-    #                         shell('bcftools index -f {file2}.filtered.vcf.gz')
-    #                         shell('rm {params.duplicated} {file2}.vcf.gz {file2}.vcf.gz.csi')
+            # rename the files that have not been sampled
+            for name in to_rename:
+                sanitized_name = shlex.quote(name)
+                print(f'Renaming {sanitized_name}')
+                command = f"""
+                        mv {os.path.join(sanitized_name + '.vcf.gz')} {os.path.join(sanitized_name + sanitized_new_ext)}
+                        mv {os.path.join(sanitized_name + '.vcf.gz.csi')} {os.path.join(sanitized_name + sanitized_idx)}
+                        """
+                subprocess.run(command, shell=True, check=True, cwd=sanitized_path)
 
-    #         for file in files_reord:
-    #             if file not in filtered:
-    #                 file1 = file.split('.')[0] # file does not have extensions
-    #                 print(f'file {file1} renamed')   
-    #                 shell('mv {file1}.vcf.gz {file1}.filtered.vcf.gz')
-    #                 shell('mv {file1}.vcf.gz.csi {file1}.filtered.vcf.gz.csi')
-    #             #shell('scripts/create_symlinks.sh {file1}')
-    #         shell('rm {params}')
+            # Remove all files that do not contain the string new_ext
+            command = f"""
+                        find {sanitized_path} -type f ! -name '*{sanitized_new_ext}*' -exec rm -f {{}} \;
+                        """
+            subprocess.run(command, shell=True, check=True, cwd=sanitized_path)
 
-else:
-    rule rename:
-        input:
-            vcf = [[f'{vcfdir}' + x
-                    for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz',
-                        suffixOne = splitFiles)] if len(splitFiles) != 0 else [],
-                    [f'{vcfdir}' + x
-                        for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz',
-                            suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []],
-            idx = [[f'{vcfdir}' + x
-                    for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz.csi',
-                        suffixOne = splitFiles)] if len(splitFiles) != 0 else [],
-                    [f'{vcfdir}' + x
-                        for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz.csi',
-                            suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []],
-        output:
-            vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz',
-            idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_{{file}}.filtered.vcf.gz.csi'
+    rule merge:
+        input: rules.compare_and_filter_vcfs.output
+        output: merged_output 
+        params:
+            files_path = f'{vcfdir}/{{chromosome}}/',
+            indexes = filtered_output_idx
         conda: "bcftools"
-        log: 'logs/Rename_{chromosome}.log'
-        resources: cpus=1, mem_mb=32000, time_min=30
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt'
         shell:
+            r"""
+            printf '%s\n' {params.files_path}*filtered*.gz > {wildcards.chromosome}.merge
+            
+            bcftools merge \
+                -m snps \
+                -l {wildcards.chromosome}.merge \
+                --threads {threads} -O z -o {output}
+	    
+            rm {wildcards.chromosome}.merge {params.indexes}
             """
-            if [ -h {input} ]; then
-                ln -s $( realpath {input.vcf} ) {output.vcf}
-                ln -s $( realpath {input.idx} ) {output.idx}
-            else
-                ln -s {input.vcf} {output.vcf}
-                ln -s {input.idx} {output.idx}
-            fi
+
+    rule index_merged:
+        input: rules.merge.output,
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi'
+        conda: 'bcftools'
+        benchmark: 'benchmarks/{chromosome}.indxmerged.benchmark.txt'
+        shell:
+            r"""
+            bcftools index -f {input}
+            """                    
+    
+    rule vcf_summary:
+        input: 
+            vcf_file = rules.merge.output,
+            index = rules.index_merged.output,
+        output:
+            summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk',
+            # done = touch(f'{vcfdir}/{{chromosome}}/merged_summary.done')
+        params:
+            plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/'
+        conda: 'bcftools'
+        benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt'
+        shell:
+            r"""
+            bcftools stats {input.vcf_file} > {output.summary}
+            plot-vcfstats -p {params.plots} {output.summary}
             """
 
+    input_vcf = rules.merge.output
+    input_index = rules.index_merged.output
+    input_summary = rules.vcf_summary.output.summary
+
+# else, use the output from split_and_move as input for the next step
+# else:
+#     input_vcf = rules.split_and_move_vcfs.output
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/infer_trees.smk b/Snakemake/workflow/rules/infer_trees.smk
index f7c2a76..cf36dcc 100644
--- a/Snakemake/workflow/rules/infer_trees.smk
+++ b/Snakemake/workflow/rules/infer_trees.smk
@@ -1,36 +1,38 @@
 import re
 
-# if multiallelic:
-#     vcf_4_inference = f'{vcfdir}/{{chromosome}}_allbi.vcf.gz'
-# else:
-#     vcf_4_inference = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz'
-
 rule prepare_sample_file:
     input:
-        #vcf=vcf_4_inference,
         vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz',
         meta = config['meta']
     output:
-        f"../{Project}/Tsinfer/samples/{{chromosome}}.samples"
+        f'{projdir}/Tsinfer/samples/{{chromosome}}.samples'
     params:
         chrLength= lambda wildcards:  config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])],
         ploidy=config['ploidy']
     conda: "HLab_tsinfer"
-    threads: 1
-    resources: cpus=1, mem_mb=128000, time_min=200
-    log: 'logs/Prepare_sample_file_{chromosome}.log'
+    threads: 32
+    resources: cpus=32, mem_mb=768000, time_min=200
+    benchmark:
+            'benchmarks/{chromosome}.prep_sample.benchmark.txt'
+    #log: 'logs/Prepare_sample_file_{chromosome}.log'
     shell:
         "python scripts/Prepare_tsinfer_sample_file.py "
         "{input.vcf} {input.meta} {output} {params.ploidy} {params.chrLength}"
 
 rule infer:
     input:
-        f"../{Project}/Tsinfer/samples/{{chromosome}}.samples"
-    conda: "HLab_tsinfer"
-    threads: 1
-    resources: cpus=1, mem_mb=128000, time_min=300
-    log: 'logs/Infer_{chromosome}.log'
+        f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
     output:
-        f"../{Project}/Tsinfer/trees/{{chromosome}}.trees"
+        f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
+    conda: "HLab_tsinfer"
+    threads: 16
+    resources: cpus=16, mem_mb=1024000, time_min=10080
+    benchmark:
+            'benchmarks/{chromosome}.infer.benchmark.txt'
+    #log: 'logs/Infer_{chromosome}.log'
     shell:
-        "python scripts/Infer_trees.py {input} {output}"
+        r"""
+        # Check amount of memory (in kbytes) as seen by the job
+        ulimit -v
+        python scripts/Infer_trees.py {input} {output}
+        """
diff --git a/Snakemake/workflow/rules/keep_snp.smk b/Snakemake/workflow/rules/keep_snp.smk
new file mode 100644
index 0000000..323bd1b
--- /dev/null
+++ b/Snakemake/workflow/rules/keep_snp.smk
@@ -0,0 +1,43 @@
+# rule gatk_index:
+#     input:
+#         vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz',
+#         idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' 
+#     output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.tbi'),
+#     conda: 'gatk-env'
+#     benchmark: 
+#         'benchmark/{chromosome}.gatk_index.benchmark.txt'
+#     shell:
+#         r"""
+#         gatk IndexFeatureFile -I {input.vcf}
+#         """
+
+# rule filter_snps:
+#     input: 
+#         vcf = rules.phase.output,
+#         idx = rules.index_phased.output,
+#     output: 
+#         vcf_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz'),
+#         idx_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi'),
+#     params: config['reference_genome'],
+#     conda: 'gatk-env'
+#     benchmark: 'benchmarks/{chromosome}.filter_snps.benchmark.txt'
+#     shell:
+#         r"""
+#         gatk SelectVariants \
+#             -R {params} \
+#             -V {input.vcf} \
+#             --select-type-to-include SNP \
+#             -O {output.vcf_file} 
+
+#         bcftools index {output.vcf_file}           
+#         """
+
+# rule index:
+#     input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz',
+#     output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi')
+#     conda: 'bcftools'
+#     benchmark: 'benchmarks/{chromosome}.index_filtered_snps.benchmark.txt'
+#     shell:
+#         r"""
+#         bcftools index {input}
+#         """
diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk
index 86cbc2c..72f74c1 100644
--- a/Snakemake/workflow/rules/merge_samples_vcf.smk
+++ b/Snakemake/workflow/rules/merge_samples_vcf.smk
@@ -1,50 +1,26 @@
-if len(allFiles) != 1:
-    rule merge:
-        input:
-            vcf = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)],
-            idx = [f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi', file=allFiles)],
-        output:
-            vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz',
-        params:
-            files_path = f'{vcfdir}/{{chromosome}}/'
-        conda: "bcftools"
-        threads: 32
-        resources: cpus=32, mem_mb=2048000, time_min=1200
-        benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt'
-        shell:
-            r"""
-            printf '%s\n' {params.files_path}*.gz > files.merge
-            
-            bcftools merge \
-                -m snps \
-                -l files.merge \
-                --threads {threads} -O z -o {output.vcf} 
-            """
-
-    rule index_merged:
-        input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz',
-        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi'
-        conda: 'bcftools'
-        shell:
-            r"""
-            bcftools index -f {input}
-            """
+# Filter out INDELS and missing data (>20%) from the vcf file
+# The input vcf file is the merged vcf file from the previous step
+# or the output from split and move (if only one file is present)
 
 rule remove_missing_indels:
     input: 
-        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz',
-        index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi'
+        vcf = input_vcf,
+        index = input_index,
+        summary = input_summary,
     output:    
-        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz',
-        index = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.recode.vcf.gz.csi'
+        vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz'),
+        index = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi')
     params:
-        prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged'
+        prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}',
+        missing = config['missing_data'],
+        indel = '--remove-indels' if config['remove_indel'] else ''
     conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.missindel.benchmark.txt' 
     shell:
         r"""
         vcftools --gzvcf {input.vcf} \
-            --remove-indels \
-            --max-missing 0.1 \
+            {params.indel} \
+            --max-missing {params.missing} \
             --remove-filtered-all \
             --recode \
             --out {params.prefix}
@@ -52,21 +28,5 @@ rule remove_missing_indels:
         bgzip {params.prefix}.recode.vcf
         bcftools index -f {output.vcf}  
 
-        rm  {params.prefix}.recode.vcf 
-        """   
-
-# rule summary:
-#     input: 
-#         vcf_file = rules.filter_merged_vcf.output.vcf,
-#         index = rules.filter_merged_vcf.output.index,
-#     output:
-#         summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk'
-#     params:
-#         plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/'
-#     conda: 'bcftools'
-#     benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt'
-#     shell:
-#         r"""
-#         bcftools stats {input.vcf_file} > {output.summary}
-#         plot-vcfstats -p {params.plots} {output.summary}
-#         """
\ No newline at end of file
+        # rm  {params.prefix}.recode.vcf 
+        """ 
diff --git a/Snakemake/workflow/rules/new_infer_trees.smk b/Snakemake/workflow/rules/new_infer_trees.smk
index 930c3ca..0dec01f 100644
--- a/Snakemake/workflow/rules/new_infer_trees.smk
+++ b/Snakemake/workflow/rules/new_infer_trees.smk
@@ -8,7 +8,8 @@
 
 rule prepare_sample_file:
     input:
-        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
+        vcf = input_ts,
+        # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
         meta = config['meta']
     output: f'{projdir}/Tsinfer/samples/{{chromosome}}.samples'
     params:
@@ -16,103 +17,92 @@ rule prepare_sample_file:
     conda: "HLab_tsinfer"
     threads: 32
     resources: cpus = 32, mem_mb = 768000, time_min = 200
-    benchmark:
-            'benchmarks/{chromosome}_new.prep_sample.benchmark.txt'
+    benchmark: 'benchmarks/{chromosome}_new.prep_sample.benchmark.txt'
     shell:
         r"""
         ulimit -v
         
         python scripts/generate_samples.py \
-                {input.vcf} {input.meta} {output} {params.chrLength}
+                {input.vcf} {output} {input.meta} {params.chrLength} {threads}
         
         """
 
 rule generate_ancestors:
     input: 
-        samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
+        samples = rules.prepare_sample_file.output,
         sites_to_exclude = f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude',
     output:
         ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors',
     conda: 'HLab_tsinfer'
     threads: 64
     resources: cpus = 64, mem_mb = 2048000, time_min = 1680
-    benchmark:
-            'benchmarks/{chromosome}_generate_ancestors.benchmark.txt'
+    benchmark: 'benchmarks/{chromosome}_generate_ancestors.benchmark.txt'
     shell:
         r"""
         # Check amount of memory (in kbytes) as seen by the job
         ulimit -v
         python scripts/generate_ancestors.py {input.samples} {input.sites_to_exclude} {output.ancestors_samples} {threads}
         """
-# tsinfer generate-ancestors {input} \
-#                 --num-threads {threads} \
-#                 --num-flush-threads {threads} \
-#                 --progress \
-#                 --verbosity \
-#                 --log-section tsinfer.inference
-
 
 rule truncate_ancestors:
     input:
-        ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors',
+        ancestors_samples = rules.generate_ancestors.output.ancestors_samples,
     output: 
         truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated',
-   # params:
-   #     upper = 0.4,
-   #     lower = 0.2,
+    params:
+       upper = config['truncate_upper'],
+       lower = config['truncate_lower'],
     conda: 'HLab_tsinfer'
     threads: 32
     resources: cpus=32, mem_mb=2048000, time_min=1680
-    benchmark:
-            'benchmarks/{chromosome}_truncate.benchmark.txt'
+    benchmark: 'benchmarks/{chromosome}_truncate.benchmark.txt'
     shell:
         r"""
         ulimit -v
-        python scripts/truncate.py {input} {output}
+        python scripts/truncate.py {input} {params.upper} {params.lower} {output}
         """
 
 rule ancestors_trees:
     input:
-        samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
-        truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated',
+        samples = rules.prepare_sample_file.output,
+        truncated_ancestors = rules.truncate_ancestors.output.truncated_ancestors,
     output: 
         ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees',
     params:
-        re = '1.1e-8',
-        mima = '1'
+        re = f"--recombination-rate {config['recombination_rate_anc']}" if config['recombination_rate_anc'] else '',
+        misma = f"--mismatch-ratio {config['mismatch_ratio_anc']}" if config['mismatch_ratio_anc'] else '',
     conda: 'HLab_tsinfer'
     threads: 32
     resources: cpus = 32, mem_mb = 2048000, time_min = 1680
-    benchmark:
-            'benchmarks/{chromosome}_ancestors_trees.benchmark.txt'
+    benchmark: 'benchmarks/{chromosome}_ancestors_trees.benchmark.txt'
     shell:
         r"""
         tsinfer match-ancestors {input.samples} \
                 -A {output} \
+                {params.re} \
+                {params.misma} \
                 --log-section tsinfer.inference \
                 --num-threads {threads} \
                 --progress \
                 --verbosity \
                 --ancestors {input.truncated_ancestors} 
         """
-# --recombination-rate {params.re} \
-               # --mismatch-ratio {params.mima} \
-               
 
 rule infer_trees:
     input:
-        samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
-        ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees',
+        samples = rules.prepare_sample_file.output,
+        ancestors_trees = rules.ancestors_trees.output.ancestors_trees,
     output:
         trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
     params:
-        re = '1.1e-8',
-        mima = '1'
+        re = f"--recombination-rate {config['recombination_rate_tree']}" 
+            if config['recombination_rate_tree'] else '',
+        misma = f"--mismatch-ratio {config['mismatch_ratio_tree']}" 
+            if config['mismatch_ratio_tree'] else '',
     conda: 'HLab_tsinfer'
     threads: 32
     resources: cpus=32, mem_mb=2048000, time_min=1680
-    benchmark:
-            'benchmarks/{chromosome}_infer.benchmark.txt'
+    benchmark: 'benchmarks/{chromosome}_infer.benchmark.txt'
     shell:
         r"""
         # Check amount of memory (in kbytes) as seen by the job
@@ -120,26 +110,30 @@ rule infer_trees:
         tsinfer match-samples {input.samples} \
                 -A {input.ancestors_trees} \
                 --num-threads {threads} \
+                {params.re} \
+                {params.misma} \
                 --progress \
                 --log-section tsinfer.inference \
                 --verbosity \
                 -O {output}
         """
-# --recombination-rate {params.re} \
-#                 --mismatch-ratio {params.mima} \
 
 rule simplify_trees:
     input: 
-        trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
+        trees = rules.infer_trees.output.trees,
     output: 
-        simplified_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.simplified.trees',
+        f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees',
+    params:
+        genint = config['generation_interval'],
+        mutrate = config['mutation_rate'],
     conda: 'HLab_tsinfer'
     threads: 32
     resources: cpus=32, mem_mb=2048000, time_min=1680
-    benchmark:
-            'benchmarks/{chromosome}_simplify.benchmark.txt'
+    benchmark: 'benchmarks/{chromosome}_simplify.benchmark.txt'
     shell: 
         r"""
         ulimit -v
-        python scripts/simplify.py {input} {output}
-        """             
\ No newline at end of file
+        python scripts/simplify.py {input} {params.genint} {params.mutrate} {output}
+        """             
+
+# ,[^.]        
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/phasing.smk b/Snakemake/workflow/rules/phasing.smk
index 53e59db..6ea0fad 100644
--- a/Snakemake/workflow/rules/phasing.smk
+++ b/Snakemake/workflow/rules/phasing.smk
@@ -1,29 +1,27 @@
-file_to_phase = rules.remove_missing_indels.output.vcf
-file_to_phase_idx = rules.remove_missing_indels.output.index
+# Phase files if species is diploid using the output from the
+# remove_missing_indels rule
+
+file_to_phase = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz'
+file_to_phase_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi'
 
 phased_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
 phased_output_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi'
 
+checkpoint decide_start_point:
+    output: "workflow_start.txt"
+    shell:
+        """
+        if [ {config['phase_only']} == False ] && [ -f {phased_output} {phased_output_idx} ]; then
+            echo "ancestral_info_to_vcf" > {output}
+        else
+            echo "start" > {output}
+        fi
+        """
+
 if config['ploidy'] == 1:
-    rule rename_phased:
-        input:
-            vcf = file_to_phase,
-            idx = file_to_phase_idx,
-        output:
-            vcf = temp(phased_output),
-            idx = temp(phased_output_idx)
-        log: 'logs/Rename_phased_{chromosome}.log'
-        resources: cpus=1, mem_mb=32000, time_min=60
-        shell:
-            """
-            if [ -h {input.vcf} ]; then
-                ln -s $( realpath {input.vcf} ) {output.vcf}
-                ln -s $( realpath {input.vcf} ).csi {output.vcf}
-            else
-                ln -s {input.vcf} {output.vcf}
-                ln -s {input.idx} {output.idx}
-            fi
-            """
+    input_ancestral = file_to_phase
+    input_ancestral_idx = file_to_phase_idx
+
 else:
     rule phase:
         input:
@@ -36,8 +34,7 @@ else:
         threads: 32
         resources: cpus = 32, mem_mb = 2048, time_min = 7200
         conda: 'shapeit4am'
-        benchmark:
-            'benchmarks/{chromosome}.shapeit.benchmark.txt'
+        benchmark: 'benchmarks/{chromosome}.shapeit.benchmark.txt'
         shell:
             r"""
             str='{wildcards.chromosome}'
@@ -53,15 +50,17 @@ else:
                                 --log {wildcards.chromosome}_phased.log                
             """         
 
-    rule index_phased: 
-        input:
-            rules.phase.output.vcf
-        output:
-            idx = phased_output_idx
+    rule index_phased_cleanup: 
+        input: rules.phase.output
+        output: phased_output_idx
+        params: 
+            chr = lambda wildcards: wildcards.chromosome[3:],
+            dire = f'{vcfdir}/{{chromosome}}',    
         conda: 'bcftools'
-        benchmark:
-            'benchmarks/{chromosome}.bcftools_index.bechmark.txt'
+        benchmark: 'benchmarks/{chromosome}.bcftools_index.bechmark.txt'
         shell:
             r"""
-            bcftools index -f {input}
-            """
\ No newline at end of file
+            bcftools index {input}  
+            """
+    input_ancestral = phased_output
+    input_ancestral_idx = phased_output_idx
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk
index cd855b2..52565f4 100644
--- a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk
+++ b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk
@@ -5,19 +5,20 @@ else:
 
 
 rule get_af:
-    input: f'{vcfdir}/{{chromosome}}_phased.vcf.gz'
+    input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
     output:
-        info = temp(f'{vcfdir}/Info{{chromosome}}.INFO')
+        vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_info.vcf.gz'),
+        info = temp(f'{vcfdir}/{{chromosome}}/Info{{chromosome}}.INFO')
     params:
-        prefix=f'{vcfdir}/Info{{chromosome}}'
+        prefix=f'{vcfdir}/{{chromosome}}/Info{{chromosome}}'
     conda: "bcftools"
     threads: 1
     resources: cpus=1, mem_mb=32000, time_min=60
     log: 'logs/get_af_{chromosome}.log'
     shell:
         """
-        bcftools +fill-tags {input} -Oz -o {input} -- -t AN,AC,AF
-        vcftools --gzvcf {input} --out {params.prefix} --get-INFO AC --get-INFO AF
+        bcftools +fill-tags {input} -Oz -o {output.vcf} -- -t AN,AC,AF
+        vcftools --gzvcf {output.vcf} --out {params.prefix} --get-INFO AC --get-INFO AF
 
         LOGFILE={params.prefix}.log
         if test -f "$LOGFILE"; then
@@ -27,12 +28,10 @@ rule get_af:
 
 rule get_major:
     input: rules.get_af.output.info
-    output: temp(f'{vcfdir}/Major{{chromosome}}.txt')
+    output: temp(f'{vcfdir}/{{chromosome}}/Major{{chromosome}}.txt')
     threads: 1
     resources: cpus=1, mem_mb=32000, time_min=60
     log: 'logs/Get_major_{chromosome}.log'
-#    wildcard_constraints:
-#        chromosome="^chr | ^Chr"
     shell:
         """
         awk '{{if (NR!=1 && $5>=0.5) {{print $1"_"$2","$4}} else if (NR!=1 && $5<0.5) {{print $1"_"$2","$3}}}}' {input} > {output}
@@ -42,7 +41,7 @@ rule extract_ancestral_chromosome:
     input:
         ancestral_file
     output:
-        temp(f'{vcfdir}/Ancestral{{chromosome}}.txt')
+        temp(f'{vcfdir}/{{chromosome}}/Ancestral{{chromosome}}.txt')
     params:
         chrNum = lambda wc: wc.get('chromosome')[3:]
     log: 'logs/Extract_ancestral_chromosome_{chromosome}.log'
@@ -57,7 +56,7 @@ rule join_major_ancestral:
         ancestral = rules.extract_ancestral_chromosome.output,
         major = rules.get_major.output
     output:
-        temp(f'{vcfdir}/MajAAnc_{{chromosome}}.txt')
+        temp(f'{vcfdir}/{{chromosome}}/MajAAnc_{{chromosome}}.txt')
     log: 'logs/Join_major_ancestral_{chromosome}.log'
     resources: cpus=1, mem_mb=32000, time_min=60
     shell:
@@ -69,7 +68,7 @@ rule determine_ancestral_major:
     input:
         rules.join_major_ancestral.output
     output:
-        temp(f'{vcfdir}/MajOAnc_{{chromosome}}.txt')
+        temp(f'{vcfdir}/{{chromosome}}/MajOAnc_{{chromosome}}.txt')
     log: 'logs/Determine_major_ancestral_{chromosome}.log'
     shell:
         """
@@ -78,9 +77,9 @@ rule determine_ancestral_major:
 
 rule decompress:
     input:
-        vcf=f'{vcfdir}/{{chromosome}}_phased.vcf.gz',
-	majororaa=rules.determine_ancestral_major.output
-    output: f'{vcfdir}/{{chromosome}}_phased.vcf' # this is removing both the .gz and the decompressed file
+        vcf = rules.get_af.output.vcf,
+	    majororaa = rules.determine_ancestral_major.output
+    output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' 
     threads: 1
     resources: cpus=1, mem_mb=32000, time_min=30
     log: 'logs/Decompress_{chromosome}.log'
@@ -92,9 +91,9 @@ rule decompress:
 
 rule change_infoAA_vcf:
     input:
-        vcf=rules.decompress.output,
-        ancestralAllele=rules.determine_ancestral_major.output
-    output: f'{vcfdir}/{{chromosome}}_ancestral.vcf'
+        vcf = rules.decompress.output,
+        ancestralAllele = rules.determine_ancestral_major.output
+    output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf'
     conda: "bcftools"
     threads: 1
     resources: cpus=1, mem_mb=64000, time_min=120
@@ -109,17 +108,23 @@ rule change_infoAA_vcf:
         """
 
 rule compress_vcf:
-    input: rules.change_infoAA_vcf.output,
+    input: 
+        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf',
     output:
-        file = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz',
-        idx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi'
+        vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz',
+        idx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi',
+    params:
+        chrdir = f'{vcfdir}/{{chromosome}}/',
+        outdir = f'{vcfdir}/'
     conda: "bcftools"
     threads: 1
     resources: cpus=1, mem_mb=64000, time_min=60
     log: 'logs/Compress_{chromosome}.log'
     shell:
+       r"""
+        bgzip {input.vcf}
+        bcftools index {input.vcf}.gz
+
+        mv {input}.gz {input}.gz.csi {params.outdir}
+        rm -r {params.chrdir}
         """
-        bgzip {input}
-        bcftools index -f {output.file}
-        """
-# keeping the phased files w/o ancestral alleles
diff --git a/Snakemake/workflow/rules/split.smk b/Snakemake/workflow/rules/split.smk
index 139bc49..114a7db 100644
--- a/Snakemake/workflow/rules/split.smk
+++ b/Snakemake/workflow/rules/split.smk
@@ -1,12 +1,29 @@
+# This script is used for managing VCF files in a directory previous to tree inference
+# It identifies two types of VCF files in the 'RawVCF' directory: 
+# 1. Files that start with either "Chr" or "chr" and end with "vcf.gz" (referred to as 'splitFiles')
+# 2. Files that start with "Combined" and end with "vcf.gz" (referred to as 'combinedFiles')
+# It then creates regex patterns for these file names for later use in rules.
+# The script also combines the names of both types of files into a single list (allFiles).
+# If there are any 'splitFiles', it defines a rule (move_vcf) to move these files 
+# from the 'RawVCF' directory to a directory named after the chromosome in the file name.
+# The rule also renames the files to include the chromosome name at the start of the file name.
+import os
+import re
 
+print(chromosomes)
 splitFiles = list(set([f.strip("vcf.gz").split("_")[1]
     for f in os.listdir(f'{vcfdir}/RawVCF')
-        if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) )]))
+        if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) and "_" in f)]))
 
 combinedFiles = list(set([f.strip("vcf.gz").split("_")[1]
     for f in os.listdir(f'{vcfdir}/RawVCF')
         if (f.endswith("vcf.gz") and f.startswith("Combined"))]))
 
+# Prepare a regex pattern that matches any of the file names in splitFiles and
+# combinedFiles so rules only look for them
+splitFiles_pattern = "(" + "|".join(re.escape(name) for name in splitFiles) + ")"
+combinedFiles_pattern = "(" + "|".join(re.escape(name) for name in combinedFiles) + ")"        
+
 allFiles = splitFiles + combinedFiles
 
 if len(splitFiles) > 0:
@@ -20,7 +37,9 @@ if len(splitFiles) > 0:
         conda: "bcftools"
         threads: 1
         resources: cpus=1, mem_mb=32000, time_min=60
-        log: 'logs/Move_vcf_{chromosome}_{suffixOne}.log'
+        benchmark: 'benchmarks/{chromosome}.{suffixOne}.move.benchmark.txt'
+        wildcard_constraints:
+            suffixOne = splitFiles_pattern
         shell:
             """
             ln -s {input.vcf} {output.vcf}
@@ -32,13 +51,15 @@ if len(combinedFiles) > 0:
         input:
             f'{vcfdir}/RawVCF/Combined_{{suffixTwo}}.vcf.gz'
         output:
-            [f'{vcfdir}' + x
+            temp([f'{vcfdir}' + x
                 for x in expand('/{{chromosome}}/{{chromosome}}_{{suffixTwo}}.vcf.gz',
-                    chromosome = chromosomes, suffixTwo = combinedFiles)]
+                    chromosome = chromosomes, suffixTwo = combinedFiles)])
         conda: "bcftools"
         threads: 1
         resources: cpus=1, mem_mb=64000, time_min=60
-        log: expand('logs/Split_and_move_vcfs_{{chromosome}}_{{suffixTwo}}.log')
+        benchmark: 'benchmarks/{chromosome}.{suffixTwo}.split.benchmark.txt'
+        wildcard_constraints:
+            suffixTwo = combinedFiles_pattern
         shell:
             """
             str='{wildcards.chromosome}'
diff --git a/Snakemake/workflow/scripts/Infer_trees.py b/Snakemake/workflow/scripts/Infer_trees.py
index 50810fb..41f6b53 100644
--- a/Snakemake/workflow/scripts/Infer_trees.py
+++ b/Snakemake/workflow/scripts/Infer_trees.py
@@ -44,6 +44,8 @@
     progress_monitor=True,
 )
 
+ancestors_ts.dump(f'{outputfile}.ancestors')
+
 ts = tsinfer.match_samples(
     sampleFile,
     ancestors_ts,
@@ -53,25 +55,24 @@
     progress_monitor=True,
 ).simplify(keep_unary=False)
 
-"""
-ts = tsinfer.infer(sampleFile)
+#ts = tsinfer.infer(sampleFile)
 print(
     "Inferred tree sequence `{}`: {} trees over {} Mb".format(
-        "drone_ts", ts.num_trees, ts.sequence_length / 1e6
+        "ts", ts.num_trees, ts.sequence_length / 1e6
     )
 )
-"""
-# # Check the metadata
-# for sample_node_id in ts.samples():
-#     individual_id = ts.node(sample_node_id).individual
-#     population_id = ts.node(sample_node_id).population
-#     print(
-#         "Node",
-#         sample_node_id,
-#         "labels genome sampled from",
-#         json.loads(ts.individual(individual_id).metadata),
-#         "in",
-#         json.loads(ts.population(population_id).metadata)["subpop"],
-#     )
+
+# Check the metadata
+for sample_node_id in ts.samples():
+    individual_id = ts.node(sample_node_id).individual
+    population_id = ts.node(sample_node_id).population
+    print(
+        "Node",
+        sample_node_id,
+        "labels genome sampled from",
+        json.loads(ts.individual(individual_id).metadata),
+        "in",
+        json.loads(ts.population(population_id).metadata)["subpop"],
+    )
 
 ts.dump(outputFile)
diff --git a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py
index 84972c3..5d9c45e 100644
--- a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py
+++ b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pandas as pd
 import pkg_resources
+from itertools import chain
+from tsinfer import MISSING_DATA
 
 print(sys.executable)
 
@@ -70,7 +72,9 @@ def add_sites(vcf, samples, ploidy):
     for variant in vcf:  # Loop over variants, each assumed at a unique site
         progressbar.update(variant.POS - pos)
         if pos == variant.POS:
-            raise ValueError("Duplicate positions for variant at position", pos)
+            print("Duplicate positions for variant at position", pos)
+            continue
+        
         else:
             pos = variant.POS
         if ploidy > 1:
@@ -83,10 +87,10 @@ def add_sites(vcf, samples, ploidy):
              alleles = alleles + [letter for letter in string]
              #ALT = ALT + [letter for letter in string]
 
-        print(alleles)
+        #print(alleles)
         # ignores non-bialllelic sites
         if len(alleles) > 2:
-            print('Ignoring non-biallelic site')
+         #   print('Ignoring non-biallelic site')
             continue
 
         # if len([*ancestral]) > 1:
@@ -95,7 +99,7 @@ def add_sites(vcf, samples, ploidy):
         #     #continue
 
         ancestral = variant.INFO.get("AA", variant.REF)
-        print(f'alleles: {alleles} | ancestal: {[*ancestral]}')
+        #print(f'alleles: {alleles} | ancestal: {[*ancestral]}')
         
         try:
            ancestral_allele = alleles.index(ancestral[0])
@@ -111,6 +115,7 @@ def add_sites(vcf, samples, ploidy):
                              ancestral_allele=alleles.index(ancestral[0]))
 
 
+
 def add_populations(vcf, samples, metaData):
     """
     This is a modified add populations function for drones from David Wragg data
@@ -121,10 +126,23 @@ def add_populations(vcf, samples, metaData):
     """
     pop_lookup = {}
     metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all')
-    sample_subpop = [list(metaData.SubPop[metaData.ID == x]) for x in vcf.samples]
-    for subpop, pop in zip(metaPop.SubPop, metaPop.Pop):
-        pop_lookup[subpop] = samples.add_population(metadata={"pop": pop, "subpop": subpop})
-    return [pop_lookup[subpop[0]] for subpop in sample_subpop]
+    
+    # to account for same subpop in different countries - creating a 'fake subpop' to be used as key
+    # if subpop is repeated new code is subpop+origin, else subpop
+    d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)]
+    metaPop['code'] = d
+    # add code col to the full meatafile
+    metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner')
+    
+    # list population per sample based on recoded subpop
+    sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples]
+    
+    # the dictionary key will be based on the recoding - so keeps the correct amount of population
+    # but the metadata takes in the real subpop value
+    for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code):
+        pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon})
+    return [pop_lookup[code[0]] for code in sample_subpop]
+
 
 #######################################################################
 # Read in the information and prepare the data
@@ -138,7 +156,7 @@ def add_populations(vcf, samples, metaData):
 # Create samples for haploid data
 with tsinfer.SampleData(path=sampleFile,
                         sequence_length=chrLength,
-                        num_flush_threads=20, max_file_size=2**30) as samples:
+                        num_flush_threads=16, max_file_size=2**30) as samples:
    populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile)
    print("populations determined")
    add_individuals(vcf = vcfD, samples = samples, populations = populations, metaData = metaFile, ploidy = ploidy)
@@ -148,5 +166,6 @@ def add_populations(vcf, samples, metaData):
 print(
    "Sample file created for {} samples ".format(samples.num_samples)
    + "({} individuals) ".format(samples.num_individuals)
-   + "with {} variable sites.".format(samples.num_sites)
-)
+   + "with {} variable sites.".format(samples.num_sites),
+   flush=True,
+)
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/generate_ancestors.py b/Snakemake/workflow/scripts/generate_ancestors.py
index 1e4ce05..9a8ef82 100644
--- a/Snakemake/workflow/scripts/generate_ancestors.py
+++ b/Snakemake/workflow/scripts/generate_ancestors.py
@@ -1,5 +1,6 @@
 import sys
 import tsinfer
+import numcodecs
 import numpy as np
 
 args = sys.argv
@@ -9,6 +10,7 @@
 output = args[3]
 threads = int(args[4])
 
+gzip_compressor = numcodecs.GZip(level=1)
 
 ancestors = tsinfer.generate_ancestors(
     sample_data=samples,
@@ -16,4 +18,5 @@
     exclude_positions=sites_to_exclude,
     progress_monitor=True,
     num_threads=threads,
+    compressor=gzip_compressor,
 )
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/generate_samples.py b/Snakemake/workflow/scripts/generate_samples.py
index 9db40a9..3543fc7 100644
--- a/Snakemake/workflow/scripts/generate_samples.py
+++ b/Snakemake/workflow/scripts/generate_samples.py
@@ -6,10 +6,13 @@
 import json
 import cyvcf2
 import tsinfer
+import numcodecs
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
 
+# create a GZip compressor instance with a specific compression level
+gzip_compressor = numcodecs.GZip(level=1)
 
 def add_populations(vcf, samples, metaData):
     """
@@ -88,13 +91,33 @@ def add_diploid_sites(vcf, samples):
         genotypes = [g for row in variant.genotypes for g in row[0:2]]
         samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
 
+def process_samples(file, metadata, sample_path, chr_length, compressor, threads):
+    with tsinfer.SampleData(path=sample_path, 
+                            sequence_length=chr_length,
+                            compressor=compressor, 
+                            num_flush_threads=threads, max_file_size=2**30) as samples:
+        populations = add_populations(vcf=file, samples=samples, metaData=metadata)
+        print("populations determined")
+        add_diploid_individuals(vcf=file, samples=samples, populations=populations)
+        print("individuals added")
+        add_diploid_sites(vcf=file, samples=samples)
+        print("sites added")
+        
+        print(
+            "Sample file created for {} samples ".format(samples.num_samples)
+            + "({} individuals) ".format(samples.num_individuals)
+            + "with {} variable sites.".format(samples.num_sites),
+            flush=True,
+        )
+
 # -----------------------------------------------------------------------------------------
 
 args = sys.argv
 vcfFile = args[1]
-meta = args[2]
-sampleFile = args[3]
+sampleFile = args[2]
+meta = args[3]
 chrLength = args[4]
+nthreads = int(args[5])
 
 metaFile = pd.read_csv(meta)
 
@@ -102,19 +125,45 @@ def add_diploid_sites(vcf, samples):
 vcfD = cyvcf2.VCF(vcfFile, strict_gt=True)
 
 # Create samples for haploid data
-with tsinfer.SampleData(path=sampleFile,
-                        sequence_length=chrLength,
-                        num_flush_threads=16, max_file_size=2**30) as samples:
-   populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile)
-   print("populations determined")
-   add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations)
-   print("individuals added")
-   add_diploid_sites(vcf = vcfD, samples = samples)
-   print("sites added")
+try:
+    process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, nthreads)
+except Exception as e:
+    print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.")
+    process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, 16)
+        
+# with tsinfer.SampleData(path=sampleFile,
+#                         sequence_length=chrLength,
+#                         num_flush_threads=nthreads, max_file_size=2**30) as samples:
+#    populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile)
+#    print("populations determined")
+#    add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations)
+#    print("individuals added")
+#    add_diploid_sites(vcf = vcfD, samples = samples)
+#    print("sites added")
 
-print(
-   "Sample file created for {} samples ".format(samples.num_samples)
-   + "({} individuals) ".format(samples.num_individuals)
-   + "with {} variable sites.".format(samples.num_sites),
-   flush=True,
-)
+# -----------------------------------------------------------------------------------------
+# def execute_with_threads(nthreads):
+#     try:
+#         with tsinfer.SampleData(path=sampleFile, 
+#                                 sequence_length=chrLength,
+#                                 compressor=gzip_compressor, 
+#                                 num_flush_threads=nthreads, max_file_size=2**30) as samples:
+#             populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile)
+#             print("populations determined")
+#             add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations)
+#             print("individuals added")
+#             add_diploid_sites(vcf=vcfD, samples=samples)
+#             print("sites added")
+#     except Exception as e:
+#         print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.")
+#         with tsinfer.SampleData(path=sampleFile,
+#                                 sequence_length=chrLength,
+#                                 compressor=gzip_compressor,
+#                                 num_flush_threads=16, max_file_size=2**30) as samples:
+#             populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile)
+#             print("populations determined")
+#             add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations)
+#             print("individuals added")
+#             add_diploid_sites(vcf=vcfD, samples=samples)
+#             print("sites added")
+# execute_with_threads(nthreads)
diff --git a/Snakemake/workflow/scripts/simplify.py b/Snakemake/workflow/scripts/simplify.py
index f59ebfa..7659671 100644
--- a/Snakemake/workflow/scripts/simplify.py
+++ b/Snakemake/workflow/scripts/simplify.py
@@ -1,9 +1,29 @@
 import sys
 import tskit
+import subprocess
+# upgrade to the latest version of tsdate
+subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "tsdate"])
+import tsdate
 
 args = sys.argv
 ts = tskit.load(args[1])
-output = args[2]
+generation_interval = float(args[2])
+mu = float(args[3])
+output = args[4]
 
-ts_simplified = ts.simplify(keep_unary=False)
-ts_simplified.dump(output)
\ No newline at end of file
+# Ne = ts.diversity()/4/mu  
+
+processed_ts = tsdate.preprocess_ts(
+    ts,
+    split_disjoint=True,
+    keep_unary=False,
+    )
+
+dated_ts = tsdate.variational_gamma(tree_sequence = processed_ts, 
+                                  mutation_rate = mu, 
+                                  eps = generation_interval,
+                                  max_iterations = 100,
+                                  time_units = 'generations',
+                                  progress = True,)
+
+dated_ts.dump(output)
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/truncate.py b/Snakemake/workflow/scripts/truncate.py
index 17037bc..1fbf0f4 100644
--- a/Snakemake/workflow/scripts/truncate.py
+++ b/Snakemake/workflow/scripts/truncate.py
@@ -3,16 +3,15 @@
 
 args = sys.argv
 ancestors = tsinfer.load(args[1])
-#uprtime = args[2]
-#lwertime = args[3]
-output = args[2]
+uprtime = float(args[2])
+lwertime = float(args[3])
+output = args[4]
 
 ancestors.truncate_ancestors(
     path=output, 
     max_file_size=2**38,
-    lower_time_bound=0.2,
-    upper_time_bound=0.4,
+    lower_time_bound=lwertime,
+    upper_time_bound=uprtime,
     length_multiplier=2,
 )
 
-
diff --git a/Snakemake/workflow/scripts/tsparam.py b/Snakemake/workflow/scripts/tsparam.py
index 8634e20..676e691 100644
--- a/Snakemake/workflow/scripts/tsparam.py
+++ b/Snakemake/workflow/scripts/tsparam.py
@@ -1,5 +1,5 @@
 # parameters for tsinfer --- change accordingly
-threads=20
+threads=16
 lwertime=0.4
 uprtime=0.6
 lenmultiply=2

From 5606ece6beacac61100aad952a46f789b1d88338 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra Fortuna <88657362+gmafrafortuna@users.noreply.github.com>
Date: Thu, 25 Jul 2024 06:30:32 +0200
Subject: [PATCH 13/15] Update Snakefile

---
 Snakemake/workflow/Snakefile | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile
index 27b38e1..1f6de40 100644
--- a/Snakemake/workflow/Snakefile
+++ b/Snakemake/workflow/Snakefile
@@ -15,11 +15,6 @@ from os.path import isfile, join
 configfile: '../config/glob_cattle.yaml'
 report: "reports/tsinfer.rst"
 
-# specify config files on the command line using --configfile
-#configfile: '../config/ancestral_Eddie.yaml'
-#configfile: '../config/tsinfer_Eddie.yaml'
-
-
 ##### setup singularity #####
 # Can we set conda env here???
 
@@ -31,27 +26,13 @@ report: "reports/tsinfer.rst"
 # (2) sort into the correct Chr folder in vcfDir.
 
 # defining variables that will be used throughout the script
-
-# the final directory name is now editable -- so can be named according to specific project
-# mapdir indicates where to find the genetic map for phasing (full path in the config)
-mapdir = config['genmap']
-oDir = Path(config['o_dir'], config['PROJECT'])
-
-vcfdir=config['vcf_dir'] # from config file, the directory containing the VCF(s) to process
-# the path where to save intermetdiate and final VCFs
-if config['process_vcf_in_original_dir']:
-    vcfOut = config['vcf_dir'] # same as above
-else:
-    vcfOut = f"{oDir}/VCF" # VCF dir inside the output dir
-
-
+vcfdir=config['vcf_dir'] # from config file, the path to vcf
 # this folder can be any where but must have the following structure:
 # vcfDir:
 #   - RawVCF
 #   - Chr1
 #   -Chr...
 
-
 # the final directory name is now editable -- so can be named according to specific project
 # mapdir indicates where to find the genetic map for phasing (full path in the config)
 Project = config['PROJECT']
@@ -86,4 +67,4 @@ else:
 # each chromosome. These files are stored in the vcf folder declaired in the
 # configuration file. It also checks that vcfs have been filtered before merged
 rule all:
-    input: rule_all_input
\ No newline at end of file
+    input: rule_all_input

From f4cc4b11cf27c0b59c38d60e928e3ac0f1e8e893 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra Fortuna <88657362+gmafrafortuna@users.noreply.github.com>
Date: Thu, 25 Jul 2024 16:09:18 +0200
Subject: [PATCH 14/15] Update ancestral_info_to_vcf.smk

---
 Snakemake/workflow/rules/ancestral_info_to_vcf.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
index f69bee0..1aeb6c0 100644
--- a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
+++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
@@ -44,7 +44,7 @@ else:
             vcf_file = input_ancestral,
             idx_file = input_ancestral_idx
         output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf'
-        conda: bcftools
+        conda: 'bcftools'
         benchmark: 'benchmarks/{chromosome}.decompress.benchmark.txt' 
         shell:
             r"""
@@ -138,4 +138,4 @@ rule compress_index_vcf:
         bgzip -c {input} > {output.vcf_file}
         bcftools index {output.vcf_file}
         """
-input_ts = rules.compress_index_vcf.output.vcf_file
\ No newline at end of file
+input_ts = rules.compress_index_vcf.output.vcf_file

From a911031ec144115d7edfb9484de099b32488d0f8 Mon Sep 17 00:00:00 2001
From: Gabriela Mafra Fortuna <88657362+gmafrafortuna@users.noreply.github.com>
Date: Fri, 26 Jul 2024 09:30:29 +0200
Subject: [PATCH 15/15] Update cluster.json

---
 Snakemake/workflow/profile/cluster.json | 156 +++++-------------------
 1 file changed, 33 insertions(+), 123 deletions(-)

diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json
index 574c29c..2670dad 100644
--- a/Snakemake/workflow/profile/cluster.json
+++ b/Snakemake/workflow/profile/cluster.json
@@ -1,21 +1,27 @@
 {
  "__default__": 
  {
-   "mem": "32G",
-   "threads": "16",
+   "mem": "8G",
+   "threads": "1",
    "time": "10:00:00"
  },
+ "compare_and_filter_vcfs":
+ {
+  "mem": "16G",
+  "threads": "8",
+  "time": "20:00:00"
+ },
  "merge":
  {
-  "mem": "64G",
-  "threads": "32",
+  "mem": "16G",
+  "threads": "8",
   "time": "20:00:00"
  },
- "rename":
+ "remove_missing_indels":
  {
-  "mem": "32G",
-  "threads": "1",
-  "time": "03:00:00"
+  "mem": "16G",
+  "threads": "16",
+  "time": "05:00:00"
  },
  "phase":
   {
@@ -23,25 +29,25 @@
     "threads": "32",
     "time": "240:00:00"
   },
-  "filter_positions":
+  "get_vcf_positions":
   {
-    "mem": "64G",
+    "mem": "16G",
     "threads": "32",
     "time": "05:00:00"
   },
-  "extract_ancestral_chromosome":
+  "get_ancestral_alleles":
   {
     "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
-  "join_major_ancestral":
+  "update_regions_file":
   {
     "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
-  "determine_ancestral_major":
+  "add_info_to_vcf":
   {
     "mem": "32G",
     "threads": "1",
@@ -53,136 +59,40 @@
     "threads": "1",
     "time": "03:00:00"
   },
-  "sort_ancestral_to_vcf":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "decompress":
-  {
-    "mem": "8G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "change_infoAA_vcf":
-  {
-    "mem": "32G",
-    "threads": "1",
-    "time": "05:00:00"
-  },
   "prepare_sample_file":
   {
-    "mem": "64G",
-    "threads": "32",
+    "mem": "32G",
+    "threads": "16",
     "time": "336:00:00"
   },
   "generate_ancestors":
   {
-    "mem": "64G",
-    "threads": "32",
+    "mem": "32G",
+    "threads": "16",
     "time": "336:00:00"
   },
   "truncate_ancestors":
   {
-    "mem": "64G",
-    "threads": "32",
+    "mem": "32G",
+    "threads": "16",
     "time": "336:00:00"
   },
   "ancestors_trees":
   {
-    "mem": "64G",
-    "threads": "32",
+    "mem": "32G",
+    "threads": "16",
     "time": "336:00:00"
   },
   "infer_trees":
   {
-    "mem": "64G",
-    "threads": "32",
+    "mem": "32G",
+    "threads": "16",
     "time": "336:00:00"
   },
- "extract_aligned_alleles_from_vcf":
+ "simplify_trees":
  {
-  "mem": "16G",
-  "threads": "1",
+  "mem": "32G",
+  "threads": "16",
   "time": "03:00:00"
- },
- "extract_snps_from_info":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "extract_pos_aligned_alleles_from_vcf":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "extract_vcf_alleles_from_aligned":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time" : "03:00:00"
-  },
-  "create_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time" : "05:00:00"
-  },
-  "edit_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "01:00:00"
-  },
-  "combine_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "run_estsfs":
-  {
-    "mem": "32G",
-    "threads": "1",
-    "time": "20:00:00"
-  },
-  "extract_major":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_minor":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_major_outgroup":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_estsfs_prob":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "determine_ancestral":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "move_and_rename_aa":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "05:00:00"
-  }
+ }
 }