Skip to content

Changes for different types of chromosome name #3

@conniecl

Description

@conniecl

Some changes may be needed when training your own ATAC-seq dataset, especially when the chromosomes are formatted like: 1,2,3,..., instead of chr1, chr2,chr3...

1. tutorials/train.py, eval.py, predict_snv_atac.py, export_predictions.py

Touch empty files for basenji_blacklist.bed, test.vcf.gz,basenji_unmappable.bed, and change the input location and chromosome format in those files

1.1 train.py
    # Data paths
    signal_file = "/mnt/sdb/lchen/05.peak/SRR31036590.bw"
    genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
    blacklist_file = ["/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_blacklist.bed", "/mnt/sda/lchen/software/ASAP-main/tutorials/test.vcf.gz"]
    unmap_file = "/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_unmappable.bed"
    generated = "/mnt/sda/lchen/software/ASAP-main/tutorials/tmp"
    logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"
    # Training parameters
    test_chroms = [2, 5, 10]
    val_chroms = [1, 4, 7]
    train_chroms = [x for x in range(1, 10) if x not in test_chroms and x not in val_chroms]
    n_gpus = 1
1.2 eval.py
    # Data paths
    #signal_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567.nodup.no_chrM_MT.tn5.pval.signal.bigwig"
    signal_file = "/mnt/sdb/lchen/05.peak/SRR31036591.bw"
    peak_file = "/mnt/sdb/lchen/05.peak/SRR31036591_peaks.narrowPeak"
    #peak_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567.nodup.no_chrM_MT.tn5.pval0.01.300K.narrowPeak.gz"
    #genome = "/cluster/work/boeva/lkasak/data/hg38.fa"
    genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
    #blacklist_file = ["/cluster/work/boeva/lkasak/data/basenji_blacklist.bed", "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567_pcawg_hg38.vcf.gz"]
    blacklist_file = []
    #unmap_file = "/cluster/work/boeva/lkasak/data/basenji_unmappable.bed"
    unmap_file = "/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_unmappable.bed"
    generated = "/mnt/sda/lchen/software/ASAP-main/tutorials/tmp"
    logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"

    # Model parameters
    model_name = "convnext_dcnn"
    experiment_name = "SRR31036590_convnext_dcnn"

    # Training parameters
    test_chroms = [2, 4, 7]
1.3 predict_snv_atac.py (output bw file in this process seems to be wrong, try to fix it but may not work well, do not use this bw file, but bw that in export_predictions.py)
    # Data paths
    #signal_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567.nodup.no_chrM_MT.tn5.pval.signal.bigwig"
    signal_file = "/mnt/sdb/lchen/05.peak/SRR31036591.bw"
    #genome = "/cluster/work/boeva/lkasak/data/hg38.fa"
    genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
    logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"
    #snv_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567_pcawg_hg38.vcf.gz"
    snv_file = "/mnt/sda/lchen/epi_con/13.nbt_dai/01.dna/05.vcf/test2.vcf"

    # Model parameters
    model_name = "convnext_dcnn"
    experiment_name = "SRR31036590_convnext_dcnn"

    # Prediction parameters
    #chroms = [*range(1,11)]
    chroms = [9,10]
1.4 export_predictions.py
    # Data paths
    signal_file = "/mnt/sdb/lchen/05.peak/SRR31036591.bw"
    genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
    blacklist_file = ["/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_blacklist.bed"]
    generated = "/mnt/sda/lchen/software/ASAP-main/tutorials/tmp"
    logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"

    # Model parameters
    model_name = "convnext_dcnn"
    experiment_name = "SRR31036590_convnext_dcnn"

    # Prediction parameters
    #chroms = [*range(1,20)]
    chroms = [9,10]
2. asap/dataloader/utils/io.py (< represent the old one, and `>' represent the new one)
30c30
<     df = df[df['chrom'] == f'chr{chrom}']
---
>     df = df[df['chrom'].astype(str) == str(chrom)]
3. asap/dataloader/utils/data_bw.py
9a10
> 
11c12
<     coverage = bw.values(f'chr{chrom}', start, end, numpy=True).astype('float16')
---
>     coverage = bw.values(f'{chrom}', start, end, numpy=True).astype('float16')
4. asap/task.py
> from typing import Union, Dict
176c177,178
< def predict_snv_atac(experiment_name: str, model: str, snv_file: str, signal_file: str, logs_dir: str, out_dir: str, genome: str, chroms: List[int]=[*range(1,23)], use_map: bool=False, export_bigwig: str=None, scale: dict | float=1.0):
---
> #def predict_snv_atac(experiment_name: str, model: str, snv_file: str, signal_file: str, logs_dir: str, out_dir: str, genome: str, chroms: List[int]=[*range(1,10)], use_map: bool=False, export_bigwig: str=None, scale: dict | float=1.0):
> def predict_snv_atac(experiment_name: str, model: str, snv_file: str, signal_file: str, logs_dir: str, out_dir: str, genome: str, chroms: List[int]=[*range(1,10)], use_map: bool=False, export_bigwig: str=None, scale: Union[Dict, float] = 1.0):
240c242,249
<         chroms = results['chr'].unique()
---
>         chroms_raw = results['chr'].unique()
>         chroms = []
>         for chrom in chroms_raw:
>             chrom_int = int(chrom)
>             chrom_str = str(chrom_int)
>             chroms.append(chrom_str)
> 
>         #print(f"orig: {chroms}, type: {[type(c) for c in chroms]}")
263c272,274
<             scale_factor = scale[chrom]
---
>             chrom_key = str(chrom).strip()
>             #chrom_key = int(chrom) if isinstance(next(iter(scale.keys())), int) else chrom
>             scale_factor = scale[chrom_key]
270c281
<             chrom_df = results[results['chr'] == chrom]
---
>             chrom_df = results[results['chr'].astype(str).str.strip() == chrom]
359,364c370,391
<                 if ref_flg:
<                     values1 = [values1[i] for i in sorted_indices]
<                     bw_out_ref.addEntries([chrom] * len(starts), starts, ends=ends, values=values1)    
<                 if alt_flg:
<                     values2 = [values2[i] for i in sorted_indices]
<                     bw_out_alt.addEntries([chrom] * len(starts), starts, ends=ends, values=values2)
---
>                 #print(f"{chrom} {len(starts)} {len(ends)} {starts[:5]} {ends[:5]}")
>                 unique_data = {}
>                 for i, s in enumerate(starts):
>                     if s not in unique_data:
>                         unique_data[s] = (ends[i], values1[i] if ref_flg and i < len(values1) else None,values2[i] if alt_flg and i < len(values2) else None)
>                 if unique_data:
>                     unique_starts = sorted(unique_data.keys())
>                     unique_ends = [unique_data[s][0] for s in unique_starts]
>                     if ref_flg:
>                         unique_values1 = [unique_data[s][1] for s in unique_starts]
>                         bw_out_ref.addEntries([chrom_str] * len(unique_starts), unique_starts, ends=unique_ends, values=unique_values1)
>                         if alt_flg:
>                             unique_values2 = [unique_data[s][2] for s in unique_starts]
>                             bw_out_alt.addEntries([chrom_str] * len(unique_starts), unique_starts, ends=unique_ends, values=unique_values2)
> 
> #                if ref_flg:
> #                    values1 = [values1[i] for i in sorted_indices]
> #                    bw_out_ref.addEntries([chrom] * len(starts), starts, ends=ends, values=values1)    
> #                if alt_flg:
> #                    values2 = [values2[i] for i in sorted_indices]
>                     #print(f"{values2}")
> #                    bw_out_alt.addEntries([chrom] * len(starts), starts, ends=ends, values=values2)
5. asap/snv/data_loader.py
5c5
<     df = pd.read_csv(snv_file, comment='#', names=vcf_columns, sep='\t', index_col=False)
---
>     df = pd.read_csv(snv_file, comment='#', names=vcf_columns, sep='\t', usecols=range(8), index_col=False)
7,9c7,12
<     df = df[df.chr.isin([f'chr{chrom}' for chrom in range(1, 23)])]
<     df = df[df['filter'] == 'PASS']
<     
---
> 
>     #df = df[df.chr.isin([str(chrom) for chrom in range(1, 11)])]
>     #df = df[df['filter'] == 'PASS']
>     df['id'] = df['chr'].astype(str) + '_' + df['pos'].astype(str)
>     #print(df['id'])
> 
10a14
> 
6. asap/snv/predict.py (need to deal with the type of chromosome; the position of SNPs that are near the start of the chromosome, like chr1_450; the BW output)
29c29,38
< 
---
>    
>    # === add the debug info ===
>     print(f"start of debug info:")
>     print(f"SNV DataFrame shape: {snv.shape}")
>     print(f"Name of SNV column : {snv.columns.tolist()}")
>     print(f"List of chromosome: {snv['chr'].unique()[:10]}")
>     print(f"Type of chromosome: {snv['chr'].dtype}")
>     print(f"List of chromosome that need to be dealed: {chroms}")
>    # === end of the debug info ===
>     
31c40,42
<         chr_df = snv[(snv.chr == f'chr{chrom}') & (snv[output_columns].isnull().any(axis=1))]
---
>         #chr_df = snv[(snv.chr == f'chr{chrom}') & (snv[output_columns].isnull().any(axis=1))] edit y lchen to the next
>         chr_df = snv[(snv.chr == chrom) & (snv[output_columns].isnull().any(axis=1))]
>         print(chr_df)
51,52c62,77
<             x = _get_x_by_idx(chrom=chrom, seq=seq, seq_starts=starts, window=window_size, margin=margin_size)
<             y = _get_y_by_idx(signal_files=[bw_file], chrom=chrom, seq_starts=starts, window=window_size, bin_size=bin_size)
---
>             # check the if starts < 0, due to the first SNPs may near the start of chromoe
>             valid_mask = starts >=0
>             if not valid_mask.all():
>                 print(f"remove {np.sum(~valid_mask)} SNV that close to the start of chromosome")
>                 chr_df_i = chr_df_i[valid_mask]
>                 starts = starts[valid_mask]
>                 if len(chr_df_i) == 0:
>                     start_idx = end_idx
>                     continue
>             try:
>                 x = _get_x_by_idx(chrom=chrom, seq=seq, seq_starts=starts, window=window_size, margin=margin_size)
>                 y = _get_y_by_idx(signal_files=[bw_file], chrom=chrom, seq_starts=starts, window=window_size, bin_size=bin_size)
>             except Exception as e:
>                 print(f"ERROR: {e}")
>                 start_idx = end_idx
>                 continue
  1. Since the distance between the adjustment SNP should be longer than 1024bp, use plink at first:
    ~/software/plink-1.9/plink --vcf test.vcf --bp-space 1200 --recode vcf --out test1 --noweb --keep-allele-order --chr 9,10
    And have no idea why these restrictions were applied in ASAP, just downsize the SNP for jumping the bug, hope to see the changes in the later version.
  2. All the changed scripts can be found below change_file.zip

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions