-
Notifications
You must be signed in to change notification settings - Fork 1
Open
Description
Some changes may be needed when training your own ATAC-seq dataset, especially when the chromosomes are formatted like: 1,2,3,..., instead of chr1, chr2,chr3...
1. tutorials/train.py, eval.py, predict_snv_atac.py, export_predictions.py
Touch empty files for basenji_blacklist.bed, test.vcf.gz,basenji_unmappable.bed, and change the input location and chromosome format in those files
1.1 train.py
# Data paths
signal_file = "/mnt/sdb/lchen/05.peak/SRR31036590.bw"
genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
blacklist_file = ["/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_blacklist.bed", "/mnt/sda/lchen/software/ASAP-main/tutorials/test.vcf.gz"]
unmap_file = "/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_unmappable.bed"
generated = "/mnt/sda/lchen/software/ASAP-main/tutorials/tmp"
logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"
# Training parameters
test_chroms = [2, 5, 10]
val_chroms = [1, 4, 7]
train_chroms = [x for x in range(1, 10) if x not in test_chroms and x not in val_chroms]
n_gpus = 1
1.2 eval.py
# Data paths
#signal_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567.nodup.no_chrM_MT.tn5.pval.signal.bigwig"
signal_file = "/mnt/sdb/lchen/05.peak/SRR31036591.bw"
peak_file = "/mnt/sdb/lchen/05.peak/SRR31036591_peaks.narrowPeak"
#peak_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567.nodup.no_chrM_MT.tn5.pval0.01.300K.narrowPeak.gz"
#genome = "/cluster/work/boeva/lkasak/data/hg38.fa"
genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
#blacklist_file = ["/cluster/work/boeva/lkasak/data/basenji_blacklist.bed", "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567_pcawg_hg38.vcf.gz"]
blacklist_file = []
#unmap_file = "/cluster/work/boeva/lkasak/data/basenji_unmappable.bed"
unmap_file = "/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_unmappable.bed"
generated = "/mnt/sda/lchen/software/ASAP-main/tutorials/tmp"
logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"
# Model parameters
model_name = "convnext_dcnn"
experiment_name = "SRR31036590_convnext_dcnn"
# Training parameters
test_chroms = [2, 4, 7]
1.3 predict_snv_atac.py (output bw file in this process seems to be wrong, try to fix it but may not work well, do not use this bw file, but bw that in export_predictions.py)
# Data paths
#signal_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567.nodup.no_chrM_MT.tn5.pval.signal.bigwig"
signal_file = "/mnt/sdb/lchen/05.peak/SRR31036591.bw"
#genome = "/cluster/work/boeva/lkasak/data/hg38.fa"
genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"
#snv_file = "/cluster/work/boeva/lkasak/data/TCGA-A6-A567/TCGA-A6-A567_pcawg_hg38.vcf.gz"
snv_file = "/mnt/sda/lchen/epi_con/13.nbt_dai/01.dna/05.vcf/test2.vcf"
# Model parameters
model_name = "convnext_dcnn"
experiment_name = "SRR31036590_convnext_dcnn"
# Prediction parameters
#chroms = [*range(1,11)]
chroms = [9,10]
1.4 export_predictions.py
# Data paths
signal_file = "/mnt/sdb/lchen/05.peak/SRR31036591.bw"
genome = "/mnt/sda/lchen/epi_con/genome/b73_bwa/Zmays.fa"
blacklist_file = ["/mnt/sda/lchen/software/ASAP-main/tutorials/basenji_blacklist.bed"]
generated = "/mnt/sda/lchen/software/ASAP-main/tutorials/tmp"
logs_dir = "/mnt/sda/lchen/software/ASAP-main/tutorials/logs"
# Model parameters
model_name = "convnext_dcnn"
experiment_name = "SRR31036590_convnext_dcnn"
# Prediction parameters
#chroms = [*range(1,20)]
chroms = [9,10]
2. asap/dataloader/utils/io.py (< represent the old one, and `>' represent the new one)
30c30
< df = df[df['chrom'] == f'chr{chrom}']
---
> df = df[df['chrom'].astype(str) == str(chrom)]
3. asap/dataloader/utils/data_bw.py
9a10
>
11c12
< coverage = bw.values(f'chr{chrom}', start, end, numpy=True).astype('float16')
---
> coverage = bw.values(f'{chrom}', start, end, numpy=True).astype('float16')
4. asap/task.py
> from typing import Union, Dict
176c177,178
< def predict_snv_atac(experiment_name: str, model: str, snv_file: str, signal_file: str, logs_dir: str, out_dir: str, genome: str, chroms: List[int]=[*range(1,23)], use_map: bool=False, export_bigwig: str=None, scale: dict | float=1.0):
---
> #def predict_snv_atac(experiment_name: str, model: str, snv_file: str, signal_file: str, logs_dir: str, out_dir: str, genome: str, chroms: List[int]=[*range(1,10)], use_map: bool=False, export_bigwig: str=None, scale: dict | float=1.0):
> def predict_snv_atac(experiment_name: str, model: str, snv_file: str, signal_file: str, logs_dir: str, out_dir: str, genome: str, chroms: List[int]=[*range(1,10)], use_map: bool=False, export_bigwig: str=None, scale: Union[Dict, float] = 1.0):
240c242,249
< chroms = results['chr'].unique()
---
> chroms_raw = results['chr'].unique()
> chroms = []
> for chrom in chroms_raw:
> chrom_int = int(chrom)
> chrom_str = str(chrom_int)
> chroms.append(chrom_str)
>
> #print(f"orig: {chroms}, type: {[type(c) for c in chroms]}")
263c272,274
< scale_factor = scale[chrom]
---
> chrom_key = str(chrom).strip()
> #chrom_key = int(chrom) if isinstance(next(iter(scale.keys())), int) else chrom
> scale_factor = scale[chrom_key]
270c281
< chrom_df = results[results['chr'] == chrom]
---
> chrom_df = results[results['chr'].astype(str).str.strip() == chrom]
359,364c370,391
< if ref_flg:
< values1 = [values1[i] for i in sorted_indices]
< bw_out_ref.addEntries([chrom] * len(starts), starts, ends=ends, values=values1)
< if alt_flg:
< values2 = [values2[i] for i in sorted_indices]
< bw_out_alt.addEntries([chrom] * len(starts), starts, ends=ends, values=values2)
---
> #print(f"{chrom} {len(starts)} {len(ends)} {starts[:5]} {ends[:5]}")
> unique_data = {}
> for i, s in enumerate(starts):
> if s not in unique_data:
> unique_data[s] = (ends[i], values1[i] if ref_flg and i < len(values1) else None,values2[i] if alt_flg and i < len(values2) else None)
> if unique_data:
> unique_starts = sorted(unique_data.keys())
> unique_ends = [unique_data[s][0] for s in unique_starts]
> if ref_flg:
> unique_values1 = [unique_data[s][1] for s in unique_starts]
> bw_out_ref.addEntries([chrom_str] * len(unique_starts), unique_starts, ends=unique_ends, values=unique_values1)
> if alt_flg:
> unique_values2 = [unique_data[s][2] for s in unique_starts]
> bw_out_alt.addEntries([chrom_str] * len(unique_starts), unique_starts, ends=unique_ends, values=unique_values2)
>
> # if ref_flg:
> # values1 = [values1[i] for i in sorted_indices]
> # bw_out_ref.addEntries([chrom] * len(starts), starts, ends=ends, values=values1)
> # if alt_flg:
> # values2 = [values2[i] for i in sorted_indices]
> #print(f"{values2}")
> # bw_out_alt.addEntries([chrom] * len(starts), starts, ends=ends, values=values2)
5. asap/snv/data_loader.py
5c5
< df = pd.read_csv(snv_file, comment='#', names=vcf_columns, sep='\t', index_col=False)
---
> df = pd.read_csv(snv_file, comment='#', names=vcf_columns, sep='\t', usecols=range(8), index_col=False)
7,9c7,12
< df = df[df.chr.isin([f'chr{chrom}' for chrom in range(1, 23)])]
< df = df[df['filter'] == 'PASS']
<
---
>
> #df = df[df.chr.isin([str(chrom) for chrom in range(1, 11)])]
> #df = df[df['filter'] == 'PASS']
> df['id'] = df['chr'].astype(str) + '_' + df['pos'].astype(str)
> #print(df['id'])
>
10a14
>
6. asap/snv/predict.py (need to deal with the type of chromosome; the position of SNPs that are near the start of the chromosome, like chr1_450; the BW output)
29c29,38
<
---
>
> # === add the debug info ===
> print(f"start of debug info:")
> print(f"SNV DataFrame shape: {snv.shape}")
> print(f"Name of SNV column : {snv.columns.tolist()}")
> print(f"List of chromosome: {snv['chr'].unique()[:10]}")
> print(f"Type of chromosome: {snv['chr'].dtype}")
> print(f"List of chromosome that need to be dealed: {chroms}")
> # === end of the debug info ===
>
31c40,42
< chr_df = snv[(snv.chr == f'chr{chrom}') & (snv[output_columns].isnull().any(axis=1))]
---
> #chr_df = snv[(snv.chr == f'chr{chrom}') & (snv[output_columns].isnull().any(axis=1))] edit y lchen to the next
> chr_df = snv[(snv.chr == chrom) & (snv[output_columns].isnull().any(axis=1))]
> print(chr_df)
51,52c62,77
< x = _get_x_by_idx(chrom=chrom, seq=seq, seq_starts=starts, window=window_size, margin=margin_size)
< y = _get_y_by_idx(signal_files=[bw_file], chrom=chrom, seq_starts=starts, window=window_size, bin_size=bin_size)
---
> # check the if starts < 0, due to the first SNPs may near the start of chromoe
> valid_mask = starts >=0
> if not valid_mask.all():
> print(f"remove {np.sum(~valid_mask)} SNV that close to the start of chromosome")
> chr_df_i = chr_df_i[valid_mask]
> starts = starts[valid_mask]
> if len(chr_df_i) == 0:
> start_idx = end_idx
> continue
> try:
> x = _get_x_by_idx(chrom=chrom, seq=seq, seq_starts=starts, window=window_size, margin=margin_size)
> y = _get_y_by_idx(signal_files=[bw_file], chrom=chrom, seq_starts=starts, window=window_size, bin_size=bin_size)
> except Exception as e:
> print(f"ERROR: {e}")
> start_idx = end_idx
> continue
- Since the distance between the adjustment SNP should be longer than 1024bp, use
plinkat first:
~/software/plink-1.9/plink --vcf test.vcf --bp-space 1200 --recode vcf --out test1 --noweb --keep-allele-order --chr 9,10
And have no idea why these restrictions were applied inASAP, just downsize the SNP for jumping the bug, hope to see the changes in the later version. - All the changed scripts can be found below change_file.zip
Metadata
Metadata
Assignees
Labels
No labels