From 59a97f2dc67541ddf8f3fe4310831ed6777b382d Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Fri, 12 Aug 2016 16:39:31 -0400 Subject: [PATCH] Speed up reading the reference. --- Makefile | 2 +- parsebam/readfasta.c | 34 ++++++++++++++++++---------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 5b9aeda..2eeb5d0 100755 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ #CC=gcc -Wall CC=gcc -D_GNU_SOURCE -CFLAGS=-c -Wall +CFLAGS=-c -Wall -lz SAMTOOLS=parsebam/samtools-0.1.18 all: diff --git a/parsebam/readfasta.c b/parsebam/readfasta.c index af5d971..e880c3d 100644 --- a/parsebam/readfasta.c +++ b/parsebam/readfasta.c @@ -3,9 +3,14 @@ #include #include #include +#include +#include #include "readfasta.h" #define MAX_BUF_SIZE 4096 +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + // initialize the reflist datastructure REFLIST* init_reflist(char* fastafile,REFLIST* reflist) { @@ -202,25 +207,21 @@ int read_next_chromosome(REFLIST* reflist,int chrom,FILE* fp) } int read_fasta(char* seqfile, REFLIST* reflist) { - FILE* fp = fopen(seqfile,"r"); - if (fp == NULL) { fprintf(stderr,"file %s not found \n",seqfile); return -1; } + clock_t t; + kseq_t *seq; + gzFile fp = gzopen(seqfile, "r"); + seq = kseq_init(fp); fprintf(stderr,"reading reference sequence file %s with %d sequences\n",seqfile,reflist->ns); - int i=-1,j=0; char c; - while (c != EOF) - { - c = fgetc(fp); - if (c == '>') - { - while (c != '\n') c = fgetc(fp); - //if (i >= 0)printf("%c %d %d",c,i,j); - i++; j=0; - } - else if (c != '\n' && c != '\t' && c != ' ') - { - reflist->sequences[i][j] = toupper(c); j++; + t = clock(); + int i=0, j=0; char c; + while (kseq_read(seq) >= 0) { + memcpy(reflist->sequences[i], seq->seq.s, seq->seq.l); + for (j = 0; j < seq->seq.l; j++) { + reflist->sequences[i][j] = toupper(reflist->sequences[i][j]); } + i++; } - fclose(fp); + gzclose(fp); fp=NULL; for (i=0;ins;i++) { reflist->sequences[i][reflist->lengths[i]] = '\0'; @@ -231,6 +232,7 @@ int read_fasta(char* seqfile, REFLIST* reflist) fprintf(stderr,"\n"); } } + fprintf(stderr, "read FASTA in %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); return 1; }