diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0380c7e --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +The MIT License +Copyright (c) 20082-2012 by Heng Li +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile index 0b9f9c2..12bd575 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,10 @@ CC=gcc CFLAGS=-g -Wall -O2 -Wno-unused-function -all:seqtk trimadap +all:seqtk seqtk:seqtk.c khash.h kseq.h $(CC) $(CFLAGS) seqtk.c -o $@ -lz -lm -trimadap:trimadap.c kseq.h ksw.h - $(CC) $(CFLAGS) ksw.c trimadap.c -o $@ -lz -lm - clean: rm -fr gmon.out *.o ext/*.o a.out seqtk trimadap *~ *.a *.dSYM session* diff --git a/README.md b/README.md index 9f71109..1642638 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,12 @@ Introduction Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. It seamlessly parses both FASTA and FASTQ files which can also be -optionally compressed by gzip. +optionally compressed by gzip. To install `seqtk`, +```sh +git clone https://github.com/lh3/seqtk.git; +cd seqtk; make +``` +The only library dependency is zlib. Seqtk Examples -------------- @@ -54,3 +59,6 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa +* Keep the 50bp from right end of each read by trimming the rest and if the trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: + + seqtk trimfq -E 50 -s 20 in.fq > out.fq diff --git a/kseq.h b/kseq.h index 84c1fa3..8f9e498 100644 --- a/kseq.h +++ b/kseq.h @@ -23,7 +23,7 @@ SOFTWARE. */ -/* Last Modified: 05MAR2012 */ +/* Last Modified: 2017-02-11 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -39,44 +39,44 @@ #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ - int begin, end; \ - int is_eof:2, bufsize:30; \ - type_t f; \ unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ } kstream_t; +#define ks_err(ks) ((ks)->end < 0) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) -#define __KS_BASIC(SCOPE, type_t, __bufsize) \ - SCOPE kstream_t *ks_init(type_t f) \ +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; ks->bufsize = __bufsize; \ + ks->f = f; \ ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ - SCOPE void ks_destroy(kstream_t *ks) \ + static inline void ks_destroy(kstream_t *ks) \ { \ - if (!ks) return; \ - free(ks->buf); \ - free(ks); \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ } -#define __KS_INLINED(__read) \ +#define __KS_GETC(__read, __bufsize) \ static inline int ks_getc(kstream_t *ks) \ { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks_err(ks)) return -3; \ + if (ks_eof(ks)) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + else if (ks->end < 0) { ks->is_eof = 1; return -3; } \ } \ return (int)ks->buf[ks->begin++]; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } + } #ifndef KSTRING_T #define KSTRING_T kstring_t @@ -90,24 +90,25 @@ typedef struct __kstring_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif -#define __KS_GETUNTIL(SCOPE, __read) \ - SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ + int gotany = 0; \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ + if (ks_err(ks)) return -3; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } else break; \ } \ - if (delimiter == KS_SEP_LINE) { \ - for (i = ks->begin; i < ks->end; ++i) \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == '\n') break; \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ @@ -117,14 +118,15 @@ typedef struct __kstring_t { if (isspace(ks->buf[i])) break; \ } else if (delimiter == KS_SEP_TAB) { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + gotany = 1; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ if (i < ks->end) { \ @@ -132,108 +134,101 @@ typedef struct __kstring_t { break; \ } \ } \ + if (!gotany && ks_eof(ks)) return -1; \ if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ - str->s[str->l] = '\0'; \ + str->s[str->l] = '\0'; \ return str->l; \ - } - -#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(SCOPE, type_t, __bufsize) \ - __KS_GETUNTIL(SCOPE, __read) \ - __KS_INLINED(__read) - -#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } -#define KSTREAM_DECLARE(type_t, __read) \ +#define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ - extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ - extern kstream_t *ks_init(type_t f); \ - extern void ks_destroy(kstream_t *ks); \ - __KS_INLINED(__read) - -/****************** - * FASTA/Q parser * - ******************/ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) -#define __KSEQ_BASIC(SCOPE, type_t) \ - SCOPE kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - SCOPE void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ + ks_destroy(ks->f); \ + free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string + -3 error reading stream */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ - int c; \ + int c,r; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error*/ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ + seq->is_fastq = (c == '+'); \ + if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ + if (c == -3) return -3; /* stream error */ \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ } -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char, is_fastq; \ + kstream_t *f; \ } kseq_t; -#define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(SCOPE, type_t) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) diff --git a/ksort.h b/ksort.h deleted file mode 100644 index 4da7a13..0000000 --- a/ksort.h +++ /dev/null @@ -1,298 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2011-04-10 (0.1.6): - - * Added sample - - 2011-03 (0.1.5): - - * Added shuffle/permutation - - 2008-11-16 (0.1.4): - - * Fixed a bug in introsort() that happens in rare cases. - - 2008-11-05 (0.1.3): - - * Fixed a bug in introsort() for complex comparisons. - - * Fixed a bug in mergesort(). The previous version is not stable. - - 2008-09-15 (0.1.2): - - * Accelerated introsort. On my Mac (not on another Linux machine), - my implementation is as fast as std::sort on random input. - - * Added combsort and in introsort, switch to combsort if the - recursion is too deep. - - 2008-09-13 (0.1.1): - - * Added k-small algorithm - - 2008-09-05 (0.1.0): - - * Initial version - -*/ - -#ifndef AC_KSORT_H -#define AC_KSORT_H - -#include -#include - -typedef struct { - void *left, *right; - int depth; -} ks_isort_stack_t; - -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } - -#define KSORT_INIT(name, type_t, __sort_lt) \ - void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ - { \ - type_t *a2[2], *a, *b; \ - int curr, shift; \ - \ - a2[0] = array; \ - a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ - for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ - ks_heapadjust_##name(i, lsize, l); \ - } \ - void ks_heapsort_##name(size_t lsize, type_t l[]) \ - { \ - size_t i; \ - for (i = lsize - 1; i > 0; --i) { \ - type_t tmp; \ - tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ - } \ - } \ - static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ - { \ - type_t *i, *j, swap_tmp; \ - for (i = s + 1; i < t; ++i) \ - for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ - swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ - } \ - } \ - void ks_combsort_##name(size_t n, type_t a[]) \ - { \ - const double shrink_factor = 1.2473309501039786540366528676643; \ - int do_swap; \ - size_t gap = n; \ - type_t tmp, *i, *j; \ - do { \ - if (gap > 2) { \ - gap = (size_t)(gap / shrink_factor); \ - if (gap == 9 || gap == 10) gap = 11; \ - } \ - do_swap = 0; \ - for (i = a; i < a + n - gap; ++i) { \ - j = i + gap; \ - if (__sort_lt(*j, *i)) { \ - tmp = *i; *i = *j; *j = tmp; \ - do_swap = 1; \ - } \ - } \ - } while (do_swap || gap > 2); \ - if (gap != 1) __ks_insertsort_##name(a, a + n); \ - } \ - void ks_introsort_##name(size_t n, type_t a[]) \ - { \ - int d; \ - ks_isort_stack_t *top, *stack; \ - type_t rp, swap_tmp; \ - type_t *s, *t, *i, *j, *k; \ - \ - if (n < 1) return; \ - else if (n == 2) { \ - if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ - return; \ - } \ - for (d = 2; 1ul<>1) + 1; \ - if (__sort_lt(*k, *i)) { \ - if (__sort_lt(*k, *j)) k = j; \ - } else k = __sort_lt(*j, *i)? i : j; \ - rp = *k; \ - if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ - for (;;) { \ - do ++i; while (__sort_lt(*i, rp)); \ - do --j; while (i <= j && __sort_lt(rp, *j)); \ - if (j <= i) break; \ - swap_tmp = *i; *i = *j; *j = swap_tmp; \ - } \ - swap_tmp = *i; *i = *t; *t = swap_tmp; \ - if (i-s > t-i) { \ - if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ - s = t-i > 16? i+1 : t; \ - } else { \ - if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ - t = i-s > 16? i-1 : s; \ - } \ - } else { \ - if (top == stack) { \ - free(stack); \ - __ks_insertsort_##name(a, a+n); \ - return; \ - } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ - } \ - } \ - } \ - /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ - /* 0 <= kk < n */ \ - type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ - { \ - type_t *low, *high, *k, *ll, *hh, *mid; \ - low = arr; high = arr + n - 1; k = arr + kk; \ - for (;;) { \ - if (high <= low) return *k; \ - if (high == low + 1) { \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - return *k; \ - } \ - mid = low + (high - low) / 2; \ - if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ - KSORT_SWAP(type_t, *mid, *(low+1)); \ - ll = low + 1; hh = high; \ - for (;;) { \ - do ++ll; while (__sort_lt(*ll, *low)); \ - do --hh; while (__sort_lt(*low, *hh)); \ - if (hh < ll) break; \ - KSORT_SWAP(type_t, *ll, *hh); \ - } \ - KSORT_SWAP(type_t, *low, *hh); \ - if (hh <= k) low = ll; \ - if (hh >= k) high = hh - 1; \ - } \ - } \ - void ks_shuffle_##name(size_t n, type_t a[]) \ - { \ - int i, j; \ - for (i = n; i > 1; --i) { \ - type_t tmp; \ - j = (int)(drand48() * i); \ - tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ - } \ - } \ - void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \ - { /* reference: http://code.activestate.com/recipes/272884/ */ \ - int i, k, pop = n; \ - for (i = (int)r, k = 0; i >= 0; --i) { \ - double z = 1., x = drand48(); \ - type_t tmp; \ - while (x < z) z -= z * i / (pop--); \ - if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \ - ++k; \ - } \ - } - -#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) -#define ks_introsort(name, n, a) ks_introsort_##name(n, a) -#define ks_combsort(name, n, a) ks_combsort_##name(n, a) -#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) -#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) -#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) -#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) -#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) - -#define ks_lt_generic(a, b) ((a) < (b)) -#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) - -typedef const char *ksstr_t; - -#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) -#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) - -#endif diff --git a/kstring.h b/kstring.h deleted file mode 100644 index 6639bd8..0000000 --- a/kstring.h +++ /dev/null @@ -1,169 +0,0 @@ -/* The MIT License - - Copyright (c) by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef KSTRING_H -#define KSTRING_H - -#include -#include -#include - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - uint32_t l, m; - char *s; -} kstring_t; -#endif - -typedef struct { - uint64_t tab[4]; - int sep, finished; - const char *p; // end of the current token -} ks_tokaux_t; - -#ifdef __cplusplus -extern "C" { -#endif - - int ksprintf(kstring_t *s, const char *fmt, ...); - int ksprintf_fast(kstring_t *s, const char *fmt, ...); - int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); - char *kstrstr(const char *str, const char *pat, int **_prep); - char *kstrnstr(const char *str, const char *pat, int n, int **_prep); - void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); - - /* kstrtok() is similar to strtok_r() except that str is not - * modified and both str and sep can be NULL. For efficiency, it is - * actually recommended to set both to NULL in the subsequent calls - * if sep is not changed. */ - char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); - -#ifdef __cplusplus -} -#endif - -static inline void ks_resize(kstring_t *s, size_t size) -{ - if (s->m < size) { - s->m = size; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } -} - -static inline int kputsn(const char *p, int l, kstring_t *s) -{ - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - memcpy(s->s + s->l, p, l); - s->l += l; - s->s[s->l] = 0; - return l; -} - -static inline int kputs(const char *p, kstring_t *s) -{ - return kputsn(p, strlen(p), s); -} - -static inline int kputc(int c, kstring_t *s) -{ - if (s->l + 1 >= s->m) { - s->m = s->l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - s->s[s->l++] = c; - s->s[s->l] = 0; - return c; -} - -static inline int kputw(int c, kstring_t *s) -{ - char buf[16]; - int l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputuw(unsigned c, kstring_t *s) -{ - char buf[16]; - int l, i; - unsigned x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputl(long c, kstring_t *s) -{ - char buf[32]; - long l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -static inline int *ksplit(kstring_t *s, int delimiter, int *n) -{ - int max = 0, *offsets = 0; - *n = ksplit_core(s->s, delimiter, &max, &offsets); - return offsets; -} - -#endif diff --git a/ksw.c b/ksw.c deleted file mode 100644 index 8aaa7fc..0000000 --- a/ksw.c +++ /dev/null @@ -1,454 +0,0 @@ -/* The MIT License - - Copyright (c) 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include "ksw.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#define UNLIKELY(x) __builtin_expect((x),0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; - -struct _kswq_t { - int qlen, slen; - uint8_t shift, mdiff, max, size; - __m128i *qp, *H0, *H1, *E, *Hmax; -}; - -/** - * Initialize the query data structure - * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet - * @param mat Scoring matrix in a one-dimension array - * - * @return Query data structure - */ -kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) -{ - kswq_t *q; - int slen, a, tmp, p; - - size = size > 1? 2 : 1; - p = 8 * (3 - size); // # values per __m128i - slen = (qlen + p - 1) / p; // segmented length - q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory - q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory - q->H0 = q->qp + slen * m; - q->H1 = q->H0 + slen; - q->E = q->H1 + slen; - q->Hmax = q->E + slen; - q->slen = slen; q->qlen = qlen; q->size = size; - // compute shift - tmp = m * m; - for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score - if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; - if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; - } - q->max = q->mdiff; - q->shift = 256 - q->shift; // NB: q->shift is uint8_t - q->mdiff += q->shift; // this is the difference between the min and max scores - // An example: p=8, qlen=19, slen=3 and segmentation: - // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} - if (size == 1) { - int8_t *t = (int8_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; - } - } else { - int16_t *t = (int16_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]); - } - } - return q; -} - -kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) -{ - int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; - uint64_t *b; - __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; - kswr_t r; - -#define __max_16(ret, xx) do { \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ - (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ - } while (0) - - // initialization - r = g_defr; - minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; - endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; - m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi8(_gapo + _gape); - gape = _mm_set1_epi8(_gape); - shift = _mm_set1_epi8(q->shift); - H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; - slen = q->slen; - for (i = 0; i < slen; ++i) { - _mm_store_si128(E + i, zero); - _mm_store_si128(H0 + i, zero); - _mm_store_si128(Hmax + i, zero); - } - // the core loop - for (i = 0; i < tlen; ++i) { - int j, k, cmp, imax; - __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector - h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian - for (j = 0; LIKELY(j < slen); ++j) { - /* SW cells are computed in the following order: - * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} - * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} - * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} - */ - // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) - h = _mm_adds_epu8(h, _mm_load_si128(S + j)); - h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) - e = _mm_load_si128(E + j); // e=E'(i,j) - h = _mm_max_epu8(h, e); - h = _mm_max_epu8(h, f); // h=H'(i,j) - max = _mm_max_epu8(max, h); // set max - _mm_store_si128(H1 + j, h); // save to H'(i,j) - // now compute E'(i+1,j) - h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo - e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape - e = _mm_max_epu8(e, h); // e=E'(i+1,j) - _mm_store_si128(E + j, e); // save to E'(i+1,j) - // now compute F'(i,j+1) - f = _mm_subs_epu8(f, gape); - f = _mm_max_epu8(f, h); - // get H'(i-1,j) and prepare for the next j - h = _mm_load_si128(H0 + j); // h=H'(i-1,j) - } - // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion - for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max - f = _mm_slli_si128(f, 1); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_load_si128(H1 + j); - h = _mm_max_epu8(h, f); // h=H'(i,j) - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu8(h, gapoe); - f = _mm_subs_epu8(f, gape); - cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); - if (UNLIKELY(cmp == 0xffff)) goto end_loop16; - } - } -end_loop16: - //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); - __max_16(imax, max); // imax is the maximum number in max - if (imax >= minsc) { // write the b array; this condition adds branching unfornately - if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append - if (n_b == m_b) { - m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)realloc(b, 8 * m_b); - } - b[n_b++] = (uint64_t)imax<<32 | i; - } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last - } - if (imax > gmax) { - gmax = imax; te = i; // te is the end position on the target - for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector - _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax + q->shift >= 255 || gmax >= endsc) break; - } - S = H1; H1 = H0; H0 = S; // swap H0 and H1 - } - r.score = gmax + q->shift < 255? gmax : 255; - r.te = te; - if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score - int max = -1, low, high, qlen = slen * 16; - uint8_t *t = (uint8_t*)Hmax; - for (i = 0; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; - //printf("%d,%d\n", max, gmax); - if (b) { - i = (r.score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) - r.score2 = b[i]>>32, r.te2 = e; - } - } - } - free(b); - return r; -} - -kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) -{ - int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; - uint64_t *b; - __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; - kswr_t r; - -#define __max_8(ret, xx) do { \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ - (ret) = _mm_extract_epi16((xx), 0); \ - } while (0) - - // initialization - r = g_defr; - minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; - endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; - m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi16(_gapo + _gape); - gape = _mm_set1_epi16(_gape); - H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; - slen = q->slen; - for (i = 0; i < slen; ++i) { - _mm_store_si128(E + i, zero); - _mm_store_si128(H0 + i, zero); - _mm_store_si128(Hmax + i, zero); - } - // the core loop - for (i = 0; i < tlen; ++i) { - int j, k, imax; - __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector - h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 2); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_adds_epi16(h, *S++); - e = _mm_load_si128(E + j); - h = _mm_max_epi16(h, e); - h = _mm_max_epi16(h, f); - max = _mm_max_epi16(max, h); - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, gapoe); - e = _mm_subs_epu16(e, gape); - e = _mm_max_epi16(e, h); - _mm_store_si128(E + j, e); - f = _mm_subs_epu16(f, gape); - f = _mm_max_epi16(f, h); - h = _mm_load_si128(H0 + j); - } - for (k = 0; LIKELY(k < 16); ++k) { - f = _mm_slli_si128(f, 2); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_load_si128(H1 + j); - h = _mm_max_epi16(h, f); - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, gapoe); - f = _mm_subs_epu16(f, gape); - if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; - } - } -end_loop8: - __max_8(imax, max); - if (imax >= minsc) { - if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { - if (n_b == m_b) { - m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)realloc(b, 8 * m_b); - } - b[n_b++] = (uint64_t)imax<<32 | i; - } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last - } - if (imax > gmax) { - gmax = imax; te = i; - for (j = 0; LIKELY(j < slen); ++j) - _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax >= endsc) break; - } - S = H1; H1 = H0; H0 = S; - } - r.score = gmax; r.te = te; - { - int max = -1, low, high, qlen = slen * 8; - uint16_t *t = (uint16_t*)Hmax; - for (i = 0, r.qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; - if (b) { - i = (r.score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) - r.score2 = b[i]>>32, r.te2 = e; - } - } - } - free(b); - return r; -} - -static void revseq(int l, uint8_t *s) -{ - int i, t; - for (i = 0; i < l>>1; ++i) - t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; -} - -kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) -{ - int size; - kswq_t *q; - kswr_t r, rr; - kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); - - q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); - if (qry && *qry == 0) *qry = q; - func = q->size == 2? ksw_i16 : ksw_u8; - size = q->size; - r = func(q, tlen, target, gapo, gape, xtra); - if (qry == 0) free(q); - if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; - revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end - q = ksw_qinit(size, r.qe + 1, query, m, mat); - rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); - revseq(r.qe + 1, query); revseq(r.te + 1, target); - free(q); - if (r.score == rr.score) - r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; - return r; -} - -/******************************************* - * Main function (not compiled by default) * - *******************************************/ - -#ifdef _KSW_MAIN - -#include -#include -#include -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -unsigned char seq_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -int main(int argc, char *argv[]) -{ - int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; - int8_t mat[25]; - int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; - uint8_t *rseq = 0; - gzFile fpt, fpq; - kseq_t *kst, *ksq; - - // parse command line - while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { - switch (c) { - case 'a': sa = atoi(optarg); break; - case 'b': sb = atoi(optarg); break; - case 'q': gapo = atoi(optarg); break; - case 'r': gape = atoi(optarg); break; - case 't': minsc = atoi(optarg); break; - case 'f': forward_only = 1; break; - case '1': xtra |= KSW_XBYTE; break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); - return 1; - } - if (minsc > 0xffff) minsc = 0xffff; - xtra |= KSW_XSUBO | minsc; - // initialize scoring matrix - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? sa : -sb; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; - // open file - fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); - fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); - // all-pair alignment - while (kseq_read(ksq) > 0) { - kswq_t *q[2] = {0, 0}; - kswr_t r; - for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; - if (!forward_only) { // reverse - if ((int)ksq->seq.m > max_rseq) { - max_rseq = ksq->seq.m; - rseq = (uint8_t*)realloc(rseq, max_rseq); - } - for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) - rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; - } - gzrewind(fpt); kseq_rewind(kst); - while (kseq_read(kst) > 0) { - for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; - r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); - if (r.score >= minsc) - printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); - if (rseq) { - r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); - if (r.score >= minsc) - printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); - } - } - free(q[0]); free(q[1]); - } - free(rseq); - kseq_destroy(kst); gzclose(fpt); - kseq_destroy(ksq); gzclose(fpq); - return 0; -} -#endif diff --git a/ksw.h b/ksw.h deleted file mode 100644 index e1ecf8d..0000000 --- a/ksw.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __AC_KSW_H -#define __AC_KSW_H - -#include - -#define KSW_XBYTE 0x10000 -#define KSW_XSTOP 0x20000 -#define KSW_XSUBO 0x40000 -#define KSW_XSTART 0x80000 - -struct _kswq_t; -typedef struct _kswq_t kswq_t; - -typedef struct { - int score; // best score - int te, qe; // target end and query end - int score2, te2; // second best score and ending position on the target - int tb, qb; // target start and query start -} kswr_t; - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * Aligning two sequences - * - * @param qlen length of the query sequence (typically - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - An example: - -#include "kvec.h" -int main() { - kvec_t(int) array; - kv_init(array); - kv_push(int, array, 10); // append - kv_a(int, array, 20) = 5; // dynamic - kv_A(array, 20) = 4; // static - kv_destroy(array); - return 0; -} -*/ - -/* - 2008-09-22 (0.1.0): - - * The initial version. - -*/ - -#ifndef AC_KVEC_H -#define AC_KVEC_H - -#include - -#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) - -#define kvec_t(type) struct { size_t n, m; type *a; } -#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) -#define kv_destroy(v) free((v).a) -#define kv_A(v, i) ((v).a[(i)]) -#define kv_pop(v) ((v).a[--(v).n]) -#define kv_size(v) ((v).n) -#define kv_max(v) ((v).m) - -#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) - -#define kv_copy(type, v1, v0) do { \ - if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ - (v1).n = (v0).n; \ - memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ - } while (0) \ - -#define kv_push(type, v, x) do { \ - if ((v).n == (v).m) { \ - (v).m = (v).m? (v).m<<1 : 2; \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ - } \ - (v).a[(v).n++] = (x); \ - } while (0) - -#define kv_pushp(type, v) (((v).n == (v).m)? \ - ((v).m = ((v).m? (v).m<<1 : 2), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), ((v).a + ((v).n++)) - -#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ - ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ - : 0), (v).a[(i)]) - -#endif diff --git a/seqtk.c b/seqtk.c index e169c9a..5e59b40 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 20082-2012 by Heng Li + Copyright (c) 2008-2016 Broad Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -57,9 +57,10 @@ reghash_t *stk_reg_read(const char *fn) int dret; kstring_t *str; // read the list - str = calloc(1, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; ks = ks_init(fp); + str = calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int beg = -1, end = -1; reglist_t *p; @@ -183,10 +184,13 @@ static void stk_printstr(const kstring_t *s, unsigned line_len) } } -void stk_printseq(const kseq_t *s, int line_len) +static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n) { putchar(s->qual.l? '@' : '>'); - fputs(s->name.s, stdout); + if (n >= 0) { + if (prefix) fputs(prefix, stdout); + printf("%lld", (long long)n); + } else fputs(s->name.s, stdout); if (s->comment.l) { putchar(' '); fputs(s->comment.s, stdout); } @@ -197,6 +201,11 @@ void stk_printseq(const kseq_t *s, int line_len) } } +static inline void stk_printseq(const kseq_t *s, int line_len) +{ + stk_printseq_renamed(s, line_len, 0, -1); +} + /* 64-bit Mersenne Twister pseudorandom number generator. Adapted from: @@ -268,40 +277,75 @@ krint64_t kr_rand(krand_t *kr) /* quality based trimming with Mott's algorithm */ int stk_trimfq(int argc, char *argv[]) -{ // FIXME: when a record with zero length will always be treated as a fasta record +{ gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0; - while ((c = getopt(argc, argv, "l:q:b:e:")) >= 0) { + int i, c, min_len = 30, shortest_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + while ((c = getopt(argc, argv, "l:s:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; + case 's': shortest_len = atoi(optarg); break; case 'b': left = atoi(optarg); break; case 'e': right = atoi(optarg); break; + case 'B': left_keep = atoi(optarg); break; + case 'E': right_keep = atoi(optarg); break; } } if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); - fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); + fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e/-B/-E) [%.2f]\n", param); + fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); + fprintf(stderr, " -s INT trimming by -b/-e/-B/-E shall not produce reads shorter then INT bp [%d]\n", shortest_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep); +// fprintf(stderr, " -Q force FASTQ output\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); for (i = 0; i < 128; ++i) q_int2real[i] = pow(10., -(i - 33) / 10.); while (kseq_read(seq) >= 0) { int beg, tmp, end; double s, max; - if (left || right) { + if (left_keep) { + beg = left; end = left + left_keep; + if (seq->seq.l < end) end = seq->seq.l; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (right_keep) { + beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; + if (beg < 0) beg = 0; + if (end < 0) end = 0; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (left || right) { beg = left; end = seq->seq.l - right; - if (beg >= end) beg = end = 0; - } else if (seq->qual.l > min_len) { + if (end < 0) end = 0; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; if (q < 36) q = 36; @@ -325,12 +369,12 @@ int stk_trimfq(int argc, char *argv[]) end = beg + min_len; } } else beg = 0, end = seq->seq.l; - putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout); + putchar(seq->is_fastq? '@' : '>'); fputs(seq->name.s, stdout); if (seq->comment.l) { putchar(' '); puts(seq->comment.s); } else putchar('\n'); fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n'); - if (seq->qual.l) { + if (seq->is_fastq) { puts("+"); fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n'); } @@ -345,24 +389,26 @@ int stk_comp(int argc, char *argv[]) { gzFile fp; kseq_t *seq; - int l, c, upper_only = 0, from_stdin; + int l, c, upper_only = 0; reghash_t *h = 0; reglist_t dummy; + while ((c = getopt(argc, argv, "ur:")) >= 0) { switch (c) { case 'u': upper_only = 1; break; case 'r': h = stk_reg_read(optarg); break; } } - from_stdin = !isatty(fileno(stdin)); - if (argc == optind && !from_stdin) { + if (argc == optind && isatty(fileno(stdin))) { fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); return 1; } - if (from_stdin && strcmp(argv[optind], "-") != 0) - fprintf(stderr, "[W::%s] stdin is available; the input file is ignored!\n", __func__); fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); while ((l = kseq_read(seq)) >= 0) { @@ -428,6 +474,10 @@ int stk_randbase(int argc, char *argv[]) return 1; } fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { int i; @@ -480,6 +530,10 @@ int stk_hety(int argc, char *argv[]) } } fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); win_step = win_size / n_start; buf = calloc(win_size, 1); @@ -538,8 +592,16 @@ int stk_subseq(int argc, char *argv[]) return 1; } h = stk_reg_read(argv[optind+1]); + if (h == 0) { + fprintf(stderr, "[E::%s] failed to read the list of regions in file '%s'\n", __func__, argv[optind+1]); + return 1; + } // subseq fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream\n", __func__); + return 1; + } seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; @@ -560,7 +622,7 @@ int stk_subseq(int argc, char *argv[]) if (beg) printf(":%d", beg+1); } else printf(":%d-%d", beg+1, end); } - if (seq->comment.l) printf("\t%s", seq->comment.s); + if (seq->comment.l) printf(" %s", seq->comment.s); } else printf("%s\t%d\t", seq->name.s, beg + 1); if (end > seq->seq.l) end = seq->seq.l; for (j = 0; j < end - beg; ++j) { @@ -617,6 +679,10 @@ int stk_mergefa(int argc, char *argv[]) fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); seq[i] = kseq_init(fp[i]); } + if (fp[0] == 0 || fp[1] == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; srand48(11); while (kseq_read(seq[0]) >= 0) { @@ -688,8 +754,9 @@ int stk_famask(int argc, char *argv[]) { gzFile fp[2]; kseq_t *seq[2]; - int i, l; - if (argc < 3) { + int i, l, c; + while ((c = getopt(argc, argv, "")) >= 0); + if (argc - optind < 2) { fprintf(stderr, "Usage: seqtk famask \n"); return 1; } @@ -697,6 +764,10 @@ int stk_famask(int argc, char *argv[]) fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); seq[i] = kseq_init(fp[i]); } + if (fp[0] == 0 || fp[1] == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } while (kseq_read(seq[0]) >= 0) { int min_l, c[2]; kseq_read(seq[1]); @@ -736,6 +807,10 @@ int stk_mutfa(int argc, char *argv[]) // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { char *s = strdup(str->s); @@ -765,6 +840,10 @@ int stk_mutfa(int argc, char *argv[]) free(str->s); free(str); // mutfa fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; @@ -807,6 +886,10 @@ int stk_listhet(int argc, char *argv[]) return 1; } fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { for (i = 0; i < l; ++i) { @@ -892,6 +975,10 @@ int stk_cutN(int argc, char *argv[]) return 1; } fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } ks = kseq_init(fp); while ((l = kseq_read(ks)) >= 0) { int k = 0, begin = 0, end = 0; @@ -920,6 +1007,10 @@ int stk_hrun(int argc, char *argv[]) } if (argc == optind + 2) min_len = atoi(argv[optind+1]); fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } ks = kseq_init(fp); while (kseq_read(ks) >= 0) { c = ks->seq.s[0]; l = 1; beg = 0; @@ -996,6 +1087,10 @@ int stk_sample(int argc, char *argv[]) } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); n_seqs = 0; while (kseq_read(seq) >= 0) { @@ -1027,6 +1122,10 @@ int stk_sample(int argc, char *argv[]) buf = malloc(num * 8); for (i = 0; i < num; ++i) buf[i] = UINT64_MAX; fp = gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); n_seqs = 0; while (kseq_read(seq) >= 0) { @@ -1107,14 +1206,14 @@ int stk_seq(int argc, char *argv[]) { gzFile fp; kseq_t *seq; - int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, from_stdin, max_q = 255; + int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255; unsigned i, line_len = 0; int64_t n_seqs = 0; double frac = 1.; khash_t(reg) *h = 0; krand_t *kr = 0; - while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVX:")) >= 0) { + while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:S")) >= 0) { switch (c) { case 'a': case 'A': flag |= 1; break; @@ -1125,6 +1224,8 @@ int stk_seq(int argc, char *argv[]) case '2': flag |= 32; break; case 'V': flag |= 64; break; case 'N': flag |= 128; break; + case 'U': flag |= 256; break; + case 'S': flag |= 512; break; case 'M': h = stk_reg_read(optarg); break; case 'n': mask_chr = *optarg; break; case 'Q': qual_shift = atoi(optarg); break; @@ -1137,8 +1238,7 @@ int stk_seq(int argc, char *argv[]) } } if (kr == 0) kr = kr_srand(11); - from_stdin = !isatty(fileno(stdin)); - if (argc == optind && !from_stdin) { + if (argc == optind && isatty(fileno(stdin))) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk seq [options] |\n\n"); fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n"); @@ -1158,12 +1258,18 @@ int stk_seq(int argc, char *argv[]) fprintf(stderr, " -1 output the 2n-1 reads only\n"); fprintf(stderr, " -2 output the 2n reads only\n"); fprintf(stderr, " -V shift quality by '(-Q) - 33'\n"); + fprintf(stderr, " -U convert all bases to uppercases\n"); + fprintf(stderr, " -S strip of white spaces in sequences\n"); fprintf(stderr, "\n"); free(kr); return 1; } if (line_len == 0) line_len = UINT_MAX; fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); qual_thres += qual_shift; while (kseq_read(seq) >= 0) { @@ -1174,6 +1280,19 @@ int stk_seq(int argc, char *argv[]) if ((flag&16) && (n_seqs&1) == 0) continue; if ((flag&32) && (n_seqs&1) == 1) continue; } + if (flag & 512) { // option -S: squeeze out white spaces + int k; + if (seq->qual.l) { + for (i = k = 0; i < seq->seq.l; ++i) + if (!isspace(seq->seq.s[i])) + seq->qual.s[k++] = seq->qual.s[i]; + seq->qual.l = k; + } + for (i = k = 0; i < seq->seq.l; ++i) + if (!isspace(seq->seq.s[i])) + seq->seq.s[k++] = seq->seq.s[i]; + seq->seq.l = k; + } if (seq->qual.l && qual_thres > qual_shift) { if (mask_chr) { for (i = 0; i < seq->seq.l; ++i) @@ -1185,10 +1304,13 @@ int stk_seq(int argc, char *argv[]) seq->seq.s[i] = tolower(seq->seq.s[i]); } } - if (flag & 1) seq->qual.l = 0; - if (flag & 2) seq->comment.l = 0; + if (flag & 256) // option -U: convert to uppercases + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = toupper(seq->seq.s[i]); + if (flag & 1) seq->qual.l = 0; // option -a: fastq -> fasta + if (flag & 2) seq->comment.l = 0; // option -C: drop fasta/q comments if (h) stk_mask(seq, h, flag&8, mask_chr); // masking - if (flag & 4) { // reverse complement + if (flag & 4) { // option -r: reverse complement int c0, c1; for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence c0 = comp_tab[(int)seq->seq.s[i]]; @@ -1206,7 +1328,7 @@ int stk_seq(int argc, char *argv[]) if ((flag & 64) && seq->qual.l && qual_shift != 33) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= qual_shift - 33; - if (flag & 128) { + if (flag & 128) { // option -N: drop sequences containing ambiguous bases - Note: this is the last step! for (i = 0; i < seq->seq.l; ++i) if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break; if (i < seq->seq.l) continue; @@ -1220,6 +1342,67 @@ int stk_seq(int argc, char *argv[]) return 0; } +int stk_gc(int argc, char *argv[]) +{ + int c, is_at = 0, min_l = 20; + double frac = 0.6f, xdropoff = 10.0f, q; + gzFile fp; + kseq_t *seq; + + while ((c = getopt(argc, argv, "wx:f:l:")) >= 0) { + if (c == 'x') xdropoff = atof(optarg); + else if (c == 'w') is_at = 1; + else if (c == 'f') frac = atof(optarg); + else if (c == 'l') min_l = atoi(optarg); + } + if (optind + 1 > argc) { + fprintf(stderr, "Usage: seqtk gc [options] \n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -w identify high-AT regions\n"); + fprintf(stderr, " -f FLOAT min GC fraction (or AT fraction for -w) [%.2f]\n", frac); + fprintf(stderr, " -l INT min region length to output [%d]\n", min_l); + fprintf(stderr, " -x FLOAT X-dropoff [%.1f]\n", xdropoff); + return 1; + } + q = (1.0f - frac) / frac; + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + while (kseq_read(seq) >= 0) { + int i, start = 0, max_i = 0, n_hits = 0, start_hits = 0, max_hits = 0; + double sc = 0., max = 0.; + for (i = 0; i < seq->seq.l; ++i) { + int hit; + c = seq_nt16_table[(int)seq->seq.s[i]]; + if (is_at) hit = (c == 1 || c == 8 || c == 9); + else hit = (c == 2 || c == 4 || c == 6); + n_hits += hit; + if (hit) { + if (sc == 0) start = i, start_hits = n_hits; + sc += q; + if (sc > max) max = sc, max_i = i, max_hits = n_hits; + } else if (sc > 0) { + sc += -1.0f; + if (sc < 0 || max - sc > xdropoff) { + if (max_i + 1 - start >= min_l) + printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); + sc = max = 0; + i = max_i; + } + } + } + if (max > 0. && max_i + 1 - start >= min_l) + printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + int stk_mergepe(int argc, char *argv[]) { gzFile fp1, fp2; @@ -1231,6 +1414,10 @@ int stk_mergepe(int argc, char *argv[]) } fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (fp1 == 0 || fp2 == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq[0] = kseq_init(fp1); seq[1] = kseq_init(fp2); while (kseq_read(seq[0]) >= 0) { @@ -1252,16 +1439,16 @@ int stk_dropse(int argc, char *argv[]) { gzFile fp; kseq_t *seq, last; - int from_stdin; - from_stdin = !isatty(fileno(stdin)); - if (argc == 1 && !from_stdin) { - fprintf(stderr, "Usage: seqtk dropSE \n"); + if (argc == 1 && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk dropse \n"); return 1; } - if (from_stdin && argc != 1 && strcmp(argv[1], "-") != 0) - fprintf(stderr, "[W::%s] stdin is available; the input file is ignored!\n", __func__); fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } seq = kseq_init(fp); memset(&last, 0, sizeof(kseq_t)); @@ -1287,6 +1474,53 @@ int stk_dropse(int argc, char *argv[]) return 0; } +int stk_rename(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq, last; + char *prefix = 0; + uint64_t n = 1; + + if (argc == 1 && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk rename [prefix]\n"); + return 1; + } + fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + if (argc > 2) prefix = argv[2]; + + memset(&last, 0, sizeof(kseq_t)); + while (kseq_read(seq) >= 0) { + if (last.name.l) { + kstring_t *p = &last.name, *q = &seq->name; + int is_diff; + if (p->l == q->l) { + int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; + is_diff = strncmp(p->s, q->s, l); + } else is_diff = 1; + if (!is_diff) { + stk_printseq_renamed(&last, 0, prefix, n); + stk_printseq_renamed(seq, 0, prefix, n); + last.name.l = 0; + ++n; + } else { + stk_printseq_renamed(&last, 0, prefix, n); + ++n; + cpy_kseq(&last, seq); + } + } else cpy_kseq(&last, seq); + } + if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n); + + kseq_destroy(seq); + gzclose(fp); + // free last! + return 0; +} int stk_kfreq(int argc, char *argv[]) { @@ -1319,6 +1553,10 @@ int stk_kfreq(int argc, char *argv[]) } fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } ks = kseq_init(fp); while (kseq_read(ks) >= 0) { int k, x[2], cnt[2], cnt_nei[2], which; @@ -1345,22 +1583,129 @@ int stk_kfreq(int argc, char *argv[]) return 0; } +/* fqchk */ + +typedef struct { + int64_t q[94], b[5]; +} posstat_t; + +static void fqc_aux(posstat_t *p, int pos, int64_t allq[94], double perr[94], int qthres) +{ + int k; + int64_t sum = 0, qsum = 0, sum_low = 0; + double psum = 0; + if (pos <= 0) printf("ALL"); + else printf("%d", pos); + for (k = 0; k <= 4; ++k) sum += p->b[k]; + printf("\t%lld", (long long)sum); + for (k = 0; k <= 4; ++k) + printf("\t%.1f", 100. * p->b[k] / sum); + for (k = 0; k <= 93; ++k) { + qsum += p->q[k] * k, psum += p->q[k] * perr[k]; + if (k < qthres) sum_low += p->q[k]; + } + printf("\t%.1f\t%.1f", (double)qsum/sum, -4.343*log((psum+1e-6)/(sum+1e-6))); + if (qthres <= 0) { + for (k = 0; k <= 93; ++k) + if (allq[k] > 0) printf("\t%.2f", 100. * p->q[k] / sum); + } else printf("\t%.1f\t%.1f", 100. * sum_low / sum, 100. * (sum - sum_low) / sum); + putchar('\n'); +} + +int stk_fqchk(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, c, k, max_len = 0, min_len = 0x7fffffff, max_alloc = 0, offset = 33, n_diffQ = 0, qthres = 20; + int64_t tot_len = 0, n = 0; + double perr[94]; + posstat_t all, *pos = 0; + + while ((c = getopt(argc, argv, "q:")) >= 0) + if (c == 'q') qthres = atoi(optarg); + + if (optind == argc) { + fprintf(stderr, "Usage: seqtk fqchk [-q %d] \n", qthres); + fprintf(stderr, "Note: use -q0 to get the distribution of all quality values\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + for (k = 0; k <= 93; ++k) + perr[k] = pow(10., -.1 * k); + perr[0] = perr[1] = perr[2] = perr[3] = .5; + while (kseq_read(seq) >= 0) { + if (seq->qual.l == 0) continue; + ++n; + tot_len += seq->seq.l; + min_len = min_len < seq->seq.l? min_len : seq->seq.l; + max_len = max_len > seq->seq.l? max_len : seq->seq.l; + if (max_len > max_alloc) { + int old_max = max_alloc; + max_alloc = max_len; + kroundup32(max_alloc); + pos = realloc(pos, max_alloc * sizeof(posstat_t)); + memset(&pos[old_max], 0, (max_alloc - old_max) * sizeof(posstat_t)); + } + for (i = 0; i < seq->qual.l; ++i) { + int q = seq->qual.s[i] - offset; + int b = seq_nt6_table[(int)seq->seq.s[i]]; + b = b? b - 1 : 4; + q = q < 93? q : 93; + ++pos[i].q[q]; + ++pos[i].b[b]; + } + } + kseq_destroy(seq); + gzclose(fp); + + memset(&all, 0, sizeof(posstat_t)); + for (i = 0; i < max_len; ++i) { + for (k = 0; k <= 93; ++k) + all.q[k] += pos[i].q[k]; + for (k = 0; k <= 4; ++k) + all.b[k] += pos[i].b[k]; + } + for (k = n_diffQ = 0; k <= 93; ++k) + if (all.q[k]) ++n_diffQ; + printf("min_len: %d; max_len: %d; avg_len: %.2f; %d distinct quality values\n", min_len, max_len, (double)tot_len/n, n_diffQ); + printf("POS\t#bases\t%%A\t%%C\t%%G\t%%T\t%%N\tavgQ\terrQ"); + if (qthres <= 0) { + for (k = 0; k <= 93; ++k) + if (all.q[k] > 0) printf("\t%%Q%d", k); + } else printf("\t%%low\t%%high"); + putchar('\n'); + fqc_aux(&all, 0, all.q, perr, qthres); + for (i = 0; i < max_len; ++i) + fqc_aux(&pos[i], i + 1, all.q, perr, qthres); + free(pos); + return 0; +} + /* main function */ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.0-r68-dirty\n\n"); + fprintf(stderr, "Version: 1.2-r101b-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n"); fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); + fprintf(stderr, " fqchk fastq QC (base/quality summary)\n"); fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n"); fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n"); fprintf(stderr, " hety regional heterozygosity\n"); + fprintf(stderr, " gc identify high- or low-GC regions\n"); fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); fprintf(stderr, " mergefa merge two FASTA/Q files\n"); + fprintf(stderr, " famask apply a X-coded FASTA to a source FASTA\n"); fprintf(stderr, " dropse drop unpaired from interleaved PE FASTA/Q\n"); + fprintf(stderr, " rename rename sequence names\n"); fprintf(stderr, " randbase choose a random base from hets\n"); fprintf(stderr, " cutN cut sequence at long N\n"); fprintf(stderr, " listhet extract the position of each het\n"); @@ -1372,7 +1717,9 @@ int main(int argc, char *argv[]) { if (argc == 1) return usage(); if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); + else if (strcmp(argv[1], "fqchk") == 0) stk_fqchk(argc-1, argv+1); else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); + else if (strcmp(argv[1], "gc") == 0) stk_gc(argc-1, argv+1); else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); @@ -1387,8 +1734,9 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1); else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1); else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1); + else if (strcmp(argv[1], "rename") == 0) stk_rename(argc-1, argv+1); else { - fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); + fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]); return 1; } return 0; diff --git a/trimadap.c b/trimadap.c deleted file mode 100644 index b968c81..0000000 --- a/trimadap.c +++ /dev/null @@ -1,184 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "ksw.h" -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -unsigned char seq_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -typedef struct { - int type, len; - uint8_t *seq; - kswq_t *qp; - uint64_t cnt; -} ta_adap_t; - -int main(int argc, char *argv[]) -{ - int n_adaps, m_adaps; - int c, i, j, k, from_stdin; - int sa = 1, sb = 2, go = 1, ge = 3, type = 1; - int min_sc = 15, min_len = 10; - double max_diff = 0.15; - ta_adap_t *adaps; - kseq_t *ks; - gzFile fp; - int8_t mat[25]; - kstring_t str = {0,0,0}; - - n_adaps = m_adaps = 0; adaps = 0; - while ((c = getopt(argc, argv, "5:3:s:t:l:")) >= 0) { - if (c == '5' || c == '3') { - ta_adap_t *p; - if (m_adaps == n_adaps) { - m_adaps = m_adaps? m_adaps<<1 : 4; - adaps = realloc(adaps, m_adaps * sizeof(ta_adap_t)); - } - p = &adaps[n_adaps++]; - p->seq = (uint8_t*)strdup(optarg); - p->type = c - '0'; - } else if (c == 't') { - if (strcmp(optarg, "ilpe") == 0) type = 1; - } else if (c == 's') min_sc = atoi(optarg); - else if (c == 'd') max_diff = atof(optarg); - else if (c == 'l') min_len = atoi(optarg); - } - - // preset - if (type == 1 && n_adaps == 0) { - m_adaps = n_adaps = 3; - adaps = malloc(m_adaps * sizeof(ta_adap_t)); - adaps[0].type = 5; adaps[0].seq = (uint8_t*)strdup("AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"); - adaps[1].type = 3; adaps[1].seq = (uint8_t*)strdup("AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"); - adaps[2].type = 3; adaps[2].seq = (uint8_t*)strdup("ATCTCGTATGCCGTCTTCTGCTTG"); - } - - // update adapter info - for (j = 0; j < n_adaps; ++j) { - ta_adap_t *p = &adaps[j]; - p->len = strlen((char*)p->seq); - p->qp = 0; - p->cnt = 0; - for (i = 0; i < p->len; ++i) - p->seq[i] = seq_nt4_table[(uint8_t)p->seq[i]]; - } - - from_stdin = !isatty(fileno(stdin)); - if (optind == argc && !from_stdin) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: trimadap [options] \n\n"); - fprintf(stderr, "Options: -5 STR 5'-end adapter\n"); - fprintf(stderr, " -3 STR 3'-end adapter\n"); - fprintf(stderr, " -l INT min length [%d]\n", min_len); - fprintf(stderr, " -s INT min score [%d]\n", min_sc); - fprintf(stderr, " -d FLOAT max difference [%.3f]\n", max_diff); - fprintf(stderr, "\n"); - return 1; // FIXME: memory leak - } - - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? sa : -sb; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; - - fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "rb") : gzdopen(fileno(stdin), "rb"); - ks = kseq_init(fp); - while (kseq_read(ks) >= 0) { - if (str.m < ks->seq.m) { - str.m = ks->seq.m; - str.s = realloc(str.s, str.m); - } - str.l = ks->seq.l; - for (i = 0; i < ks->seq.l; ++i) - str.s[i] = seq_nt4_table[(uint8_t)ks->seq.s[i]]; - for (j = 0; j < n_adaps; ++j) { - kswr_t r; - double diff; - int type; - ta_adap_t *p = &adaps[j]; - r = ksw_align(p->len, p->seq, str.l, (uint8_t*)str.s, 5, mat, go, ge, KSW_XBYTE|KSW_XSTART|(min_len*sa), &p->qp); - ++r.te; ++r.qe; // change to 0-based - k = r.qe - r.qb < r.te - r.tb? r.qe - r.qb : r.te - r.tb; - diff = (double)(k * sa - r.score) / sb / k; - //printf("%d:%.3f [%d,%d):%d <=> [%d,%d):%d\n", r.score, diff, r.qb, r.qe, p->len, r.tb, r.te, (int)str.l); - if (r.qb <= r.tb && p->len - r.qe <= str.l - r.te) { // contained - if (r.qb * sa > sa + sb) continue; - if ((p->len - r.qe) * sa > sa + sb) continue; - type = 1; - } else if (r.qb <= r.tb) { // 3' overlap - if (r.qb * sa > sa + sb) continue; - if ((str.l - r.te) * sa > sa + sb) continue; - type = 2; - } else { - if ((p->len - r.qe) * sa > sa + sb) continue; - if (r.tb * sa > sa + sb) continue; - type = 3; - } - if (p->type == 5) { - if (r.tb == 0 && r.qe == p->len && (r.te - r.tb) * sa == r.score) - type = 4; - } else if (p->type == 3) { - if (r.qb == 0 && r.te == str.l && (r.te - r.tb) * sa == r.score) - type = 4; - } - if (type == 4) { - if (r.te - r.tb < min_len) continue; - } else { - if (r.score < min_sc || diff > max_diff) continue; - } - ++p->cnt; - if (p->type == 5) { - k = r.te + (p->len - r.qe); - k = k < str.l? k : str.l; - for (i = 0; i < k; ++i) ks->seq.s[i] = 'X'; - } else if (p->type == 3) { - k = r.tb > r.qb? r.tb - r.qb : 0; - for (i = k; i < str.l; ++i) ks->seq.s[i] = 'X'; - } - } - putchar(ks->qual.l? '@' : '>'); - puts(ks->name.s); - puts(ks->seq.s); - if (ks->qual.l) { - puts("+"); - puts(ks->qual.s); - } - } - free(str.s); - kseq_destroy(ks); - gzclose(fp); - - for (j = 0; j < n_adaps; ++j) { - ta_adap_t *p = &adaps[j]; - fprintf(stderr, "%-15ld ", (long)p->cnt); - for (i = 0; i < p->len; ++i) fputc("ACGTN"[(int)p->seq[i]], stderr); - fputc('\n', stderr); - free(p->seq); - free(p->qp); - } - free(adaps); - return 0; -}