diff --git a/Makefile b/Makefile index 0bbb078d6..b59a443f3 100644 --- a/Makefile +++ b/Makefile @@ -267,6 +267,7 @@ NONCONFIGURE_OBJS = hfile_libcurl.o PLUGIN_EXT = PLUGIN_OBJS = +bgzf_internal_h = bgzf_internal.h $(htslib_bgzf_h) cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) cram_io_h = cram/cram_io.h $(cram_misc_h) cram_misc_h = cram/misc.h @@ -487,7 +488,7 @@ hts-object-files: $(LIBHTS_OBJS) $(CC) -shared $(LDFLAGS) -o $@ $< hts.dll.a $(LIBS) -bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) +bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(bgzf_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) @@ -499,7 +500,7 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) +vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bgzf_internal_h) sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h) diff --git a/bgzf.c b/bgzf.c index 8cf3b7828..cb280268f 100644 --- a/bgzf.c +++ b/bgzf.c @@ -48,12 +48,13 @@ #include "htslib/hts_endian.h" #include "cram/pooled_alloc.h" #include "hts_internal.h" +#include "bgzf_internal.h" +#include "htslib/khash.h" #ifndef EFTYPE #define EFTYPE ENOEXEC #endif -#define BGZF_CACHE #define BGZF_MT #define BLOCK_HEADER_LENGTH 18 @@ -76,21 +77,15 @@ */ static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; -#ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; -#include "htslib/khash.h" -KHASH_MAP_INIT_INT64(cache, cache_t) -#endif +KHASH_MAP_INIT_INT64(bgzf_cache, cache_t) -struct bgzf_cache_t { - khash_t(cache) *h; - khint_t last_pos; -}; +// struct bgzf_cache_t is defined in bgzf_internal.h #ifdef BGZF_MT @@ -409,20 +404,21 @@ static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) errno = EFTYPE; return NULL; } -#ifdef BGZF_CACHE + if (!(fp->cache = malloc(sizeof(*fp->cache)))) { free(fp->uncompressed_block); free(fp); return NULL; } - if (!(fp->cache->h = kh_init(cache))) { + if (!(fp->cache->h = kh_init(bgzf_cache))) { free(fp->uncompressed_block); free(fp->cache); free(fp); return NULL; } fp->cache->last_pos = 0; -#endif + fp->cache->private_data = NULL; + fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL; return fp; } @@ -442,6 +438,15 @@ static BGZF *bgzf_write_init(const char *mode) fp = (BGZF*)calloc(1, sizeof(BGZF)); if (fp == NULL) goto mem_fail; fp->is_write = 1; + + fp->cache = malloc(sizeof(bgzf_cache_t)); + if (!fp->cache) + goto mem_fail; + fp->cache->h = NULL; + fp->cache->last_pos = 0; + fp->cache->private_data = NULL; + fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL; + int compress_level = mode2level(mode); if ( compress_level==-2 ) { @@ -479,6 +484,7 @@ static BGZF *bgzf_write_init(const char *mode) fail: if (fp != NULL) { + free(fp->cache); free(fp->uncompressed_block); free(fp->gz_stream); free(fp); @@ -896,15 +902,16 @@ static int check_header(const uint8_t *header) && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; } -#ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; - if (fp->is_write) return; - khash_t(cache) *h = fp->cache->h; - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) free(kh_val(h, k).block); - kh_destroy(cache, h); + if (fp->cache->h) { + khash_t(bgzf_cache) *h = fp->cache->h; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(bgzf_cache, h); + } + bgzf_clear_private_data(fp); free(fp->cache); } @@ -913,8 +920,8 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) khint_t k; cache_t *p; - khash_t(cache) *h = fp->cache->h; - k = kh_get(cache, h, block_address); + khash_t(bgzf_cache) *h = fp->cache->h; + k = kh_get(bgzf_cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; @@ -937,7 +944,7 @@ static void cache_block(BGZF *fp, int size) uint8_t *block = NULL; cache_t *p; //fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address); - khash_t(cache) *h = fp->cache->h; + khash_t(bgzf_cache) *h = fp->cache->h; if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return; if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { @@ -959,13 +966,13 @@ static void cache_block(BGZF *fp, int size) if (k != k_orig) { block = kh_val(h, k).block; - kh_del(cache, h, k); + kh_del(bgzf_cache, h, k); } } else { block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); } if (!block) return; - k = kh_put(cache, h, fp->block_address, &ret); + k = kh_put(bgzf_cache, h, fp->block_address, &ret); if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen) free(block); return; @@ -976,11 +983,6 @@ static void cache_block(BGZF *fp, int size) p->block = block; memcpy(p->block, fp->uncompressed_block, p->size); } -#else -static void free_cache(BGZF *fp) {} -static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} -static void cache_block(BGZF *fp, int size) {} -#endif /* * Absolute htell in this compressed file. diff --git a/bgzf_internal.h b/bgzf_internal.h new file mode 100644 index 000000000..b3a790df6 --- /dev/null +++ b/bgzf_internal.h @@ -0,0 +1,70 @@ +/* bgzf_internal.h -- internal bgzf functions; not part of the public API. + + Copyright (C) 2025 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include "htslib/bgzf.h" + +/* + * BGZF private data interface + * This exists so that we can pass BCF headers into interfaces that have + * traditionally only taken a BGZF pointer without a corresponding bcf_hdr_t *, + * notably the bcf_readrec() function used by BCF iterators. + * + * To preserve the BGZF API and ABI, this is tagged on to the existing + * opaque bgzf_cache_t structure. bgzf_cache_t is now defined here so we can + * inline lookups. + */ + +typedef void bgzf_private_data_cleanup_func(void *private_data); + +struct kh_bgzf_cache_s; + +struct bgzf_cache_t { + struct kh_bgzf_cache_s *h; + unsigned int last_pos; + void *private_data; + bgzf_private_data_cleanup_func *private_data_cleanup; +}; + +// Set private data. cleanup will be called on bgzf_close() or +// bgzf_clear_private_data(); + +static inline void bgzf_set_private_data(BGZF *fp, void *private_data, + bgzf_private_data_cleanup_func *fn) { + assert(fp->cache != NULL); + fp->cache->private_data = private_data; + fp->cache->private_data_cleanup = fn; +} + +static inline void bgzf_clear_private_data(BGZF *fp) { + assert(fp->cache != NULL); + if (fp->cache->private_data) { + if (fp->cache->private_data_cleanup) + fp->cache->private_data_cleanup(fp->cache->private_data); + fp->cache->private_data = NULL; + } +} + +static inline void * bgzf_get_private_data(BGZF *fp) { + assert(fp->cache != NULL); + return fp->cache->private_data; +} diff --git a/vcf.c b/vcf.c index cae3ba97f..3dab369d0 100644 --- a/vcf.c +++ b/vcf.c @@ -51,6 +51,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "htslib/khash.h" +#include "bgzf_internal.h" #if 0 // This helps on Intel a bit, often 6-7% faster VCF parsing. @@ -117,6 +118,7 @@ typedef struct hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings int version; //cached version + uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called } bcf_hdr_aux_t; @@ -189,6 +191,30 @@ static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) return VCF_DEF; } +// Header reference counting + +static void bcf_hdr_incr_ref(bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + aux->ref_count += 2; +} + +static void bcf_hdr_decr_ref(bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + if (aux->ref_count >= 2) + aux->ref_count -= 2; + + if (aux->ref_count == 0) + bcf_hdr_destroy(h); +} + +static void hdr_bgzf_private_data_cleanup(void *data) +{ + bcf_hdr_t *h = (bcf_hdr_t *) data; + bcf_hdr_decr_ref(h); +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -1498,6 +1524,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) aux->key_len = NULL; aux->dict = *((vdict_t*)h->dict[0]); aux->version = 0; + aux->ref_count = 1; free(h->dict[0]); h->dict[0] = aux; @@ -1522,6 +1549,12 @@ void bcf_hdr_destroy(bcf_hdr_t *h) int i; khint_t k; if (!h) return; + bcf_hdr_aux_t *aux = get_hdr_aux(h); + if (aux->ref_count > 1) // Refs still held, so delay destruction + { + aux->ref_count &= ~1; + return; + } for (i = 0; i < 3; ++i) { vdict_t *d = (vdict_t*)h->dict[i]; if (d == 0) continue; @@ -1529,7 +1562,6 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if (kh_exist(d, k)) free((char*)kh_key(d, k)); if ( i==0 ) { - bcf_hdr_aux_t *aux = get_hdr_aux(h); for (k=kh_begin(aux->gen); kgen); k++) if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); kh_destroy(hdict, aux->gen); @@ -1597,6 +1629,10 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) htxt[hlen] = '\0'; // Ensure htxt is terminated if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail; free(htxt); + + bcf_hdr_incr_ref(h); + bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); + return h; fail: hts_log_error("Failed to read BCF header"); @@ -1638,6 +1674,9 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1; if ( bgzf_flush(fp) < 0) return -1; + bcf_hdr_incr_ref(h); + bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); + free(htxt.s); return 0; } @@ -1818,42 +1857,44 @@ static const char *get_type_name(int type) { static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type) { int j, k; - size_t bytes; + unsigned int inc = 1 << bcf_type_shift[type]; + ptrdiff_t bytes = samples * ploidy * inc; - #define NOSWITCHFOR(type_t, rawtype_t,convert, reconvert, p, end, samples, ploidy, consumed) \ - for (j = 0; j < samples; ++j) { /*for each sample */ \ - int anyunphased = 0; \ - uint8_t *ptr1 = p; \ - rawtype_t al1 = 0, val; \ - for (k = 0; k < ploidy; ++k) { /*in each ploidy */ \ - val = convert(p); \ - if (!k) al1 = val; \ - else if (!(val & 1)) anyunphased = 1; \ - /*get to next phasing or skip the rest for this sample*/ \ - *consumed = ((anyunphased || al1 & 1) ? ploidy - k : 1) << bcf_type_shift[type_t]; \ - if (end - p < *consumed) return 1; \ - p += *consumed; \ - if (anyunphased || al1 & 1) break; \ - } \ - if (!anyunphased && al1 > 1) { /*no other unphased*/ \ - /*set phased on read or make unphased on write as upto 4.3 1st - phasing is not described explicitly and has to be inferred*/ \ - al1 |= 1; \ - reconvert(convert((uint8_t*)&al1), ptr1); \ - } \ - } - #define i8_to_le(v, buf) *((uint8_t*)buf) = v; - switch(type) { - case BCF_BT_INT8: - NOSWITCHFOR(BCF_BT_INT8, uint8_t, le_to_i8, i8_to_le, p, end, samples, ploidy, &bytes); - break; - case BCF_BT_INT16: - NOSWITCHFOR(BCF_BT_INT16, uint16_t, le_to_i16, i16_to_le, p, end, samples, ploidy, &bytes); + if (samples < 0 || ploidy < 0 || end - p < bytes) + return 1; + + /* + * This works because phasing is stored in the least-significant bit + * of the GT encoding, and the data is always stored little-endian. + * Thus it's possible to get the desired result by doing bit operations + * on the least-significant byte of each value and ignoring the + * higher bytes (for 16-bit and 32-bit values). + */ + + switch (ploidy) { + case 1: + // Trivial case - haploid data is phased by default + for (j = 0; j < samples; ++j) { + *p |= 1; + p += inc; + } break; - case BCF_BT_INT32: - NOSWITCHFOR(BCF_BT_INT32, uint32_t, le_to_i32, i32_to_le, p, end, samples, ploidy, &bytes); + case 2: + // Mostly trivial case - first is phased if second is. + for (j = 0; j < samples; ++j) { + *p |= (p[inc] & 1); + p += 2 * inc; + } break; - //no other types are valid for phasing. + default: + // Generic case - first is phased if all other alleles are. + for (j = 0; j < samples; ++j) { + uint8_t allphased = 1; + for (k = 1; k < ploidy; ++k) + allphased &= (p[inc * k]); + *p |= allphased; + p += ploidy * inc; + } } *q = p; return 0; @@ -1944,6 +1985,7 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { (1 << BCF_BT_FLOAT) | (1 << BCF_BT_CHAR)); int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0; + int idgt = hdr ? bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1; // Check for valid contig ID if (rec->rid < 0 @@ -2053,9 +2095,16 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { bcf_record_check_err(hdr, rec, "type", &reports, type); err |= BCF_ERR_TAG_INVALID; } - bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; - if (end - ptr < bytes) goto bad_indiv; - ptr += bytes; + if (idgt >= 0 && idgt == key) { + // check first GT phasing bit and fix up if necessary + if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) { + err |= BCF_ERR_TAG_INVALID; + } + } else { + bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; + if (end - ptr < bytes) goto bad_indiv; + ptr += bytes; + } } if (!err && rec->rlen < 0) { @@ -2130,7 +2179,9 @@ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - if (fp->format.format == vcf) return vcf_read(fp,h,v); + if (fp->format.format == vcf) return vcf_read(fp, h, v); + if (!h) + h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf); int ret = bcf_read1_core(fp->fp.bgzf, v); if (ret == 0) ret = bcf_record_check(h, v); if ( ret!=0 || !h->keep_samples ) return ret; @@ -2140,8 +2191,9 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) { bcf1_t *v = (bcf1_t *) vv; + const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp); int ret = bcf_read1_core(fp, v); - if (ret == 0) ret = bcf_record_check(NULL, v); + if (ret == 0) ret = bcf_record_check(hdr, v); if (ret >= 0) *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen; return ret; @@ -6051,13 +6103,12 @@ int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, c int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) { - int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag), gt = 0; + int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 ) { // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT. if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; - gt = 1; } else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type @@ -6118,38 +6169,6 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v } #undef BRANCH - if (gt && bcf_get_version(hdr, NULL) < VCF44) { - //convert to v44 phasing if required - int32_t *v = *dst, *ptr1, val1, anyunphased = 0, end = 0; - for (i=0; in; j++) { - if (*v != bcf_int32_vector_end) { - if (!j) { - val1 = *v; - ptr1 = v; - } else { - if (!(*v & 1)) { //unphased or unkonwn - anyunphased = 1; - } - } - } else { - end = 1; - } - - if (val1 & 1 || anyunphased || end) { - //phased || an unphased found || end of data, skip sample - v += (fmt->n - j); - break; - } - ++v; - } - if (val1 && !(val1 & 1) && !anyunphased) { - //valid unphased one w/o any other unphased, make phased - *ptr1 |= 1; - } - } - } return nsmpl*fmt->n; }