From 705548f2aae9abfd1c524ae935b7d7a024f0cdef Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 26 Aug 2025 15:01:03 +0100 Subject: [PATCH 1/5] Add a way of attaching private data to a BGZF file handle The motivation for this is to enable passing of a pointer to a bcf_hdr_t structure to bcf_readrec(), which currently does not get one. It does always get a pointer for the BGZF handle, so a header struct could be passed in via that if it can be stored somewhere. To enable this while not changing the bgzf API or ABI, extra fields are added to the opaque bgzf_cache_t field. The BGZF_CACHE macro that could be use to disable addition of the cache feature removed as it was always turned on anyway. The cache struct now has to be created for files open for write, although the cache part is not used. The hash type used by the cache is renamed from "cache" to "bgzf_cache" to improve its name-spacing. The interfaces to add, get, and remove private data are put in a new bgzf_internal.h header. The bgzf_cache_t struct definition is also moved there so that the get function can be inlined for faster access to the private data field. The bgzf_cache_t definition is rewritten slightly so that it's not necessary to invoke KHASH_MAP_INIT_INT64() before it in the header file, as doing that would require struct cache_t to be moved from bgzf.c to the new header as well. Instead, typedef kh_bgzf_cache_t is used in place of khash(bgzf_cache), and unsigned int instead of khint_t. --- Makefile | 3 ++- bgzf.c | 58 ++++++++++++++++++++-------------------- bgzf_internal.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 29 deletions(-) create mode 100644 bgzf_internal.h diff --git a/Makefile b/Makefile index 0bbb078d6..1a14a86be 100644 --- a/Makefile +++ b/Makefile @@ -267,6 +267,7 @@ NONCONFIGURE_OBJS = hfile_libcurl.o PLUGIN_EXT = PLUGIN_OBJS = +bgzf_internal_h = bgzf_internal.h $(htslib_bgzf_h) cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) cram_io_h = cram/cram_io.h $(cram_misc_h) cram_misc_h = cram/misc.h @@ -487,7 +488,7 @@ hts-object-files: $(LIBHTS_OBJS) $(CC) -shared $(LDFLAGS) -o $@ $< hts.dll.a $(LIBS) -bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) +bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(bgzf_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) diff --git a/bgzf.c b/bgzf.c index 8cf3b7828..cb280268f 100644 --- a/bgzf.c +++ b/bgzf.c @@ -48,12 +48,13 @@ #include "htslib/hts_endian.h" #include "cram/pooled_alloc.h" #include "hts_internal.h" +#include "bgzf_internal.h" +#include "htslib/khash.h" #ifndef EFTYPE #define EFTYPE ENOEXEC #endif -#define BGZF_CACHE #define BGZF_MT #define BLOCK_HEADER_LENGTH 18 @@ -76,21 +77,15 @@ */ static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; -#ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; -#include "htslib/khash.h" -KHASH_MAP_INIT_INT64(cache, cache_t) -#endif +KHASH_MAP_INIT_INT64(bgzf_cache, cache_t) -struct bgzf_cache_t { - khash_t(cache) *h; - khint_t last_pos; -}; +// struct bgzf_cache_t is defined in bgzf_internal.h #ifdef BGZF_MT @@ -409,20 +404,21 @@ static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) errno = EFTYPE; return NULL; } -#ifdef BGZF_CACHE + if (!(fp->cache = malloc(sizeof(*fp->cache)))) { free(fp->uncompressed_block); free(fp); return NULL; } - if (!(fp->cache->h = kh_init(cache))) { + if (!(fp->cache->h = kh_init(bgzf_cache))) { free(fp->uncompressed_block); free(fp->cache); free(fp); return NULL; } fp->cache->last_pos = 0; -#endif + fp->cache->private_data = NULL; + fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL; return fp; } @@ -442,6 +438,15 @@ static BGZF *bgzf_write_init(const char *mode) fp = (BGZF*)calloc(1, sizeof(BGZF)); if (fp == NULL) goto mem_fail; fp->is_write = 1; + + fp->cache = malloc(sizeof(bgzf_cache_t)); + if (!fp->cache) + goto mem_fail; + fp->cache->h = NULL; + fp->cache->last_pos = 0; + fp->cache->private_data = NULL; + fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL; + int compress_level = mode2level(mode); if ( compress_level==-2 ) { @@ -479,6 +484,7 @@ static BGZF *bgzf_write_init(const char *mode) fail: if (fp != NULL) { + free(fp->cache); free(fp->uncompressed_block); free(fp->gz_stream); free(fp); @@ -896,15 +902,16 @@ static int check_header(const uint8_t *header) && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; } -#ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; - if (fp->is_write) return; - khash_t(cache) *h = fp->cache->h; - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) free(kh_val(h, k).block); - kh_destroy(cache, h); + if (fp->cache->h) { + khash_t(bgzf_cache) *h = fp->cache->h; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(bgzf_cache, h); + } + bgzf_clear_private_data(fp); free(fp->cache); } @@ -913,8 +920,8 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) khint_t k; cache_t *p; - khash_t(cache) *h = fp->cache->h; - k = kh_get(cache, h, block_address); + khash_t(bgzf_cache) *h = fp->cache->h; + k = kh_get(bgzf_cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; @@ -937,7 +944,7 @@ static void cache_block(BGZF *fp, int size) uint8_t *block = NULL; cache_t *p; //fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address); - khash_t(cache) *h = fp->cache->h; + khash_t(bgzf_cache) *h = fp->cache->h; if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return; if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { @@ -959,13 +966,13 @@ static void cache_block(BGZF *fp, int size) if (k != k_orig) { block = kh_val(h, k).block; - kh_del(cache, h, k); + kh_del(bgzf_cache, h, k); } } else { block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); } if (!block) return; - k = kh_put(cache, h, fp->block_address, &ret); + k = kh_put(bgzf_cache, h, fp->block_address, &ret); if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen) free(block); return; @@ -976,11 +983,6 @@ static void cache_block(BGZF *fp, int size) p->block = block; memcpy(p->block, fp->uncompressed_block, p->size); } -#else -static void free_cache(BGZF *fp) {} -static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} -static void cache_block(BGZF *fp, int size) {} -#endif /* * Absolute htell in this compressed file. diff --git a/bgzf_internal.h b/bgzf_internal.h new file mode 100644 index 000000000..b3a790df6 --- /dev/null +++ b/bgzf_internal.h @@ -0,0 +1,70 @@ +/* bgzf_internal.h -- internal bgzf functions; not part of the public API. + + Copyright (C) 2025 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include "htslib/bgzf.h" + +/* + * BGZF private data interface + * This exists so that we can pass BCF headers into interfaces that have + * traditionally only taken a BGZF pointer without a corresponding bcf_hdr_t *, + * notably the bcf_readrec() function used by BCF iterators. + * + * To preserve the BGZF API and ABI, this is tagged on to the existing + * opaque bgzf_cache_t structure. bgzf_cache_t is now defined here so we can + * inline lookups. + */ + +typedef void bgzf_private_data_cleanup_func(void *private_data); + +struct kh_bgzf_cache_s; + +struct bgzf_cache_t { + struct kh_bgzf_cache_s *h; + unsigned int last_pos; + void *private_data; + bgzf_private_data_cleanup_func *private_data_cleanup; +}; + +// Set private data. cleanup will be called on bgzf_close() or +// bgzf_clear_private_data(); + +static inline void bgzf_set_private_data(BGZF *fp, void *private_data, + bgzf_private_data_cleanup_func *fn) { + assert(fp->cache != NULL); + fp->cache->private_data = private_data; + fp->cache->private_data_cleanup = fn; +} + +static inline void bgzf_clear_private_data(BGZF *fp) { + assert(fp->cache != NULL); + if (fp->cache->private_data) { + if (fp->cache->private_data_cleanup) + fp->cache->private_data_cleanup(fp->cache->private_data); + fp->cache->private_data = NULL; + } +} + +static inline void * bgzf_get_private_data(BGZF *fp) { + assert(fp->cache != NULL); + return fp->cache->private_data; +} From 41c265d5025e402c58c7b7532df3ff0ed8aa4a1c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 26 Aug 2025 16:25:14 +0100 Subject: [PATCH 2/5] Store ref counted bcf_hdr_t pointer in BGZF private data For bcf files, the header pointer hasn't always been passed into bcf_read(), especially when using iterators. As having it available would be useful for VCF 4.4+ support, this works around its absence by attaching a pointer to the header in BGZF private data, which was previously unused for vcf/bcf. It also adds reference counting to the header so that it can be cleaned up safely irrespective of whether hts_close() or bcf_hdr_destroy() was called first. To avoid ABI breakage, the reference count is stored in the bcf_hdr_aux_t struct. --- Makefile | 2 +- vcf.c | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1a14a86be..b59a443f3 100644 --- a/Makefile +++ b/Makefile @@ -500,7 +500,7 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) +vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bgzf_internal_h) sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h) diff --git a/vcf.c b/vcf.c index cae3ba97f..2276c996b 100644 --- a/vcf.c +++ b/vcf.c @@ -51,6 +51,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "htslib/khash.h" +#include "bgzf_internal.h" #if 0 // This helps on Intel a bit, often 6-7% faster VCF parsing. @@ -117,6 +118,7 @@ typedef struct hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings int version; //cached version + uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called } bcf_hdr_aux_t; @@ -189,6 +191,30 @@ static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) return VCF_DEF; } +// Header reference counting + +static void bcf_hdr_incr_ref(bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + aux->ref_count += 2; +} + +static void bcf_hdr_decr_ref(bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + if (aux->ref_count >= 2) + aux->ref_count -= 2; + + if (aux->ref_count == 0) + bcf_hdr_destroy(h); +} + +static void hdr_bgzf_private_data_cleanup(void *data) +{ + bcf_hdr_t *h = (bcf_hdr_t *) data; + bcf_hdr_decr_ref(h); +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -1498,6 +1524,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) aux->key_len = NULL; aux->dict = *((vdict_t*)h->dict[0]); aux->version = 0; + aux->ref_count = 1; free(h->dict[0]); h->dict[0] = aux; @@ -1522,6 +1549,12 @@ void bcf_hdr_destroy(bcf_hdr_t *h) int i; khint_t k; if (!h) return; + bcf_hdr_aux_t *aux = get_hdr_aux(h); + if (aux->ref_count > 1) // Refs still held, so delay destruction + { + aux->ref_count &= ~1; + return; + } for (i = 0; i < 3; ++i) { vdict_t *d = (vdict_t*)h->dict[i]; if (d == 0) continue; @@ -1529,7 +1562,6 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if (kh_exist(d, k)) free((char*)kh_key(d, k)); if ( i==0 ) { - bcf_hdr_aux_t *aux = get_hdr_aux(h); for (k=kh_begin(aux->gen); kgen); k++) if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); kh_destroy(hdict, aux->gen); @@ -1597,6 +1629,10 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) htxt[hlen] = '\0'; // Ensure htxt is terminated if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail; free(htxt); + + bcf_hdr_incr_ref(h); + bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); + return h; fail: hts_log_error("Failed to read BCF header"); @@ -1638,6 +1674,9 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1; if ( bgzf_flush(fp) < 0) return -1; + bcf_hdr_incr_ref(h); + bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); + free(htxt.s); return 0; } From e136ad46fceb387ff1ba58da780193b7cf16a136 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 26 Aug 2025 16:47:43 +0100 Subject: [PATCH 3/5] Update first phasing bit when reading BCF BCF saved by versions of HTSlib before 1.22 will always store the first phasing bit as 0. For consistency with the VCF reader, update this bit when reading BCF so that is is set if all other phasing bits are also set. --- vcf.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/vcf.c b/vcf.c index 2276c996b..17ea7896a 100644 --- a/vcf.c +++ b/vcf.c @@ -1983,6 +1983,7 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { (1 << BCF_BT_FLOAT) | (1 << BCF_BT_CHAR)); int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0; + int idgt = hdr ? bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1; // Check for valid contig ID if (rec->rid < 0 @@ -2092,9 +2093,16 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { bcf_record_check_err(hdr, rec, "type", &reports, type); err |= BCF_ERR_TAG_INVALID; } - bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; - if (end - ptr < bytes) goto bad_indiv; - ptr += bytes; + if (idgt >= 0 && idgt == key) { + // check first GT phasing bit and fix up if necessary + if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) { + err |= BCF_ERR_TAG_INVALID; + } + } else { + bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; + if (end - ptr < bytes) goto bad_indiv; + ptr += bytes; + } } if (!err && rec->rlen < 0) { @@ -2169,7 +2177,9 @@ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - if (fp->format.format == vcf) return vcf_read(fp,h,v); + if (fp->format.format == vcf) return vcf_read(fp, h, v); + if (!h) + h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf); int ret = bcf_read1_core(fp->fp.bgzf, v); if (ret == 0) ret = bcf_record_check(h, v); if ( ret!=0 || !h->keep_samples ) return ret; @@ -2179,8 +2189,9 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) { bcf1_t *v = (bcf1_t *) vv; + const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp); int ret = bcf_read1_core(fp, v); - if (ret == 0) ret = bcf_record_check(NULL, v); + if (ret == 0) ret = bcf_record_check(hdr, v); if (ret >= 0) *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen; return ret; From 2fdd37e2ae3ac25332fff5a5c06fcca940d98fba Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 26 Aug 2025 16:48:12 +0100 Subject: [PATCH 4/5] Don't try to fix phasing in bcf_get_format_values() Phasing should now be fixed up in bcf_read()/vcf_read(), so there's no need to try again in bcf_get_format_values(). --- vcf.c | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/vcf.c b/vcf.c index 17ea7896a..ba1072a4d 100644 --- a/vcf.c +++ b/vcf.c @@ -6101,13 +6101,12 @@ int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, c int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) { - int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag), gt = 0; + int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 ) { // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT. if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; - gt = 1; } else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type @@ -6168,38 +6167,6 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v } #undef BRANCH - if (gt && bcf_get_version(hdr, NULL) < VCF44) { - //convert to v44 phasing if required - int32_t *v = *dst, *ptr1, val1, anyunphased = 0, end = 0; - for (i=0; in; j++) { - if (*v != bcf_int32_vector_end) { - if (!j) { - val1 = *v; - ptr1 = v; - } else { - if (!(*v & 1)) { //unphased or unkonwn - anyunphased = 1; - } - } - } else { - end = 1; - } - - if (val1 & 1 || anyunphased || end) { - //phased || an unphased found || end of data, skip sample - v += (fmt->n - j); - break; - } - ++v; - } - if (val1 && !(val1 & 1) && !anyunphased) { - //valid unphased one w/o any other unphased, make phased - *ptr1 |= 1; - } - } - } return nsmpl*fmt->n; } From cf1314df7b81133785a70fd2888050fd1085a240 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Aug 2025 17:44:12 +0100 Subject: [PATCH 5/5] Speed up updatephasing By noting that we're only interested in the least-significant bit of each GT value, it's possible to reduce the number of branches in this function by doing bit manipulations on the first byte of each stored value. The common haploid and diploid cases are also specialised so the inner loop on ploidy can be avoided for those cases. --- vcf.c | 68 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/vcf.c b/vcf.c index ba1072a4d..3dab369d0 100644 --- a/vcf.c +++ b/vcf.c @@ -1857,42 +1857,44 @@ static const char *get_type_name(int type) { static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type) { int j, k; - size_t bytes; + unsigned int inc = 1 << bcf_type_shift[type]; + ptrdiff_t bytes = samples * ploidy * inc; - #define NOSWITCHFOR(type_t, rawtype_t,convert, reconvert, p, end, samples, ploidy, consumed) \ - for (j = 0; j < samples; ++j) { /*for each sample */ \ - int anyunphased = 0; \ - uint8_t *ptr1 = p; \ - rawtype_t al1 = 0, val; \ - for (k = 0; k < ploidy; ++k) { /*in each ploidy */ \ - val = convert(p); \ - if (!k) al1 = val; \ - else if (!(val & 1)) anyunphased = 1; \ - /*get to next phasing or skip the rest for this sample*/ \ - *consumed = ((anyunphased || al1 & 1) ? ploidy - k : 1) << bcf_type_shift[type_t]; \ - if (end - p < *consumed) return 1; \ - p += *consumed; \ - if (anyunphased || al1 & 1) break; \ - } \ - if (!anyunphased && al1 > 1) { /*no other unphased*/ \ - /*set phased on read or make unphased on write as upto 4.3 1st - phasing is not described explicitly and has to be inferred*/ \ - al1 |= 1; \ - reconvert(convert((uint8_t*)&al1), ptr1); \ - } \ - } - #define i8_to_le(v, buf) *((uint8_t*)buf) = v; - switch(type) { - case BCF_BT_INT8: - NOSWITCHFOR(BCF_BT_INT8, uint8_t, le_to_i8, i8_to_le, p, end, samples, ploidy, &bytes); - break; - case BCF_BT_INT16: - NOSWITCHFOR(BCF_BT_INT16, uint16_t, le_to_i16, i16_to_le, p, end, samples, ploidy, &bytes); + if (samples < 0 || ploidy < 0 || end - p < bytes) + return 1; + + /* + * This works because phasing is stored in the least-significant bit + * of the GT encoding, and the data is always stored little-endian. + * Thus it's possible to get the desired result by doing bit operations + * on the least-significant byte of each value and ignoring the + * higher bytes (for 16-bit and 32-bit values). + */ + + switch (ploidy) { + case 1: + // Trivial case - haploid data is phased by default + for (j = 0; j < samples; ++j) { + *p |= 1; + p += inc; + } break; - case BCF_BT_INT32: - NOSWITCHFOR(BCF_BT_INT32, uint32_t, le_to_i32, i32_to_le, p, end, samples, ploidy, &bytes); + case 2: + // Mostly trivial case - first is phased if second is. + for (j = 0; j < samples; ++j) { + *p |= (p[inc] & 1); + p += 2 * inc; + } break; - //no other types are valid for phasing. + default: + // Generic case - first is phased if all other alleles are. + for (j = 0; j < samples; ++j) { + uint8_t allphased = 1; + for (k = 1; k < ploidy; ++k) + allphased &= (p[inc * k]); + *p |= allphased; + p += ploidy * inc; + } } *q = p; return 0;