From aa6f35432cac901476531e3dfe9b1868b408c091 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 9 Apr 2024 17:13:46 +0100 Subject: [PATCH] Increase the input block size for bgzip. Commit e495718 changed bgzip from unix raw POSIX read() calls to hread(). Unfortunately hread gets its buffer size from stat of the input file descriptor, which can be 4kb for a pipe. We're reading 0xff00 bytes, so this ends up being split over two reads mostly, with one or both involving additional memcpys. This makes the buffered I/O worse performing than non-buffered. In the most extreme cases (cat data | bgzip -l0 > /dev/null) this is a two fold slow down. The easy solution is just to increase the buffer size to something sensible. It's a little messy as we have to use hfile_internal.h to get hfile_set_blksize, but it works. I'm not sure why we didn't elect to make that API more public. Probably simply out of caution. Fixes #1767 --- bgzip.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bgzip.c b/bgzip.c index 129343fb5..2e887b028 100644 --- a/bgzip.c +++ b/bgzip.c @@ -39,6 +39,7 @@ #include "htslib/bgzf.h" #include "htslib/hts.h" #include "htslib/hfile.h" +#include "hfile_internal.h" // for hfile_set_blksize #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN @@ -337,6 +338,9 @@ int main(int argc, char **argv) return 1; } + // Increase block size to improve throughput on fast filesystems + hfile_set_blksize(f_src, 256*1024); + if (write_fname) { if (!exp_out_open) { // only open this file once for writing, close at the end if ((fp = bgzf_open(write_fname, out_mode)) == NULL) {