From 4361a3d7dc98bd0edb3e8a75ffb5d8fb6cfdb74e Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 26 Aug 2015 09:23:08 +0100 Subject: [PATCH] Add hts_parse_decimal() and hts_parse_region() flags parameter [DRAFT] Add the first flag, HTS_PARSE_THOUSANDS_SEP. [IN PROGRESS] Need to figure out whether hts_parse_region() is workable with a strend argument and the possibility of colons in chromosome names... --- hts.c | 17 ++++++++++++----- htslib/hts.h | 39 ++++++++++++++++++++++++++------------- regidx.c | 8 ++++---- synced_bcf_reader.c | 14 +++++++------- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/hts.c b/hts.c index 5c1473cfc9..cb911c0022 100644 --- a/hts.c +++ b/hts.c @@ -1824,7 +1824,7 @@ static inline long long push_digit(long long i, char c) return 10 * i + digit; } -long long hts_parse_decimal(const char *str, char **end) +long long hts_parse_decimal(const char *str, char **strend, int flags) { long long n = 0; int decimals = 0, e = 0, lost = 0; @@ -1837,7 +1837,7 @@ long long hts_parse_decimal(const char *str, char **end) if (*s == '+' || *s == '-') sign = *s++; while (*s) if (isdigit(*s)) n = push_digit(n, *s++); - else if (*s == ',') s++; + else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; else break; if (*s == '.') { @@ -1860,7 +1860,7 @@ long long hts_parse_decimal(const char *str, char **end) fprintf(stderr, "[W::%s] discarding fractional part of %.*s\n", __func__, (int)(s - str), str); - if (end) *end = (char *) s; + if (strend) *strend = (char *) s; else if (*s && hts_verbose >= 2) fprintf(stderr, "[W::%s] ignoring unknown characters after %.*s[%s]\n", __func__, (int)(s - str), str, s); @@ -1869,6 +1869,12 @@ long long hts_parse_decimal(const char *str, char **end) } const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + return hts_parse_region(s, NULL, beg, end, HTS_PARSE_THOUSANDS_SEP); +} + +const char * +hts_parse_region(const char *s, char **strend, int *beg, int *end, int flags) { char *hyphen; const char *colon = strrchr(s, ':'); @@ -1877,11 +1883,12 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) return s + strlen(s); } - *beg = hts_parse_decimal(colon+1, &hyphen) - 1; + *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; if (*beg < 0) *beg = 0; + // FIXME \0 vs. return NULL if (*hyphen == '\0') *end = INT_MAX; - else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL); + else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, strend, flags); else return NULL; if (*beg >= *end) return NULL; diff --git a/htslib/hts.h b/htslib/hts.h index c9ce386ad3..964cd12597 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -477,27 +477,40 @@ hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx); int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped); uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); + +#define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers + /// Parse a numeric string -/** The number may be expressed in scientific notation, and may contain commas - in the integer part (before any decimal point or E notation). - @param str String to be parsed - @param end If non-NULL, set on return to point to the first character - in @a str after those forming the parsed number +/** The number may be expressed in scientific notation, and optionally may + contain commas in the integer part (before any decimal point or E notation). + @param str String to be parsed + @param strend If non-NULL, set on return to point to the first character + in @a str after those forming the parsed number + @param flags Or'ed-together combination of HTS_PARSE_* flags @return Converted value of the parsed number. - When @a end is NULL, a warning will be printed (if hts_verbose is 2 + When @a strend is NULL, a warning will be printed (if hts_verbose is 2 or more) if there are any trailing characters after the number. */ -long long hts_parse_decimal(const char *str, char **end); +long long hts_parse_decimal(const char *str, char **strend, int flags); + +/// Equivalent to hts_parse_region(str, NULL, beg, end, HTS_PARSE_THOUSANDS_SEP) +const char *hts_parse_reg(const char *str, int *beg, int *end); /// Parse a "CHR:START-END"-style region string -/** @param str String to be parsed - @param beg Set on return to the 0-based start of the region - @param end Set on return to the 1-based end of the region - @return Pointer to the colon or '\0' after the reference sequence name, - or NULL if @a str could not be parsed. +/** @param str String to be parsed + @param strend If non-NULL, set on return to point to the first character + in @a str after those forming the parsed region + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @param flags Or'ed-together combination of HTS_PARSE_* flags + @return Pointer to the colon or terminating character after the reference + sequence name, or NULL if @a str could not be parsed. + + When @a strend is NULL, a warning will be printed (if hts_verbose is 2 + or more) if there are any trailing characters after the region string. */ -const char *hts_parse_reg(const char *str, int *beg, int *end); +const char *hts_parse_region(const char *str, char **strend, int *beg, int *end, int flags); hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); void hts_itr_destroy(hts_itr_t *iter); diff --git a/regidx.c b/regidx.c index 551cde57b0..84f18b41a4 100644 --- a/regidx.c +++ b/regidx.c @@ -297,11 +297,11 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *re *chr_end = se-1; ss = se+1; - reg->start = hts_parse_decimal(ss, &se); + reg->start = hts_parse_decimal(ss, &se, 0); if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } ss = se+1; - reg->end = hts_parse_decimal(ss, &se) - 1; + reg->end = hts_parse_decimal(ss, &se, 0) - 1; if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } return 0; @@ -322,7 +322,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *re *chr_end = se-1; ss = se+1; - reg->start = hts_parse_decimal(ss, &se) - 1; + reg->start = hts_parse_decimal(ss, &se, 0) - 1; if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } if ( !se[0] || !se[1] ) @@ -330,7 +330,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *re else { ss = se+1; - reg->end = hts_parse_decimal(ss, &se); + reg->end = hts_parse_decimal(ss, &se, 0); if ( ss==se ) reg->end = reg->start; else reg->end--; } diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 958ec79c68..1d2a3e5ee3 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -887,7 +887,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep==':' ) { sp = ep+1; - from = hts_parse_decimal(sp,(char**)&ep); + from = hts_parse_decimal(sp,(char**)&ep,0); if ( sp==ep ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); @@ -906,7 +906,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } ep++; sp = ep; - to = hts_parse_decimal(sp,(char**)&ep); + to = hts_parse_decimal(sp,(char**)&ep,0); if ( *ep && *ep!=',' ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); @@ -953,15 +953,15 @@ static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **ch if ( i<=k ) return -1; if ( k==l ) { - *from = *to = hts_parse_decimal(ss, &tmp); + *from = *to = hts_parse_decimal(ss, &tmp, 0); if ( tmp==ss ) return -1; } else { if ( k==ifrom ) - *from = hts_parse_decimal(ss, &tmp); + *from = hts_parse_decimal(ss, &tmp, 0); else - *to = hts_parse_decimal(ss, &tmp); + *to = hts_parse_decimal(ss, &tmp, 0); if ( ss==tmp ) return -1; for (i=k; i