From 199ad6feed1c2eab977dee3c5f895ef7ab612cac Mon Sep 17 00:00:00 2001 From: "Yinhe Cheng ycheng@us.ibm.com" Date: Fri, 3 Jun 2016 18:25:34 -0400 Subject: [PATCH 1/4] Made changes to enable PPC64 --- ksw.c | 421 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 403 insertions(+), 18 deletions(-) diff --git a/ksw.c b/ksw.c index 9793e5eb..5be0b790 100644 --- a/ksw.c +++ b/ksw.c @@ -26,7 +26,9 @@ #include #include #include -#include +#ifdef __PPC64__ +#include "vec128int.h" +#endif #include "ksw.h" #ifdef USE_MALLOC_WRAPPERS @@ -115,38 +117,111 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; kswr_t r; +#ifdef __PPC64__ +#define __max_16(ret, xx) do { \ + (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 8)); \ + (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 4)); \ + (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 2)); \ + (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 1)); \ + (ret) = vec_extract8sh((xx), 0) & 0x00ff; \ + } while (0) +#else #define __max_16(ret, xx) do { \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ - (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ - } while (0) + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) +#endif // initialization r = g_defr; minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); +#ifdef __PPC64__ + zero = vec_splat4sw(0); +#else + zero = _mm_set1_epi32(0); /* !!!REP NOT FOUND!!! */ +#endif +#ifdef __PPC64__ + oe_del = vec_splat16sb(_o_del + _e_del); +#else oe_del = _mm_set1_epi8(_o_del + _e_del); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e_del = vec_splat16sb(_e_del); +#else e_del = _mm_set1_epi8(_e_del); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + oe_ins = vec_splat16sb(_o_ins + _e_ins); +#else oe_ins = _mm_set1_epi8(_o_ins + _e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e_ins = vec_splat16sb(_e_ins); +#else e_ins = _mm_set1_epi8(_e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + shift = vec_splat16sb(q->shift); +#else shift = _mm_set1_epi8(q->shift); + /* NEED INSPECTION */ +#endif H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { +#ifdef __PPC64__ + vec_store1q(E + i, zero); +#else _mm_store_si128(E + i, zero); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(H0 + i, zero); +#else _mm_store_si128(H0 + i, zero); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(Hmax + i, zero); +#else _mm_store_si128(Hmax + i, zero); + /* NEED INSPECTION */ +#endif } // the core loop for (i = 0; i < tlen; ++i) { int j, k, cmp, imax; __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector +#ifdef __PPC64__ + h = vec_load1q(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example +#else h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + /* NEED INSPECTION */ +#endif + #ifdef __BIG_ENDIAN__ +#ifdef __PPC64__ + h = vec_shiftrightbytes1q(h, 1); +#else + h = _mm_srli_si128(h, 1); + /* NEED INSPECTION */ +#endif + #else +#ifdef __PPC64__ + h = vec_shiftleftbytes1q(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian +#else + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + /* NEED INSPECTION */ +#endif + #endif for (j = 0; LIKELY(j < slen); ++j) { /* SW cells are computed in the following order: * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} @@ -154,35 +229,154 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} */ // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) +#ifdef __PPC64__ + h = vec_addsaturating16ub(h, vec_load1q(S + j)); +#else h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_subtractsaturating16ub(h, shift); // h=H'(i-1,j-1)+S(i,j) +#else h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e = vec_load1q(E + j); // e=E'(i,j) +#else e = _mm_load_si128(E + j); // e=E'(i,j) + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_max16ub(h, e); +#else h = _mm_max_epu8(h, e); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_max16ub(h, f); // h=H'(i,j) +#else h = _mm_max_epu8(h, f); // h=H'(i,j) + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + max = vec_max16ub(max, h); // set max +#else max = _mm_max_epu8(max, h); // set max + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(H1 + j, h); // save to H'(i,j) +#else _mm_store_si128(H1 + j, h); // save to H'(i,j) + /* NEED INSPECTION */ +#endif // now compute E'(i+1,j) +#ifdef __PPC64__ + e = vec_subtractsaturating16ub(e, e_del); // e=E'(i,j) - e_del +#else e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + t = vec_subtractsaturating16ub(h, oe_del); // h=H'(i,j) - o_del - e_del +#else t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e = vec_max16ub(e, t); // e=E'(i+1,j) +#else e = _mm_max_epu8(e, t); // e=E'(i+1,j) + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(E + j, e); // save to E'(i+1,j) +#else _mm_store_si128(E + j, e); // save to E'(i+1,j) + /* NEED INSPECTION */ +#endif // now compute F'(i,j+1) +#ifdef __PPC64__ + f = vec_subtractsaturating16ub(f, e_ins); +#else f = _mm_subs_epu8(f, e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + t = vec_subtractsaturating16ub(h, oe_ins); // h=H'(i,j) - o_ins - e_ins +#else t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + f = vec_max16ub(f, t); +#else f = _mm_max_epu8(f, t); + /* NEED INSPECTION */ +#endif // get H'(i-1,j) and prepare for the next j +#ifdef __PPC64__ + h = vec_load1q(H0 + j); // h=H'(i-1,j) +#else h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + /* NEED INSPECTION */ +#endif } // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max - f = _mm_slli_si128(f, 1); + #ifdef __BIG_ENDIAN__ +#ifdef __PPC64__ + f = vec_shiftrightbytes1q(f, 1); +#else + f = _mm_srli_si128(f, 1); + /* NEED INSPECTION */ +#endif + #else +#ifdef __PPC64__ + f = vec_shiftleftbytes1q(f, 1); +#else + f = _mm_slli_si128(f, 1); + /* NEED INSPECTION */ +#endif + #endif for (j = 0; LIKELY(j < slen); ++j) { +#ifdef __PPC64__ + h = vec_load1q(H1 + j); +#else h = _mm_load_si128(H1 + j); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_max16ub(h, f); // h=H'(i,j) +#else h = _mm_max_epu8(h, f); // h=H'(i,j) + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(H1 + j, h); +#else _mm_store_si128(H1 + j, h); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_subtractsaturating16ub(h, oe_ins); +#else h = _mm_subs_epu8(h, oe_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + f = vec_subtractsaturating16ub(f, e_ins); +#else f = _mm_subs_epu8(f, e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + cmp = vec_extractupperbit16sb(vec_compareeq16sb(vec_subtractsaturating16ub(f, h), zero)); +#else cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + /* NEED INSPECTION */ +#endif if (UNLIKELY(cmp == 0xffff)) goto end_loop16; } } @@ -201,7 +395,12 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del if (imax > gmax) { gmax = imax; te = i; // te is the end position on the target for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector +#ifdef __PPC64__ + vec_store1q(Hmax + j, vec_load1q(H1 + j)); +#else _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + /* NEED INSPECTION */ +#endif if (gmax + q->shift >= 255 || gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; // swap H0 and H1 @@ -236,61 +435,242 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; kswr_t r; +#ifdef __PPC64__ #define __max_8(ret, xx) do { \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ - (ret) = _mm_extract_epi16((xx), 0); \ - } while (0) + (xx) = vec_max8sh((xx), vec_shiftrightbytes1q((xx), 8)); \ + (xx) = vec_max8sh((xx), vec_shiftrightbytes1q((xx), 4)); \ + (xx) = vec_max8sh((xx), vec_shiftrightbytes1q((xx), 2)); \ + (ret) = vec_extract8sh((xx), 0); \ + } while (0) +#else +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) +#endif // initialization r = g_defr; minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); +#ifdef __PPC64__ + zero = vec_splat4sw(0); +#else + zero = _mm_set1_epi32(0); /* !!!REP NOT FOUND!!! */ +#endif +#ifdef __PPC64__ + oe_del = vec_splat8sh(_o_del + _e_del); +#else oe_del = _mm_set1_epi16(_o_del + _e_del); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e_del = vec_splat8sh(_e_del); +#else e_del = _mm_set1_epi16(_e_del); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + oe_ins = vec_splat8sh(_o_ins + _e_ins); +#else oe_ins = _mm_set1_epi16(_o_ins + _e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e_ins = vec_splat8sh(_e_ins); +#else e_ins = _mm_set1_epi16(_e_ins); + /* NEED INSPECTION */ +#endif H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { +#ifdef __PPC64__ + vec_store1q(E + i, zero); +#else _mm_store_si128(E + i, zero); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(H0 + i, zero); +#else _mm_store_si128(H0 + i, zero); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(Hmax + i, zero); +#else _mm_store_si128(Hmax + i, zero); + /* NEED INSPECTION */ +#endif } // the core loop for (i = 0; i < tlen; ++i) { int j, k, imax; __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector +#ifdef __PPC64__ + h = vec_load1q(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example +#else h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 2); + /* NEED INSPECTION */ +#endif + #ifdef __BIG_ENDIAN__ +#ifdef __PPC64__ + h = vec_shiftrightbytes1q(h, 2); +#else + h = _mm_srli_si128(h, 2); + /* NEED INSPECTION */ +#endif + #else +#ifdef __PPC64__ + h = vec_shiftleftbytes1q(h, 2); +#else + h = _mm_slli_si128(h, 2); + /* NEED INSPECTION */ +#endif + #endif for (j = 0; LIKELY(j < slen); ++j) { +#ifdef __PPC64__ + h = vec_addsaturating8sh(h, *S++); +#else h = _mm_adds_epi16(h, *S++); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e = vec_load1q(E + j); +#else e = _mm_load_si128(E + j); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_max8sh(h, e); +#else h = _mm_max_epi16(h, e); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_max8sh(h, f); +#else h = _mm_max_epi16(h, f); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + max = vec_max8sh(max, h); +#else max = _mm_max_epi16(max, h); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(H1 + j, h); +#else _mm_store_si128(H1 + j, h); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e = vec_subtractsaturating8uh(e, e_del); +#else e = _mm_subs_epu16(e, e_del); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + t = vec_subtractsaturating8uh(h, oe_del); +#else t = _mm_subs_epu16(h, oe_del); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + e = vec_max8sh(e, t); +#else e = _mm_max_epi16(e, t); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(E + j, e); +#else _mm_store_si128(E + j, e); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + f = vec_subtractsaturating8uh(f, e_ins); +#else f = _mm_subs_epu16(f, e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + t = vec_subtractsaturating8uh(h, oe_ins); +#else t = _mm_subs_epu16(h, oe_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + f = vec_max8sh(f, t); +#else f = _mm_max_epi16(f, t); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_load1q(H0 + j); +#else h = _mm_load_si128(H0 + j); + /* NEED INSPECTION */ +#endif } for (k = 0; LIKELY(k < 16); ++k) { - f = _mm_slli_si128(f, 2); + #ifdef __BIG_ENDIAN__ +#ifdef __PPC64__ + f = vec_shiftrightbytes1q(f, 2); +#else + f = _mm_srli_si128(f, 2); + /* NEED INSPECTION */ +#endif + #else +#ifdef __PPC64__ + f = vec_shiftleftbytes1q(f, 2); +#else + f = _mm_slli_si128(f, 2); + /* NEED INSPECTION */ +#endif + #endif for (j = 0; LIKELY(j < slen); ++j) { +#ifdef __PPC64__ + h = vec_load1q(H1 + j); +#else h = _mm_load_si128(H1 + j); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_max8sh(h, f); +#else h = _mm_max_epi16(h, f); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + vec_store1q(H1 + j, h); +#else _mm_store_si128(H1 + j, h); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + h = vec_subtractsaturating8uh(h, oe_ins); +#else h = _mm_subs_epu16(h, oe_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + f = vec_subtractsaturating8uh(f, e_ins); +#else f = _mm_subs_epu16(f, e_ins); + /* NEED INSPECTION */ +#endif +#ifdef __PPC64__ + if(UNLIKELY(!vec_extractupperbit16sb(vec_comparegt8sh(f, h)))) goto end_loop8; +#else if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + /* NEED INSPECTION */ +#endif } } end_loop8: @@ -307,7 +687,12 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de if (imax > gmax) { gmax = imax; te = i; for (j = 0; LIKELY(j < slen); ++j) +#ifdef __PPC64__ + vec_store1q(Hmax + j, vec_load1q(H1 + j)); +#else _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + /* NEED INSPECTION */ +#endif if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; From 43bd0da0c8e6160feeaf0f3080561c097d566069 Mon Sep 17 00:00:00 2001 From: "Yinhe Cheng ycheng@us.ibm.com" Date: Fri, 3 Jun 2016 18:29:51 -0400 Subject: [PATCH 2/4] Add a header file for PPC64 --- vec128int.h | 276 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100755 vec128int.h diff --git a/vec128int.h b/vec128int.h new file mode 100755 index 00000000..11e7da89 --- /dev/null +++ b/vec128int.h @@ -0,0 +1,276 @@ +/******************************************************************************/ +/* */ +/* Licensed Materials - Property of IBM */ +/* */ +/* IBM Power Vector Intrinisic Functions version 1.0.3 */ +/* */ +/* Copyright IBM Corp. 2014,2016 */ +/* US Government Users Restricted Rights - Use, duplication or */ +/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ +/* */ +/* See the licence in the license subdirectory. */ +/* */ +/* More information on this software is available on the IBM DeveloperWorks */ +/* website at */ +/* https://www.ibm.com/developerworks/community/groups/community/powerveclib */ +/* */ +/******************************************************************************/ + +#ifndef _H_VEC128INT +#define _H_VEC128INT + +#include + +#define VECLIB_ALIGNED8 __attribute__ ((__aligned__ (8))) +#define VECLIB_ALIGNED16 __attribute__ ((__aligned__ (16))) + +/* Control inlining */ +#ifdef NOT_ALWAYS_INLINE + #define VECLIB_INLINE +#else + #define VECLIB_INLINE static inline __attribute__ ((__always_inline__)) +#endif +#define VECLIB_NOINLINE static __attribute__ ((__noinline__)) + +#define VECLIB_ALIGNED16 __attribute__ ((__aligned__ (16))) + +typedef + VECLIB_ALIGNED8 + unsigned long long +__m64; + +typedef + VECLIB_ALIGNED16 + vector unsigned char +__m128i; + +typedef + VECLIB_ALIGNED16 + union { + __m128i as_m128i; + __m64 as_m64 [2]; + vector signed char as_vector_signed_char; + vector unsigned char as_vector_unsigned_char; + vector bool char as_vector_bool_char; + vector signed short as_vector_signed_short; + vector unsigned short as_vector_unsigned_short; + vector bool short as_vector_bool_short; + vector signed int as_vector_signed_int; + vector unsigned int as_vector_unsigned_int; + vector bool int as_vector_bool_int; + vector signed long long as_vector_signed_long_long; + vector unsigned long long as_vector_unsigned_long_long; + vector bool long long as_vector_bool_long_long; + char as_char [16]; + short as_short [8]; + int as_int [4]; + unsigned int as_unsigned_int [4]; + long long as_long_long [2]; + } __m128i_union; + +typedef const long intlit3; /* 3 bit int literal */ +typedef const long intlit8; /* 8 bit int literal */ + +/******************************************************** Load ********************************************************/ + +/* Load 128-bits of integer data, aligned */ +VECLIB_INLINE __m128i vec_load1q (__m128i const* address) +{ + return (__m128i) vec_ld (0, (vector unsigned char*) address); +} + +/******************************************************** Set *********************************************************/ + +/* Splat 8-bit char to 16 8-bit chars */ +VECLIB_INLINE __m128i vec_splat16sb (char scalar) +{ return (__m128i) vec_splats ((signed char) scalar); } + +/* Splat 16-bit short to 8 16-bit shorts */ +VECLIB_INLINE __m128i vec_splat8sh (short scalar) +{ return (__m128i) vec_splats (scalar); } + +/* Splat 32-bit int to 4 32-bit ints */ +VECLIB_INLINE __m128i vec_splat4sw (int scalar) +{ return (__m128i) vec_splats (scalar); } + +/******************************************************** Store *******************************************************/ + +/* Store 128-bits integer, aligned */ +VECLIB_INLINE void vec_store1q (__m128i* address, __m128i v) +{ vec_st (v, 0, address); } + + +/******************************************************* Extract ******************************************************/ + +/* Extract upper bit of 16 8-bit chars */ +VECLIB_INLINE int vec_extractupperbit16sb (__m128i v) +{ + __m128i_union t; + t.as_m128i = v; + int result = 0; + #ifdef __LITTLE_ENDIAN__ + result |= (t.as_char[15] & 0x80) << (15-7); + result |= (t.as_char[14] & 0x80) << (14-7); + result |= (t.as_char[13] & 0x80) << (13-7); + result |= (t.as_char[12] & 0x80) << (12-7); + result |= (t.as_char[11] & 0x80) << (11-7); + result |= (t.as_char[10] & 0x80) << (10-7); + result |= (t.as_char[9] & 0x80) << (9-7); + result |= (t.as_char[8] & 0x80) << (8-7); + result |= (t.as_char[7] & 0x80); + result |= (t.as_char[6] & 0x80) >> (7-6); + result |= (t.as_char[5] & 0x80) >> (7-5); + result |= (t.as_char[4] & 0x80) >> (7-4); + result |= (t.as_char[3] & 0x80) >> (7-3); + result |= (t.as_char[2] & 0x80) >> (7-2); + result |= (t.as_char[1] & 0x80) >> (7-1); + result |= (t.as_char[0] & 0x80) >> 7; + #elif __BIG_ENDIAN__ + result |= (t.as_char[0] & 0x80) << (15-7); + result |= (t.as_char[1] & 0x80) << (14-7); + result |= (t.as_char[2] & 0x80) << (13-7); + result |= (t.as_char[3] & 0x80) << (12-7); + result |= (t.as_char[4] & 0x80) << (11-7); + result |= (t.as_char[5] & 0x80) << (10-7); + result |= (t.as_char[6] & 0x80) << (9-7); + result |= (t.as_char[7] & 0x80) << (8-7); + result |= (t.as_char[8] & 0x80); + result |= (t.as_char[9] & 0x80) >> (7-6); + result |= (t.as_char[10] & 0x80) >> (7-5); + result |= (t.as_char[11] & 0x80) >> (7-4); + result |= (t.as_char[12] & 0x80) >> (7-3); + result |= (t.as_char[13] & 0x80) >> (7-2); + result |= (t.as_char[14] & 0x80) >> (7-1); + result |= (t.as_char[15] & 0x80) >> 7; + #endif + return result; +} + +/* Extract 16-bit short from one of 8 16-bit shorts */ +VECLIB_INLINE int vec_extract8sh (__m128i v, intlit3 element_from_right) +{ + __m128i_union t; + #ifdef __LITTLE_ENDIAN__ + t.as_m128i = v; + return t.as_short[element_from_right & 7]; + #elif __BIG_ENDIAN__ + static const vector unsigned char permute_selector[8] = { + /* To extract specified halfword element into lowest halfword of the left half with other elements zeroed */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x1E,0x1F, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 0 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x1C,0x1D, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 1 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x1A,0x1B, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 2 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x18,0x19, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 3 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x16,0x17, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 4 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x14,0x15, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 5 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x12,0x13, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }, /* element 6 */ + { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x10,0x11, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F } /* element 7 */ + }; + t.as_m128i = vec_perm (vec_splats ((unsigned char) 0), v, permute_selector[element_from_right & 7]); + return (short) t.as_long_long[__vsr_left_half_long_long_in_memory]; + #endif +} + +/***************************************************** Arithmetic *****************************************************/ + +/* Add 16 8-bit chars with signed saturation */ +VECLIB_INLINE __m128i vec_addsaturating16sb (__m128i left, __m128i right) +{ + return (__m128i) vec_adds ((vector signed char) left, (vector signed char) right); +} + +/* Add 16 8-bit chars with unsigned saturation */ +VECLIB_INLINE __m128i vec_addsaturating16ub (__m128i left, __m128i right) +{ return (__m128i) vec_adds ((vector unsigned char) left, (vector unsigned char) right); } + +/* Add 8 16-bit shorts with signed saturation */ +VECLIB_INLINE __m128i vec_addsaturating8sh (__m128i left, __m128i right) +{ return (__m128i) vec_adds ((vector signed short) left, (vector signed short) right); } + +/* Subtract 16 8-bit chars with unsigned saturation */ +VECLIB_INLINE __m128i vec_subtractsaturating16ub (__m128i left, __m128i right) +{ return (__m128i) vec_subs ((vector unsigned char) left, (vector unsigned char) right); } + +/* Subtract 8 16-bit shorts with unsigned saturation */ +VECLIB_INLINE __m128i vec_subtractsaturating8uh (__m128i left, __m128i right) +{ return (__m128i) vec_subs ((vector unsigned short) left, (vector unsigned short) right); } + +/* Max 8 16-bit shorts */ +VECLIB_INLINE __m128i vec_max8sh (__m128i left, __m128i right) +{ return (__m128i) vec_max ((vector signed short) left, (vector signed short) right); } + +/* Max 16 8-bit unsigned chars */ +VECLIB_INLINE __m128i vec_max16ub (__m128i left, __m128i right) +{ return (__m128i) vec_max ((vector unsigned char) left, (vector unsigned char) right); } + +#ifdef __LITTLE_ENDIAN__ + #define LEleft_BEright left + #define LEright_BEleft right +#elif __BIG_ENDIAN__ + #define LEleft_BEright right + #define LEright_BEleft left +#endif + +/****************************************************** Shift *********************************************************/ + +/*- SSE2 shifts >= 32 produce 0; Altivec/MMX shifts by count%count_size. */ +/*- The Altivec spec says the element shift count vector register must have a shift count in each element */ +/*- and the shift counts may be different for each element. */ +/*- The PowerPC Architecture says all elements must contain the same shift count. That wins. */ +/*- The element shift count_size is: byte shift: 3 bits (0-7), halfword: 4 bits (0-15), word: 5 bits (0-31). */ +/*- For full vector shifts the Altivec/PowerPC bit shift count is in the rightmost 7 bits, */ +/*- with a 4 bit slo/sro byte shift count then a 3 bit sll/srl bit shift count. */ + +/* Shift left */ + +/* Shift 128-bits left logical immediate by bytes */ +VECLIB_INLINE __m128i vec_shiftleftbytes1q (__m128i v, intlit8 bytecount) +{ + if ((unsigned long) bytecount >= 16) + { + /* SSE2 shifts >= element_size or < 0 produce 0; Altivec/MMX shifts by bytecount%element_size. */ + return (__m128i) vec_splats (0); + } else if (bytecount == 0) { + return v; + } else { + /* The PowerPC byte shift count must be multiplied by 8. */ + /* It need not but can be replicated, which handles both LE and BE shift count positioning. */ + __m128i_union replicated_count; + replicated_count.as_m128i = vec_splat16sb (bytecount << 3); + return (__m128i) vec_slo (v, replicated_count.as_m128i); + } +} + +/* Shift right */ + +/* Shift 128-bits right logical immediate by bytes */ +VECLIB_INLINE __m128i vec_shiftrightbytes1q (__m128i v, intlit8 bytecount) +{ + if ((unsigned long) bytecount >= 16) + { + /* SSE2 shifts >= element_size or < 0 produce 0; Altivec/MMX shifts by bytecount%element_size. */ + return (__m128i) vec_splats (0); + } else if (bytecount == 0) { + return v; + } else { + /* The PowerPC byte shift count must be multiplied by 8. */ + /* It need not but can be replicated, which handles both LE and BE shift count positioning. */ + __m128i_union replicated_count; + replicated_count.as_m128i = vec_splat16sb (bytecount << 3); + /* AT gcc v7.1 may miscompile vec_sro as vec_slo? */ + return (__m128i) vec_sro (v, replicated_count.as_m128i); + } +} + +/******************************************************* Compare ******************************************************/ + +/* Compare eq */ + +/* Compare 16 8-bit chars for == to vector mask */ +VECLIB_INLINE __m128i vec_compareeq16sb (__m128i left, __m128i right) +{ return (__m128i) vec_cmpeq ((vector signed char) left, (vector signed char) right); } + +/* Compare 8 16-bit shorts for > to vector mask */ +VECLIB_INLINE __m128i vec_comparegt8sh (__m128i left, __m128i right) +{ return (__m128i) vec_cmpgt ((vector signed short) left, (vector signed short) right); } + +#endif From 85655783df2562d1ad94558dabaad30c93c625ee Mon Sep 17 00:00:00 2001 From: yhcheng Date: Fri, 10 Jun 2016 16:02:14 -0500 Subject: [PATCH 3/4] Update ksw.c --- ksw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ksw.c b/ksw.c index 5be0b790..a5e0cb49 100644 --- a/ksw.c +++ b/ksw.c @@ -28,6 +28,8 @@ #include #ifdef __PPC64__ #include "vec128int.h" +#else +#include #endif #include "ksw.h" From 6d97b9f7f0ebdf586d9c2b300084a615a71694a1 Mon Sep 17 00:00:00 2001 From: yhcheng Date: Tue, 12 Jul 2016 15:29:26 -0500 Subject: [PATCH 4/4] Update vec128int.h --- vec128int.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vec128int.h b/vec128int.h index 11e7da89..59cd9ee0 100755 --- a/vec128int.h +++ b/vec128int.h @@ -24,6 +24,20 @@ #define VECLIB_ALIGNED8 __attribute__ ((__aligned__ (8))) #define VECLIB_ALIGNED16 __attribute__ ((__aligned__ (16))) +#ifdef __LITTLE_ENDIAN__ + #define __vsx_lowest_left_half_char_in_memory 8 + #define __vsr_lowest_left_half_short_in_memory 4 + #define __vsr_lowest_left_half_int_in_memory 2 + #define __vsr_left_half_long_long_in_memory 1 + #define __vsr_right_half_long_long_in_memory 0 +#elif __BIG_ENDIAN__ + #define __vsx_lowest_left_half_char_in_memory 7 + #define __vsr_lowest_left_half_short_in_memory 3 + #define __vsr_lowest_left_half_int_in_memory 1 + #define __vsr_left_half_long_long_in_memory 0 + #define __vsr_right_half_long_long_in_memory 1 +#endif + /* Control inlining */ #ifdef NOT_ALWAYS_INLINE #define VECLIB_INLINE