/* * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic * James S. Plank, Ethan L. Miller, Kevin M. Greenan, * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. * * gf_w16.c * * Routines for 16-bit Galois fields */ #include "gf_int.h" #include #include #define GF_FIELD_WIDTH (16) #define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) #define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 #define GF_BASE_FIELD_WIDTH (8) #define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) struct gf_w16_logtable_data { uint16_t log_tbl[GF_FIELD_SIZE]; uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; uint16_t inv_tbl[GF_FIELD_SIZE]; uint16_t *d_antilog; }; struct gf_w16_zero_logtable_data { int log_tbl[GF_FIELD_SIZE]; uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; uint16_t *antilog_tbl; uint16_t inv_tbl[GF_FIELD_SIZE]; }; struct gf_w16_lazytable_data { uint16_t log_tbl[GF_FIELD_SIZE]; uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; uint16_t inv_tbl[GF_FIELD_SIZE]; uint16_t *d_antilog; uint16_t lazytable[GF_FIELD_SIZE]; }; struct gf_w16_bytwo_data { uint64_t prim_poly; uint64_t mask1; uint64_t mask2; }; struct gf_w16_split_8_8_data { uint16_t tables[3][256][256]; }; struct gf_w16_group_4_4_data { uint16_t reduce[16]; uint16_t shift[16]; }; struct gf_w16_composite_data { uint8_t *mult_table; }; #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ t2 = b & am2; \ t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ b = (t1 ^ (t2 & ip));} #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ t2 = _mm_and_si128(va, m2); \ t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } #define GF_FIRST_BIT (1 << 15) #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) static inline gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a) { return gf->divide.w32(gf, 1, a); } static inline gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { b = gf->inverse.w32(gf, b); return gf->multiply.w32(gf, a, b); } static void gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_region_data rd; uint16_t *s16; uint16_t *d16; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; if (xor) { while (d16 < ((uint16_t *) rd.d_top)) { *d16 ^= gf->multiply.w32(gf, val, *s16); d16++; s16++; } } else { while (d16 < ((uint16_t *) rd.d_top)) { *d16 = gf->multiply.w32(gf, val, *s16); d16++; s16++; } } gf_do_final_region_alignment(&rd); } #if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_region_data rd; uint16_t *s16; uint16_t *d16; __m128i a, b; __m128i result; __m128i prim_poly; __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; if (xor) { while (d16 < ((uint16_t *) rd.d_top)) { /* see gf_w16_clm_multiply() to see explanation of method */ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); d16++; s16++; } } else { while (d16 < ((uint16_t *) rd.d_top)) { /* see gf_w16_clm_multiply() to see explanation of method */ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); d16++; s16++; } } gf_do_final_region_alignment(&rd); } #endif #if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_region_data rd; uint16_t *s16; uint16_t *d16; __m128i a, b; __m128i result; __m128i prim_poly; __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; if (xor) { while (d16 < ((uint16_t *) rd.d_top)) { /* see gf_w16_clm_multiply() to see explanation of method */ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); d16++; s16++; } } else { while (d16 < ((uint16_t *) rd.d_top)) { /* see gf_w16_clm_multiply() to see explanation of method */ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); d16++; s16++; } } gf_do_final_region_alignment(&rd); } #endif #if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_region_data rd; uint16_t *s16; uint16_t *d16; __m128i a, b; __m128i result; __m128i prim_poly; __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; if (xor) { while (d16 < ((uint16_t *) rd.d_top)) { /* see gf_w16_clm_multiply() to see explanation of method */ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); d16++; s16++; } } else { while (d16 < ((uint16_t *) rd.d_top)) { /* see gf_w16_clm_multiply() to see explanation of method */ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); d16++; s16++; } } gf_do_final_region_alignment(&rd); } #endif static inline gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b) { gf_val_32_t e_i, e_im1, e_ip1; gf_val_32_t d_i, d_im1, d_ip1; gf_val_32_t y_i, y_im1, y_ip1; gf_val_32_t c_i; if (b == 0) return -1; e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; e_i = b; d_im1 = 16; for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; y_i = 1; y_im1 = 0; while (e_i != 1) { e_ip1 = e_im1; d_ip1 = d_im1; c_i = 0; while (d_ip1 >= d_i) { c_i ^= (1 << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); if (e_ip1 == 0) return 0; while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; } y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); y_im1 = y_i; y_i = y_ip1; e_im1 = e_i; d_im1 = d_i; e_i = e_ip1; d_i = d_ip1; } return y_i; } static gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index) { uint16_t *r16, rv; r16 = (uint16_t *) start; rv = r16[index]; return rv; } static gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index) { int sub_size; gf_internal_t *h; uint8_t *r8, *top; uint16_t a, b, *r16; gf_region_data rd; h = (gf_internal_t *) gf->scratch; gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); r16 = (uint16_t *) start; if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; index -= (((uint16_t *) rd.d_start) - r16); r8 = (uint8_t *) rd.d_start; top = (uint8_t *) rd.d_top; sub_size = (top-r8)/2; a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); return (a | (b << 8)); } static gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index) { uint16_t *r16, rv; uint8_t *r8; gf_region_data rd; gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); r16 = (uint16_t *) start; if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; index -= (((uint16_t *) rd.d_start) - r16); r8 = (uint8_t *) rd.d_start; r8 += ((index & 0xfffffff0)*2); r8 += (index & 0xf); rv = (*r8 << 8); r8 += 16; rv |= *r8; return rv; } static inline gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) { return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly); } /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only include it for completeness. It does have the feature that it requires no extra memory. */ static inline gf_val_32_t gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t rv = 0; #if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); b = _mm_insert_epi32 (a, b16, 0); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); /* Do the initial multiply */ result = _mm_clmulepi64_si128 (a, b, 0); /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only have to do the reduction at most twice, because (w-2)/z == 2. Where z is equal to the number of zeros after the leading 1 _mm_clmulepi64_si128 is the carryless multiply operation. Here _mm_srli_si128 shifts the result to the right by 2 bytes. This allows us to multiply the prim_poly by the leading bits of the result. We then xor the result of that operation back with the result.*/ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); #endif return rv; } static inline gf_val_32_t gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t rv = 0; #if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); b = _mm_insert_epi32 (a, b16, 0); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); /* Do the initial multiply */ result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); #endif return rv; } static inline gf_val_32_t gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t rv = 0; #if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); b = _mm_insert_epi32 (a, b16, 0); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); /* Do the initial multiply */ result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); result = _mm_xor_si128 (result, w); /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); #endif return rv; } static inline gf_val_32_t gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t product, i, pp, a, b; gf_internal_t *h; a = a16; b = b16; h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; product = 0; for (i = 0; i < GF_FIELD_WIDTH; i++) { if (a & (1 << i)) product ^= (b << i); } for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); } return product; } static int gf_w16_shift_init(gf_t *gf) { gf->multiply.w32 = gf_w16_shift_multiply; return 1; } static int gf_w16_cfm_init(gf_t *gf) { #if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; /*Ben: Determining how many reductions to do */ if ((0xfe00 & h->prim_poly) == 0) { gf->multiply.w32 = gf_w16_clm_multiply_2; gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; } else if((0xf000 & h->prim_poly) == 0) { gf->multiply.w32 = gf_w16_clm_multiply_3; gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3; } else if ((0xe000 & h->prim_poly) == 0) { gf->multiply.w32 = gf_w16_clm_multiply_4; gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4; } else { return 0; } return 1; #endif return 0; } /* KMG: GF_MULT_LOGTABLE: */ static void gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint16_t *s16, *d16; int lv; struct gf_w16_logtable_data *ltd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; lv = ltd->log_tbl[val]; if (xor) { while (d16 < (uint16_t *) rd.d_top) { *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); d16++; s16++; } } else { while (d16 < (uint16_t *) rd.d_top) { *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); d16++; s16++; } } gf_do_final_region_alignment(&rd); } static inline gf_val_32_t gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w16_logtable_data *ltd; ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]]; } static inline gf_val_32_t gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int log_sum = 0; struct gf_w16_logtable_data *ltd; if (a == 0 || b == 0) return 0; ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b]; return (ltd->d_antilog[log_sum]); } static gf_val_32_t gf_w16_log_inverse(gf_t *gf, gf_val_32_t a) { struct gf_w16_logtable_data *ltd; ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (ltd->inv_tbl[a]); } static int gf_w16_log_init(gf_t *gf) { gf_internal_t *h; struct gf_w16_logtable_data *ltd; int i, b; int check = 0; h = (gf_internal_t *) gf->scratch; ltd = h->private; for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) ltd->log_tbl[i] = 0; ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE; b = 1; for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { if (ltd->log_tbl[b] != 0) check = 1; ltd->log_tbl[b] = i; ltd->antilog_tbl[i] = b; ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b; b <<= 1; if (b & GF_FIELD_SIZE) { b = b ^ h->prim_poly; } } /* If you can't construct the log table, there's a problem. This code is used for some other implementations (e.g. in SPLIT), so if the log table doesn't work in that instance, use CARRY_FREE / SHIFT instead. */ if (check) { if (h->mult_type != GF_MULT_LOG_TABLE) { #if defined(INTEL_SSE4_PCLMUL) return gf_w16_cfm_init(gf); #endif return gf_w16_shift_init(gf); } else { _gf_errno = GF_E_LOGPOLY; return 0; } } ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ ltd->inv_tbl[1] = 1; for (i = 2; i < GF_FIELD_SIZE; i++) { ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; } gf->inverse.w32 = gf_w16_log_inverse; gf->divide.w32 = gf_w16_log_divide; gf->multiply.w32 = gf_w16_log_multiply; gf->multiply_region.w32 = gf_w16_log_multiply_region; return 1; } /* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions. */ /* Ben: Does alternate mapping multiplication using a split table in the lazy method without sse instructions*/ static void gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t i, j, c, prod; uint8_t *s8, *d8, *top; uint16_t table[4][16]; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); /*Ben: Constructs lazy multiplication table*/ for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); table[i][j] = gf->multiply.w32(gf, c, val); } } /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */ s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; top = (uint8_t *) rd.d_top; while (d8 < top) { /*Ben: Multiplies across 16 two byte quantities using alternate mapping high bits are on the left, low bits are on the right. */ for (j=0;j<16;j++) { /*Ben: If the xor flag is set, the product should include what is in dest */ prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0; /*Ben: xors all 4 table lookups into the product variable*/ prod ^= ((table[0][*(s8+16)&0xf]) ^ (table[1][(*(s8+16)&0xf0)>>4]) ^ (table[2][*(s8)&0xf]) ^ (table[3][(*(s8)&0xf0)>>4])); /*Ben: Stores product in the destination and moves on*/ *d8 = (uint8_t)(prod >> 8); *(d8+16) = (uint8_t)(prod & 0x00ff); s8++; d8++; } s8+=16; d8+=16; } gf_do_final_region_alignment(&rd); } static void gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t i, j, a, c, prod; uint16_t *s16, *d16, *top; uint16_t table[4][16]; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); table[i][j] = gf->multiply.w32(gf, c, val); } } s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; top = (uint16_t *) rd.d_top; while (d16 < top) { a = *s16; prod = (xor) ? *d16 : 0; for (i = 0; i < 4; i++) { prod ^= table[i][a&0xf]; a >>= 4; } *d16 = prod; s16++; d16++; } } static void gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t j, k, v, a, prod, *s64, *d64, *top64; gf_internal_t *h; uint64_t htable[256], ltable[256]; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; v = val; ltable[0] = 0; for (j = 1; j < 256; j <<= 1) { for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]); v = GF_MULTBY_TWO(v); } htable[0] = 0; for (j = 1; j < 256; j <<= 1) { for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]); v = GF_MULTBY_TWO(v); } s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top64 = (uint64_t *) rd.d_top; /* Does Unrolling Matter? -- Doesn't seem to. while (d64 != top64) { a = *s64; prod = htable[a >> 56]; a <<= 8; prod ^= ltable[a >> 56]; a <<= 8; prod <<= 16; prod ^= htable[a >> 56]; a <<= 8; prod ^= ltable[a >> 56]; a <<= 8; prod <<= 16; prod ^= htable[a >> 56]; a <<= 8; prod ^= ltable[a >> 56]; a <<= 8; prod <<= 16; prod ^= htable[a >> 56]; a <<= 8; prod ^= ltable[a >> 56]; prod ^= ((xor) ? *d64 : 0); *d64 = prod; s64++; d64++; } */ while (d64 != top64) { a = *s64; prod = 0; for (j = 0; j < 4; j++) { prod <<= 16; prod ^= htable[a >> 56]; a <<= 8; prod ^= ltable[a >> 56]; a <<= 8; } //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better. prod ^= ((xor) ? *d64 : 0); *d64 = prod; s64++; d64++; } gf_do_final_region_alignment(&rd); } static void gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t c; gf_internal_t *h; struct gf_w16_lazytable_data *ltd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; ltd = (struct gf_w16_lazytable_data *) h->private; ltd->lazytable[0] = 0; /* a = val; c = 1; pp = h->prim_poly; do { ltd->lazytable[c] = a; c <<= 1; if (c & (1 << GF_FIELD_WIDTH)) c ^= pp; a <<= 1; if (a & (1 << GF_FIELD_WIDTH)) a ^= pp; } while (c != 1); */ for (c = 1; c < GF_FIELD_SIZE; c++) { ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val); } gf_two_byte_region_table_multiply(&rd, ltd->lazytable); gf_do_final_region_alignment(&rd); } static void gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { #ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; uint64_t c, prod; uint8_t low[4][16]; uint8_t high[4][16]; gf_region_data rd; __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); prod = gf->multiply.w32(gf, c, val); low[i][j] = (prod & 0xff); high[i][j] = (prod >> 8); } } for (i = 0; i < 4; i++) { tlow[i] = _mm_loadu_si128((__m128i *)low[i]); thigh[i] = _mm_loadu_si128((__m128i *)high[i]); } s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top64 = (uint64_t *) rd.d_top; mask = _mm_set1_epi8 (0x0f); lmask = _mm_set1_epi16 (0xff); if (xor) { while (d64 != top64) { ta = _mm_load_si128((__m128i *) s64); tb = _mm_load_si128((__m128i *) (s64+2)); tta = _mm_srli_epi16(ta, 8); ttb = _mm_srli_epi16(tb, 8); tpl = _mm_and_si128(tb, lmask); tph = _mm_and_si128(ta, lmask); tb = _mm_packus_epi16(tpl, tph); ta = _mm_packus_epi16(ttb, tta); ti = _mm_and_si128 (mask, tb); tph = _mm_shuffle_epi8 (thigh[0], ti); tpl = _mm_shuffle_epi8 (tlow[0], ti); tb = _mm_srli_epi16(tb, 4); ti = _mm_and_si128 (mask, tb); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); ta = _mm_srli_epi16(ta, 4); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); ta = _mm_unpackhi_epi8(tpl, tph); tb = _mm_unpacklo_epi8(tpl, tph); tta = _mm_load_si128((__m128i *) d64); ta = _mm_xor_si128(ta, tta); ttb = _mm_load_si128((__m128i *) (d64+2)); tb = _mm_xor_si128(tb, ttb); _mm_store_si128 ((__m128i *)d64, ta); _mm_store_si128 ((__m128i *)(d64+2), tb); d64 += 4; s64 += 4; } } else { while (d64 != top64) { ta = _mm_load_si128((__m128i *) s64); tb = _mm_load_si128((__m128i *) (s64+2)); tta = _mm_srli_epi16(ta, 8); ttb = _mm_srli_epi16(tb, 8); tpl = _mm_and_si128(tb, lmask); tph = _mm_and_si128(ta, lmask); tb = _mm_packus_epi16(tpl, tph); ta = _mm_packus_epi16(ttb, tta); ti = _mm_and_si128 (mask, tb); tph = _mm_shuffle_epi8 (thigh[0], ti); tpl = _mm_shuffle_epi8 (tlow[0], ti); tb = _mm_srli_epi16(tb, 4); ti = _mm_and_si128 (mask, tb); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); ta = _mm_srli_epi16(ta, 4); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); ta = _mm_unpackhi_epi8(tpl, tph); tb = _mm_unpacklo_epi8(tpl, tph); _mm_store_si128 ((__m128i *)d64, ta); _mm_store_si128 ((__m128i *)(d64+2), tb); d64 += 4; s64 += 4; } } gf_do_final_region_alignment(&rd); #endif } static void gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { #ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; uint64_t c, prod; uint8_t low[4][16]; uint8_t high[4][16]; gf_region_data rd; __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4]; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); prod = gf->multiply.w32(gf, c, val); low[i][j] = (prod & 0xff); high[i][j] = (prod >> 8); } } for (i = 0; i < 4; i++) { tlow[i] = _mm_loadu_si128((__m128i *)low[i]); thigh[i] = _mm_loadu_si128((__m128i *)high[i]); } s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top64 = (uint64_t *) rd.d_top; mask = _mm_set1_epi8 (0x0f); if (xor) { while (d64 != top64) { ta = _mm_load_si128((__m128i *) s64); tb = _mm_load_si128((__m128i *) (s64+2)); ti = _mm_and_si128 (mask, tb); tph = _mm_shuffle_epi8 (thigh[0], ti); tpl = _mm_shuffle_epi8 (tlow[0], ti); tb = _mm_srli_epi16(tb, 4); ti = _mm_and_si128 (mask, tb); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); ta = _mm_srli_epi16(ta, 4); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); ta = _mm_load_si128((__m128i *) d64); tph = _mm_xor_si128(tph, ta); _mm_store_si128 ((__m128i *)d64, tph); tb = _mm_load_si128((__m128i *) (d64+2)); tpl = _mm_xor_si128(tpl, tb); _mm_store_si128 ((__m128i *)(d64+2), tpl); d64 += 4; s64 += 4; } } else { while (d64 != top64) { ta = _mm_load_si128((__m128i *) s64); tb = _mm_load_si128((__m128i *) (s64+2)); ti = _mm_and_si128 (mask, tb); tph = _mm_shuffle_epi8 (thigh[0], ti); tpl = _mm_shuffle_epi8 (tlow[0], ti); tb = _mm_srli_epi16(tb, 4); ti = _mm_and_si128 (mask, tb); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); ta = _mm_srli_epi16(ta, 4); ti = _mm_and_si128 (mask, ta); tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); _mm_store_si128 ((__m128i *)d64, tph); _mm_store_si128 ((__m128i *)(d64+2), tpl); d64 += 4; s64 += 4; } } gf_do_final_region_alignment(&rd); #endif } uint32_t gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { uint32_t alow, blow; struct gf_w16_split_8_8_data *d8; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; d8 = (struct gf_w16_split_8_8_data *) h->private; alow = a & 0xff; blow = b & 0xff; a >>= 8; b >>= 8; return d8->tables[0][alow][blow] ^ d8->tables[1][alow][b] ^ d8->tables[1][a][blow] ^ d8->tables[2][a][b]; } static int gf_w16_split_init(gf_t *gf) { gf_internal_t *h; struct gf_w16_split_8_8_data *d8; int i, j, exp, issse3; uint32_t p, basep; h = (gf_internal_t *) gf->scratch; #ifdef INTEL_SSSE3 issse3 = 1; #else issse3 = 0; #endif if (h->arg1 == 8 && h->arg2 == 8) { d8 = (struct gf_w16_split_8_8_data *) h->private; basep = 1; for (exp = 0; exp < 3; exp++) { for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; d8->tables[exp][1][1] = basep; for (i = 2; i < 256; i++) { if (i&1) { p = d8->tables[exp][i^1][1]; d8->tables[exp][i][1] = p ^ basep; } else { p = d8->tables[exp][i>>1][1]; d8->tables[exp][i][1] = GF_MULTBY_TWO(p); } } for (i = 1; i < 256; i++) { p = d8->tables[exp][i][1]; for (j = 1; j < 256; j++) { if (j&1) { d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; } else { d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); } } } for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); } gf->multiply.w32 = gf_w16_split_8_8_multiply; gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; return 1; } /* We'll be using LOG for multiplication, unless the pp isn't primitive. In that case, we'll be using SHIFT. */ gf_w16_log_init(gf); /* Defaults */ if (issse3) { gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; } else { gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; } if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { if (issse3) { if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; else if(h->region_type & GF_REGION_NOSSE) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; else if(h->region_type & GF_REGION_ALTMAP) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; } else { if(h->region_type & GF_REGION_SSE) return 0; else if(h->region_type & GF_REGION_ALTMAP) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; else gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; } } return 1; } static int gf_w16_table_init(gf_t *gf) { gf_w16_log_init(gf); gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; return 1; } static void gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint16_t lv; int i; uint16_t *s16, *d16, *top16; struct gf_w16_zero_logtable_data *ltd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private; s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; top16 = (uint16_t *) rd.d_top; bytes = top16 - d16; lv = ltd->log_tbl[val]; if (xor) { for (i = 0; i < bytes; i++) { d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); } } else { for (i = 0; i < bytes; i++) { d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); } } /* This isn't necessary. */ gf_do_final_region_alignment(&rd); } /* Here -- double-check Kevin */ static inline gf_val_32_t gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w16_zero_logtable_data *ltd; ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; } static inline gf_val_32_t gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int log_sum = 0; struct gf_w16_zero_logtable_data *ltd; if (a == 0 || b == 0) return 0; ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); return (ltd->antilog_tbl[log_sum]); } static gf_val_32_t gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a) { struct gf_w16_zero_logtable_data *ltd; ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (ltd->inv_tbl[a]); } static inline gf_val_32_t gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { uint32_t prod, pp, pmask, amask; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; prod = 0; pmask = 0x8000; amask = 0x8000; while (amask != 0) { if (prod & pmask) { prod = ((prod << 1) ^ pp); } else { prod <<= 1; } if (a & amask) prod ^= b; amask >>= 1; } return prod; } static inline gf_val_32_t gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { uint32_t prod, pp, bmask; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; prod = 0; bmask = 0x8000; while (1) { if (a & 1) prod ^= b; a >>= 1; if (a == 0) return prod; if (b & bmask) { b = ((b << 1) ^ pp); } else { b <<= 1; } } } static void gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t *s64, *d64, t1, t2, ta, prod, amask; gf_region_data rd; struct gf_w16_bytwo_data *btd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); gf_do_initial_region_alignment(&rd); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; if (xor) { while (s64 < (uint64_t *) rd.s_top) { prod = 0; amask = 0x8000; ta = *s64; while (amask != 0) { AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); if (val & amask) prod ^= ta; amask >>= 1; } *d64 ^= prod; d64++; s64++; } } else { while (s64 < (uint64_t *) rd.s_top) { prod = 0; amask = 0x8000; ta = *s64; while (amask != 0) { AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); if (val & amask) prod ^= ta; amask >>= 1; } *d64 = prod; d64++; s64++; } } gf_do_final_region_alignment(&rd); } #define BYTWO_P_ONESTEP {\ SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ t1 = _mm_and_si128(v, one); \ t1 = _mm_sub_epi16(t1, one); \ t1 = _mm_and_si128(t1, ta); \ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } #ifdef INTEL_SSE2 static void gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int i; uint8_t *s8, *d8; uint32_t vrev; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w16_bytwo_data *btd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); vrev = 0; for (i = 0; i < 16; i++) { vrev <<= 1; if (!(val & (1 << i))) vrev |= 1; } s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; pp = _mm_set1_epi16(btd->prim_poly&0xffff); m1 = _mm_set1_epi16((btd->mask1)&0xffff); m2 = _mm_set1_epi16((btd->mask2)&0xffff); one = _mm_set1_epi16(1); while (d8 < (uint8_t *) rd.d_top) { prod = _mm_setzero_si128(); v = _mm_set1_epi16(vrev); ta = _mm_load_si128((__m128i *) s8); tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); d8 += 16; s8 += 16; } gf_do_final_region_alignment(&rd); } #endif #ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi16(btd->prim_poly&0xffff); m1 = _mm_set1_epi16((btd->mask1)&0xffff); m2 = _mm_set1_epi16((btd->mask2)&0xffff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); SSE_AB2(pp, m1, m2, va, t1, t2); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } } #endif #ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi16(btd->prim_poly&0xffff); m1 = _mm_set1_epi16((btd->mask1)&0xffff); m2 = _mm_set1_epi16((btd->mask2)&0xffff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); SSE_AB2(pp, m1, m2, va, t1, t2); vb = _mm_load_si128 ((__m128i *)(d8)); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } } #endif #ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_w16_bytwo_data *btd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; if (val == 2) { if (xor) { gf_w16_bytwo_b_sse_region_2_xor(&rd, btd); } else { gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd); } gf_do_final_region_alignment(&rd); return; } s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; pp = _mm_set1_epi16(btd->prim_poly&0xffff); m1 = _mm_set1_epi16((btd->mask1)&0xffff); m2 = _mm_set1_epi16((btd->mask2)&0xffff); while (d8 < (uint8_t *) rd.d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); itb = val; while (1) { if (itb & 1) vb = _mm_xor_si128(vb, va); itb >>= 1; if (itb == 0) break; SSE_AB2(pp, m1, m2, va, t1, t2); } _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } gf_do_final_region_alignment(&rd); } #endif static void gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w16_bytwo_data *btd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; switch (val) { case 2: if (xor) { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 ^= ta; d64++; s64++; } } else { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 = ta; d64++; s64++; } } break; case 3: if (xor) { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; prod = ta; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 ^= (ta ^ prod); d64++; s64++; } } else { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; prod = ta; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 = (ta ^ prod); d64++; s64++; } } break; case 4: if (xor) { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 ^= ta; d64++; s64++; } } else { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 = ta; d64++; s64++; } } break; case 5: if (xor) { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; prod = ta; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 ^= (ta ^ prod); d64++; s64++; } } else { while (d64 < (uint64_t *) rd.d_top) { ta = *s64; prod = ta; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); *d64 = ta ^ prod; d64++; s64++; } } break; default: if (xor) { while (d64 < (uint64_t *) rd.d_top) { prod = *d64 ; ta = *s64; tb = val; while (1) { if (tb & 1) prod ^= ta; tb >>= 1; if (tb == 0) break; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); } *d64 = prod; d64++; s64++; } } else { while (d64 < (uint64_t *) rd.d_top) { prod = 0 ; ta = *s64; tb = val; while (1) { if (tb & 1) prod ^= ta; tb >>= 1; if (tb == 0) break; AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); } *d64 = prod; d64++; s64++; } } break; } gf_do_final_region_alignment(&rd); } static int gf_w16_bytwo_init(gf_t *gf) { gf_internal_t *h; uint64_t ip, m1, m2; struct gf_w16_bytwo_data *btd; h = (gf_internal_t *) gf->scratch; btd = (struct gf_w16_bytwo_data *) (h->private); ip = h->prim_poly & 0xffff; m1 = 0xfffe; m2 = 0x8000; btd->prim_poly = 0; btd->mask1 = 0; btd->mask2 = 0; while (ip != 0) { btd->prim_poly |= ip; btd->mask1 |= m1; btd->mask2 |= m2; ip <<= GF_FIELD_WIDTH; m1 <<= GF_FIELD_WIDTH; m2 <<= GF_FIELD_WIDTH; } if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w16_bytwo_p_multiply; #ifdef INTEL_SSE2 if (h->region_type & GF_REGION_NOSSE) gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; else gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; #else gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; if(h->region_type & GF_REGION_SSE) return 0; #endif } else { gf->multiply.w32 = gf_w16_bytwo_b_multiply; #ifdef INTEL_SSE2 if (h->region_type & GF_REGION_NOSSE) gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; else gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; #else gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; if(h->region_type & GF_REGION_SSE) return 0; #endif } return 1; } static int gf_w16_log_zero_init(gf_t *gf) { gf_internal_t *h; struct gf_w16_zero_logtable_data *ltd; int i, b; h = (gf_internal_t *) gf->scratch; ltd = h->private; ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1; bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl)); ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]); b = 1; for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { ltd->log_tbl[b] = (uint16_t)i; ltd->antilog_tbl[i] = (uint16_t)b; ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b; b <<= 1; if (b & GF_FIELD_SIZE) { b = b ^ h->prim_poly; } } ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ ltd->inv_tbl[1] = 1; for (i = 2; i < GF_FIELD_SIZE; i++) { ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; } gf->inverse.w32 = gf_w16_log_zero_inverse; gf->divide.w32 = gf_w16_log_zero_divide; gf->multiply.w32 = gf_w16_log_zero_multiply; gf->multiply_region.w32 = gf_w16_log_zero_multiply_region; return 1; } static gf_val_32_t gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t b0 = b & 0x00ff; uint8_t b1 = (b & 0xff00) >> 8; uint8_t a0 = a & 0x00ff; uint8_t a1 = (a & 0xff00) >> 8; uint8_t a1b1; uint16_t rv; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); return rv; } static gf_val_32_t gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; uint8_t b0 = b & 0x00ff; uint8_t b1 = (b & 0xff00) >> 8; uint8_t a0 = a & 0x00ff; uint8_t a1 = (a & 0xff00) >> 8; uint8_t a1b1, *mt; uint16_t rv; struct gf_w16_composite_data *cd; cd = (struct gf_w16_composite_data *) h->private; mt = cd->mult_table; a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); return rv; } /* * Composite field division trick (explained in 2007 tech report) * * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 * * let c = b^-1 * * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) * * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 * * let d = b1c1 and d+1 = b0c0 * * solve s*b1c1+b1c0+b0c1 = 0 * * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 * * c0 = (d+1)b0^-1 * c1 = d*b1^-1 * * a / b = a * c */ static gf_val_32_t gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t a0 = a & 0x00ff; uint8_t a1 = (a & 0xff00) >> 8; uint8_t c0, c1, d, tmp; uint16_t c; uint8_t a0inv, a1inv; if (a0 == 0) { a1inv = base_gf->inverse.w32(base_gf, a1); c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); c1 = a1inv; } else if (a1 == 0) { c0 = base_gf->inverse.w32(base_gf, a0); c1 = 0; } else { a1inv = base_gf->inverse.w32(base_gf, a1); a0inv = base_gf->inverse.w32(base_gf, a0); d = base_gf->multiply.w32(base_gf, a1, a0inv); tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); tmp = base_gf->inverse.w32(base_gf, tmp); d = base_gf->multiply.w32(base_gf, d, tmp); c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); c1 = base_gf->multiply.w32(base_gf, d, a1inv); } c = c0 | (c1 << 8); return c; } static void gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t b0 = val & 0x00ff; uint8_t b1 = (val & 0xff00) >> 8; uint16_t *s16, *d16, *top; uint8_t a0, a1, a1b1, *mt; gf_region_data rd; struct gf_w16_composite_data *cd; cd = (struct gf_w16_composite_data *) h->private; mt = cd->mult_table; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); s16 = rd.s_start; d16 = rd.d_start; top = rd.d_top; if (mt == NULL) { if (xor) { while (d16 < top) { a0 = (*s16) & 0x00ff; a1 = ((*s16) & 0xff00) >> 8; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); s16++; d16++; } } else { while (d16 < top) { a0 = (*s16) & 0x00ff; a1 = ((*s16) & 0xff00) >> 8; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); s16++; d16++; } } } else { if (xor) { while (d16 < top) { a0 = (*s16) & 0x00ff; a1 = ((*s16) & 0xff00) >> 8; a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); s16++; d16++; } } else { while (d16 < top) { a0 = (*s16) & 0x00ff; a1 = ((*s16) & 0xff00) >> 8; a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); s16++; d16++; } } } } static void gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t val0 = val & 0x00ff; uint8_t val1 = (val & 0xff00) >> 8; gf_region_data rd; int sub_reg_size; uint8_t *slow, *shigh; uint8_t *dlow, *dhigh, *top;; /* JSP: I want the two pointers aligned wrt each other on 16 byte boundaries. So I'm going to make sure that the area on which the two operate is a multiple of 32. Of course, that junks up the mapping, but so be it -- that's why we have extract_word.... */ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); slow = (uint8_t *) rd.s_start; dlow = (uint8_t *) rd.d_start; top = (uint8_t *) rd.d_top; sub_reg_size = (top - dlow)/2; shigh = slow + sub_reg_size; dhigh = dlow + sub_reg_size; base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } static int gf_w16_composite_init(gf_t *gf) { gf_internal_t *h = (gf_internal_t *) gf->scratch; struct gf_w16_composite_data *cd; if (h->base_gf == NULL) return 0; cd = (struct gf_w16_composite_data *) h->private; cd->mult_table = gf_w8_get_mult_table(h->base_gf); if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt; } else { gf->multiply_region.w32 = gf_w16_composite_multiply_region; } if (cd->mult_table == NULL) { gf->multiply.w32 = gf_w16_composite_multiply_recursive; } else { gf->multiply.w32 = gf_w16_composite_multiply_inline; } gf->divide.w32 = NULL; gf->inverse.w32 = gf_w16_composite_inverse; return 1; } static void gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h) { int i, j; shift[0] = 0; for (i = 0; i < 16; i += 2) { j = (shift[i>>1] << 1); if (j & (1 << 16)) j ^= h->prim_poly; shift[i] = j; shift[i^1] = j^val; } } static inline gf_val_32_t gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { uint16_t p, l, ind, r, a16; struct gf_w16_group_4_4_data *d44; gf_internal_t *h = (gf_internal_t *) gf->scratch; d44 = (struct gf_w16_group_4_4_data *) h->private; gf_w16_group_4_set_shift_tables(d44->shift, b, h); a16 = a; ind = a16 >> 12; a16 <<= 4; p = d44->shift[ind]; r = p & 0xfff; l = p >> 12; ind = a16 >> 12; a16 <<= 4; p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); r = p & 0xfff; l = p >> 12; ind = a16 >> 12; a16 <<= 4; p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); r = p & 0xfff; l = p >> 12; ind = a16 >> 12; p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); return p; } static void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint16_t p, l, ind, r, a16, p16; struct gf_w16_group_4_4_data *d44; gf_region_data rd; uint16_t *s16, *d16, *top; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_internal_t *h = (gf_internal_t *) gf->scratch; d44 = (struct gf_w16_group_4_4_data *) h->private; gf_w16_group_4_set_shift_tables(d44->shift, val, h); gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; top = (uint16_t *) rd.d_top; while (d16 < top) { a16 = *s16; p16 = (xor) ? *d16 : 0; ind = a16 >> 12; a16 <<= 4; p = d44->shift[ind]; r = p & 0xfff; l = p >> 12; ind = a16 >> 12; a16 <<= 4; p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); r = p & 0xfff; l = p >> 12; ind = a16 >> 12; a16 <<= 4; p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); r = p & 0xfff; l = p >> 12; ind = a16 >> 12; p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); p ^= p16; *d16 = p; d16++; s16++; } gf_do_final_region_alignment(&rd); } static int gf_w16_group_init(gf_t *gf) { int i, j, p; struct gf_w16_group_4_4_data *d44; gf_internal_t *h = (gf_internal_t *) gf->scratch; d44 = (struct gf_w16_group_4_4_data *) h->private; d44->reduce[0] = 0; for (i = 0; i < 16; i++) { p = 0; for (j = 0; j < 4; j++) { if (i & (1 << j)) p ^= (h->prim_poly << j); } d44->reduce[p>>16] = (p&0xffff); } gf->multiply.w32 = gf_w16_group_4_4_multiply; gf->divide.w32 = NULL; gf->inverse.w32 = NULL; gf->multiply_region.w32 = gf_w16_group_4_4_region_multiply; return 1; } int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { switch(mult_type) { case GF_MULT_TABLE: return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data); break; case GF_MULT_LOG_ZERO: return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64; break; case GF_MULT_LOG_TABLE: return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; break; case GF_MULT_DEFAULT: case GF_MULT_SPLIT_TABLE: if (arg1 == 8 && arg2 == 8) { return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64; } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) { return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; } else if (mult_type == GF_MULT_DEFAULT || (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) { return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; } return 0; break; case GF_MULT_GROUP: return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64; break; case GF_MULT_CARRY_FREE: return sizeof(gf_internal_t); break; case GF_MULT_SHIFT: return sizeof(gf_internal_t); break; case GF_MULT_COMPOSITE: return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64; break; default: return 0; } return 0; } int gf_w16_init(gf_t *gf) { gf_internal_t *h; h = (gf_internal_t *) gf->scratch; /* Allen: set default primitive polynomial / irreducible polynomial if needed */ if (h->prim_poly == 0) { if (h->mult_type == GF_MULT_COMPOSITE) { h->prim_poly = gf_composite_get_default_poly(h->base_gf); if (h->prim_poly == 0) return 0; } else { /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^16). h->prim_poly = 0x1002d; The following is the traditional primitive polynomial for GF(2^16) */ h->prim_poly = 0x1100b; } } if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16); gf->multiply.w32 = NULL; gf->divide.w32 = NULL; gf->inverse.w32 = NULL; gf->multiply_region.w32 = NULL; switch(h->mult_type) { case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break; case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break; case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break; case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break; case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break; case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break; case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break; default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { gf->divide.w32 = gf_w16_divide_from_inverse; gf->inverse.w32 = gf_w16_euclid; } else if (h->divide_type == GF_DIVIDE_MATRIX) { gf->divide.w32 = gf_w16_divide_from_inverse; gf->inverse.w32 = gf_w16_matrix; } if (gf->divide.w32 == NULL) { gf->divide.w32 = gf_w16_divide_from_inverse; if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid; } if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide; if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { gf->extract_word.w32 = gf_w16_composite_extract_word; } else { gf->extract_word.w32 = gf_w16_split_extract_word; } } else if (h->region_type == GF_REGION_CAUCHY) { gf->multiply_region.w32 = gf_wgen_cauchy_region; gf->extract_word.w32 = gf_wgen_extract_word; } else { gf->extract_word.w32 = gf_w16_extract_word; } if (gf->multiply_region.w32 == NULL) { gf->multiply_region.w32 = gf_w16_multiply_region_from_single; } return 1; } /* Inline setup functions */ uint16_t *gf_w16_get_log_table(gf_t *gf) { struct gf_w16_logtable_data *ltd; if (gf->multiply.w32 == gf_w16_log_multiply) { ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (uint16_t *) ltd->log_tbl; } return NULL; } uint16_t *gf_w16_get_mult_alog_table(gf_t *gf) { gf_internal_t *h; struct gf_w16_logtable_data *ltd; h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { ltd = (struct gf_w16_logtable_data *) h->private; return (uint16_t *) ltd->antilog_tbl; } return NULL; } uint16_t *gf_w16_get_div_alog_table(gf_t *gf) { gf_internal_t *h; struct gf_w16_logtable_data *ltd; h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { ltd = (struct gf_w16_logtable_data *) h->private; return (uint16_t *) ltd->d_antilog; } return NULL; }