summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crc32.c114
1 files changed, 114 insertions, 0 deletions
diff --git a/crc32.c b/crc32.c
index f6cd52f..2d20829 100644
--- a/crc32.c
+++ b/crc32.c
@@ -617,6 +617,118 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
return (const z_crc_t FAR *)crc_table;
}
+/* =========================================================================
+ * Use ARM machine instructions if requested. This will compute the CRC about
+ * ten times faster than the braided calculation. This code does not check for
+ * the presence of the CRC instruction. Compile with care.
+ */
+#if defined(Z_ARM_CRC32) && defined(__aarch64__) && W == 8
+
+/*
+ Constants empirically determined to maximize speed. These values are from
+ measurements on a Cortex-A57. Your mileage may vary.
+ */
+#define Z_BATCH 3990 /* number of words in a batch */
+#define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
+#define Z_BATCH_MIN 800 /* fewest words in a final batch */
+
+unsigned long ZEXPORT crc32_z(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ z_size_t len;
+{
+ z_crc_t val;
+ z_word_t crc1, crc2;
+ const z_word_t *word;
+ z_word_t val0, val1, val2;
+ z_size_t last, last2, i;
+ z_size_t num;
+
+ /* Return initial CRC, if requested. */
+ if (buf == Z_NULL) return 0;
+
+#ifdef DYNAMIC_CRC_TABLE
+ once(&made, make_crc_table);
+#endif /* DYNAMIC_CRC_TABLE */
+
+ /* Pre-condition the CRC */
+ crc ^= 0xffffffff;
+
+ /* Compute the CRC up to a word boundary. */
+ while (len && ((z_size_t)buf & 7) != 0) {
+ len--;
+ val = *buf++;
+ __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
+ }
+
+ /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
+ word = (z_word_t const *)buf;
+ num = len >> 3;
+ len &= 7;
+
+ /* Do three interleaved CRCs to realize the throughput of one crc32x
+ instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
+ CRCs are combined into a single CRC after each set of batches. */
+ while (num >= 3 * Z_BATCH) {
+ crc1 = 0;
+ crc2 = 0;
+ for (i = 0; i < Z_BATCH; i++) {
+ val0 = word[i];
+ val1 = word[i + Z_BATCH];
+ val2 = word[i + 2 * Z_BATCH];
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
+ }
+ word += 3 * Z_BATCH;
+ num -= 3 * Z_BATCH;
+ crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
+ crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
+ }
+
+ /* Do one last smaller batch with the remaining words, if there are enough
+ to pay for the combination of CRCs. */
+ last = num / 3;
+ if (last >= Z_BATCH_MIN) {
+ last2 = last << 1;
+ crc1 = 0;
+ crc2 = 0;
+ for (i = 0; i < last; i++) {
+ val0 = word[i];
+ val1 = word[i + last];
+ val2 = word[i + last2];
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
+ }
+ word += 3 * last;
+ num -= 3 * last;
+ val = x2nmodp(last, 6);
+ crc = multmodp(val, crc) ^ crc1;
+ crc = multmodp(val, crc) ^ crc2;
+ }
+
+ /* Compute the CRC on any remaining words. */
+ for (i = 0; i < num; i++) {
+ val0 = word[i];
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
+ }
+ word += num;
+
+ /* Complete the CRC on any remaining bytes. */
+ buf = (const unsigned char FAR *)word;
+ while (len) {
+ len--;
+ val = *buf++;
+ __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
+ }
+
+ /* Return the CRC, post-conditioned. */
+ return crc ^ 0xffffffff;
+}
+
+#else
+
/* ========================================================================= */
unsigned long ZEXPORT crc32_z(crc, buf, len)
unsigned long crc;
@@ -939,6 +1051,8 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
return crc ^ 0xffffffff;
}
+#endif
+
/* ========================================================================= */
unsigned long ZEXPORT crc32(crc, buf, len)
unsigned long crc;