cr50: replace call to __aeabi_uldivmod with better implementation

FIPS module should be self contained. Adding __aeabi_uldivmod is challenging as it comes from compiler builtin and used by other parts. In this CL we implement udiv32() which divides 64-bit number by 32-bit number. Since it doesn't compute reminder and don't have to support real 64-bit by 64-bit division it's faster. Also, we can use ARM instruction to count leading zeros instead of doing it manually. This code is reused from Ti50 cryptolib code as is. Exhaustive bn_div() test is provided by test/tpm_test/bn_test.c which is now can be built with board/cr50/dcrypto version. BUG=b:138578318 TEST=make BOARD=cr50 test/tpm_test/make CR50=1 build/tpm_test/bn_test Signed-off-by: Vadim Sukhomlinov <sukhomlinov@google.com> Change-Id: I19cebc5c11d3a80bc50732350b7c598bfa374348 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/ec/+/3042138 Reviewed-by: Vadim Sukhomlinov <sukhomlinov@chromium.org> Reviewed-by: Vadim Bendebury <vbendeb@chromium.org> Tested-by: Vadim Sukhomlinov <sukhomlinov@chromium.org> Auto-Submit: Vadim Sukhomlinov <sukhomlinov@chromium.org> Commit-Queue: Vadim Bendebury <vbendeb@chromium.org>
author: Vadim Sukhomlinov <sukhomlinov@google.com> 2021-07-20 09:28:38 -0700
committer: Commit Bot <commit-bot@chromium.org> 2021-07-21 20:11:03 +0000
commit: 7c1e430fa0cfd8a46030c3ef69fd3ab99ebc133a (patch)
tree: 9d324b1d4de82f23a0e5fb86b06ab9b8ba137a01
parent: afa1e4cddef431500ae1763135c4ace60289931d (diff)
download: chrome-ec-7c1e430fa0cfd8a46030c3ef69fd3ab99ebc133a.tar.gz
2 files changed, 128 insertions, 9 deletions
diff --git a/board/cr50/dcrypto/bn.c b/board/cr50/dcrypto/bn.c
index 94aafa1799..7b68c10d89 100644
--- a/board/cr50/dcrypto/bn.c
+++ b/board/cr50/dcrypto/bn.c
@@ -491,6 +491,121 @@ static void bn_mul_ex(struct LITE_BIGNUM *c,
 	BN_DIGIT(c, i + b->dmax - 1) = carry;
 }
 
+/* Functions to convert between uint32_t and uint64_t */
+static inline uint32_t lo32(uint64_t v)
+{
+	return (uint32_t)v;
+}
+static inline uint32_t hi32(uint64_t v)
+{
+	return (uint32_t)(v >> 32);
+}
+static inline uint64_t make64(uint32_t hi, uint32_t lo)
+{
+	return (((uint64_t)hi) << 32) | lo;
+}
+
+static inline uint32_t lo16(uint32_t v)
+{
+	return (uint32_t)(v)&0xffff;
+}
+
+static inline uint32_t hi16(uint32_t v)
+{
+	return (uint32_t)(v >> 16);
+}
+
+/* make Clang's host behavior of clz match Soteria and avoid UBSAN error */
+static inline int clz(unsigned int x)
+{
+	return (x) ? __builtin_clz(x) : 32;
+}
+
+/**
+ * Unsigned division of 64-bit integer with 32-bit divisor, used to implement
+ * Knuth's long division algorithm. For platforms which don't support hardware
+ * 64 by 32 division we have to either rely on compiler builtins (__udivdi3,
+ * __aeabi_uldivmod) or implement this code explicitly.
+ * Due to potential build issues with dependency on compiler run-time libs,
+ * use our own implementation.
+ *
+ * Algorithm is adapted from GNU's libgcc and optimized for the use case.
+ *
+ */
+#define udiv_qrnnd(q, r, n1, n0, d)                               \
+	{							  \
+		uint32_t __d1, __d0, __q1, __q0, __r1, __r0, __m; \
+		__d1 = hi16(d);                                   \
+		__d0 = lo16(d);                                   \
+								  \
+		__q1 = (n1) / __d1;				  \
+		__r1 = (n1) - (__q1 * __d1);			  \
+		__m = __q1 * __d0;                                \
+		__r1 = (__r1 << 16) | hi16(n0);                   \
+		if (__r1 < __m) {                                 \
+			__q1--;                                   \
+			__r1 += (d);                              \
+			if (__r1 >= (d))                          \
+				if (__r1 < __m)                   \
+					__q1--, __r1 += (d);      \
+		}                                                 \
+		__r1 -= __m;                                      \
+		__q0 = __r1 / __d1;                               \
+		__r0 = __r1 - (__q0 * __d1);                      \
+		__m = __q0 * __d0;                                \
+		__r0 = (__r0 << 16) | lo16(n0);                   \
+		if (__r0 < __m) {                                 \
+			__q0--;                                   \
+			__r0 += (d);                              \
+			if (__r0 >= (d))                          \
+				if (__r0 < __m)                   \
+					__q0--, __r0 += (d);      \
+		}                                                 \
+		__r0 -= __m;                                      \
+								  \
+		(q) = (__q1 << 16) | __q0;                        \
+		(r) = __r0;                                       \
+	}
+
+uint64_t udiv32(uint64_t n, uint32_t d0)
+{
+	uint32_t n0, n1, n2, q0, q1, bm;
+
+	n0 = lo32(n);
+	n1 = hi32(n);
+
+	/* if it's 32-bit division or division by zero, use hardware directly */
+	if (d0 == 0 || n1 == 0)
+		return n0 / d0;
+
+	bm = clz(d0);
+	if (d0 > n1) { /* 0q = nn / 0D */
+		/* make the most significant bit of the denominator set. */
+		if (bm != 0) {
+			d0 = d0 << bm;
+			n1 = (n1 << bm) | (n0 >> (32 - bm));
+			n0 = n0 << bm;
+		}
+		q1 = 0;
+	} else {
+		/* qq = NN / 0d */
+		if (bm == 0) {
+			n1 -= d0;
+			q1 = 1;
+		} else {
+			/* Normalize.  */
+			d0 = d0 << bm;
+			n2 = n1 >> (32 - bm);
+			n1 = (n1 << bm) | (n0 >> (32 - bm));
+			n0 = n0 << bm;
+			udiv_qrnnd(q1, n1, n2, n1, d0);
+		}
+	}
+	udiv_qrnnd(q0, n0, n1, n0, d0);
+	/* Remainder in n0 >> bm, but we don't use it  */
+	return make64(q1, q0);
+}
+
 static int bn_div_word_ex(struct LITE_BIGNUM *q,
 		struct LITE_BIGNUM *r,
 		const struct LITE_BIGNUM *u, int m,
@@ -501,7 +616,7 @@ static int bn_div_word_ex(struct LITE_BIGNUM *q,
 
 	for (i = m - 1; i >= 0; --i) {
 		uint64_t tmp = ((uint64_t)rem << 32) + BN_DIGIT(u, i);
-		uint32_t qd = tmp / div;
+		uint32_t qd = udiv32(tmp, div);
 
 		BN_DIGIT(q, i) = qd;
 		rem = tmp - (uint64_t)qd * div;
@@ -544,11 +659,8 @@ static int bn_div_ex(struct LITE_BIGNUM *q,
 		return bn_div_word_ex(q, r, u, m, vtop);
 
 	/* Compute shift factor to make v have high bit set */
-	s = 0;
-	while ((vtop & 0x80000000) == 0) {
-		s = s + 1;
-		vtop = vtop << 1;
-	}
+	s = clz(vtop);
+	vtop <<= s;
 
 	/* Normalize u and v into un and vn.
 	 * Note un always gains a leading digit
@@ -586,7 +698,7 @@ static int bn_div_ex(struct LITE_BIGNUM *q,
 			uint64_t rhat = ((uint64_t)un[j + n] << 32) +
 				un[j + n - 1];
 
-			qd = rhat / vn[n - 1];
+			qd = udiv32(rhat, vn[n - 1]);
 			rhat = rhat - (uint64_t)qd * vn[n - 1];
 			while ((rhat >> 32) == 0 &&
 				(uint64_t)qd * vn[n - 2] >
diff --git a/test/tpm_test/Makefile b/test/tpm_test/Makefile
index 23f66317f9..29d3e229bc 100644
--- a/test/tpm_test/Makefile
+++ b/test/tpm_test/Makefile
@@ -22,14 +22,21 @@ SWIG = /usr/bin/swig
 PYTHON_INCLUDE = $(shell python3 -c 'import sysconfig; \
 print(sysconfig.get_paths().get("include"))')
 
+ifeq ($(CR50),)
 vpath %c $(src) ../../chip/g/dcrypto $(src)/testlib
+CFLAGS += -I../../chip/g/dcrypto
+else
 
-CFLAGS = -fPIC
+# Use BOARD=cr50 specific implementation
+vpath %c $(src) ../../board/cr50/dcrypto $(src)/testlib
+CFLAGS += -I../../board/cr50/dcrypto
+endif
+
+CFLAGS += -fPIC
 CFLAGS += -I ${PYTHON_INCLUDE}
 CFLAGS += -I../../../../third_party/cryptoc/include
 CFLAGS += -I../../board/cr50
 CFLAGS += -I../../chip/g
-CFLAGS += -I../../chip/g/dcrypto
 CFLAGS += -I../../fuzz
 CFLAGS += -I../../include
 CFLAGS += -I..
author	Vadim Sukhomlinov <sukhomlinov@google.com>	2021-07-20 09:28:38 -0700
committer	Commit Bot <commit-bot@chromium.org>	2021-07-21 20:11:03 +0000
commit	7c1e430fa0cfd8a46030c3ef69fd3ab99ebc133a (patch)
tree	9d324b1d4de82f23a0e5fb86b06ab9b8ba137a01
parent	afa1e4cddef431500ae1763135c4ace60289931d (diff)
download	chrome-ec-7c1e430fa0cfd8a46030c3ef69fd3ab99ebc133a.tar.gz