From df28140cc81bc87b4a278a2876db2c52158996c6 Mon Sep 17 00:00:00 2001 From: Randall Spangler Date: Thu, 29 Jan 2015 14:15:40 -0800 Subject: Add 64-bit divide library routines for Cortex-M, Cortex-M0 These are needed for fixed-point math calculations. Taken from the same public repo where we got the 32-bit divide routines for Cortex-M0. With minor modifications to match changes we've made so far for the existing divide routines (put each function in its own section, delete dead code, etc.) BUG=chrome-os-partner:36126 BRANCH=minnie TEST=hack in a debug console command to allow arbitrary int64 divide, see that it works as expected Change-Id: I987dbca7c806c71bc38fa816971ac3a54c1641a5 Signed-off-by: Randall Spangler Reviewed-on: https://chromium-review.googlesource.com/244501 Reviewed-by: Alec Berg --- core/cortex-m/build.mk | 2 +- core/cortex-m/ldivmod.S | 86 ++++++++++++++++++++++ core/cortex-m/uldivmod.S | 177 ++++++++++++++++++++++++++++++++++++++++++++++ core/cortex-m0/build.mk | 2 +- core/cortex-m0/ldivmod.S | 88 +++++++++++++++++++++++ core/cortex-m0/uldivmod.S | 172 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 core/cortex-m/ldivmod.S create mode 100644 core/cortex-m/uldivmod.S create mode 100644 core/cortex-m0/ldivmod.S create mode 100644 core/cortex-m0/uldivmod.S diff --git a/core/cortex-m/build.mk b/core/cortex-m/build.mk index 5306865984..522f6dedeb 100644 --- a/core/cortex-m/build.mk +++ b/core/cortex-m/build.mk @@ -17,7 +17,7 @@ CFLAGS_CPU+=-mthumb -Os -mno-sched-prolog CFLAGS_CPU+=-mno-unaligned-access CFLAGS_CPU+=$(CFLAGS_FPU-y) -core-y=cpu.o init.o +core-y=cpu.o init.o ldivmod.o uldivmod.o core-$(CONFIG_COMMON_PANIC_OUTPUT)+=panic.o core-$(CONFIG_COMMON_RUNTIME)+=switch.o task.o core-$(CONFIG_WATCHDOG)+=watchdog.o diff --git a/core/cortex-m/ldivmod.S b/core/cortex-m/ldivmod.S new file mode 100644 index 0000000000..f3709af41a --- /dev/null +++ b/core/cortex-m/ldivmod.S @@ -0,0 +1,86 @@ +/* Runtime ABI for the ARM Cortex-M + * ldivmod.S: signed 64 bit division (quotient and remainder) + * + * Copyright (c) 2012 Jörg Mische + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + + + .syntax unified + .text + .code 16 + + +@ {long long quotient, long long remainder} +@ __aeabi_ldivmod(long long numerator, long long denominator) +@ +@ Divide r1:r0 by r3:r2 and return the quotient in r1:r0 and the remainder in +@ r3:r2 (all signed) +@ + .thumb_func + .section .text.__aeabi_ldivmod + .global __aeabi_ldivmod +__aeabi_ldivmod: + + cmp r1, #0 + bge L_num_pos + + push {r4, lr} + movs r4, #0 @ num = -num + rsbs r0, r0, #0 + sbcs r4, r1 + mov r1, r4 + + cmp r3, #0 + bge L_neg_both + + movs r4, #0 @ den = -den + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + bl __aeabi_uldivmod + movs r4, #0 @ rem = -rem + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + pop {r4, pc} + +L_neg_both: + bl __aeabi_uldivmod + movs r4, #0 @ quot = -quot + rsbs r0, r0, #0 + sbcs r4, r1 + mov r1, r4 + movs r4, #0 @ rem = -rem + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + pop {r4, pc} + +L_num_pos: + cmp r3, #0 + bge __aeabi_uldivmod + + push {r4, lr} + movs r4, #0 @ den = -den + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + bl __aeabi_uldivmod + movs r4, #0 @ quot = -quot + rsbs r0, r0, #0 + sbcs r4, r1 + mov r1, r4 + pop {r4, pc} diff --git a/core/cortex-m/uldivmod.S b/core/cortex-m/uldivmod.S new file mode 100644 index 0000000000..a14bdb203f --- /dev/null +++ b/core/cortex-m/uldivmod.S @@ -0,0 +1,177 @@ +/* Runtime ABI for the ARM Cortex-M + * uldivmod.S: unsigned 64 bit division + * + * Copyright (c) 2012 Jörg Mische + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + + + .syntax unified + .text + .code 16 + + + +@ {unsigned long long quotient, unsigned long long remainder} +@ __aeabi_uldivmod(unsigned long long numerator, unsigned long long denominator) +@ +@ Divide r1:r0 by r3:r2 and return the quotient in r1:r0 and the remainder +@ in r3:r2 (all unsigned) +@ + .thumb_func + .section .text.__aeabi_uldivmod + .global __aeabi_uldivmod +__aeabi_uldivmod: + + cmp r3, #0 + bne L_large_denom + cmp r2, #0 + beq L_divison_by_0 + cmp r1, #0 + beq L_fallback_32bits + + + + @ case 1: num >= 2^32 and denom < 2^32 + @ Result might be > 2^32, therefore we first calculate the upper 32 + @ bits of the result. It is done similar to the calculation of the + @ lower 32 bits, but with a denominator that is shifted by 32. + @ Hence the lower 32 bits of the denominator are always 0 and the + @ costly 64 bit shift and sub operations can be replaced by cheap 32 + @ bit operations. + + push {r4, r5, r6, r7, lr} + + @ shift left the denominator until it is greater than the numerator + @ denom(r7:r6) = r3:r2 << 32 + + @ TODO(crosbug.com/p/36128): Loops like this (which occur in several + @ places in this file) are inefficent in ARMv6-m. + + movs r5, #1 @ bitmask + adds r7, r2, #0 @ dont shift if denominator would overflow + bmi L_upper_result + cmp r1, r7 + blo L_upper_result + +L_denom_shift_loop1: + lsls r5, #1 + lsls r7, #1 + bmi L_upper_result @ dont shift if overflow + cmp r1, r7 + bhs L_denom_shift_loop1 + +L_upper_result: + mov r3, r1 + mov r2, r0 + movs r1, #0 @ upper result = 0 + +L_sub_loop1: + cmp r3, r7 + bcc L_dont_sub1 @ if (num>denom) + + subs r3, r7 @ num -= denom + orrs r1, r5 @ result(r7:r6) |= bitmask(r5) +L_dont_sub1: + + lsrs r7, #1 @ denom(r7:r6) >>= 1 + lsrs r5, #1 @ bitmask(r5) >>= 1 + bne L_sub_loop1 + + movs r5, #1 + lsls r5, #31 + movs r6, #0 + b L_lower_result + + + + @ case 2: division by 0 + @ call __aeabi_ldiv0 + +L_divison_by_0: + b __aeabi_ldiv0 + + + + @ case 3: num < 2^32 and denom < 2^32 + @ fallback to 32 bit division + +L_fallback_32bits: + mov r1, r0 + udiv r0, r2 @ r0 = quotient + mul r3, r0, r2 @ r3 = quotient * divisor + subs r2, r3, r2 @ r2 = remainder + movs r1, #0 + movs r3, #0 + bx lr + + + + @ case 4: denom >= 2^32 + @ result is smaller than 2^32 + +L_large_denom: + push {r4, r5, r6, r7, lr} + + mov r7, r3 + mov r6, r2 + mov r3, r1 + mov r2, r0 + + @ Shift left the denominator until it is greater than the numerator + + movs r1, #0 @ high word of result is 0 + movs r5, #1 @ bitmask + adds r7, #0 @ dont shift if denominator would overflow + bmi L_lower_result + cmp r3, r7 + blo L_lower_result + +L_denom_shift_loop4: + lsls r5, #1 + lsls r7, #1 + lsls r6, #1 + adcs r7, r1 @ r1=0 + bmi L_lower_result @ dont shift if overflow + cmp r3, r7 + bhs L_denom_shift_loop4 + + + +L_lower_result: + movs r0, #0 + +L_sub_loop4: + mov r4, r3 + cmp r2, r6 + sbcs r4, r7 + bcc L_dont_sub4 @ if (num>denom) + + subs r2, r6 @ numerator -= denom + sbcs r3, r7 + orrs r0, r5 @ result(r1:r0) |= bitmask(r5) +L_dont_sub4: + + lsls r4, r7, #31 @ denom(r7:r6) >>= 1 + lsrs r6, #1 + lsrs r7, #1 + orrs r6, r4 + lsrs r5, #1 @ bitmask(r5) >>= 1 + bne L_sub_loop4 + + pop {r4, r5, r6, r7, pc} + +__aeabi_ldiv0: + bl panic_reboot diff --git a/core/cortex-m0/build.mk b/core/cortex-m0/build.mk index 13cef2efc2..c87df856e0 100644 --- a/core/cortex-m0/build.mk +++ b/core/cortex-m0/build.mk @@ -13,7 +13,7 @@ CROSS_COMPILE?=arm-none-eabi- CFLAGS_CPU+=-mthumb -Os -mno-sched-prolog CFLAGS_CPU+=-mno-unaligned-access -core-y=cpu.o init.o thumb_case.o div.o lmul.o +core-y=cpu.o init.o thumb_case.o div.o lmul.o ldivmod.o uldivmod.o core-$(CONFIG_COMMON_PANIC_OUTPUT)+=panic.o core-$(CONFIG_COMMON_RUNTIME)+=switch.o task.o core-$(CONFIG_WATCHDOG)+=watchdog.o diff --git a/core/cortex-m0/ldivmod.S b/core/cortex-m0/ldivmod.S new file mode 100644 index 0000000000..48ebdfcc65 --- /dev/null +++ b/core/cortex-m0/ldivmod.S @@ -0,0 +1,88 @@ +/* Runtime ABI for the ARM Cortex-M0 + * ldivmod.S: signed 64 bit division (quotient and remainder) + * + * Copyright (c) 2012 Jörg Mische + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + + + .syntax unified + .text + .thumb + .cpu cortex-m0 + + + +@ {long long quotient, long long remainder} +@ __aeabi_ldivmod(long long numerator, long long denominator) +@ +@ Divide r1:r0 by r3:r2 and return the quotient in r1:r0 and the remainder in +@ r3:r2 (all signed) +@ + .thumb_func + .section .text.__aeabi_ldivmod + .global __aeabi_ldivmod +__aeabi_ldivmod: + + cmp r1, #0 + bge L_num_pos + + push {r4, lr} + movs r4, #0 @ num = -num + rsbs r0, r0, #0 + sbcs r4, r1 + mov r1, r4 + + cmp r3, #0 + bge L_neg_both + + movs r4, #0 @ den = -den + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + bl __aeabi_uldivmod + movs r4, #0 @ rem = -rem + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + pop {r4, pc} + +L_neg_both: + bl __aeabi_uldivmod + movs r4, #0 @ quot = -quot + rsbs r0, r0, #0 + sbcs r4, r1 + mov r1, r4 + movs r4, #0 @ rem = -rem + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + pop {r4, pc} + +L_num_pos: + cmp r3, #0 + bge __aeabi_uldivmod + + push {r4, lr} + movs r4, #0 @ den = -den + rsbs r2, r2, #0 + sbcs r4, r3 + mov r3, r4 + bl __aeabi_uldivmod + movs r4, #0 @ quot = -quot + rsbs r0, r0, #0 + sbcs r4, r1 + mov r1, r4 + pop {r4, pc} diff --git a/core/cortex-m0/uldivmod.S b/core/cortex-m0/uldivmod.S new file mode 100644 index 0000000000..13d8c3aa10 --- /dev/null +++ b/core/cortex-m0/uldivmod.S @@ -0,0 +1,172 @@ +/* Runtime ABI for the ARM Cortex-M0 + * uldivmod.S: unsigned 64 bit division + * + * Copyright (c) 2012 Jörg Mische + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + + + .syntax unified + .text + .thumb + .cpu cortex-m0 + + + +@ {unsigned long long quotient, unsigned long long remainder} +@ __aeabi_uldivmod(unsigned long long numerator, unsigned long long denominator) +@ +@ Divide r1:r0 by r3:r2 and return the quotient in r1:r0 and the remainder +@ in r3:r2 (all unsigned) +@ + .thumb_func + .section .text.__aeabi_uldivmod + .global __aeabi_uldivmod +__aeabi_uldivmod: + + cmp r3, #0 + bne L_large_denom + cmp r2, #0 + beq L_divison_by_0 + cmp r1, #0 + beq L_fallback_32bits + + + + @ case 1: num >= 2^32 and denom < 2^32 + @ Result might be > 2^32, therefore we first calculate the upper 32 + @ bits of the result. It is done similar to the calculation of the + @ lower 32 bits, but with a denominator that is shifted by 32. + @ Hence the lower 32 bits of the denominator are always 0 and the + @ costly 64 bit shift and sub operations can be replaced by cheap 32 + @ bit operations. + + push {r4, r5, r6, r7, lr} + + @ shift left the denominator until it is greater than the numerator + @ denom(r7:r6) = r3:r2 << 32 + + movs r5, #1 @ bitmask + adds r7, r2, #0 @ dont shift if denominator would overflow + bmi L_upper_result + cmp r1, r7 + blo L_upper_result + +L_denom_shift_loop1: + lsls r5, #1 + lsls r7, #1 + bmi L_upper_result @ dont shift if overflow + cmp r1, r7 + bhs L_denom_shift_loop1 + +L_upper_result: + mov r3, r1 + mov r2, r0 + movs r1, #0 @ upper result = 0 + +L_sub_loop1: + cmp r3, r7 + bcc L_dont_sub1 @ if (num>denom) + + subs r3, r7 @ num -= denom + orrs r1, r5 @ result(r7:r6) |= bitmask(r5) +L_dont_sub1: + + lsrs r7, #1 @ denom(r7:r6) >>= 1 + lsrs r5, #1 @ bitmask(r5) >>= 1 + bne L_sub_loop1 + + movs r5, #1 + lsls r5, #31 + movs r6, #0 + b L_lower_result + + + + @ case 2: division by 0 + @ call __aeabi_ldiv0 + +L_divison_by_0: + b __aeabi_ldiv0 + + + + @ case 3: num < 2^32 and denom < 2^32 + @ fallback to 32 bit division + +L_fallback_32bits: + mov r1, r2 + push {lr} + bl __aeabi_uidivmod + mov r2, r1 + movs r1, #0 + movs r3, #0 + pop {pc} + + + + @ case 4: denom >= 2^32 + @ result is smaller than 2^32 + +L_large_denom: + push {r4, r5, r6, r7, lr} + + mov r7, r3 + mov r6, r2 + mov r3, r1 + mov r2, r0 + + @ Shift left the denominator until it is greater than the numerator + + movs r1, #0 @ high word of result is 0 + movs r5, #1 @ bitmask + adds r7, #0 @ dont shift if denominator would overflow + bmi L_lower_result + cmp r3, r7 + blo L_lower_result + +L_denom_shift_loop4: + lsls r5, #1 + lsls r7, #1 + lsls r6, #1 + adcs r7, r1 @ r1=0 + bmi L_lower_result @ dont shift if overflow + cmp r3, r7 + bhs L_denom_shift_loop4 + + + +L_lower_result: + movs r0, #0 + +L_sub_loop4: + mov r4, r3 + cmp r2, r6 + sbcs r4, r7 + bcc L_dont_sub4 @ if (num>denom) + + subs r2, r6 @ numerator -= denom + sbcs r3, r7 + orrs r0, r5 @ result(r1:r0) |= bitmask(r5) +L_dont_sub4: + + lsls r4, r7, #31 @ denom(r7:r6) >>= 1 + lsrs r6, #1 + lsrs r7, #1 + orrs r6, r4 + lsrs r5, #1 @ bitmask(r5) >>= 1 + bne L_sub_loop4 + + pop {r4, r5, r6, r7, pc} -- cgit v1.2.1