summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2018-03-13 12:12:13 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2018-03-13 12:12:13 +0000
commitc748e48b1239d2ea60a5f67eaf2f11ffadd2b359 (patch)
treef6eb16469fc3c26545a079ce06f60feb5227c3dd
parent4d590720d0cf0fbba5c21275afbf20ecb78f34a5 (diff)
downloadpcre-c748e48b1239d2ea60a5f67eaf2f11ffadd2b359.tar.gz
JIT compiler update.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1729 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--sljit/sljitConfigInternal.h27
-rw-r--r--sljit/sljitLir.c10
-rw-r--r--sljit/sljitNativeARM_64.c250
-rw-r--r--sljit/sljitNativeARM_T2_32.c112
-rw-r--r--sljit/sljitNativeX86_32.c79
-rw-r--r--sljit/sljitNativeX86_64.c120
-rw-r--r--sljit/sljitNativeX86_common.c17
7 files changed, 405 insertions, 210 deletions
diff --git a/sljit/sljitConfigInternal.h b/sljit/sljitConfigInternal.h
index e13282c..00525de 100644
--- a/sljit/sljitConfigInternal.h
+++ b/sljit/sljitConfigInternal.h
@@ -147,17 +147,23 @@
#define SLJIT_CONFIG_UNSUPPORTED 1
#endif
-#else /* !_WIN32 */
+#else /* _WIN32 */
#if defined(_M_X64) || defined(__x86_64__)
#define SLJIT_CONFIG_X86_64 1
+#elif (defined(_M_ARM) && _M_ARM >= 7 && defined(_M_ARMT)) || defined(__thumb2__)
+#define SLJIT_CONFIG_ARM_THUMB2 1
+#elif (defined(_M_ARM) && _M_ARM >= 7)
+#define SLJIT_CONFIG_ARM_V7 1
#elif defined(_ARM_)
#define SLJIT_CONFIG_ARM_V5 1
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define SLJIT_CONFIG_ARM_64 1
#else
#define SLJIT_CONFIG_X86_32 1
#endif
-#endif /* !WIN32 */
+#endif /* !_WIN32 */
#endif /* SLJIT_CONFIG_AUTO */
#if (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
@@ -324,6 +330,11 @@
sparc_cache_flush((from), (to))
#define SLJIT_CACHE_FLUSH_OWN_IMPL 1
+#elif defined _WIN32
+
+#define SLJIT_CACHE_FLUSH(from, to) \
+ FlushInstructionCache(GetCurrentProcess(), (char*)(from), (char*)(to) - (char*)(from))
+
#else
/* Calls __ARM_NR_cacheflush on ARM-Linux. */
@@ -371,12 +382,18 @@ typedef int sljit_sw;
#define SLJIT_64BIT_ARCHITECTURE 1
#define SLJIT_WORD_SHIFT 3
#ifdef _WIN32
+#ifdef __GNUC__
+/* These types do not require windows.h */
+typedef unsigned long long sljit_uw;
+typedef long long sljit_sw;
+#else
typedef unsigned __int64 sljit_uw;
typedef __int64 sljit_sw;
-#else
+#endif
+#else /* !_WIN32 */
typedef unsigned long int sljit_uw;
typedef long int sljit_sw;
-#endif
+#endif /* _WIN32 */
#endif
typedef sljit_uw sljit_p;
@@ -590,7 +607,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 26
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 10
-#define SLJIT_LOCALS_OFFSET_BASE (2 * sizeof(sljit_sw))
+#define SLJIT_LOCALS_OFFSET_BASE 0
#elif (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
diff --git a/sljit/sljitLir.c b/sljit/sljitLir.c
index 5e435f0..5bdddc1 100644
--- a/sljit/sljitLir.c
+++ b/sljit/sljitLir.c
@@ -26,6 +26,13 @@
#include "sljitLir.h"
+#ifdef _WIN32
+
+/* For SLJIT_CACHE_FLUSH, which can expand to FlushInstructionCache. */
+#include <windows.h>
+
+#endif /* _WIN32 */
+
#if !(defined SLJIT_STD_MACROS_DEFINED && SLJIT_STD_MACROS_DEFINED)
/* These libraries are needed for the macros below. */
@@ -2178,7 +2185,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
#endif
-#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
+ && !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
{
diff --git a/sljit/sljitNativeARM_64.c b/sljit/sljitNativeARM_64.c
index 8a437bd..27af741 100644
--- a/sljit/sljitNativeARM_64.c
+++ b/sljit/sljitNativeARM_64.c
@@ -37,14 +37,14 @@ typedef sljit_u32 sljit_ins;
#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
#define TMP_LR (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_SP (SLJIT_NUMBER_OF_REGISTERS + 5)
+#define TMP_FP (SLJIT_NUMBER_OF_REGISTERS + 5)
#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
/* r18 - platform register, currently not used */
static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 8] = {
- 31, 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 29, 9, 10, 30, 31
+ 31, 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 8, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 31, 9, 10, 30, 29
};
static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
@@ -68,6 +68,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define ADC 0x9a000000
#define ADD 0x8b000000
+#define ADDE 0x8b200000
#define ADDI 0x91000000
#define AND 0x8a000000
#define ANDI 0x92000000
@@ -96,7 +97,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define FSUB 0x1e603800
#define LDRI 0xf9400000
#define LDP 0xa9400000
-#define LDP_PST 0xa8c00000
+#define LDP_PRE 0xa9c00000
+#define LDR_PRE 0xf8400c00
#define LSLV 0x9ac02000
#define LSRV 0x9ac02400
#define MADD 0x9b000000
@@ -873,73 +875,51 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
- saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 0);
- local_size += saved_regs_size + SLJIT_LOCALS_OFFSET;
+ saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 2);
+ if (saved_regs_size & 0x8)
+ saved_regs_size += sizeof(sljit_sw);
+
local_size = (local_size + 15) & ~0xf;
- compiler->local_size = local_size;
-
- if (local_size <= (63 * sizeof(sljit_sw))) {
- FAIL_IF(push_inst(compiler, STP_PRE | 29 | RT2(TMP_LR)
- | RN(TMP_SP) | ((-(local_size >> 3) & 0x7f) << 15)));
- FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_SP) | (0 << 10)));
- offs = (local_size - saved_regs_size) << (15 - 3);
- } else {
- offs = 0 << 15;
- if (saved_regs_size & 0x8) {
- offs = 1 << 15;
- saved_regs_size += sizeof(sljit_sw);
- }
- local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET;
- if (saved_regs_size > 0)
- FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
- }
+ compiler->local_size = local_size + saved_regs_size;
+
+ FAIL_IF(push_inst(compiler, STP_PRE | RT(TMP_FP) | RT2(TMP_LR)
+ | RN(SLJIT_SP) | ((-(saved_regs_size >> 3) & 0x7f) << 15)));
+
+#ifdef _WIN32
+ if (local_size >= 4096)
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(SLJIT_SP) | (1 << 10) | (1 << 22)));
+ else if (local_size > 256)
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(SLJIT_SP) | (local_size << 10)));
+#endif
tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
prev = -1;
+ offs = 2 << 15;
for (i = SLJIT_S0; i >= tmp; i--) {
if (prev == -1) {
- if (!(offs & (1 << 15))) {
- prev = i;
- continue;
- }
- FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
- offs += 1 << 15;
+ prev = i;
continue;
}
- FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+ FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
offs += 2 << 15;
prev = -1;
}
for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
if (prev == -1) {
- if (!(offs & (1 << 15))) {
- prev = i;
- continue;
- }
- FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
- offs += 1 << 15;
+ prev = i;
continue;
}
- FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+ FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
offs += 2 << 15;
prev = -1;
}
- SLJIT_ASSERT(prev == -1);
+ if (prev != -1)
+ FAIL_IF(push_inst(compiler, STRI | RT(prev) | RN(SLJIT_SP) | (offs >> 5)));
- if (compiler->local_size > (63 * sizeof(sljit_sw))) {
- /* The local_size is already adjusted by the saved registers. */
- if (local_size > 0xfff) {
- FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | ((local_size >> 12) << 10) | (1 << 22)));
- local_size &= 0xfff;
- }
- if (local_size)
- FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (local_size << 10)));
- FAIL_IF(push_inst(compiler, STP_PRE | 29 | RT2(TMP_LR)
- | RN(TMP_SP) | ((-(16 >> 3) & 0x7f) << 15)));
- FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_SP) | (0 << 10)));
- }
+
+ FAIL_IF(push_inst(compiler, ADDI | RD(TMP_FP) | RN(SLJIT_SP) | (0 << 10)));
args = get_arg_count(arg_types);
@@ -950,6 +930,64 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
if (args >= 3)
FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_S2) | RN(TMP_ZERO) | RM(SLJIT_R2)));
+#ifdef _WIN32
+ if (local_size >= 4096) {
+ if (local_size < 4 * 4096) {
+ /* No need for a loop. */
+ if (local_size >= 2 * 4096) {
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (1 << 10) | (1 << 22)));
+ local_size -= 4096;
+ }
+
+ if (local_size >= 2 * 4096) {
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (1 << 10) | (1 << 22)));
+ local_size -= 4096;
+ }
+
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+ local_size -= 4096;
+ }
+ else {
+ FAIL_IF(push_inst(compiler, MOVZ | RD(TMP_REG2) | (((local_size >> 12) - 1) << 5)));
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (1 << 10) | (1 << 22)));
+ FAIL_IF(push_inst(compiler, SUBI | (1 << 29) | RD(TMP_REG2) | RN(TMP_REG2) | (1 << 10)));
+ FAIL_IF(push_inst(compiler, B_CC | ((((sljit_ins) -3) & 0x7ffff) << 5) | 0x1 /* not-equal */));
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+
+ local_size &= 0xfff;
+ }
+
+ if (local_size > 256) {
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_REG1) | RN(TMP_REG1) | (local_size << 10)));
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+ }
+ else if (local_size > 0)
+ FAIL_IF(push_inst(compiler, LDR_PRE | RT(TMP_ZERO) | RN(TMP_REG1) | ((-local_size & 0x1ff) << 12)));
+
+ FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_REG1) | (0 << 10)));
+ }
+ else if (local_size > 256) {
+ FAIL_IF(push_inst(compiler, LDRI | RT(TMP_ZERO) | RN(TMP_REG1)));
+ FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(TMP_REG1) | (0 << 10)));
+ }
+ else if (local_size > 0)
+ FAIL_IF(push_inst(compiler, LDR_PRE | RT(TMP_ZERO) | RN(SLJIT_SP) | ((-local_size & 0x1ff) << 12)));
+
+#else /* !_WIN32 */
+
+ /* The local_size does not include saved registers size. */
+ if (local_size > 0xfff) {
+ FAIL_IF(push_inst(compiler, SUBI | RD(SLJIT_SP) | RN(SLJIT_SP) | ((local_size >> 12) << 10) | (1 << 22)));
+ local_size &= 0xfff;
+ }
+ if (local_size != 0)
+ FAIL_IF(push_inst(compiler, SUBI | RD(SLJIT_SP) | RN(SLJIT_SP) | (local_size << 10)));
+
+#endif /* _WIN32 */
+
return SLJIT_SUCCESS;
}
@@ -957,13 +995,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
{
+ sljit_s32 saved_regs_size;
+
CHECK_ERROR();
CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
- local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 0) + SLJIT_LOCALS_OFFSET;
- local_size = (local_size + 15) & ~0xf;
- compiler->local_size = local_size;
+ saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 2);
+ if (saved_regs_size & 0x8)
+ saved_regs_size += sizeof(sljit_sw);
+
+ compiler->local_size = saved_regs_size + ((local_size + 15) & ~0xf);
return SLJIT_SUCCESS;
}
@@ -977,71 +1019,59 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *comp
FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
- local_size = compiler->local_size;
+ saved_regs_size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds, 2);
+ if (saved_regs_size & 0x8)
+ saved_regs_size += sizeof(sljit_sw);
- saved_regs_size = GET_SAVED_REGISTERS_SIZE(compiler->scratches, compiler->saveds, 0);
- if (local_size <= (63 * sizeof(sljit_sw)))
- offs = (local_size - saved_regs_size) << (15 - 3);
+ local_size = compiler->local_size - saved_regs_size;
+
+ /* Load LR as early as possible. */
+ if (local_size == 0)
+ FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
+ else if (local_size < 63 * sizeof(sljit_sw)) {
+ FAIL_IF(push_inst(compiler, LDP_PRE | RT(TMP_FP) | RT2(TMP_LR)
+ | RN(SLJIT_SP) | (local_size << (15 - 3))));
+ }
else {
- FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
- | RN(TMP_SP) | (((16 >> 3) & 0x7f) << 15)));
- offs = 0 << 15;
- if (saved_regs_size & 0x8) {
- offs = 1 << 15;
- saved_regs_size += sizeof(sljit_sw);
- }
- local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET;
if (local_size > 0xfff) {
- FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | ((local_size >> 12) << 10) | (1 << 22)));
+ FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(SLJIT_SP) | ((local_size >> 12) << 10) | (1 << 22)));
local_size &= 0xfff;
}
if (local_size)
- FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (local_size << 10)));
+ FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(SLJIT_SP) | (local_size << 10)));
+
+ FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
}
tmp = compiler->saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - compiler->saveds) : SLJIT_FIRST_SAVED_REG;
prev = -1;
+ offs = 2 << 15;
for (i = SLJIT_S0; i >= tmp; i--) {
if (prev == -1) {
- if (!(offs & (1 << 15))) {
- prev = i;
- continue;
- }
- FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
- offs += 1 << 15;
+ prev = i;
continue;
}
- FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+ FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
offs += 2 << 15;
prev = -1;
}
for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
if (prev == -1) {
- if (!(offs & (1 << 15))) {
- prev = i;
- continue;
- }
- FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
- offs += 1 << 15;
+ prev = i;
continue;
}
- FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+ FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(SLJIT_SP) | offs));
offs += 2 << 15;
prev = -1;
}
- SLJIT_ASSERT(prev == -1);
+ if (prev != -1)
+ FAIL_IF(push_inst(compiler, LDRI | RT(prev) | RN(SLJIT_SP) | (offs >> 5)));
- if (compiler->local_size <= (63 * sizeof(sljit_sw))) {
- FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
- | RN(TMP_SP) | (((local_size >> 3) & 0x7f) << 15)));
- } else if (saved_regs_size > 0) {
- FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
- }
-
- FAIL_IF(push_inst(compiler, RET | RN(TMP_LR)));
- return SLJIT_SUCCESS;
+ /* These two can be executed in parallel. */
+ FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RN(SLJIT_SP) | (saved_regs_size << 10)));
+ return push_inst(compiler, RET | RN(TMP_LR));
}
/* --------------------------------------------------------------------- */
@@ -1856,6 +1886,46 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compil
return push_inst(compiler, inst | VT(freg) | RN(mem & REG_MASK) | ((memw & 0x1ff) << 12));
}
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
+{
+ sljit_s32 dst_reg;
+ sljit_ins ins;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
+
+ SLJIT_ASSERT (SLJIT_LOCALS_OFFSET_BASE == 0);
+
+ dst_reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
+
+ if (offset <= 0xffffff && offset >= -0xffffff) {
+ ins = ADDI;
+ if (offset < 0) {
+ offset = -offset;
+ ins = SUBI;
+ }
+
+ if (offset <= 0xfff)
+ FAIL_IF(push_inst(compiler, ins | RD(dst_reg) | RN(SLJIT_SP) | (offset << 10)));
+ else {
+ FAIL_IF(push_inst(compiler, ins | RD(dst_reg) | RN(SLJIT_SP) | ((offset & 0xfff000) >> (12 - 10)) | (1 << 22)));
+
+ offset &= 0xfff;
+ if (offset != 0)
+ FAIL_IF(push_inst(compiler, ins | RD(dst_reg) | RN(dst_reg) | (offset << 10)));
+ }
+ }
+ else {
+ FAIL_IF(load_immediate (compiler, dst_reg, offset));
+ /* Add extended register form. */
+ FAIL_IF(push_inst(compiler, ADDE | (0x3 << 13) | RD(dst_reg) | RN(SLJIT_SP) | RM(dst_reg)));
+ }
+
+ if (SLJIT_UNLIKELY(dst & SLJIT_MEM))
+ return emit_op_mem(compiler, WORD_SIZE | STORE, dst_reg, dst, dstw, TMP_REG1);
+ return SLJIT_SUCCESS;
+}
+
SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
{
struct sljit_const *const_;
diff --git a/sljit/sljitNativeARM_T2_32.c b/sljit/sljitNativeARM_T2_32.c
index 75e7a38..d7024b6 100644
--- a/sljit/sljitNativeARM_T2_32.c
+++ b/sljit/sljitNativeARM_T2_32.c
@@ -110,6 +110,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define ASRSI 0x1000
#define ASR_W 0xfa40f000
#define ASR_WI 0xea4f0020
+#define BCC 0xd000
#define BICI 0xf0200000
#define BKPT 0xbe00
#define BLX 0x4780
@@ -125,6 +126,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define EORS 0x4040
#define EOR_W 0xea800000
#define IT 0xbf00
+#define LDRI 0xf8500800
#define LSLS 0x4080
#define LSLSI 0x0000
#define LSL_W 0xfa00f000
@@ -158,6 +160,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define SBCI 0xf1600000
#define SBCS 0x4180
#define SBC_W 0xeb600000
+#define SDIV 0xfb90f0f0
#define SMULL 0xfb800000
#define STR_SP 0x9000
#define SUBS 0x1a00
@@ -172,6 +175,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define SXTH 0xb200
#define SXTH_W 0xfa0ff080
#define TST 0x4200
+#define UDIV 0xfbb0f0f0
#define UMULL 0xfba00000
#define UXTB 0xb2c0
#define UXTB_W 0xfa5ff080
@@ -339,8 +343,8 @@ static SLJIT_INLINE void set_jump_instruction(struct sljit_jump *jump, sljit_sw
/* Really complex instruction form for branches. */
s = (diff >> 23) & 0x1;
- j1 = (~(diff >> 21) ^ s) & 0x1;
- j2 = (~(diff >> 22) ^ s) & 0x1;
+ j1 = (~(diff >> 22) ^ s) & 0x1;
+ j2 = (~(diff >> 21) ^ s) & 0x1;
jump_inst[0] = 0xf000 | (s << 10) | COPY_BITS(diff, 11, 0, 10);
jump_inst[1] = (j1 << 13) | (j2 << 11) | (diff & 0x7ff);
@@ -520,6 +524,8 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst,
{
sljit_uw tmp;
+ /* MOVS cannot be used since it destroy flags. */
+
if (imm >= 0x10000) {
tmp = get_imm(imm);
if (tmp != INVALID_IMM)
@@ -1032,6 +1038,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
{
sljit_s32 args, size, i, tmp;
sljit_ins push = 0;
+#ifdef _WIN32
+ sljit_uw imm;
+#endif
CHECK_ERROR();
CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
@@ -1052,12 +1061,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
local_size = ((size + local_size + 7) & ~7) - size;
compiler->local_size = local_size;
+
+#ifdef _WIN32
+ if (local_size >= 256) {
+ if (local_size > 4096)
+ imm = get_imm(4096);
+ else
+ imm = get_imm(local_size & ~0xff);
+
+ SLJIT_ASSERT(imm != INVALID_IMM);
+ FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(SLJIT_SP) | imm));
+ }
+#else
if (local_size > 0) {
if (local_size <= (127 << 2))
FAIL_IF(push_inst16(compiler, SUB_SP | (local_size >> 2)));
else
FAIL_IF(emit_op_imm(compiler, SLJIT_SUB | ARG2_IMM, SLJIT_SP, SLJIT_SP, local_size));
}
+#endif
args = get_arg_count(arg_types);
@@ -1068,6 +1090,61 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
if (args >= 3)
FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_S2, SLJIT_R2)));
+#ifdef _WIN32
+ if (local_size >= 256) {
+ if (local_size > 4096) {
+ imm = get_imm(4096);
+ SLJIT_ASSERT(imm != INVALID_IMM);
+
+ if (local_size < 4 * 4096) {
+ if (local_size > 2 * 4096) {
+ FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+ FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+ local_size -= 4096;
+ }
+
+ if (local_size > 2 * 4096) {
+ FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+ FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+ local_size -= 4096;
+ }
+
+ FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+ local_size -= 4096;
+
+ SLJIT_ASSERT(local_size > 0);
+ }
+ else {
+ FAIL_IF(load_immediate(compiler, SLJIT_R3, (local_size >> 12) - 1));
+ FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+ FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+ SLJIT_ASSERT(reg_map[SLJIT_R3] < 7);
+ FAIL_IF(push_inst16(compiler, SUBSI8 | RDN3(SLJIT_R3) | 1));
+ FAIL_IF(push_inst16(compiler, BCC | (0x1 << 8) /* not-equal */ | (-7 & 0xff)));
+
+ local_size &= 0xfff;
+
+ if (local_size != 0)
+ FAIL_IF(push_inst32(compiler, LDRI | 0x400 | RT4(TMP_REG2) | RN4(TMP_REG1)));
+ }
+
+ if (local_size >= 256) {
+ imm = get_imm(local_size & ~0xff);
+ SLJIT_ASSERT(imm != INVALID_IMM);
+
+ FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(TMP_REG1) | imm));
+ }
+ }
+
+ local_size &= 0xff;
+ FAIL_IF(push_inst32(compiler, LDRI | 0x400 | (local_size > 0 ? 0x100 : 0) | RT4(TMP_REG2) | RN4(TMP_REG1) | local_size));
+
+ FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_SP, TMP_REG1)));
+ }
+ else if (local_size > 0)
+ FAIL_IF(push_inst32(compiler, LDRI | 0x500 | RT4(TMP_REG1) | RN4(SLJIT_SP) | local_size));
+#endif
+
return SLJIT_SUCCESS;
}
@@ -1119,11 +1196,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *comp
/* Operators */
/* --------------------------------------------------------------------- */
+#if !(defined __ARM_FEATURE_IDIV) && !(defined __ARM_ARCH_EXT_IDIV__)
+
#ifdef __cplusplus
extern "C" {
#endif
-#if defined(__GNUC__)
+#ifdef _WIN32
+extern unsigned long long __rt_udiv(unsigned int denominator, unsigned int numerator);
+extern long long __rt_sdiv(int denominator, int numerator);
+#elif defined(__GNUC__)
extern unsigned int __aeabi_uidivmod(unsigned int numerator, int unsigned denominator);
extern int __aeabi_idivmod(int numerator, int denominator);
#else
@@ -1134,10 +1216,14 @@ extern int __aeabi_idivmod(int numerator, int denominator);
}
#endif
+#endif /* !__ARM_FEATURE_IDIV && !__ARM_ARCH_EXT_IDIV__ */
+
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
{
+#if !(defined __ARM_FEATURE_IDIV) && !(defined __ARM_ARCH_EXT_IDIV__)
sljit_sw saved_reg_list[3];
sljit_sw saved_reg_count;
+#endif
CHECK_ERROR();
CHECK(check_sljit_emit_op0(compiler, op));
@@ -1155,6 +1241,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
| (reg_map[SLJIT_R0] << 12)
| (reg_map[SLJIT_R0] << 16)
| reg_map[SLJIT_R1]);
+#if (defined __ARM_FEATURE_IDIV) || (defined __ARM_ARCH_EXT_IDIV__)
+ case SLJIT_DIVMOD_UW:
+ case SLJIT_DIVMOD_SW:
+ FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, SLJIT_R0)));
+ FAIL_IF(push_inst32(compiler, (op == SLJIT_DIVMOD_UW ? UDIV : SDIV) | RD4(SLJIT_R0) | RN4(SLJIT_R0) | RM4(SLJIT_R1)));
+ FAIL_IF(push_inst32(compiler, MUL | RD4(SLJIT_R1) | RN4(SLJIT_R0) | RM4(SLJIT_R1)));
+ return push_inst32(compiler, SUB_W | RD4(SLJIT_R1) | RN4(TMP_REG1) | RM4(SLJIT_R1));
+ case SLJIT_DIV_UW:
+ case SLJIT_DIV_SW:
+ return push_inst32(compiler, (op == SLJIT_DIV_UW ? UDIV : SDIV) | RD4(SLJIT_R0) | RN4(SLJIT_R0) | RM4(SLJIT_R1));
+#else /* !__ARM_FEATURE_IDIV && !__ARM_ARCH_EXT_IDIV__ */
case SLJIT_DIVMOD_UW:
case SLJIT_DIVMOD_SW:
case SLJIT_DIV_UW:
@@ -1183,7 +1280,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
}
}
-#if defined(__GNUC__)
+#ifdef _WIN32
+ FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, SLJIT_R0)));
+ FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_R0, SLJIT_R1)));
+ FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(SLJIT_R1, TMP_REG1)));
+ FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
+ ((op | 0x2) == SLJIT_DIV_UW ? SLJIT_FUNC_OFFSET(__rt_udiv) : SLJIT_FUNC_OFFSET(__rt_sdiv))));
+#elif defined(__GNUC__)
FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
((op | 0x2) == SLJIT_DIV_UW ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
#else
@@ -1203,6 +1306,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
| (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */);
}
return SLJIT_SUCCESS;
+#endif /* __ARM_FEATURE_IDIV || __ARM_ARCH_EXT_IDIV__ */
}
return SLJIT_SUCCESS;
diff --git a/sljit/sljitNativeX86_32.c b/sljit/sljitNativeX86_32.c
index 8a83e27..074e64b 100644
--- a/sljit/sljitNativeX86_32.c
+++ b/sljit/sljitNativeX86_32.c
@@ -123,34 +123,38 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
if (args > 0) {
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_map[SLJIT_S0] << 3) | reg_map[SLJIT_R2];
+ inst[0] = MOV_r_rm;
+ inst[1] = MOD_REG | (reg_map[SLJIT_S0] << 3) | reg_map[SLJIT_R2];
+ inst += 2;
}
if (args > 1) {
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_map[SLJIT_S1] << 3) | reg_map[SLJIT_R1];
+ inst[0] = MOV_r_rm;
+ inst[1] = MOD_REG | (reg_map[SLJIT_S1] << 3) | reg_map[SLJIT_R1];
+ inst += 2;
}
if (args > 2) {
- *inst++ = MOV_r_rm;
- *inst++ = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | 0x4 /* esp */;
- *inst++ = 0x24;
- *inst++ = sizeof(sljit_sw) * (3 + 2); /* saveds >= 3 as well. */
+ inst[0] = MOV_r_rm;
+ inst[1] = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | 0x4 /* esp */;
+ inst[2] = 0x24;
+ inst[3] = sizeof(sljit_sw) * (3 + 2); /* saveds >= 3 as well. */
}
#else
if (args > 0) {
- *inst++ = MOV_r_rm;
- *inst++ = MOD_DISP8 | (reg_map[SLJIT_S0] << 3) | reg_map[TMP_REG1];
- *inst++ = sizeof(sljit_sw) * 2;
+ inst[0] = MOV_r_rm;
+ inst[1] = MOD_DISP8 | (reg_map[SLJIT_S0] << 3) | reg_map[TMP_REG1];
+ inst[2] = sizeof(sljit_sw) * 2;
+ inst += 3;
}
if (args > 1) {
- *inst++ = MOV_r_rm;
- *inst++ = MOD_DISP8 | (reg_map[SLJIT_S1] << 3) | reg_map[TMP_REG1];
- *inst++ = sizeof(sljit_sw) * 3;
+ inst[0] = MOV_r_rm;
+ inst[1] = MOD_DISP8 | (reg_map[SLJIT_S1] << 3) | reg_map[TMP_REG1];
+ inst[2] = sizeof(sljit_sw) * 3;
+ inst += 3;
}
if (args > 2) {
- *inst++ = MOV_r_rm;
- *inst++ = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | reg_map[TMP_REG1];
- *inst++ = sizeof(sljit_sw) * 4;
+ inst[0] = MOV_r_rm;
+ inst[1] = MOD_DISP8 | (reg_map[SLJIT_S2] << 3) | reg_map[TMP_REG1];
+ inst[2] = sizeof(sljit_sw) * 4;
}
#endif
@@ -170,17 +174,36 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
compiler->local_size = local_size;
#ifdef _WIN32
- if (local_size > 1024) {
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
- FAIL_IF(emit_do_imm(compiler, MOV_r_i32 + reg_map[SLJIT_R0], local_size));
-#else
- /* Space for a single argument. This amount is excluded when the stack is allocated below. */
- local_size -= sizeof(sljit_sw);
- FAIL_IF(emit_do_imm(compiler, MOV_r_i32 + reg_map[SLJIT_R0], local_size));
- FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
- SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, sizeof(sljit_sw)));
-#endif
- FAIL_IF(sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARG1(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
+ if (local_size > 0) {
+ if (local_size <= 4 * 4096) {
+ if (local_size > 4096)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096);
+ if (local_size > 2 * 4096)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 2);
+ if (local_size > 3 * 4096)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 3);
+ }
+ else {
+ EMIT_MOV(compiler, SLJIT_R0, 0, SLJIT_SP, 0);
+ EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_IMM, (local_size - 1) >> 12);
+
+ SLJIT_ASSERT (reg_map[SLJIT_R0] == 0);
+
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_R0), -4096);
+ FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+ SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 4096));
+ FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+ SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1));
+
+ inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
+ FAIL_IF(!inst);
+
+ INC_SIZE(2);
+ inst[0] = JNE_i8;
+ inst[1] = (sljit_s8) -16;
+ }
+
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -local_size);
}
#endif
diff --git a/sljit/sljitNativeX86_64.c b/sljit/sljitNativeX86_64.c
index 635ebd0..8506565 100644
--- a/sljit/sljitNativeX86_64.c
+++ b/sljit/sljitNativeX86_64.c
@@ -83,6 +83,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
+ compiler->mode32 = 0;
+
#ifdef _WIN64
/* Two/four register slots for parameters plus space for xmm6 register if needed. */
if (fscratches >= 6 || fsaveds >= 1)
@@ -126,35 +128,39 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
#ifndef _WIN64
if (args > 0) {
- *inst++ = REX_W;
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x7 /* rdi */;
+ inst[0] = REX_W;
+ inst[1] = MOV_r_rm;
+ inst[2] = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x7 /* rdi */;
+ inst += 3;
}
if (args > 1) {
- *inst++ = REX_W | REX_R;
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_lmap[SLJIT_S1] << 3) | 0x6 /* rsi */;
+ inst[0] = REX_W | REX_R;
+ inst[1] = MOV_r_rm;
+ inst[2] = MOD_REG | (reg_lmap[SLJIT_S1] << 3) | 0x6 /* rsi */;
+ inst += 3;
}
if (args > 2) {
- *inst++ = REX_W | REX_R;
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_lmap[SLJIT_S2] << 3) | 0x2 /* rdx */;
+ inst[0] = REX_W | REX_R;
+ inst[1] = MOV_r_rm;
+ inst[2] = MOD_REG | (reg_lmap[SLJIT_S2] << 3) | 0x2 /* rdx */;
}
#else
if (args > 0) {
- *inst++ = REX_W;
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x1 /* rcx */;
+ inst[0] = REX_W;
+ inst[1] = MOV_r_rm;
+ inst[2] = MOD_REG | (reg_map[SLJIT_S0] << 3) | 0x1 /* rcx */;
+ inst += 3;
}
if (args > 1) {
- *inst++ = REX_W;
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_map[SLJIT_S1] << 3) | 0x2 /* rdx */;
+ inst[0] = REX_W;
+ inst[1] = MOV_r_rm;
+ inst[2] = MOD_REG | (reg_map[SLJIT_S1] << 3) | 0x2 /* rdx */;
+ inst += 3;
}
if (args > 2) {
- *inst++ = REX_W | REX_B;
- *inst++ = MOV_r_rm;
- *inst++ = MOD_REG | (reg_map[SLJIT_S2] << 3) | 0x0 /* r8 */;
+ inst[0] = REX_W | REX_B;
+ inst[1] = MOV_r_rm;
+ inst[2] = MOD_REG | (reg_map[SLJIT_S2] << 3) | 0x0 /* r8 */;
}
#endif
}
@@ -163,58 +169,42 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
compiler->local_size = local_size;
#ifdef _WIN64
- if (local_size > 1024) {
- /* Allocate stack for the callback, which grows the stack. */
- inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + (3 + sizeof(sljit_s32)));
- FAIL_IF(!inst);
- INC_SIZE(4 + (3 + sizeof(sljit_s32)));
- *inst++ = REX_W;
- *inst++ = GROUP_BINARY_83;
- *inst++ = MOD_REG | SUB | reg_map[SLJIT_SP];
- /* Allocated size for registers must be divisible by 8. */
- SLJIT_ASSERT(!(saved_register_size & 0x7));
- /* Aligned to 16 byte. */
- if (saved_register_size & 0x8) {
- *inst++ = 5 * sizeof(sljit_sw);
- local_size -= 5 * sizeof(sljit_sw);
- } else {
- *inst++ = 4 * sizeof(sljit_sw);
- local_size -= 4 * sizeof(sljit_sw);
- }
- /* Second instruction */
- SLJIT_ASSERT(reg_map[SLJIT_R0] < 8);
- *inst++ = REX_W;
- *inst++ = MOV_rm_i32;
- *inst++ = MOD_REG | reg_lmap[SLJIT_R0];
- sljit_unaligned_store_s32(inst, local_size);
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
- || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
- compiler->skip_checks = 1;
-#endif
- FAIL_IF(sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARG1(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_grow_stack)));
- }
-#endif
-
if (local_size > 0) {
- if (local_size <= 127) {
- inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
- FAIL_IF(!inst);
- INC_SIZE(4);
- *inst++ = REX_W;
- *inst++ = GROUP_BINARY_83;
- *inst++ = MOD_REG | SUB | reg_map[SLJIT_SP];
- *inst++ = local_size;
+ if (local_size <= 4 * 4096) {
+ if (local_size > 4096)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096);
+ if (local_size > 2 * 4096)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 2);
+ if (local_size > 3 * 4096)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 3);
}
else {
- inst = (sljit_u8*)ensure_buf(compiler, 1 + 7);
+ EMIT_MOV(compiler, SLJIT_R0, 0, SLJIT_SP, 0);
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, (local_size - 1) >> 12);
+
+ SLJIT_ASSERT (reg_map[SLJIT_R0] == 0);
+
+ EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_MEM1(SLJIT_R0), -4096);
+ FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+ SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 4096));
+ FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+ TMP_REG1, 0, TMP_REG1, 0, SLJIT_IMM, 1));
+
+ inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
FAIL_IF(!inst);
- INC_SIZE(7);
- *inst++ = REX_W;
- *inst++ = GROUP_BINARY_81;
- *inst++ = MOD_REG | SUB | reg_map[SLJIT_SP];
- sljit_unaligned_store_s32(inst, local_size);
- inst += sizeof(sljit_s32);
+
+ INC_SIZE(2);
+ inst[0] = JNE_i8;
+ inst[1] = (sljit_s8) -19;
}
+
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), -local_size);
+ }
+#endif
+
+ if (local_size > 0) {
+ FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
+ SLJIT_SP, 0, SLJIT_SP, 0, SLJIT_IMM, local_size));
}
#ifdef _WIN64
diff --git a/sljit/sljitNativeX86_common.c b/sljit/sljitNativeX86_common.c
index ab7b36a..6f02ee3 100644
--- a/sljit/sljitNativeX86_common.c
+++ b/sljit/sljitNativeX86_common.c
@@ -669,23 +669,6 @@ static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
-#ifdef _WIN32
-#include <malloc.h>
-
-static void SLJIT_FUNC sljit_grow_stack(sljit_sw local_size)
-{
- /* Workaround for calling the internal _chkstk() function on Windows.
- This function touches all 4k pages belongs to the requested stack space,
- which size is passed in local_size. This is necessary on Windows where
- the stack can only grow in 4k steps. However, this function just burn
- CPU cycles if the stack is large enough. However, you don't know it in
- advance, so it must always be called. I think this is a bad design in
- general even if it has some reasons. */
- *(volatile sljit_s32*)alloca(local_size) = 0;
-}
-
-#endif
-
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
#include "sljitNativeX86_32.c"
#else