diff options
-rw-r--r-- | Makefile2 | 107 | ||||
-rw-r--r-- | bitwiseARM.s | 496 | ||||
-rw-r--r-- | configure.in | 2 | ||||
-rw-r--r-- | floor1ARM.s | 48 | ||||
-rw-r--r-- | floor1LARM.s | 46 | ||||
-rw-r--r-- | mdctARM.s | 1676 | ||||
-rw-r--r-- | mdctLARM.s | 1618 | ||||
-rw-r--r-- | testtremor.c | 461 |
8 files changed, 2511 insertions, 1943 deletions
diff --git a/Makefile2 b/Makefile2 new file mode 100644 index 0000000..1b29142 --- /dev/null +++ b/Makefile2 @@ -0,0 +1,107 @@ +# Tremolo Makefile for Windows CE port +# Uses the VLC toolchain +# $URL$ +# $Id$ + +srcdir = . +VPATH = $(srcdir) + +CC = arm-none-linux-gnueabi-gcc +CXX = arm-none-linux-gnueabi-g++ +LD = arm-none-linux-gnueabi-g++ +AR = arm-none-linux-gnueabi-ar cru +RANLIB = arm-none-linux-gnueabi-ranlib +STRIP = arm-none-linux-gnueabi-strip +WINDRES= arm-none-linux-gnueabi-windres +MKDIR = mkdir -p +RM = rm -f +RM_REC = rm -rf +ECHO = echo -n +CAT = cat +AS = arm-none-linux-gnueabi-as + +DEFINES := + +CFLAGS := -g -mcpu=cortex-a8 -mfpu=neon -I$(srcdir) -D__ARM__ -D_ARM_ + +CXXFLAGS := $(CFLAGS) + +#LDFLAGS := -L/opt/mingw32ce -Llibs/lib -lmingw32 +LDFLAGS := +LIBS := + +OBJS := +MODULE_DIRS += . + +LIBOBJS := bitwise.o bitwiseARM.o codebook.o dsp.o floor0.o \ + floor1.o floor1ARM.o floor_lookup.o framing.o info.o mapping0.o \ + mdct.o mdctARM.o misc.o res012.o vorbisfile.o +EXEOBJS := testtremor.o + +#LIBOBJS_C := bitwise.oc codebook.oc dsp.oc floor0.oc floor1.oc \ +# floor_lookup.oc framing.oc info.oc mapping0.oc mdct.oc misc.oc \ +# res012.oc vorbisfile.oc +LIBOBJS_C := bitwise.oc codebook.oc dsp.oc floor0.oc \ + floor1.oc floor_lookup.oc framing.oc info.oc mapping0.oc \ + mdct.oc misc.oc res012.oc vorbisfile.oc +EXEOBJS_C := testtremor.oc + +LIBOBJS_L := bitwise.ol bitwiseARM.o codebook.ol dsp.ol floor0.ol \ + floor1.ol floor1LARM.o floor_lookup.ol framing.ol info.ol mapping0.ol \ + mdct.ol mdctLARM.o misc.ol res012.ol vorbisfile.ol +EXEOBJS_L := testtremor.ol + +LIBOBJS_LC := bitwise.olc codebook.olc dsp.olc floor0.olc floor1.olc \ + floor_lookup.olc framing.olc info.olc mapping0.olc mdct.olc misc.olc \ + res012.olc vorbisfile.olc +EXEOBJS_LC := testtremor.olc + +# Rules +.SUFFIXES: .o .oc .ol .olc + +.c.o: + $(CC) $(CFLAGS) -c $(<) -o $*.o -D_ARM_ASSEM_ + +.c.oc: + $(CC) $(CFLAGS) -c $(<) -o $*.oc -DONLY_C + +.c.ol: + $(CC) $(CFLAGS) -c $(<) -o $*.ol -D_LOW_ACCURACY_ -D_ARM_ASSEM_ + +.c.olc: + $(CC) $(CFLAGS) -c $(<) -o $*.olc -D_LOW_ACCURACY_ -DONLY_C +.S.s: + ./arm2gnu.pl < $(<) > $*.s + +all: libTremolo.lib testtremor.exe testtremorC.exe testtremorL.exe testtremorLC.exe + +libTremolo.lib: $(LIBOBJS) + $(AR) $@ $^ + $(RANLIB) $@ + +#bitwiseTEST.o: bitwise.c +# $(CC) $(CFLAGS) -c -o bitwiseTEST.o bitwise.c -D_V_BIT_TEST + +#bittest.exe: bitwiseTEST.o bitwiseARM.o +# $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,bittest.exe.map + +testtremor.exe: testtremor.o $(LIBOBJS) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremor.exe.map + +testtremorC.exe: testtremor.oc $(LIBOBJS_C) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorC.exe.map + +testtremorL.exe: testtremor.ol $(LIBOBJS_L) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorL.exe.map + +testtremorLC.exe: testtremor.olc $(LIBOBJS_LC) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorLC.exe.map + +annotate.exe: annotate.c + gcc $^ -o $@ + +clean: + rm `find . -name \*.o` + rm `find . -name \*.ol` + rm `find . -name \*.oc` + rm `find . -name \*.olc` diff --git a/bitwiseARM.s b/bitwiseARM.s index 421b4b2..7a24aee 100644 --- a/bitwiseARM.s +++ b/bitwiseARM.s @@ -1,80 +1,80 @@ -; Tremolo library -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +@ Tremolo library +@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - AREA |.text|, CODE, READONLY + .text - EXPORT oggpack_look - EXPORT oggpack_adv - EXPORT oggpack_readinit - EXPORT oggpack_read + .global oggpack_look + .global oggpack_adv + .global oggpack_readinit + .global oggpack_read -oggpack_look - ; r0 = oggpack_buffer *b - ; r1 = int bits +oggpack_look: + @ r0 = oggpack_buffer *b + @ r1 = int bits STMFD r13!,{r10,r11,r14} LDMIA r0,{r2,r3,r12} - ; r2 = bitsLeftInSegment - ; r3 = ptr - ; r12= bitsLeftInWord - SUBS r2,r2,r1 ; bitsLeftinSegment -= bits - BLT look_slow ; Not enough bits in this segment for - ; this request. Do it slowly. - LDR r10,[r3] ; r10= ptr[0] - RSB r14,r12,#32 ; r14= 32-bitsLeftInWord - SUBS r12,r12,r1 ; r12= bitsLeftInWord -= bits - LDRLT r11,[r3,#4]! ; r11= ptr[1] - MOV r10,r10,LSR r14 ; r10= ptr[0]>>(32-bitsLeftInWord) - ADDLE r12,r12,#32 ; r12= bitsLeftInWord += 32 - RSB r14,r14,#32 ; r14= 32-bitsLeftInWord - ORRLT r10,r10,r11,LSL r14 ; r10= Next 32 bits. + @ r2 = bitsLeftInSegment + @ r3 = ptr + @ r12= bitsLeftInWord + SUBS r2,r2,r1 @ bitsLeftinSegment -= bits + BLT look_slow @ Not enough bits in this segment for + @ this request. Do it slowly. + LDR r10,[r3] @ r10= ptr[0] + RSB r14,r12,#32 @ r14= 32-bitsLeftInWord + SUBS r12,r12,r1 @ r12= bitsLeftInWord -= bits + LDRLT r11,[r3,#4]! @ r11= ptr[1] + MOV r10,r10,LSR r14 @ r10= ptr[0]>>(32-bitsLeftInWord) + ADDLE r12,r12,#32 @ r12= bitsLeftInWord += 32 + RSB r14,r14,#32 @ r14= 32-bitsLeftInWord + ORRLT r10,r10,r11,LSL r14 @ r10= Next 32 bits. MOV r14,#1 RSB r14,r14,r14,LSL r1 AND r0,r10,r14 LDMFD r13!,{r10,r11,PC} -look_slow +look_slow: STMFD r13!,{r5,r6} - ADDS r10,r2,r1 ; r10= bitsLeftInSegment + bits (i.e. - ; the initial value of bitsLeftInSeg) - ; r10 = bitsLeftInSegment (initial) - ; r12 = bitsLeftInWord - RSB r14,r12,#32 ; r14= 32-bitsLeftInWord - MOV r5,r10 ; r5 = bitsLeftInSegment (initial) + ADDS r10,r2,r1 @ r10= bitsLeftInSegment + bits (i.e. + @ the initial value of bitsLeftInSeg) + @ r10 = bitsLeftInSegment (initial) + @ r12 = bitsLeftInWord + RSB r14,r12,#32 @ r14= 32-bitsLeftInWord + MOV r5,r10 @ r5 = bitsLeftInSegment (initial) BLT look_overrun - BEQ look_next_segment ; r10= r12 = 0, if we branch - CMP r12,r10 ; If bitsLeftInWord < bitsLeftInSeg - ; there must be more in the next word - LDR r10,[r3],#4 ; r10= ptr[0] - LDRLT r6,[r3] ; r6 = ptr[1] + BEQ look_next_segment @ r10= r12 = 0, if we branch + CMP r12,r10 @ If bitsLeftInWord < bitsLeftInSeg + @ there must be more in the next word + LDR r10,[r3],#4 @ r10= ptr[0] + LDRLT r6,[r3] @ r6 = ptr[1] MOV r11,#1 - MOV r10,r10,LSR r14 ; r10= first bitsLeftInWord bits - ORRLT r10,r10,r6,LSL r12 ; r10= first bitsLeftInSeg bits+crap - RSB r11,r11,r11,LSL r5 ; r11= mask - AND r10,r10,r11 ; r10= first r5 bits - ; Load the next segments data -look_next_segment - ; At this point, r10 contains the first r5 bits of the result - LDR r11,[r0,#12] ; r11= head = b->head - ; Stall - ; Stall -look_next_segment_2 - LDR r11,[r11,#12] ; r11= head = head->next - ; Stall - ; Stall + MOV r10,r10,LSR r14 @ r10= first bitsLeftInWord bits + ORRLT r10,r10,r6,LSL r12 @ r10= first bitsLeftInSeg bits+crap + RSB r11,r11,r11,LSL r5 @ r11= mask + AND r10,r10,r11 @ r10= first r5 bits + @ Load the next segments data +look_next_segment: + @ At this point, r10 contains the first r5 bits of the result + LDR r11,[r0,#12] @ r11= head = b->head + @ Stall + @ Stall +look_next_segment_2: + LDR r11,[r11,#12] @ r11= head = head->next + @ Stall + @ Stall CMP r11,#0 BEQ look_out_of_data - LDMIA r11,{r6,r12,r14} ; r6 = buffer - ; r12= begin - ; r14= length - LDR r6,[r6] ; r6 = buffer->data + LDMIA r11,{r6,r12,r14} @ r6 = buffer + @ r12= begin + @ r14= length + LDR r6,[r6] @ r6 = buffer->data CMP r14,#0 BEQ look_next_segment_2 - ADD r6,r6,r12 ; r6 = buffer->data+begin -look_slow_loop - LDRB r12,[r6],#1 ; r12= *buffer - SUBS r14,r14,#1 ; r14= length - ; Stall - ORR r10,r10,r12,LSL r5 ; r10= first r5+8 bits + ADD r6,r6,r12 @ r6 = buffer->data+begin +look_slow_loop: + LDRB r12,[r6],#1 @ r12= *buffer + SUBS r14,r14,#1 @ r14= length + @ Stall + ORR r10,r10,r12,LSL r5 @ r10= first r5+8 bits ADD r5,r5,#8 BLE look_really_slow CMP r5,r1 @@ -85,7 +85,7 @@ look_slow_loop LDMFD r13!,{r5,r6,r10,r11,PC} -look_really_slow +look_really_slow: CMP r5,r1 BLT look_next_segment_2 MOV r14,#1 @@ -93,208 +93,208 @@ look_really_slow AND r0,r10,r14 LDMFD r13!,{r5,r6,r10,r11,PC} -look_out_of_data - ;MVN r0,#0 ; return -1 - MOV r0,#0 +look_out_of_data: + MVN r0,#0 @ return -1 + @MOV r0,#0 LDMFD r13!,{r5,r6,r10,r11,PC} -look_overrun - ; We had overrun when we started, so we need to skip -r10 bits. - LDR r11,[r0,#12] ; r11 = head = b->head - ; stall - ; stall -look_overrun_next_segment - LDR r11,[r11,#12] ; r11 = head->next - ; stall - ; stall +look_overrun: + @ We had overrun when we started, so we need to skip -r10 bits. + LDR r11,[r0,#12] @ r11 = head = b->head + @ stall + @ stall +look_overrun_next_segment: + LDR r11,[r11,#12] @ r11 = head->next + @ stall + @ stall CMP r11,#0 BEQ look_out_of_data - LDMIA r11,{r6,r7,r14} ; r6 = buffer - ; r7 = begin - ; r14= length - LDR r6,[r6] ; r6 = buffer->data - ; stall - ; stall - ADD r6,r6,r7 ; r6 = buffer->data+begin - MOV r14,r14,LSL #3 ; r14= length in bits - ADDS r14,r14,r10 ; r14= length in bits-bits to skip + LDMIA r11,{r6,r7,r14} @ r6 = buffer + @ r7 = begin + @ r14= length + LDR r6,[r6] @ r6 = buffer->data + @ stall + @ stall + ADD r6,r6,r7 @ r6 = buffer->data+begin + MOV r14,r14,LSL #3 @ r14= length in bits + ADDS r14,r14,r10 @ r14= length in bits-bits to skip MOVLE r10,r14 BLE look_overrun_next_segment - RSB r10,r10,#0 ; r10= bits to skip - ADD r6,r10,r10,LSR #3 ; r6 = pointer to data + RSB r10,r10,#0 @ r10= bits to skip + ADD r6,r10,r10,LSR #3 @ r6 = pointer to data MOV r10,#0 B look_slow_loop -oggpack_adv - ; r0 = oggpack_buffer *b - ; r1 = bits +oggpack_adv: + @ r0 = oggpack_buffer *b + @ r1 = bits LDMIA r0,{r2,r3,r12} - ; r2 = bitsLeftInSegment - ; r3 = ptr - ; r12= bitsLeftInWord - SUBS r2,r2,r1 ; Does this run us out of bits in the - BLE adv_slow ; segment? If so, do it slowly + @ r2 = bitsLeftInSegment + @ r3 = ptr + @ r12= bitsLeftInWord + SUBS r2,r2,r1 @ Does this run us out of bits in the + BLE adv_slow @ segment? If so, do it slowly SUBS r12,r12,r1 ADDLE r12,r12,#32 ADDLE r3,r3,#4 STMIA r0,{r2,r3,r12} MOV PC,R14 -adv_slow +adv_slow: STMFD r13!,{r10,r14} - LDR r14,[r0,#12] ; r14= head - ; stall -adv_slow_loop - LDR r1,[r0,#20] ; r1 = count - LDR r10,[r14,#8] ; r10= head->length - LDR r14,[r14,#12] ; r14= head->next - ; stall - ADD r1,r1,r10 ; r1 = count += head->length + LDR r14,[r0,#12] @ r14= head + @ stall +adv_slow_loop: + LDR r1,[r0,#20] @ r1 = count + LDR r10,[r14,#8] @ r10= head->length + LDR r14,[r14,#12] @ r14= head->next + @ stall + ADD r1,r1,r10 @ r1 = count += head->length CMP r14,#0 BEQ adv_end - STR r1,[r0,#20] ; b->count = count - STR r14,[r0,#12] ; b->head = head - LDMIA r14,{r3,r10,r12} ; r3 = buffer - ; r10= begin - ; r12= length - LDR r3,[r3] ; r3 = buffer->data - ADD r3,r3,r10 ; r3 = Pointer to start (byte) - AND r10,r3,#3 ; r10= bytes to backtrk to word align - MOV r10,r10,LSL #3 ; r10= bits to backtrk to word align - RSB r10,r10,#32 ; r10= bits left in word - ADDS r10,r10,r2 ; r10= bits left in word after skip + STR r1,[r0,#20] @ b->count = count + STR r14,[r0,#12] @ b->head = head + LDMIA r14,{r3,r10,r12} @ r3 = buffer + @ r10= begin + @ r12= length + LDR r3,[r3] @ r3 = buffer->data + ADD r3,r3,r10 @ r3 = Pointer to start (byte) + AND r10,r3,#3 @ r10= bytes to backtrk to word align + MOV r10,r10,LSL #3 @ r10= bits to backtrk to word align + RSB r10,r10,#32 @ r10= bits left in word + ADDS r10,r10,r2 @ r10= bits left in word after skip ADDLE r10,r10,#32 ADDLE r3,r3,#4 - BIC r3,r3,#3 ; r3 = Pointer to start (word) - ADDS r2,r2,r12,LSL #3 ; r2 = length in bits after advance + BIC r3,r3,#3 @ r3 = Pointer to start (word) + ADDS r2,r2,r12,LSL #3 @ r2 = length in bits after advance BLE adv_slow_loop STMIA r0,{r2,r3,r10} LDMFD r13!,{r10,PC} -adv_end +adv_end: MOV r2, #0 MOV r12,#0 STMIA r0,{r2,r3,r12} LDMFD r13!,{r10,PC} -oggpack_readinit - ; r0 = oggpack_buffer *b - ; r1 = oggreference *r - STR r1,[r0,#12] ; b->head = r1 - STR r1,[r0,#16] ; b->tail = r1 - LDMIA r1,{r2,r3,r12} ; r2 = b->head->buffer - ; r3 = b->head->begin - ; r12= b->head->length - LDR r2,[r2] ; r2 = b->head->buffer->data - MOV r1,r12,LSL #3 ; r1 = BitsInSegment +oggpack_readinit: + @ r0 = oggpack_buffer *b + @ r1 = oggreference *r + STR r1,[r0,#12] @ b->head = r1 + STR r1,[r0,#16] @ b->tail = r1 + LDMIA r1,{r2,r3,r12} @ r2 = b->head->buffer + @ r3 = b->head->begin + @ r12= b->head->length + LDR r2,[r2] @ r2 = b->head->buffer->data + MOV r1,r12,LSL #3 @ r1 = BitsInSegment MOV r12,#0 - ADD r3,r2,r3 ; r3 = r2+b->head->begin - BIC r2,r3,#3 ; r2 = b->headptr (word) + ADD r3,r2,r3 @ r3 = r2+b->head->begin + BIC r2,r3,#3 @ r2 = b->headptr (word) AND r3,r3,#3 MOV r3,r3,LSL #3 - RSB r3,r3,#32 ; r3 = BitsInWord + RSB r3,r3,#32 @ r3 = BitsInWord STMIA r0,{r1,r2,r3} STR r12,[r0,#20] MOV PC,R14 -oggpack_read - ; r0 = oggpack_buffer *b - ; r1 = int bits +oggpack_read: + @ r0 = oggpack_buffer *b + @ r1 = int bits STMFD r13!,{r10,r11,r14} LDMIA r0,{r2,r3,r12} - ; r2 = bitsLeftInSegment - ; r3 = ptr - ; r12= bitsLeftInWord - SUBS r2,r2,r1 ; bitsLeftinSegment -= bits - BLT read_slow ; Not enough bits in this segment for - ; this request. Do it slowly. - LDR r10,[r3] ; r10= ptr[0] - RSB r14,r12,#32 ; r14= 32-bitsLeftInWord - SUBS r12,r12,r1 ; r12= bitsLeftInWord -= bits + @ r2 = bitsLeftInSegment + @ r3 = ptr + @ r12= bitsLeftInWord + SUBS r2,r2,r1 @ bitsLeftinSegment -= bits + BLT read_slow @ Not enough bits in this segment for + @ this request. Do it slowly. + LDR r10,[r3] @ r10= ptr[0] + RSB r14,r12,#32 @ r14= 32-bitsLeftInWord + SUBS r12,r12,r1 @ r12= bitsLeftInWord -= bits ADDLE r3,r3,#4 - LDRLT r11,[r3] ; r11= ptr[1] - MOV r10,r10,LSR r14 ; r10= ptr[0]>>(32-bitsLeftInWord) - ADDLE r12,r12,#32 ; r12= bitsLeftInWord += 32 - RSB r14,r14,#32 ; r14= 32-bitsLeftInWord - ORRLT r10,r10,r11,LSL r14 ; r10= Next 32 bits. + LDRLT r11,[r3] @ r11= ptr[1] + MOV r10,r10,LSR r14 @ r10= ptr[0]>>(32-bitsLeftInWord) + ADDLE r12,r12,#32 @ r12= bitsLeftInWord += 32 + RSB r14,r14,#32 @ r14= 32-bitsLeftInWord + ORRLT r10,r10,r11,LSL r14 @ r10= Next 32 bits. STMIA r0,{r2,r3,r12} MOV r14,#1 RSB r14,r14,r14,LSL r1 AND r0,r10,r14 LDMFD r13!,{r10,r11,PC} -read_slow +read_slow: STMFD r13!,{r5,r6} - ADDS r10,r2,r1 ; r10= bitsLeftInSegment + bits (i.e. - ; the initial value of bitsLeftInSeg) - ; r10 = bitsLeftInSegment (initial) - ; r12 = bitsLeftInWord - RSB r14,r12,#32 ; r14= 32-bitsLeftInWord - MOV r5,r10 ; r5 = bitsLeftInSegment (initial) + ADDS r10,r2,r1 @ r10= bitsLeftInSegment + bits (i.e. + @ the initial value of bitsLeftInSeg) + @ r10 = bitsLeftInSegment (initial) + @ r12 = bitsLeftInWord + RSB r14,r12,#32 @ r14= 32-bitsLeftInWord + MOV r5,r10 @ r5 = bitsLeftInSegment (initial) BLT read_overrun - BEQ read_next_segment ; r10= r12 = 0, if we branch - CMP r12,r10 ; If bitsLeftInWord < bitsLeftInSeg - ; there must be more in the next word - LDR r10,[r3],#4 ; r10= ptr[0] - LDRLT r6,[r3] ; r6 = ptr[1] + BEQ read_next_segment @ r10= r12 = 0, if we branch + CMP r12,r10 @ If bitsLeftInWord < bitsLeftInSeg + @ there must be more in the next word + LDR r10,[r3],#4 @ r10= ptr[0] + LDRLT r6,[r3] @ r6 = ptr[1] MOV r11,#1 - MOV r10,r10,LSR r14 ; r10= first bitsLeftInWord bits - ORRLT r10,r10,r6,LSL r12 ; r10= first bitsLeftInSeg bits+crap - RSB r11,r11,r11,LSL r5 ; r11= mask - AND r10,r10,r11 ; r10= first r5 bits - ; Load the next segments data -read_next_segment - ; At this point, r10 contains the first r5 bits of the result - LDR r11,[r0,#12] ; r11= head = b->head - ; Stall -read_next_segment_2 - ; r11 = head - LDR r6,[r0,#20] ; r6 = count - LDR r12,[r11,#8] ; r12= length - LDR r11,[r11,#12] ; r11= head = head->next - ; Stall - ADD r6,r6,r12 ; count += length + MOV r10,r10,LSR r14 @ r10= first bitsLeftInWord bits + ORRLT r10,r10,r6,LSL r12 @ r10= first bitsLeftInSeg bits+crap + RSB r11,r11,r11,LSL r5 @ r11= mask + AND r10,r10,r11 @ r10= first r5 bits + @ Load the next segments data +read_next_segment: + @ At this point, r10 contains the first r5 bits of the result + LDR r11,[r0,#12] @ r11= head = b->head + @ Stall +read_next_segment_2: + @ r11 = head + LDR r6,[r0,#20] @ r6 = count + LDR r12,[r11,#8] @ r12= length + LDR r11,[r11,#12] @ r11= head = head->next + @ Stall + ADD r6,r6,r12 @ count += length CMP r11,#0 BEQ read_out_of_data STR r11,[r0,#12] - STR r6,[r0,#20] ; b->count = count - LDMIA r11,{r6,r12,r14} ; r6 = buffer - ; r12= begin - ; r14= length - LDR r6,[r6] ; r6 = buffer->data + STR r6,[r0,#20] @ b->count = count + LDMIA r11,{r6,r12,r14} @ r6 = buffer + @ r12= begin + @ r14= length + LDR r6,[r6] @ r6 = buffer->data CMP r14,#0 BEQ read_next_segment_2 - ADD r6,r6,r12 ; r6 = buffer->data+begin -read_slow_loop - LDRB r12,[r6],#1 ; r12= *buffer - SUBS r14,r14,#1 ; r14= length - ; Stall - ORR r10,r10,r12,LSL r5 ; r10= first r5+8 bits + ADD r6,r6,r12 @ r6 = buffer->data+begin +read_slow_loop: + LDRB r12,[r6],#1 @ r12= *buffer + SUBS r14,r14,#1 @ r14= length + @ Stall + ORR r10,r10,r12,LSL r5 @ r10= first r5+8 bits ADD r5,r5,#8 BLE read_really_slow CMP r5,r1 BLT read_slow_loop -read_end +read_end: MOV r12,#1 RSB r12,r12,r12,LSL r1 - ; Store back the new position - ; r2 = -number of bits to go from this segment - ; r6 = ptr - ; r14= bytesLeftInSegment - ; r11= New head value - LDMIA r11,{r3,r6,r14} ; r3 = buffer - ; r6 = begin - ; r14= length - LDR r3,[r3] ; r3 = buffer->data - ADD r1,r2,r14,LSL #3 ; r1 = bitsLeftInSegment - ; stall - ADD r6,r3,r6 ; r6 = pointer - AND r3,r6,#3 ; r3 = bytes used in first word - RSB r3,r2,r3,LSL #3 ; r3 = bits used in first word - BIC r2,r6,#3 ; r2 = word ptr - RSBS r3,r3,#32 ; r3 = bitsLeftInWord + @ Store back the new position + @ r2 = -number of bits to go from this segment + @ r6 = ptr + @ r14= bytesLeftInSegment + @ r11= New head value + LDMIA r11,{r3,r6,r14} @ r3 = buffer + @ r6 = begin + @ r14= length + LDR r3,[r3] @ r3 = buffer->data + ADD r1,r2,r14,LSL #3 @ r1 = bitsLeftInSegment + @ stall + ADD r6,r3,r6 @ r6 = pointer + AND r3,r6,#3 @ r3 = bytes used in first word + RSB r3,r2,r3,LSL #3 @ r3 = bits used in first word + BIC r2,r6,#3 @ r2 = word ptr + RSBS r3,r3,#32 @ r3 = bitsLeftInWord ADDLE r3,r3,#32 ADDLE r2,r2,#4 STMIA r0,{r1,r2,r3} @@ -303,66 +303,66 @@ read_end LDMFD r13!,{r5,r6,r10,r11,PC} -read_really_slow +read_really_slow: CMP r5,r1 BGE read_end - LDR r14,[r11,#8] ; r14= length of segment just done - ; stall - ; stall - ADD r2,r2,r14,LSL #3 ; r2 = -bits to use from next seg + LDR r14,[r11,#8] @ r14= length of segment just done + @ stall + @ stall + ADD r2,r2,r14,LSL #3 @ r2 = -bits to use from next seg B read_next_segment_2 -read_out_of_data - ; Store back the new position - ; r2 = -number of bits to go from this segment - ; r6 = ptr - ; r14= bytesLeftInSegment - ; RJW: This may be overkill - we leave the buffer empty, with -1 - ; bits left in it. We might get away with just storing the - ; bitsLeftInSegment as -1. - LDR r11,[r0,#12] ; r11=head +read_out_of_data: + @ Store back the new position + @ r2 = -number of bits to go from this segment + @ r6 = ptr + @ r14= bytesLeftInSegment + @ RJW: This may be overkill - we leave the buffer empty, with -1 + @ bits left in it. We might get away with just storing the + @ bitsLeftInSegment as -1. + LDR r11,[r0,#12] @ r11=head - LDMIA r11,{r3,r6,r14} ; r3 = buffer - ; r6 = begin - ; r14= length - LDR r3,[r3] ; r3 = buffer->data - ADD r6,r3,r6 ; r6 = pointer + LDMIA r11,{r3,r6,r14} @ r3 = buffer + @ r6 = begin + @ r14= length + LDR r3,[r3] @ r3 = buffer->data + ADD r6,r3,r6 @ r6 = pointer ADD r6,r6,r14 - AND r3,r6,#3 ; r3 = bytes used in first word - MOV r3,r3,LSL #3 ; r3 = bits used in first word - BIC r2,r6,#3 ; r2 = word ptr - RSBS r3,r3,#32 ; r3 = bitsLeftInWord - MVN r1,#0 ; r1 = -1 = bitsLeftInSegment + AND r3,r6,#3 @ r3 = bytes used in first word + MOV r3,r3,LSL #3 @ r3 = bits used in first word + BIC r2,r6,#3 @ r2 = word ptr + RSBS r3,r3,#32 @ r3 = bitsLeftInWord + MVN r1,#0 @ r1 = -1 = bitsLeftInSegment STMIA r0,{r1,r2,r3} - ;MVN r0,#0 ; return -1 - MOV r0,#0 + MVN r0,#0 @ return -1 + @MOV r0,#0 LDMFD r13!,{r5,r6,r10,r11,PC} -read_overrun - ; We had overrun when we started, so we need to skip -r10 bits. - LDR r11,[r0,#12] ; r11 = head = b->head - ; stall - ; stall -read_overrun_next_segment - LDR r11,[r11,#12] ; r11 = head->next - ; stall - ; stall +read_overrun: + @ We had overrun when we started, so we need to skip -r10 bits. + LDR r11,[r0,#12] @ r11 = head = b->head + @ stall + @ stall +read_overrun_next_segment: + LDR r11,[r11,#12] @ r11 = head->next + @ stall + @ stall CMP r11,#0 BEQ read_out_of_data - LDMIA r11,{r6,r7,r14} ; r6 = buffer - ; r7 = begin - ; r14= length - LDR r6,[r6] ; r6 = buffer->data - ; stall - ; stall - ADD r6,r6,r7 ; r6 = buffer->data+begin - MOV r14,r14,LSL #3 ; r14= length in bits - ADDS r14,r14,r10 ; r14= length in bits-bits to skip + LDMIA r11,{r6,r7,r14} @ r6 = buffer + @ r7 = begin + @ r14= length + LDR r6,[r6] @ r6 = buffer->data + @ stall + @ stall + ADD r6,r6,r7 @ r6 = buffer->data+begin + MOV r14,r14,LSL #3 @ r14= length in bits + ADDS r14,r14,r10 @ r14= length in bits-bits to skip MOVLE r10,r14 BLE read_overrun_next_segment - RSB r10,r10,#0 ; r10= bits to skip - ADD r6,r10,r10,LSR #3 ; r6 = pointer to data + RSB r10,r10,#0 @ r10= bits to skip + ADD r6,r10,r10,LSR #3 @ r6 = pointer to data MOV r10,#0 B read_slow_loop - END + @ END diff --git a/configure.in b/configure.in index 3f60f55..69ed8ce 100644 --- a/configure.in +++ b/configure.in @@ -6,7 +6,7 @@ dnl ------------------------------------------------ AC_INIT(mdct.c) -AC_CANONICAL_HOST +AC_CANONICAL_SYSTEM AC_CANONICAL_TARGET AM_CONFIG_HEADER([config.h]) diff --git a/floor1ARM.s b/floor1ARM.s index 689b5c5..f7f7ae1 100644 --- a/floor1ARM.s +++ b/floor1ARM.s @@ -1,36 +1,36 @@ -; Tremolo library -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +@ Tremolo library +@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - AREA |.text|, CODE, READONLY + .text - EXPORT render_line_arm + .global render_line_arm -render_line_arm - ; r0 = n - ; r1 = d - ; r2 = floor - ; r3 = base - ; <> = err - ; <> = adx - ; <> = ady +render_line_arm: + @ r0 = n + @ r1 = d + @ r2 = floor + @ r3 = base + @ <> = err + @ <> = adx + @ <> = ady MOV r12,r13 STMFD r13!,{r4-r6,r11,r14} - LDMFD r12,{r11,r12,r14} ; r11 = err - ; r12 = adx - ; r14 = ady -rl_loop - LDR r4,[r1] ; r4 = *d - LDR r5,[r2],r3,LSL #2 ; r5 = *floor r2 = floor+base - SUBS r11,r11,r14 ; err -= ady - ADDLT r11,r11,r12 ; if (err < 0) err+=adx - SMULL r6, r5, r4, r5 ; (r6,r5) = *d * *floor - ADDLT r2, r2, #4 ; floor+=1 + LDMFD r12,{r11,r12,r14} @ r11 = err + @ r12 = adx + @ r14 = ady +rl_loop: + LDR r4,[r1] @ r4 = *d + LDR r5,[r2],r3,LSL #2 @ r5 = *floor r2 = floor+base + SUBS r11,r11,r14 @ err -= ady + ADDLT r11,r11,r12 @ if (err < 0) err+=adx + SMULL r6, r5, r4, r5 @ (r6,r5) = *d * *floor + ADDLT r2, r2, #4 @ floor+=1 MOVS r6, r6, LSR #15 - ADC r5, r6, r5, LSL #17 ; r5 = MULT31_SHIFT15 + ADC r5, r6, r5, LSL #17 @ r5 = MULT31_SHIFT15 STR r5,[r1],#4 SUBS r0, r0, #1 BGT rl_loop LDMFD r13!,{r4-r6,r11,PC} - END + @ END diff --git a/floor1LARM.s b/floor1LARM.s index d7ead1d..21163ae 100644 --- a/floor1LARM.s +++ b/floor1LARM.s @@ -1,35 +1,35 @@ -; Tremolo library -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +@ Tremolo library +@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - AREA |.text|, CODE, READONLY + .text - EXPORT render_line_arm_low + .global render_line_arm_low -render_line_arm_low - ; r0 = n - ; r1 = d - ; r2 = floor - ; r3 = base - ; <> = err - ; <> = adx - ; <> = ady +render_line_arm_low: + @ r0 = n + @ r1 = d + @ r2 = floor + @ r3 = base + @ <> = err + @ <> = adx + @ <> = ady MOV r12,r13 STMFD r13!,{r4-r6,r11,r14} - LDMFD r12,{r11,r12,r14} ; r11 = err - ; r12 = adx - ; r14 = ady -rl_loop - LDR r4, [r1] ; r4 = *d - LDR r5, [r2], r3,LSL #2 ; r5 = *floor r2 = floor+base - SUBS r11,r11,r14 ; err -= ady + LDMFD r12,{r11,r12,r14} @ r11 = err + @ r12 = adx + @ r14 = ady +rl_loop: + LDR r4, [r1] @ r4 = *d + LDR r5, [r2], r3,LSL #2 @ r5 = *floor r2 = floor+base + SUBS r11,r11,r14 @ err -= ady MOV r4, r4, ASR #6 - MUL r5, r4, r5 ; r5 = MULT31_SHIFT15 - ADDLT r11,r11,r12 ; if (err < 0) err+=adx - ADDLT r2, r2, #4 ; floor+=1 + MUL r5, r4, r5 @ r5 = MULT31_SHIFT15 + ADDLT r11,r11,r12 @ if (err < 0) err+=adx + ADDLT r2, r2, #4 @ floor+=1 SUBS r0, r0, #1 STR r5, [r1], #4 BGT rl_loop LDMFD r13!,{r4-r6,r11,PC} - END + @ END @@ -1,92 +1,92 @@ -; Tremolo library -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +@ Tremolo library +@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - AREA |.text|, CODE, READONLY + .text - ; full accuracy version + @ full accuracy version - EXPORT mdct_backward_arm - EXPORT mdct_shift_right_arm - EXPORT mdct_unroll_prelap_arm - EXPORT mdct_unroll_part2_arm - EXPORT mdct_unroll_part3_arm - EXPORT mdct_unroll_postlap_arm + .global mdct_backward_arm + .global mdct_shift_right_arm + .global mdct_unroll_prelap_arm + .global mdct_unroll_part2_arm + .global mdct_unroll_part3_arm + .global mdct_unroll_postlap_arm - IMPORT sincos_lookup0 - IMPORT sincos_lookup1 + .extern sincos_lookup0 + .extern sincos_lookup1 -mdct_unroll_prelap_arm - ; r0 = out - ; r1 = post - ; r2 = r - ; r3 = step +mdct_unroll_prelap_arm: + @ r0 = out + @ r1 = post + @ r2 = r + @ r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r2, r1 ; r1 = r - post - SUBS r1, r1, #16 ; r1 = r - post - 16 + SUB r1, r2, r1 @ r1 = r - post + SUBS r1, r1, #16 @ r1 = r - post - 16 BLT unroll_over -unroll_loop +unroll_loop: LDMDB r2!,{r5,r6,r7,r12} - MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 - MOV r6, r6, ASR #9 ; r6 = (*--r)>>9 - MOV r7, r7, ASR #9 ; r7 = (*--r)>>9 - MOV r12,r12,ASR #9 ; r12= (*--r)>>9 + MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 + MOV r6, r6, ASR #9 @ r6 = (*--r)>>9 + MOV r7, r7, ASR #9 @ r7 = (*--r)>>9 + MOV r12,r12,ASR #9 @ r12= (*--r)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop -unroll_over +unroll_over: ADDS r1, r1, #16 BLE unroll_end -unroll_loop2 +unroll_loop2: LDR r5,[r2,#-4]! - ; stall - ; stall (Xscale) - MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 + @ stall + @ stall (Xscale) + MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop2 -unroll_end +unroll_end: LDMFD r13!,{r4-r7,PC} -mdct_unroll_postlap_arm - ; r0 = out - ; r1 = post - ; r2 = l - ; r3 = step +mdct_unroll_postlap_arm: + @ r0 = out + @ r1 = post + @ r2 = l + @ r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r1, r2 ; r1 = post - l - MOV r1, r1, ASR #1 ; r1 = (post - l)>>1 - SUBS r1, r1, #16 ; r1 = ((post - l)>>1) - 4 + SUB r1, r1, r2 @ r1 = post - l + MOV r1, r1, ASR #1 @ r1 = (post - l)>>1 + SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4 BLT unroll_over3 -unroll_loop3 +unroll_loop3: LDR r12,[r2],#8 LDR r7, [r2],#8 LDR r6, [r2],#8 @@ -97,142 +97,142 @@ unroll_loop3 RSB r6, r6, #0 RSB r7, r7, #0 - MOV r12, r12,ASR #9 ; r12= (-*l)>>9 - MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 - MOV r6, r6, ASR #9 ; r6 = (-*l)>>9 - MOV r7, r7, ASR #9 ; r7 = (-*l)>>9 + MOV r12, r12,ASR #9 @ r12= (-*l)>>9 + MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 + MOV r6, r6, ASR #9 @ r6 = (-*l)>>9 + MOV r7, r7, ASR #9 @ r7 = (-*l)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop3 -unroll_over3 +unroll_over3: ADDS r1, r1, #16 BLE unroll_over4 -unroll_loop4 +unroll_loop4: LDR r5,[r2], #8 - ; stall - ; stall (Xscale) + @ stall + @ stall (Xscale) RSB r5, r5, #0 - MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 + MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop4 -unroll_over4 +unroll_over4: LDMFD r13!,{r4-r7,PC} -mdct_unroll_part2_arm - ; r0 = out - ; r1 = post - ; r2 = l - ; r3 = r - ; <> = step - ; <> = wL - ; <> = wR +mdct_unroll_part2_arm: + @ r0 = out + @ r1 = post + @ r2 = l + @ r3 = r + @ <> = step + @ <> = wL + @ <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} ; r8 = step - ; r9 = wL - ; r10= wR + LDMFD r12,{r8,r9,r10} @ r8 = step + @ r9 = wL + @ r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r3, r1 ; r1 = (r - post) + SUBS r1, r3, r1 @ r1 = (r - post) BLE unroll_over5 -unroll_loop5 - LDR r12,[r2, #-8]! ; r12= *l (but l -= 2 first) - LDR r11,[r9],#4 ; r11= *wL++ - LDR r7, [r3, #-4]! ; r7 = *--r - LDR r6, [r10,#-4]! ; r6 = *--wR - - ; Can save a cycle here, at the cost of 1bit errors in rounding - SMULL r14,r11,r12,r11 ; (r14,r11) = *l * *wL++ - SMULL r14,r6, r7, r6 ; (r14,r6) = *--r * *--wR +unroll_loop5: + LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first) + LDR r11,[r9],#4 @ r11= *wL++ + LDR r7, [r3, #-4]! @ r7 = *--r + LDR r6, [r10,#-4]! @ r6 = *--wR + + @ Can save a cycle here, at the cost of 1bit errors in rounding + SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++ + SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR ADD r6, r6, r11 MOV r6, r6, ASR #8 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop5 -unroll_over5 +unroll_over5: LDMFD r13!,{r4,r6-r11,PC} -mdct_unroll_part3_arm - ; r0 = out - ; r1 = post - ; r2 = l - ; r3 = r - ; <> = step - ; <> = wL - ; <> = wR +mdct_unroll_part3_arm: + @ r0 = out + @ r1 = post + @ r2 = l + @ r3 = r + @ <> = step + @ <> = wL + @ <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} ; r8 = step - ; r9 = wL - ; r10= wR + LDMFD r12,{r8,r9,r10} @ r8 = step + @ r9 = wL + @ r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r1, r3 ; r1 = (post - r) + SUBS r1, r1, r3 @ r1 = (post - r) BLE unroll_over6 -unroll_loop6 - LDR r12,[r2],#8 ; r12= *l (but l += 2 first) - LDR r11,[r9],#4 ; r11= *wL++ - LDR r7, [r3],#4 ; r7 = *r++ - LDR r6, [r10,#-4]! ; r6 = *--wR - - ; Can save a cycle here, at the cost of 1bit errors in rounding - SMULL r14,r11,r12,r11 ; (r14,r11) = *l * *wL++ - SMULL r14,r6, r7, r6 ; (r14,r6) = *--r * *--wR +unroll_loop6: + LDR r12,[r2],#8 @ r12= *l (but l += 2 first) + LDR r11,[r9],#4 @ r11= *wL++ + LDR r7, [r3],#4 @ r7 = *r++ + LDR r6, [r10,#-4]! @ r6 = *--wR + + @ Can save a cycle here, at the cost of 1bit errors in rounding + SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++ + SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR SUB r6, r6, r11 MOV r6, r6, ASR #8 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop6 -unroll_over6 +unroll_over6: LDMFD r13!,{r4,r6-r11,PC} -mdct_shift_right_arm - ; r0 = n - ; r1 = in - ; r2 = right +mdct_shift_right_arm: + @ r0 = n + @ r1 = in + @ r2 = right STMFD r13!,{r4-r11,r14} - MOV r0, r0, LSR #2 ; n >>= 2 + MOV r0, r0, LSR #2 @ n >>= 2 ADD r1, r1, #4 SUBS r0, r0, #8 BLT sr_less_than_8 -sr_loop +sr_loop: LDR r3, [r1], #8 LDR r4, [r1], #8 LDR r5, [r1], #8 @@ -244,135 +244,135 @@ sr_loop SUBS r0, r0, #8 STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14} BGE sr_loop -sr_less_than_8 +sr_less_than_8: ADDS r0, r0, #8 BEQ sr_end -sr_loop2 +sr_loop2: LDR r3, [r1], #8 SUBS r0, r0, #1 STR r3, [r2], #4 BGT sr_loop2 -sr_end +sr_end: LDMFD r13!,{r4-r11,PC} -mdct_backward_arm - ; r0 = n - ; r1 = in +mdct_backward_arm: + @ r0 = n + @ r1 = in STMFD r13!,{r4-r11,r14} - MOV r2,#1<<4 ; r2 = 1<<shift - MOV r3,#13-4 ; r3 = 13-shift -find_shift_loop - TST r0,r2 ; if (n & (1<<shift)) == 0 + MOV r2,#1<<4 @ r2 = 1<<shift + MOV r3,#13-4 @ r3 = 13-shift +find_shift_loop: + TST r0,r2 @ if (n & (1<<shift)) == 0 MOV r2,r2,LSL #1 - SUBEQ r3,r3,#1 ; shift-- + SUBEQ r3,r3,#1 @ shift-- BEQ find_shift_loop MOV r2,#2 - MOV r2,r2,LSL r3 ; r2 = step = 2<<shift - - ; presymmetry - ; r0 = n (a multiple of 4) - ; r1 = in - ; r2 = step - ; r3 = shift - - ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) - ADD r14,r1, r0 ; r14= in+(n>>2) - SUB r4, r4, #3*4 ; r4 = aX = in+n2-3 - LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 - -presymmetry_loop1 - LDR r7, [r4,#8] ; r6 = s2 = aX[2] - LDR r11,[r5,#4] ; r11= T[1] - LDR r6, [r4] ; r6 = s0 = aX[0] - LDR r10,[r5],r2,LSL #2 ; r10= T[0] T += step - - ; XPROD31(s0, s2, T[0], T[1], &aX[0], &ax[2]) - SMULL r8, r9, r7, r11 ; (r8, r9) = s2*T[1] - ; stall - ; stall ? - SMLAL r8, r9, r6, r10 ; (r8, r9) += s0*T[0] + MOV r2,r2,LSL r3 @ r2 = step = 2<<shift + + @ presymmetry + @ r0 = n (a multiple of 4) + @ r1 = in + @ r2 = step + @ r3 = shift + + ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) + ADD r14,r1, r0 @ r14= in+(n>>2) + SUB r4, r4, #3*4 @ r4 = aX = in+n2-3 + LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 + +presymmetry_loop1: + LDR r7, [r4,#8] @ r6 = s2 = aX[2] + LDR r11,[r5,#4] @ r11= T[1] + LDR r6, [r4] @ r6 = s0 = aX[0] + LDR r10,[r5],r2,LSL #2 @ r10= T[0] T += step + + @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2]) + SMULL r8, r9, r7, r11 @ (r8, r9) = s2*T[1] + @ stall + @ stall ? + SMLAL r8, r9, r6, r10 @ (r8, r9) += s0*T[0] RSB r6, r6, #0 - ; stall ? - SMULL r8, r12,r7, r10 ; (r8, r12) = s2*T[0] + @ stall ? + SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[0] MOV r9, r9, LSL #1 - ; stall ? - SMLAL r8, r12,r6, r11 ; (r8, r12) -= s0*T[1] - STR r9, [r4],#-16 ; aX[0] = r9 + @ stall ? + SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[1] + STR r9, [r4],#-16 @ aX[0] = r9 CMP r4,r14 MOV r12,r12,LSL #1 - STR r12,[r4,#8+16] ; aX[2] = r12 + STR r12,[r4,#8+16] @ aX[2] = r12 - BGE presymmetry_loop1 ; while (aX >= in+n4) + BGE presymmetry_loop1 @ while (aX >= in+n4) -presymmetry_loop2 - LDR r6,[r4] ; r6 = s0 = aX[0] - LDR r10,[r5,#4] ; r10= T[1] - LDR r7,[r4,#8] ; r6 = s2 = aX[2] - LDR r11,[r5],-r2,LSL #2 ; r11= T[0] T -= step +presymmetry_loop2: + LDR r6,[r4] @ r6 = s0 = aX[0] + LDR r10,[r5,#4] @ r10= T[1] + LDR r7,[r4,#8] @ r6 = s2 = aX[2] + LDR r11,[r5],-r2,LSL #2 @ r11= T[0] T -= step - ; XPROD31(s0, s2, T[1], T[0], &aX[0], &ax[2]) - SMULL r8, r9, r6, r10 ; (r8, r9) = s0*T[1] - ; stall - ; stall ? - SMLAL r8, r9, r7, r11 ; (r8, r9) += s2*T[0] + @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2]) + SMULL r8, r9, r6, r10 @ (r8, r9) = s0*T[1] + @ stall + @ stall ? + SMLAL r8, r9, r7, r11 @ (r8, r9) += s2*T[0] RSB r6, r6, #0 - ; stall ? - SMULL r8, r12,r7, r10 ; (r8, r12) = s2*T[1] + @ stall ? + SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[1] MOV r9, r9, LSL #1 - ; stall ? - SMLAL r8, r12,r6, r11 ; (r8, r12) -= s0*T[0] - STR r9, [r4],#-16 ; aX[0] = r9 + @ stall ? + SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[0] + STR r9, [r4],#-16 @ aX[0] = r9 CMP r4,r1 MOV r12,r12,LSL #1 - STR r12,[r4,#8+16] ; aX[2] = r12 + STR r12,[r4,#8+16] @ aX[2] = r12 - BGE presymmetry_loop2 ; while (aX >= in) + BGE presymmetry_loop2 @ while (aX >= in) - ; r0 = n - ; r1 = in - ; r2 = step - ; r3 = shift + @ r0 = n + @ r1 = in + @ r2 = step + @ r3 = shift STMFD r13!,{r3} - LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 - ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) - SUB r4, r4, #4*4 ; r4 = aX = in+(n>>1)-4 - LDR r11,[r5,#4] ; r11= T[1] - LDR r10,[r5],r2, LSL #2 ; r10= T[0] T += step -presymmetry_loop3 - LDR r8,[r1],#16 ; r8 = ro0 = bX[0] - LDR r9,[r1,#8-16] ; r9 = ro2 = bX[2] - LDR r6,[r4] ; r6 = ri0 = aX[0] - - ; XNPROD31( ro2, ro0, T[1], T[0], &aX[0], &aX[2] ) - ; aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 - SMULL r14,r12,r8, r11 ; (r14,r12) = ro0*T[1] - RSB r8,r8,#0 ; r8 = -ro0 - ; Stall ? - SMLAL r14,r12,r9, r10 ; (r14,r12) += ro2*T[0] - LDR r7,[r4,#8] ; r7 = ri2 = aX[2] - ; Stall ? - SMULL r14,r3, r9, r11 ; (r14,r3) = ro2*T[1] + LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 + ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) + SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4 + LDR r11,[r5,#4] @ r11= T[1] + LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step +presymmetry_loop3: + LDR r8,[r1],#16 @ r8 = ro0 = bX[0] + LDR r9,[r1,#8-16] @ r9 = ro2 = bX[2] + LDR r6,[r4] @ r6 = ri0 = aX[0] + + @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] ) + @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 + SMULL r14,r12,r8, r11 @ (r14,r12) = ro0*T[1] + RSB r8,r8,#0 @ r8 = -ro0 + @ Stall ? + SMLAL r14,r12,r9, r10 @ (r14,r12) += ro2*T[0] + LDR r7,[r4,#8] @ r7 = ri2 = aX[2] + @ Stall ? + SMULL r14,r3, r9, r11 @ (r14,r3) = ro2*T[1] MOV r12,r12,LSL #1 - LDR r11,[r5,#4] ; r11= T[1] - SMLAL r14,r3, r8, r10 ; (r14,r3) -= ro0*T[0] - LDR r10,[r5],r2, LSL #2 ; r10= T[0] T += step + LDR r11,[r5,#4] @ r11= T[1] + SMLAL r14,r3, r8, r10 @ (r14,r3) -= ro0*T[0] + LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step STR r12,[r4,#8] MOV r3, r3, LSL #1 STR r3, [r4],#-16 - ; XNPROD31( ri2, ri0, T[0], T[1], &bX[0], &bX[2] ) - ; bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 - SMULL r14,r12,r6, r10 ; (r14,r12) = ri0*T[0] - RSB r6,r6,#0 ; r6 = -ri0 - ; stall ? - SMLAL r14,r12,r7, r11 ; (r14,r12) += ri2*T[1] - ; stall ? - ; stall ? - SMULL r14,r3, r7, r10 ; (r14,r3) = ri2*T[0] + @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] ) + @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 + SMULL r14,r12,r6, r10 @ (r14,r12) = ri0*T[0] + RSB r6,r6,#0 @ r6 = -ri0 + @ stall ? + SMLAL r14,r12,r7, r11 @ (r14,r12) += ri2*T[1] + @ stall ? + @ stall ? + SMULL r14,r3, r7, r10 @ (r14,r3) = ri2*T[0] MOV r12,r12,LSL #1 - ; stall ? - SMLAL r14,r3, r6, r11 ; (r14,r3) -= ri0*T[1] + @ stall ? + SMLAL r14,r3, r6, r11 @ (r14,r3) -= ri0*T[1] CMP r4,r1 STR r12,[r1,#8-16] MOV r3, r3, LSL #1 @@ -380,571 +380,571 @@ presymmetry_loop3 BGE presymmetry_loop3 - SUB r1,r1,r0 ; r1 = in -= n>>2 (i.e. restore in) + SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in) LDR r3,[r13] STR r2,[r13,#-4]! - ; mdct_butterflies - ; r0 = n = (points * 2) - ; r1 = in = x - ; r2 = i - ; r3 = shift + @ mdct_butterflies + @ r0 = n = (points * 2) + @ r1 = in = x + @ r2 = i + @ r3 = shift STMFD r13!,{r0-r1} - RSBS r4,r3,#6 ; r4 = stages = 7-shift then --stages + RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages LDR r5,=sincos_lookup0 BLE no_generics - MOV r14,#4 ; r14= 4 (i=0) - MOV r6, r14,LSL r3 ; r6 = (4<<i)<<shift -mdct_butterflies_loop1 - MOV r0, r0, LSR #1 ; r0 = points>>i = POINTS - MOV r2, r14,LSR #2 ; r2 = (1<<i)-j (j=0) + MOV r14,#4 @ r14= 4 (i=0) + MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift +mdct_butterflies_loop1: + MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS + MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0) STMFD r13!,{r4,r14} -mdct_butterflies_loop2 - - ; mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) - ; mdct_butterfly_generic(r1, r0, r6) - ; r0 = points - ; r1 = x - ; preserve r2 (external loop counter) - ; preserve r3 - ; preserve r4 (external loop counter) - ; r5 = T = sincos_lookup0 - ; r6 = step - ; preserve r14 - - STR r2,[r13,#-4]! ; stack r2 - ADD r1,r1,r0,LSL #1 ; r1 = x2+4 = x + (POINTS>>1) - ADD r7,r1,r0,LSL #1 ; r7 = x1+4 = x + POINTS - ADD r12,r5,#1024*4 ; r12= sincos_lookup0+1024 - -mdct_bufferfly_generic_loop1 - LDMDB r7!,{r2,r3,r8,r11} ; r2 = x1[0] - ; r3 = x1[1] - ; r8 = x1[2] - ; r11= x1[3] x1 -= 4 - LDMDB r1!,{r4,r9,r10,r14} ; r4 = x2[0] - ; r9 = x2[1] - ; r10= x2[2] - ; r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) - SUB r11,r11,r8 ; r11= s1 = x1[3] - x1[2] - ADD r8, r11,r8, LSL #1 ; r8 = x1[3] + x1[2] (-> x1[2]) - SUB r9, r9, r4 ; r9 = s2 = x2[1] - x2[0] - ADD r4, r9, r4, LSL #1 ; r4 = x2[1] + x2[0] (-> x1[1]) - SUB r14,r14,r10 ; r14= s3 = x2[3] - x2[2] - ADD r10,r14,r10,LSL #1 ; r10= x2[3] + x2[2] (-> x1[3]) +mdct_butterflies_loop2: + + @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) + @ mdct_butterfly_generic(r1, r0, r6) + @ r0 = points + @ r1 = x + @ preserve r2 (external loop counter) + @ preserve r3 + @ preserve r4 (external loop counter) + @ r5 = T = sincos_lookup0 + @ r6 = step + @ preserve r14 + + STR r2,[r13,#-4]! @ stack r2 + ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1) + ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS + ADD r12,r5,#1024*4 @ r12= sincos_lookup0+1024 + +mdct_bufferfly_generic_loop1: + LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0] + @ r3 = x1[1] + @ r8 = x1[2] + @ r11= x1[3] x1 -= 4 + LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0] + @ r9 = x2[1] + @ r10= x2[2] + @ r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) + SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2] + ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2]) + SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0] + ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1]) + SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2] + ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r4,r8,r10} - ; r0 = points - ; r1 = x2 - ; r2 = s0 - ; r3 free - ; r4 free - ; r5 = T - ; r6 = step - ; r7 = x1 - ; r8 free - ; r9 = s2 - ; r10 free - ; r11= s1 - ; r12= limit - ; r14= s3 - - LDR r8, [r5,#4] ; r8 = T[1] - LDR r10,[r5],r6,LSL #2 ; r10= T[0] T += step - - ; XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) - ; x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 - ; stall Xscale - SMULL r4, r3, r2, r8 ; (r4, r3) = s0*T[1] - SMLAL r4, r3, r11,r10 ; (r4, r3) += s1*T[0] + @ r0 = points + @ r1 = x2 + @ r2 = s0 + @ r3 free + @ r4 free + @ r5 = T + @ r6 = step + @ r7 = x1 + @ r8 free + @ r9 = s2 + @ r10 free + @ r11= s1 + @ r12= limit + @ r14= s3 + + LDR r8, [r5,#4] @ r8 = T[1] + LDR r10,[r5],r6,LSL #2 @ r10= T[0] T += step + + @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) + @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 + @ stall Xscale + SMULL r4, r3, r2, r8 @ (r4, r3) = s0*T[1] + SMLAL r4, r3, r11,r10 @ (r4, r3) += s1*T[0] RSB r11,r11,#0 - SMULL r11,r4, r8, r11 ; (r11,r4) = -s1*T[1] - SMLAL r11,r4, r2, r10 ; (r11,r4) += s0*T[0] - MOV r2, r3, LSL #1 ; r2 = r3<<1 = Value for x2[0] - - ; XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) - ; x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 - SMULL r11,r3, r9, r10 ; (r11,r3) = s2*T[0] - MOV r4, r4, LSL #1 ; r4 = r4<<1 = Value for x2[2] - SMLAL r11,r3, r14,r8 ; (r11,r3) += s3*T[1] + SMULL r11,r4, r8, r11 @ (r11,r4) = -s1*T[1] + SMLAL r11,r4, r2, r10 @ (r11,r4) += s0*T[0] + MOV r2, r3, LSL #1 @ r2 = r3<<1 = Value for x2[0] + + @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) + @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 + SMULL r11,r3, r9, r10 @ (r11,r3) = s2*T[0] + MOV r4, r4, LSL #1 @ r4 = r4<<1 = Value for x2[2] + SMLAL r11,r3, r14,r8 @ (r11,r3) += s3*T[1] RSB r9, r9, #0 - SMULL r10,r11,r14,r10 ; (r10,r11) = s3*T[0] - MOV r3, r3, LSL #1 ; r3 = r3<<1 = Value for x2[1] - SMLAL r10,r11,r9,r8 ; (r10,r11) -= s2*T[1] + SMULL r10,r11,r14,r10 @ (r10,r11) = s3*T[0] + MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1] + SMLAL r10,r11,r9,r8 @ (r10,r11) -= s2*T[1] CMP r5, r12 - MOV r11,r11,LSL #1 ; r11= r11<<1 = Value for x2[3] + MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3] STMIA r1,{r2,r3,r4,r11} BLT mdct_bufferfly_generic_loop1 SUB r12,r12,#1024*4 -mdct_bufferfly_generic_loop2 - LDMDB r7!,{r2,r3,r9,r10} ; r2 = x1[0] - ; r3 = x1[1] - ; r9 = x1[2] - ; r10= x1[3] x1 -= 4 - LDMDB r1!,{r4,r8,r11,r14} ; r4 = x2[0] - ; r8 = x2[1] - ; r11= x2[2] - ; r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) - SUB r9, r9,r10 ; r9 = s1 = x1[2] - x1[3] - ADD r10,r9,r10, LSL #1 ; r10= x1[2] + x1[3] (-> x1[2]) - SUB r4, r4, r8 ; r4 = s2 = x2[0] - x2[1] - ADD r8, r4, r8, LSL #1 ; r8 = x2[0] + x2[1] (-> x1[1]) - SUB r14,r14,r11 ; r14= s3 = x2[3] - x2[2] - ADD r11,r14,r11,LSL #1 ; r11= x2[3] + x2[2] (-> x1[3]) +mdct_bufferfly_generic_loop2: + LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0] + @ r3 = x1[1] + @ r9 = x1[2] + @ r10= x1[3] x1 -= 4 + LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0] + @ r8 = x2[1] + @ r11= x2[2] + @ r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) + SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3] + ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2]) + SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1] + ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1]) + SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2] + ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r8,r10,r11} - ; r0 = points - ; r1 = x2 - ; r2 = s0 - ; r3 free - ; r4 = s2 - ; r5 = T - ; r6 = step - ; r7 = x1 - ; r8 free - ; r9 = s1 - ; r10 free - ; r11 free - ; r12= limit - ; r14= s3 - - LDR r8, [r5,#4] ; r8 = T[1] - LDR r10,[r5],-r6,LSL #2 ; r10= T[0] T -= step - - ; XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) - ; x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 - ; stall Xscale - SMULL r3, r11,r2, r8 ; (r3, r11) = s0*T[1] - SMLAL r3, r11,r9, r10 ; (r3, r11) += s1*T[0] + @ r0 = points + @ r1 = x2 + @ r2 = s0 + @ r3 free + @ r4 = s2 + @ r5 = T + @ r6 = step + @ r7 = x1 + @ r8 free + @ r9 = s1 + @ r10 free + @ r11 free + @ r12= limit + @ r14= s3 + + LDR r8, [r5,#4] @ r8 = T[1] + LDR r10,[r5],-r6,LSL #2 @ r10= T[0] T -= step + + @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) + @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 + @ stall Xscale + SMULL r3, r11,r2, r8 @ (r3, r11) = s0*T[1] + SMLAL r3, r11,r9, r10 @ (r3, r11) += s1*T[0] RSB r9, r9, #0 - SMULL r3, r2, r10,r2 ; (r3, r2) = s0*T[0] - SMLAL r3, r2, r9, r8 ; (r3, r2) += -s1*T[1] - MOV r9, r11,LSL #1 ; r9 = r11<<1 = Value for x2[2] - - ; XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) - ; x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 - SMULL r3, r11,r4, r10 ; (r3,r11) = s2*T[0] - MOV r2, r2, LSL #1 ; r2 = r2<<1 = Value for x2[0] - SMLAL r3, r11,r14,r8 ; (r3,r11) += s3*T[1] + SMULL r3, r2, r10,r2 @ (r3, r2) = s0*T[0] + SMLAL r3, r2, r9, r8 @ (r3, r2) += -s1*T[1] + MOV r9, r11,LSL #1 @ r9 = r11<<1 = Value for x2[2] + + @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) + @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 + SMULL r3, r11,r4, r10 @ (r3,r11) = s2*T[0] + MOV r2, r2, LSL #1 @ r2 = r2<<1 = Value for x2[0] + SMLAL r3, r11,r14,r8 @ (r3,r11) += s3*T[1] RSB r4, r4, #0 - SMULL r10,r3,r14,r10 ; (r10,r3) = s3*T[0] - MOV r11,r11,LSL #1 ; r11= r11<<1 = Value for x2[3] - SMLAL r10,r3, r4, r8 ; (r10,r3) -= s2*T[1] + SMULL r10,r3,r14,r10 @ (r10,r3) = s3*T[0] + MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3] + SMLAL r10,r3, r4, r8 @ (r10,r3) -= s2*T[1] CMP r5, r12 - MOV r3, r3, LSL #1 ; r3 = r3<<1 = Value for x2[1] + MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1] STMIA r1,{r2,r3,r9,r11} BGT mdct_bufferfly_generic_loop2 - LDR r2,[r13],#4 ; unstack r2 - ADD r1, r1, r0, LSL #2 ; r1 = x+POINTS*j - ; stall Xscale - SUBS r2, r2, #1 ; r2-- (j++) + LDR r2,[r13],#4 @ unstack r2 + ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j + @ stall Xscale + SUBS r2, r2, #1 @ r2-- (j++) BGT mdct_butterflies_loop2 LDMFD r13!,{r4,r14} LDR r1,[r13,#4] - SUBS r4, r4, #1 ; stages-- - MOV r14,r14,LSL #1 ; r14= 4<<i (i++) - MOV r6, r6, LSL #1 ; r6 = step <<= 1 (i++) + SUBS r4, r4, #1 @ stages-- + MOV r14,r14,LSL #1 @ r14= 4<<i (i++) + MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++) BGE mdct_butterflies_loop1 LDMFD r13,{r0-r1} -no_generics - ; mdct_butterflies part2 (loop around mdct_bufferfly_32) - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift - -mdct_bufferflies_loop3 - ; mdct_bufferfly_32 - - ; block1 - ADD r4, r1, #16*4 ; r4 = &in[16] - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[16] - ; r6 = x[17] - ; r9 = x[18] - ; r10= x[19] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] - ; r8 = x[1] - ; r11= x[2] - ; r12= x[3] - SUB r5, r5, r6 ; r5 = s0 = x[16] - x[17] - ADD r6, r5, r6, LSL #1 ; r6 = x[16] + x[17] -> x[16] - SUB r9, r9, r10 ; r9 = s1 = x[18] - x[19] - ADD r10,r9, r10,LSL #1 ; r10= x[18] + x[19] -> x[18] - SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[17] - SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[19] +no_generics: + @ mdct_butterflies part2 (loop around mdct_bufferfly_32) + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift + +mdct_bufferflies_loop3: + @ mdct_bufferfly_32 + + @ block1 + ADD r4, r1, #16*4 @ r4 = &in[16] + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16] + @ r6 = x[17] + @ r9 = x[18] + @ r10= x[19] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] + @ r8 = x[1] + @ r11= x[2] + @ r12= x[3] + SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17] + ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16] + SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19] + ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18] + SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17] + SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19] STMIA r4!,{r6,r7,r10,r11} LDR r6,cPI1_8 LDR r7,cPI3_8 - ; XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) - ; x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 - ; stall Xscale - SMULL r14,r11,r5, r6 ; (r14,r11) = s0*cPI1_8 - SMLAL r14,r11,r9, r7 ; (r14,r11) += s1*cPI3_8 + @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) + @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 + @ stall Xscale + SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI1_8 + SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI3_8 RSB r9, r9, #0 - SMULL r14,r5, r7, r5 ; (r14,r5) = s0*cPI3_8 - SMLAL r14,r5, r9, r6 ; (r14,r5) -= s1*cPI1_8 + SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI3_8 + SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI1_8 MOV r11,r11,LSL #1 MOV r5, r5, LSL #1 - ; XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) - ; x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 - SMULL r14,r9, r8, r6 ; (r14,r9) = s2*cPI1_8 - SMLAL r14,r9, r12,r7 ; (r14,r9) += s3*cPI3_8 + @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) + @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 + SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI1_8 + SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI3_8 RSB r8,r8,#0 - SMULL r14,r12,r6, r12 ; (r14,r12) = s3*cPI1_8 - SMLAL r14,r12,r8, r7 ; (r14,r12) -= s2*cPI3_8 + SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI1_8 + SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI3_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r9,r11,r12} - ; block2 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[20] - ; r6 = x[21] - ; r9 = x[22] - ; r10= x[23] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[4] - ; r8 = x[5] - ; r11= x[6] - ; r12= x[7] - SUB r5, r5, r6 ; r5 = s0 = x[20] - x[21] - ADD r6, r5, r6, LSL #1 ; r6 = x[20] + x[21] -> x[20] - SUB r9, r9, r10 ; r9 = s1 = x[22] - x[23] - ADD r10,r9, r10,LSL #1 ; r10= x[22] + x[23] -> x[22] - SUB r8, r8, r7 ; r8 = s2 = x[ 5] - x[ 4] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 5] + x[ 4] -> x[21] - SUB r12,r12,r11 ; r12= s3 = x[ 7] - x[ 6] - ADD r11,r12,r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[23] + @ block2 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20] + @ r6 = x[21] + @ r9 = x[22] + @ r10= x[23] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4] + @ r8 = x[5] + @ r11= x[6] + @ r12= x[7] + SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21] + ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20] + SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23] + ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22] + SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21] + SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6] + ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23] LDR r14,cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 ; r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 - SMULL r6, r5, r14,r5 ; (r6,r5) = (s0-s1)*cPI2_8 - SUB r12,r12,r8 ; r12= s3 - s2 - ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 + SUB r5, r5, r9 @ r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 + SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8 + SUB r12,r12,r8 @ r12= s3 - s2 + ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 - SMULL r6, r8, r14,r8 ; (r6,r8) = (s3+s2)*cPI2_8 + SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8 MOV r5, r5, LSL #1 - SMULL r6, r9, r14,r9 ; (r6,r9) = (s0+s1)*cPI2_8 + SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8 MOV r8, r8, LSL #1 - SMULL r6, r12,r14,r12 ; (r6,r12) = (s3-s2)*cPI2_8 + SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r8,r9,r12} - ; block3 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[24] - ; r6 = x[25] - ; r9 = x[25] - ; r10= x[26] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[8] - ; r8 = x[9] - ; r11= x[10] - ; r12= x[11] - SUB r5, r5, r6 ; r5 = s0 = x[24] - x[25] - ADD r6, r5, r6, LSL #1 ; r6 = x[24] + x[25] -> x[25] - SUB r9, r9, r10 ; r9 = s1 = x[26] - x[27] - ADD r10,r9, r10,LSL #1 ; r10= x[26] + x[27] -> x[26] - SUB r8, r8, r7 ; r8 = s2 = x[ 9] - x[ 8] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 9] + x[ 8] -> x[25] - SUB r12,r12,r11 ; r12= s3 = x[11] - x[10] - ADD r11,r12,r11, LSL #1 ; r11= x[11] + x[10] -> x[27] + @ block3 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24] + @ r6 = x[25] + @ r9 = x[25] + @ r10= x[26] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8] + @ r8 = x[9] + @ r11= x[10] + @ r12= x[11] + SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25] + ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25] + SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27] + ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26] + SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25] + SUB r12,r12,r11 @ r12= s3 = x[11] - x[10] + ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27] STMIA r4!,{r6,r7,r10,r11} LDR r6,cPI3_8 LDR r7,cPI1_8 - ; XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) - ; x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 - ; stall Xscale - SMULL r14,r11,r5, r6 ; (r14,r11) = s0*cPI3_8 - SMLAL r14,r11,r9, r7 ; (r14,r11) += s1*cPI1_8 + @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) + @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 + @ stall Xscale + SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI3_8 + SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI1_8 RSB r9, r9, #0 - SMULL r14,r5, r7, r5 ; (r14,r5) = s0*cPI1_8 - SMLAL r14,r5, r9, r6 ; (r14,r5) -= s1*cPI3_8 + SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI1_8 + SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI3_8 MOV r11,r11,LSL #1 MOV r5, r5, LSL #1 - ; XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) - ; x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 - SMULL r14,r9, r8, r6 ; (r14,r9) = s2*cPI3_8 - SMLAL r14,r9, r12,r7 ; (r14,r9) += s3*cPI1_8 + @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) + @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 + SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI3_8 + SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI1_8 RSB r8,r8,#0 - SMULL r14,r12,r6, r12 ; (r14,r12) = s3*cPI3_8 - SMLAL r14,r12,r8, r7 ; (r14,r12) -= s2*cPI1_8 + SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI3_8 + SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI1_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r9,r11,r12} - ; block4 - LDMIA r4,{r5,r6,r10,r11} ; r5 = x[28] - ; r6 = x[29] - ; r10= x[30] - ; r11= x[31] - LDMIA r1,{r8,r9,r12,r14} ; r8 = x[12] - ; r9 = x[13] - ; r12= x[14] - ; r14= x[15] - SUB r5, r5, r6 ; r5 = s0 = x[28] - x[29] - ADD r6, r5, r6, LSL #1 ; r6 = x[28] + x[29] -> x[28] - SUB r7, r14,r12 ; r7 = s3 = x[15] - x[14] - ADD r12,r7, r12, LSL #1 ; r12= x[15] + x[14] -> x[31] - SUB r10,r10,r11 ; r10= s1 = x[30] - x[31] - ADD r11,r10,r11,LSL #1 ; r11= x[30] + x[31] -> x[30] - SUB r14, r8, r9 ; r14= s2 = x[12] - x[13] - ADD r9, r14, r9, LSL #1 ; r9 = x[12] + x[13] -> x[29] + @ block4 + LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28] + @ r6 = x[29] + @ r10= x[30] + @ r11= x[31] + LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12] + @ r9 = x[13] + @ r12= x[14] + @ r14= x[15] + SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29] + ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28] + SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14] + ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31] + SUB r10,r10,r11 @ r10= s1 = x[30] - x[31] + ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30] + SUB r14, r8, r9 @ r14= s2 = x[12] - x[13] + ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29] STMIA r4!,{r6,r9,r11,r12} STMIA r1!,{r5,r7,r10,r14} - ; mdct_butterfly16 (1st version) - ; block 1 + @ mdct_butterfly16 (1st version) + @ block 1 SUB r1,r1,#16*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] - ; r6 = x[ 9] - ; r9 = x[10] - ; r10= x[11] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] - ; r8 = x[1] - ; r11= x[2] - ; r12= x[3] - SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] + @ r6 = x[ 9] + @ r9 = x[10] + @ r10= x[11] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] + @ r8 = x[1] + @ r11= x[2] + @ r12= x[3] + SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] LDR r14,cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 ; r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 - SMULL r6, r5, r14,r5 ; (r6,r5) = (s0-s1)*cPI2_8 - SUB r12,r12,r8 ; r12= s3 - s2 - ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 + SUB r5, r5, r9 @ r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 + SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8 + SUB r12,r12,r8 @ r12= s3 - s2 + ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 - SMULL r6, r8, r14,r8 ; (r6,r8) = (s3+s2)*cPI2_8 + SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8 MOV r5, r5, LSL #1 - SMULL r6, r9, r14,r9 ; (r6,r9) = (s0+s1)*cPI2_8 + SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8 MOV r8, r8, LSL #1 - SMULL r6, r12,r14,r12 ; (r6,r12) = (s3-s2)*cPI2_8 + SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r8,r9,r12} - ; block4 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] - ; r6 = x[13] - ; r9 = x[14] - ; r10= x[15] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] - ; r8 = x[ 5] - ; r11= x[ 6] - ; r12= x[ 7] - SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] - SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] - SUB r12,r9, r10 ; r12= s3 = x[14] - x[15] - ADD r10,r12,r10,LSL #1 ; r10= x[14] + x[15] -> x[14] + @ block4 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] + @ r6 = x[13] + @ r9 = x[14] + @ r10= x[15] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] + @ r8 = x[ 5] + @ r11= x[ 6] + @ r12= x[ 7] + SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] + SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] + SUB r12,r9, r10 @ r12= s3 = x[14] - x[15] + ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r12,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} - ; block 2 + @ block 2 ADD r1,r1,#16*4-8*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] - ; r6 = x[ 9] - ; r9 = x[10] - ; r10= x[11] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] - ; r8 = x[1] - ; r11= x[2] - ; r12= x[3] - SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] + @ r6 = x[ 9] + @ r9 = x[10] + @ r10= x[11] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] + @ r8 = x[1] + @ r11= x[2] + @ r12= x[3] + SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] LDR r14,cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 ; r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 - SMULL r6, r5, r14,r5 ; (r6,r5) = (s0-s1)*cPI2_8 - SUB r12,r12,r8 ; r12= s3 - s2 - ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 + SUB r5, r5, r9 @ r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 + SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8 + SUB r12,r12,r8 @ r12= s3 - s2 + ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 - SMULL r6, r8, r14,r8 ; (r6,r8) = (s3+s2)*cPI2_8 + SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8 MOV r5, r5, LSL #1 - SMULL r6, r9, r14,r9 ; (r6,r9) = (s0+s1)*cPI2_8 + SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8 MOV r8, r8, LSL #1 - SMULL r6, r12,r14,r12 ; (r6,r12) = (s3-s2)*cPI2_8 + SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r8,r9,r12} - ; block4 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] - ; r6 = x[13] - ; r9 = x[14] - ; r10= x[15] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] - ; r8 = x[ 5] - ; r11= x[ 6] - ; r12= x[ 7] - SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] - SUB r9, r9, r10 ; r9 = s3 = x[14] - x[15] - ADD r10,r9, r10,LSL #1 ; r10= x[14] + x[15] -> x[14] - SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] + @ block4 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] + @ r6 = x[13] + @ r9 = x[14] + @ r10= x[15] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] + @ r8 = x[ 5] + @ r11= x[ 6] + @ r12= x[ 7] + SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] + SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15] + ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14] + SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r9,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} ADD r1,r1,#8*4 @@ -953,117 +953,117 @@ mdct_bufferflies_loop3 LDMFD r13,{r0-r3} -mdct_bitreverse_arm - ; r0 = points = n - ; r1 = in - ; r2 = step - ; r3 = shift +mdct_bitreverse_arm: + @ r0 = points = n + @ r1 = in + @ r2 = step + @ r3 = shift - MOV r4, #0 ; r4 = bit = 0 - ADD r5, r1, r0, LSL #1 ; r5 = w = x + (n>>1) + MOV r4, #0 @ r4 = bit = 0 + ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1) ADR r6, bitrev SUB r5, r5, #8 -brev_lp +brev_lp: LDRB r7, [r6, r4, LSR #6] AND r8, r4, #0x3f LDRB r8, [r6, r8] - ADD r4, r4, #1 ; bit++ - ; stall XScale - ORR r7, r7, r8, LSL #6 ; r7 = bitrev[bit] + ADD r4, r4, #1 @ bit++ + @ stall XScale + ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit] MOV r7, r7, LSR r3 - ADD r9, r1, r7, LSL #2 ; r9 = xx = x + (b>>shift) - CMP r5, r9 ; if (w > xx) - LDR r10,[r5],#-8 ; r10 = w[0] w -= 2 - LDRGT r11,[r5,#12] ; r11 = w[1] - LDRGT r12,[r9] ; r12 = xx[0] - LDRGT r14,[r9,#4] ; r14 = xx[1] - STRGT r10,[r9] ; xx[0]= w[0] - STRGT r11,[r9,#4] ; xx[1]= w[1] - STRGT r12,[r5,#8] ; w[0] = xx[0] - STRGT r14,[r5,#12] ; w[1] = xx[1] + ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift) + CMP r5, r9 @ if (w > xx) + LDR r10,[r5],#-8 @ r10 = w[0] w -= 2 + LDRGT r11,[r5,#12] @ r11 = w[1] + LDRGT r12,[r9] @ r12 = xx[0] + LDRGT r14,[r9,#4] @ r14 = xx[1] + STRGT r10,[r9] @ xx[0]= w[0] + STRGT r11,[r9,#4] @ xx[1]= w[1] + STRGT r12,[r5,#8] @ w[0] = xx[0] + STRGT r14,[r5,#12] @ w[1] = xx[1] CMP r5,r1 BGT brev_lp - ; mdct_step7 - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift - - CMP r2, #4 ; r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + - LDRLT r5, =sincos_lookup1 ; sincos_lookup0 + - ADD r7, r1, r0, LSL #1 ; r7 = w1 = x + (n>>1) - ADDGE r5, r5, r2, LSL #1 ; (step>>1) - ADD r8, r5, #1024*4 ; r8 = Ttop -step7_loop1 - LDR r6, [r1] ; r6 = w0[0] - LDR r9, [r1,#4] ; r9 = w0[1] - LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] ; r11= w1[1] - LDR r14,[r5,#4] ; r14= T[1] - LDR r12,[r5],r2,LSL #2 ; r12= T[0] T += step - - ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] - - ; Can save 1 cycle by using SMULL SMLAL - at the cost of being - ; 1 off. - SMULL r0, r3, r6, r14 ; (r0,r3) = s0*T[1] - SMULL r0, r4, r11,r12 ; (r0,r4) += s1*T[0] = s2 + @ mdct_step7 + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift + + CMP r2, #4 @ r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + + LDRLT r5, =sincos_lookup1 @ sincos_lookup0 + + ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1) + ADDGE r5, r5, r2, LSL #1 @ (step>>1) + ADD r8, r5, #1024*4 @ r8 = Ttop +step7_loop1: + LDR r6, [r1] @ r6 = w0[0] + LDR r9, [r1,#4] @ r9 = w0[1] + LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] @ r11= w1[1] + LDR r14,[r5,#4] @ r14= T[1] + LDR r12,[r5],r2,LSL #2 @ r12= T[0] T += step + + ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] + + @ Can save 1 cycle by using SMULL SMLAL - at the cost of being + @ 1 off. + SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[1] + SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[0] = s2 ADD r3, r3, r4 - SMULL r0, r14,r11,r14 ; (r0,r14) = s1*T[1] - SMULL r0, r12,r6, r12 ; (r0,r12) += s0*T[0] = s3 + SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[1] + SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[0] = s3 SUB r14,r14,r12 - ; r9 = s0b<<1 - ; r10= s1b<<1 - ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 + @ r9 = s0b<<1 + @ r10= s1b<<1 + ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b - ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b + SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b + ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 ; w0 += 2 + STR r10,[r1],#4 @ w0 += 2 STR r3, [r7] STR r12,[r7,#4] CMP r5,r8 BLT step7_loop1 -step7_loop2 - LDR r6, [r1] ; r6 = w0[0] - LDR r9, [r1,#4] ; r9 = w0[1] - LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] ; r11= w1[1] - LDR r14,[r5,-r2,LSL #2]! ; r12= T[1] T -= step - LDR r12,[r5,#4] ; r14= T[0] - - ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] - - ; Can save 1 cycle by using SMULL SMLAL - at the cost of being - ; 1 off. - SMULL r0, r3, r6, r14 ; (r0,r3) = s0*T[0] - SMULL r0, r4, r11,r12 ; (r0,r4) += s1*T[1] = s2 +step7_loop2: + LDR r6, [r1] @ r6 = w0[0] + LDR r9, [r1,#4] @ r9 = w0[1] + LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] @ r11= w1[1] + LDR r14,[r5,-r2,LSL #2]! @ r12= T[1] T -= step + LDR r12,[r5,#4] @ r14= T[0] + + ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] + + @ Can save 1 cycle by using SMULL SMLAL - at the cost of being + @ 1 off. + SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[0] + SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[1] = s2 ADD r3, r3, r4 - SMULL r0, r14,r11,r14 ; (r0,r14) = s1*T[0] - SMULL r0, r12,r6, r12 ; (r0,r12) += s0*T[1] = s3 + SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[0] + SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[1] = s3 SUB r14,r14,r12 - ; r9 = s0b<<1 - ; r10= s1b<<1 - ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 + @ r9 = s0b<<1 + @ r10= s1b<<1 + ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b - ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b + SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b + ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 ; w0 += 2 + STR r10,[r1],#4 @ w0 += 2 STR r3, [r7] STR r12,[r7,#4] @@ -1072,116 +1072,116 @@ step7_loop2 LDMFD r13!,{r0-r3} - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift - MOV r2, r2, ASR #2 ; r2 = step >>= 2 + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift + MOV r2, r2, ASR #2 @ r2 = step >>= 2 CMP r2, #0 CMPNE r2, #1 BEQ mdct_end - ; step > 1 (default case) - CMP r2, #4 ; r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + - LDRLT r5, =sincos_lookup1 ; sincos_lookup1 - ADD r7, r1, r0, LSL #1 ; r7 = iX = x + (n>>1) - ADDGE r5, r5, r2, LSL #1 ; (step>>1) -mdct_step8_default - LDR r6, [r1],#4 ; r6 = s0 = x[0] - LDR r8, [r1],#4 ; r8 = -s1 = x[1] - LDR r12,[r5,#4] ; r12= T[1] - LDR r14,[r5],r2,LSL #2 ; r14= T[0] T += step - RSB r8, r8, #0 ; r8 = s1 - - ; XPROD31(s0, s1, T[0], T[1], x, x+1) - ; x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] - SMULL r9, r10, r8, r12 ; (r9,r10) = s1 * T[1] + @ step > 1 (default case) + CMP r2, #4 @ r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + + LDRLT r5, =sincos_lookup1 @ sincos_lookup1 + ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1) + ADDGE r5, r5, r2, LSL #1 @ (step>>1) +mdct_step8_default: + LDR r6, [r1],#4 @ r6 = s0 = x[0] + LDR r8, [r1],#4 @ r8 = -s1 = x[1] + LDR r12,[r5,#4] @ r12= T[1] + LDR r14,[r5],r2,LSL #2 @ r14= T[0] T += step + RSB r8, r8, #0 @ r8 = s1 + + @ XPROD31(s0, s1, T[0], T[1], x, x+1) + @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] + SMULL r9, r10, r8, r12 @ (r9,r10) = s1 * T[1] CMP r1, r7 - SMLAL r9, r10, r6, r14 ; (r9,r10) += s0 * T[0] - RSB r6, r6, #0 ; r6 = -s0 - SMULL r9, r11, r8, r14 ; (r9,r11) = s1 * T[0] + SMLAL r9, r10, r6, r14 @ (r9,r10) += s0 * T[0] + RSB r6, r6, #0 @ r6 = -s0 + SMULL r9, r11, r8, r14 @ (r9,r11) = s1 * T[0] MOV r10,r10,LSL #1 - SMLAL r9, r11, r6, r12 ; (r9,r11) -= s0 * T[1] + SMLAL r9, r11, r6, r12 @ (r9,r11) -= s0 * T[1] STR r10,[r1,#-8] MOV r11,r11,LSL #1 STR r11,[r1,#-4] BLT mdct_step8_default -mdct_end +mdct_end: MOV r0, r2 LDMFD r13!,{r4-r11,PC} -cPI1_8 - DCD 0x7641af3d -cPI2_8 - DCD 0x5a82799a -cPI3_8 - DCD 0x30fbc54d -bitrev - DCB 0 - DCB 32 - DCB 16 - DCB 48 - DCB 8 - DCB 40 - DCB 24 - DCB 56 - DCB 4 - DCB 36 - DCB 20 - DCB 52 - DCB 12 - DCB 44 - DCB 28 - DCB 60 - DCB 2 - DCB 34 - DCB 18 - DCB 50 - DCB 10 - DCB 42 - DCB 26 - DCB 58 - DCB 6 - DCB 38 - DCB 22 - DCB 54 - DCB 14 - DCB 46 - DCB 30 - DCB 62 - DCB 1 - DCB 33 - DCB 17 - DCB 49 - DCB 9 - DCB 41 - DCB 25 - DCB 57 - DCB 5 - DCB 37 - DCB 21 - DCB 53 - DCB 13 - DCB 45 - DCB 29 - DCB 61 - DCB 3 - DCB 35 - DCB 19 - DCB 51 - DCB 11 - DCB 43 - DCB 27 - DCB 59 - DCB 7 - DCB 39 - DCB 23 - DCB 55 - DCB 15 - DCB 47 - DCB 31 - DCB 63 - - END +cPI1_8: + .word 0x7641af3d +cPI2_8: + .word 0x5a82799a +cPI3_8: + .word 0x30fbc54d +bitrev: + .byte 0 + .byte 32 + .byte 16 + .byte 48 + .byte 8 + .byte 40 + .byte 24 + .byte 56 + .byte 4 + .byte 36 + .byte 20 + .byte 52 + .byte 12 + .byte 44 + .byte 28 + .byte 60 + .byte 2 + .byte 34 + .byte 18 + .byte 50 + .byte 10 + .byte 42 + .byte 26 + .byte 58 + .byte 6 + .byte 38 + .byte 22 + .byte 54 + .byte 14 + .byte 46 + .byte 30 + .byte 62 + .byte 1 + .byte 33 + .byte 17 + .byte 49 + .byte 9 + .byte 41 + .byte 25 + .byte 57 + .byte 5 + .byte 37 + .byte 21 + .byte 53 + .byte 13 + .byte 45 + .byte 29 + .byte 61 + .byte 3 + .byte 35 + .byte 19 + .byte 51 + .byte 11 + .byte 43 + .byte 27 + .byte 59 + .byte 7 + .byte 39 + .byte 23 + .byte 55 + .byte 15 + .byte 47 + .byte 31 + .byte 63 + + @ END @@ -1,92 +1,92 @@ -; Tremolo library -; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +@ Tremolo library +@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - AREA |.text|, CODE, READONLY + .text - ; low accuracy version + @ low accuracy version - EXPORT mdct_backward_arm_low - EXPORT mdct_shift_right_arm_low - EXPORT mdct_unroll_prelap_arm_low - EXPORT mdct_unroll_part2_arm_low - EXPORT mdct_unroll_part3_arm_low - EXPORT mdct_unroll_postlap_arm_low + .global mdct_backward_arm_low + .global mdct_shift_right_arm_low + .global mdct_unroll_prelap_arm_low + .global mdct_unroll_part2_arm_low + .global mdct_unroll_part3_arm_low + .global mdct_unroll_postlap_arm_low - IMPORT sincos_lookup0 - IMPORT sincos_lookup1 + .extern sincos_lookup0 + .extern sincos_lookup1 -mdct_unroll_prelap_arm_low - ; r0 = out - ; r1 = post - ; r2 = r - ; r3 = step +mdct_unroll_prelap_arm_low: + @ r0 = out + @ r1 = post + @ r2 = r + @ r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r2, r1 ; r1 = r - post - SUBS r1, r1, #16 ; r1 = r - post - 16 + SUB r1, r2, r1 @ r1 = r - post + SUBS r1, r1, #16 @ r1 = r - post - 16 BLT unroll_over -unroll_loop +unroll_loop: LDMDB r2!,{r5,r6,r7,r12} - MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 - MOV r6, r6, ASR #9 ; r6 = (*--r)>>9 - MOV r7, r7, ASR #9 ; r7 = (*--r)>>9 - MOV r12,r12,ASR #9 ; r12= (*--r)>>9 + MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 + MOV r6, r6, ASR #9 @ r6 = (*--r)>>9 + MOV r7, r7, ASR #9 @ r7 = (*--r)>>9 + MOV r12,r12,ASR #9 @ r12= (*--r)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop -unroll_over +unroll_over: ADDS r1, r1, #16 BLE unroll_end -unroll_loop2 +unroll_loop2: LDR r5,[r2,#-4]! - ; stall - ; stall (Xscale) - MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 + @ stall + @ stall (Xscale) + MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop2 -unroll_end +unroll_end: LDMFD r13!,{r4-r7,PC} -mdct_unroll_postlap_arm_low - ; r0 = out - ; r1 = post - ; r2 = l - ; r3 = step +mdct_unroll_postlap_arm_low: + @ r0 = out + @ r1 = post + @ r2 = l + @ r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r1, r2 ; r1 = post - l - MOV r1, r1, ASR #1 ; r1 = (post - l)>>1 - SUBS r1, r1, #16 ; r1 = ((post - l)>>1) - 4 + SUB r1, r1, r2 @ r1 = post - l + MOV r1, r1, ASR #1 @ r1 = (post - l)>>1 + SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4 BLT unroll_over3 -unroll_loop3 +unroll_loop3: LDR r12,[r2],#8 LDR r7, [r2],#8 LDR r6, [r2],#8 @@ -97,145 +97,145 @@ unroll_loop3 RSB r6, r6, #0 RSB r7, r7, #0 - MOV r12, r12,ASR #9 ; r12= (-*l)>>9 - MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 - MOV r6, r6, ASR #9 ; r6 = (-*l)>>9 - MOV r7, r7, ASR #9 ; r7 = (-*l)>>9 + MOV r12, r12,ASR #9 @ r12= (-*l)>>9 + MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 + MOV r6, r6, ASR #9 @ r6 = (-*l)>>9 + MOV r7, r7, ASR #9 @ r7 = (-*l)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop3 -unroll_over3 +unroll_over3: ADDS r1, r1, #16 BLE unroll_over4 -unroll_loop4 +unroll_loop4: LDR r5,[r2], #8 - ; stall - ; stall (Xscale) + @ stall + @ stall (Xscale) RSB r5, r5, #0 - MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 + MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop4 -unroll_over4 +unroll_over4: LDMFD r13!,{r4-r7,PC} -mdct_unroll_part2_arm_low - ; r0 = out - ; r1 = post - ; r2 = l - ; r3 = r - ; <> = step - ; <> = wL - ; <> = wR +mdct_unroll_part2_arm_low: + @ r0 = out + @ r1 = post + @ r2 = l + @ r3 = r + @ <> = step + @ <> = wL + @ <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} ; r8 = step - ; r9 = wL - ; r10= wR + LDMFD r12,{r8,r9,r10} @ r8 = step + @ r9 = wL + @ r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r3, r1 ; r1 = (r - post) + SUBS r1, r3, r1 @ r1 = (r - post) BLE unroll_over5 -unroll_loop5 - LDR r12,[r2, #-8]! ; r12= *l (but l -= 2 first) - LDR r7, [r3, #-4]! ; r7 = *--r - LDRB r6, [r10,#-1]! ; r6 = *--wR - LDRB r11,[r9],#1 ; r11= *wL++ +unroll_loop5: + LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first) + LDR r7, [r3, #-4]! @ r7 = *--r + LDRB r6, [r10,#-1]! @ r6 = *--wR + LDRB r11,[r9],#1 @ r11= *wL++ MOV r12, r12, ASR #8 - ; Can save a cycle here, at the cost of 1bit errors in rounding - MUL r11,r12,r11 ; r11 = *l * *wL++ + @ Can save a cycle here, at the cost of 1bit errors in rounding + MUL r11,r12,r11 @ r11 = *l * *wL++ MOV r7, r7, ASR #8 - MLA r6, r7, r6, r11 ; r6 = *--r * *--wR + MLA r6, r7, r6, r11 @ r6 = *--r * *--wR MOV r6, r6, ASR #9 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop5 -unroll_over5 +unroll_over5: LDMFD r13!,{r4,r6-r11,PC} -mdct_unroll_part3_arm_low - ; r0 = out - ; r1 = post - ; r2 = l - ; r3 = r - ; <> = step - ; <> = wL - ; <> = wR +mdct_unroll_part3_arm_low: + @ r0 = out + @ r1 = post + @ r2 = l + @ r3 = r + @ <> = step + @ <> = wL + @ <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} ; r8 = step - ; r9 = wL - ; r10= wR + LDMFD r12,{r8,r9,r10} @ r8 = step + @ r9 = wL + @ r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r1, r3 ; r1 = (post - r) + SUBS r1, r1, r3 @ r1 = (post - r) BLE unroll_over6 -unroll_loop6 - LDR r12,[r2],#8 ; r12= *l (but l += 2 first) - LDR r7, [r3],#4 ; r7 = *r++ - LDRB r11,[r9],#1 ; r11= *wL++ - LDRB r6, [r10,#-1]! ; r6 = *--wR +unroll_loop6: + LDR r12,[r2],#8 @ r12= *l (but l += 2 first) + LDR r7, [r3],#4 @ r7 = *r++ + LDRB r11,[r9],#1 @ r11= *wL++ + LDRB r6, [r10,#-1]! @ r6 = *--wR - ; Can save a cycle here, at the cost of 1bit errors in rounding + @ Can save a cycle here, at the cost of 1bit errors in rounding MOV r12,r12,ASR #8 - MUL r11,r12,r11 ; (r14,r11) = *l * *wL++ + MUL r11,r12,r11 @ (r14,r11) = *l * *wL++ MOV r7, r7, ASR #8 - MUL r6, r7, r6 ; (r14,r6) = *--r * *--wR + MUL r6, r7, r6 @ (r14,r6) = *--r * *--wR SUB r6, r6, r11 MOV r6, r6, ASR #9 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop6 -unroll_over6 +unroll_over6: LDMFD r13!,{r4,r6-r11,PC} -mdct_shift_right_arm_low - ; r0 = n - ; r1 = in - ; r2 = right +mdct_shift_right_arm_low: + @ r0 = n + @ r1 = in + @ r2 = right STMFD r13!,{r4-r11,r14} - MOV r0, r0, LSR #2 ; n >>= 2 + MOV r0, r0, LSR #2 @ n >>= 2 ADD r1, r1, #4 SUBS r0, r0, #8 BLT sr_less_than_8 -sr_loop +sr_loop: LDR r3, [r1], #8 LDR r4, [r1], #8 LDR r5, [r1], #8 @@ -247,225 +247,225 @@ sr_loop SUBS r0, r0, #8 STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14} BGE sr_loop -sr_less_than_8 +sr_less_than_8: ADDS r0, r0, #8 BEQ sr_end -sr_loop2 +sr_loop2: LDR r3, [r1], #8 SUBS r0, r0, #1 STR r3, [r2], #4 BGT sr_loop2 -sr_end +sr_end: LDMFD r13!,{r4-r11,PC} -mdct_backward_arm_low - ; r0 = n - ; r1 = in +mdct_backward_arm_low: + @ r0 = n + @ r1 = in STMFD r13!,{r4-r11,r14} - MOV r2, #1<<4 ; r2 = 1<<shift - MOV r3, #13-4 ; r3 = 13-shift -find_shift_loop - TST r0, r2 ; if (n & (1<<shift)) == 0 + MOV r2, #1<<4 @ r2 = 1<<shift + MOV r3, #13-4 @ r3 = 13-shift +find_shift_loop: + TST r0, r2 @ if (n & (1<<shift)) == 0 MOV r2, r2, LSL #1 - SUBEQ r3, r3, #1 ; shift-- + SUBEQ r3, r3, #1 @ shift-- BEQ find_shift_loop MOV r2, #2 - MOV r2, r2, LSL r3 ; r2 = step = 2<<shift - - ; presymmetry - ; r0 = n (a multiple of 4) - ; r1 = in - ; r2 = step - ; r3 = shift - - ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) - ADD r14,r1, r0 ; r14= in+(n>>2) - SUB r4, r4, #3*4 ; r4 = aX = in+n2-3 - LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 - -presymmetry_loop1 - LDR r7, [r4,#8] ; r6 = s2 = aX[2] - LDRB r11,[r5,#1] ; r11= T[1] - LDR r6, [r4],#-16 ; r6 = s0 = aX[0] - LDRB r10,[r5],r2 ; r10= T[0] T += step + MOV r2, r2, LSL r3 @ r2 = step = 2<<shift + + @ presymmetry + @ r0 = n (a multiple of 4) + @ r1 = in + @ r2 = step + @ r3 = shift + + ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) + ADD r14,r1, r0 @ r14= in+(n>>2) + SUB r4, r4, #3*4 @ r4 = aX = in+n2-3 + LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 + +presymmetry_loop1: + LDR r7, [r4,#8] @ r6 = s2 = aX[2] + LDRB r11,[r5,#1] @ r11= T[1] + LDR r6, [r4],#-16 @ r6 = s0 = aX[0] + LDRB r10,[r5],r2 @ r10= T[0] T += step MOV r6, r6, ASR #8 MOV r7, r7, ASR #8 - ; XPROD31(s0, s2, T[0], T[1], &aX[0], &ax[2]) - MUL r9, r6, r10 ; r9 = s0*T[0] + @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2]) + MUL r9, r6, r10 @ r9 = s0*T[0] RSB r6, r6, #0 - MLA r9, r7, r11,r9 ; r9 += s2*T[1] + MLA r9, r7, r11,r9 @ r9 += s2*T[1] CMP r4, r14 - MUL r12,r7, r10 ; r12 = s2*T[0] - STR r9, [r4,#16] ; aX[0] = r9 - MLA r12,r6, r11,r12 ; r12 -= s0*T[1] - STR r12,[r4,#8+16] ; aX[2] = r12 - - BGE presymmetry_loop1 ; while (aX >= in+n4) - -presymmetry_loop2 - LDR r6, [r4],#-16 ; r6 = s0 = aX[0] - LDRB r10,[r5,#1] ; r10= T[1] - LDR r7, [r4,#16+8] ; r6 = s2 = aX[2] - LDRB r11,[r5],-r2 ; r11= T[0] T -= step + MUL r12,r7, r10 @ r12 = s2*T[0] + STR r9, [r4,#16] @ aX[0] = r9 + MLA r12,r6, r11,r12 @ r12 -= s0*T[1] + STR r12,[r4,#8+16] @ aX[2] = r12 + + BGE presymmetry_loop1 @ while (aX >= in+n4) + +presymmetry_loop2: + LDR r6, [r4],#-16 @ r6 = s0 = aX[0] + LDRB r10,[r5,#1] @ r10= T[1] + LDR r7, [r4,#16+8] @ r6 = s2 = aX[2] + LDRB r11,[r5],-r2 @ r11= T[0] T -= step MOV r6, r6, ASR #8 MOV r7, r7, ASR #8 - ; XPROD31(s0, s2, T[1], T[0], &aX[0], &ax[2]) - MUL r9, r6, r10 ; r9 = s0*T[1] + @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2]) + MUL r9, r6, r10 @ r9 = s0*T[1] RSB r6, r6, #0 - MLA r9, r7, r11,r9 ; r9 += s2*T[0] + MLA r9, r7, r11,r9 @ r9 += s2*T[0] CMP r4, r1 - MUL r12,r7, r10 ; r12 = s2*T[1] - STR r9, [r4,#16] ; aX[0] = r9 - MLA r12,r6, r11,r12 ; r12 -= s0*T[0] - STR r12,[r4,#8+16] ; aX[2] = r12 + MUL r12,r7, r10 @ r12 = s2*T[1] + STR r9, [r4,#16] @ aX[0] = r9 + MLA r12,r6, r11,r12 @ r12 -= s0*T[0] + STR r12,[r4,#8+16] @ aX[2] = r12 - BGE presymmetry_loop2 ; while (aX >= in) + BGE presymmetry_loop2 @ while (aX >= in) - ; r0 = n - ; r1 = in - ; r2 = step - ; r3 = shift + @ r0 = n + @ r1 = in + @ r2 = step + @ r3 = shift STMFD r13!,{r3} - LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 - ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) - SUB r4, r4, #4*4 ; r4 = aX = in+(n>>1)-4 - LDRB r11,[r5,#1] ; r11= T[1] - LDRB r10,[r5],r2 ; r10= T[0] T += step -presymmetry_loop3 - LDR r8, [r1],#16 ; r8 = ro0 = bX[0] - LDR r9, [r1,#8-16] ; r9 = ro2 = bX[2] - LDR r6, [r4],#-16 ; r6 = ri0 = aX[0] - LDR r7, [r4,#8+16] ; r7 = ri2 = aX[2] + LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 + ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) + SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4 + LDRB r11,[r5,#1] @ r11= T[1] + LDRB r10,[r5],r2 @ r10= T[0] T += step +presymmetry_loop3: + LDR r8, [r1],#16 @ r8 = ro0 = bX[0] + LDR r9, [r1,#8-16] @ r9 = ro2 = bX[2] + LDR r6, [r4],#-16 @ r6 = ri0 = aX[0] + LDR r7, [r4,#8+16] @ r7 = ri2 = aX[2] MOV r8, r8, ASR #8 MOV r9, r9, ASR #8 MOV r6, r6, ASR #8 - ; XNPROD31( ro2, ro0, T[1], T[0], &aX[0], &aX[2] ) - ; aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 - MUL r12,r8, r11 ; r12 = ro0*T[1] + @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] ) + @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 + MUL r12,r8, r11 @ r12 = ro0*T[1] MOV r7, r7, ASR #8 - MLA r12,r9, r10,r12 ; r12 += ro2*T[0] - RSB r8, r8, #0 ; r8 = -ro0 - MUL r3, r9, r11 ; r3 = ro2*T[1] - LDRB r11,[r5,#1] ; r11= T[1] - MLA r3, r8, r10,r3 ; r3 -= ro0*T[0] - LDRB r10,[r5],r2 ; r10= T[0] T += step + MLA r12,r9, r10,r12 @ r12 += ro2*T[0] + RSB r8, r8, #0 @ r8 = -ro0 + MUL r3, r9, r11 @ r3 = ro2*T[1] + LDRB r11,[r5,#1] @ r11= T[1] + MLA r3, r8, r10,r3 @ r3 -= ro0*T[0] + LDRB r10,[r5],r2 @ r10= T[0] T += step STR r12,[r4,#16+8] STR r3, [r4,#16] - ; XNPROD31( ri2, ri0, T[0], T[1], &bX[0], &bX[2] ) - ; bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 - MUL r12,r6, r10 ; r12 = ri0*T[0] - RSB r6, r6, #0 ; r6 = -ri0 - MLA r12,r7, r11,r12 ; r12 += ri2*T[1] + @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] ) + @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 + MUL r12,r6, r10 @ r12 = ri0*T[0] + RSB r6, r6, #0 @ r6 = -ri0 + MLA r12,r7, r11,r12 @ r12 += ri2*T[1] CMP r4, r1 - MUL r3, r7, r10 ; r3 = ri2*T[0] + MUL r3, r7, r10 @ r3 = ri2*T[0] STR r12,[r1,#8-16] - MLA r3, r6, r11,r3 ; r3 -= ri0*T[1] + MLA r3, r6, r11,r3 @ r3 -= ri0*T[1] STR r3, [r1,#-16] BGE presymmetry_loop3 - SUB r1,r1,r0 ; r1 = in -= n>>2 (i.e. restore in) + SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in) LDR r3,[r13] STR r2,[r13,#-4]! - ; mdct_butterflies - ; r0 = n = (points * 2) - ; r1 = in = x - ; r2 = i - ; r3 = shift + @ mdct_butterflies + @ r0 = n = (points * 2) + @ r1 = in = x + @ r2 = i + @ r3 = shift STMFD r13!,{r0-r1} - RSBS r4,r3,#6 ; r4 = stages = 7-shift then --stages + RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages LDR r5,=sincos_lookup0 BLE no_generics - MOV r14,#4 ; r14= 4 (i=0) - MOV r6, r14,LSL r3 ; r6 = (4<<i)<<shift -mdct_butterflies_loop1 - MOV r0, r0, LSR #1 ; r0 = points>>i = POINTS - MOV r2, r14,LSR #2 ; r2 = (1<<i)-j (j=0) + MOV r14,#4 @ r14= 4 (i=0) + MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift +mdct_butterflies_loop1: + MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS + MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0) STMFD r13!,{r4,r14} -mdct_butterflies_loop2 - - ; mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) - ; mdct_butterfly_generic(r1, r0, r6) - ; r0 = points - ; r1 = x - ; preserve r2 (external loop counter) - ; preserve r3 - ; preserve r4 (external loop counter) - ; r5 = T = sincos_lookup0 - ; r6 = step - ; preserve r14 - - STR r2,[r13,#-4]! ; stack r2 - ADD r1,r1,r0,LSL #1 ; r1 = x2+4 = x + (POINTS>>1) - ADD r7,r1,r0,LSL #1 ; r7 = x1+4 = x + POINTS - ADD r12,r5,#1024 ; r12= sincos_lookup0+1024 - -mdct_bufferfly_generic_loop1 - LDMDB r7!,{r2,r3,r8,r11} ; r2 = x1[0] - ; r3 = x1[1] - ; r8 = x1[2] - ; r11= x1[3] x1 -= 4 - LDMDB r1!,{r4,r9,r10,r14} ; r4 = x2[0] - ; r9 = x2[1] - ; r10= x2[2] - ; r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) - SUB r11,r11,r8 ; r11= s1 = x1[3] - x1[2] - ADD r8, r11,r8, LSL #1 ; r8 = x1[3] + x1[2] (-> x1[2]) - SUB r9, r9, r4 ; r9 = s2 = x2[1] - x2[0] - ADD r4, r9, r4, LSL #1 ; r4 = x2[1] + x2[0] (-> x1[1]) - SUB r14,r14,r10 ; r14= s3 = x2[3] - x2[2] - ADD r10,r14,r10,LSL #1 ; r10= x2[3] + x2[2] (-> x1[3]) +mdct_butterflies_loop2: + + @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) + @ mdct_butterfly_generic(r1, r0, r6) + @ r0 = points + @ r1 = x + @ preserve r2 (external loop counter) + @ preserve r3 + @ preserve r4 (external loop counter) + @ r5 = T = sincos_lookup0 + @ r6 = step + @ preserve r14 + + STR r2,[r13,#-4]! @ stack r2 + ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1) + ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS + ADD r12,r5,#1024 @ r12= sincos_lookup0+1024 + +mdct_bufferfly_generic_loop1: + LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0] + @ r3 = x1[1] + @ r8 = x1[2] + @ r11= x1[3] x1 -= 4 + LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0] + @ r9 = x2[1] + @ r10= x2[2] + @ r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) + SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2] + ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2]) + SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0] + ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1]) + SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2] + ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r4,r8,r10} - ; r0 = points - ; r1 = x2 - ; r2 = s0 - ; r3 free - ; r4 free - ; r5 = T - ; r6 = step - ; r7 = x1 - ; r8 free - ; r9 = s2 - ; r10 free - ; r11= s1 - ; r12= limit - ; r14= s3 - - LDRB r8, [r5,#1] ; r8 = T[1] - LDRB r10,[r5],r6 ; r10= T[0] T += step + @ r0 = points + @ r1 = x2 + @ r2 = s0 + @ r3 free + @ r4 free + @ r5 = T + @ r6 = step + @ r7 = x1 + @ r8 free + @ r9 = s2 + @ r10 free + @ r11= s1 + @ r12= limit + @ r14= s3 + + LDRB r8, [r5,#1] @ r8 = T[1] + LDRB r10,[r5],r6 @ r10= T[0] T += step MOV r2, r2, ASR #8 MOV r11,r11,ASR #8 MOV r9, r9, ASR #8 MOV r14,r14,ASR #8 - ; XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) - ; x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 - ; stall Xscale - MUL r3, r2, r8 ; r3 = s0*T[1] - MLA r3, r11,r10,r3 ; r3 += s1*T[0] + @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) + @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 + @ stall Xscale + MUL r3, r2, r8 @ r3 = s0*T[1] + MLA r3, r11,r10,r3 @ r3 += s1*T[0] RSB r11,r11,#0 - MUL r4, r8, r11 ; r4 = -s1*T[1] - MLA r4, r2, r10,r4 ; r4 += s0*T[0] = Value for x2[2] - MOV r2, r3 ; r2 = r3 = Value for x2[0] - - ; XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) - ; x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 - MUL r3, r9, r10 ; r3 = s2*T[0] - MLA r3, r14,r8, r3 ; r3 += s3*T[1] = Value for x2[1] + MUL r4, r8, r11 @ r4 = -s1*T[1] + MLA r4, r2, r10,r4 @ r4 += s0*T[0] = Value for x2[2] + MOV r2, r3 @ r2 = r3 = Value for x2[0] + + @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) + @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 + MUL r3, r9, r10 @ r3 = s2*T[0] + MLA r3, r14,r8, r3 @ r3 += s3*T[1] = Value for x2[1] RSB r9, r9, #0 - MUL r11,r14,r10 ; r11 = s3*T[0] - MLA r11,r9, r8, r11 ; r11 -= s2*T[1] = Value for x2[3] + MUL r11,r14,r10 @ r11 = s3*T[0] + MLA r11,r9, r8, r11 @ r11 -= s2*T[1] = Value for x2[3] CMP r5, r12 STMIA r1,{r2,r3,r4,r11} @@ -473,472 +473,472 @@ mdct_bufferfly_generic_loop1 BLT mdct_bufferfly_generic_loop1 SUB r12,r12,#1024 -mdct_bufferfly_generic_loop2 - LDMDB r7!,{r2,r3,r9,r10} ; r2 = x1[0] - ; r3 = x1[1] - ; r9 = x1[2] - ; r10= x1[3] x1 -= 4 - LDMDB r1!,{r4,r8,r11,r14} ; r4 = x2[0] - ; r8 = x2[1] - ; r11= x2[2] - ; r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) - SUB r9, r9,r10 ; r9 = s1 = x1[2] - x1[3] - ADD r10,r9,r10, LSL #1 ; r10= x1[2] + x1[3] (-> x1[2]) - SUB r4, r4, r8 ; r4 = s2 = x2[0] - x2[1] - ADD r8, r4, r8, LSL #1 ; r8 = x2[0] + x2[1] (-> x1[1]) - SUB r14,r14,r11 ; r14= s3 = x2[3] - x2[2] - ADD r11,r14,r11,LSL #1 ; r11= x2[3] + x2[2] (-> x1[3]) +mdct_bufferfly_generic_loop2: + LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0] + @ r3 = x1[1] + @ r9 = x1[2] + @ r10= x1[3] x1 -= 4 + LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0] + @ r8 = x2[1] + @ r11= x2[2] + @ r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) + SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3] + ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2]) + SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1] + ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1]) + SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2] + ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r8,r10,r11} - ; r0 = points - ; r1 = x2 - ; r2 = s0 - ; r3 free - ; r4 = s2 - ; r5 = T - ; r6 = step - ; r7 = x1 - ; r8 free - ; r9 = s1 - ; r10 free - ; r11 free - ; r12= limit - ; r14= s3 - - LDRB r8, [r5,#1] ; r8 = T[1] - LDRB r10,[r5],-r6 ; r10= T[0] T -= step + @ r0 = points + @ r1 = x2 + @ r2 = s0 + @ r3 free + @ r4 = s2 + @ r5 = T + @ r6 = step + @ r7 = x1 + @ r8 free + @ r9 = s1 + @ r10 free + @ r11 free + @ r12= limit + @ r14= s3 + + LDRB r8, [r5,#1] @ r8 = T[1] + LDRB r10,[r5],-r6 @ r10= T[0] T -= step MOV r2, r2, ASR #8 MOV r9, r9, ASR #8 MOV r4, r4, ASR #8 MOV r14,r14,ASR #8 - ; XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) - ; x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 - ; stall Xscale - MUL r11,r2, r8 ; r11 = s0*T[1] - MLA r11,r9, r10,r11 ; r11 += s1*T[0] + @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) + @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 + @ stall Xscale + MUL r11,r2, r8 @ r11 = s0*T[1] + MLA r11,r9, r10,r11 @ r11 += s1*T[0] RSB r9, r9, #0 - MUL r2, r10,r2 ; r2 = s0*T[0] - MLA r2, r9, r8, r2 ; r2 += -s1*T[1] = Value for x2[0] - MOV r9, r11 ; r9 = r11 = Value for x2[2] - - ; XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) - ; x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 - MUL r11,r4, r10 ; r11 = s2*T[0] - MLA r11,r14,r8, r11 ; r11 += s3*T[1] = Value for x2[3] + MUL r2, r10,r2 @ r2 = s0*T[0] + MLA r2, r9, r8, r2 @ r2 += -s1*T[1] = Value for x2[0] + MOV r9, r11 @ r9 = r11 = Value for x2[2] + + @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) + @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 + MUL r11,r4, r10 @ r11 = s2*T[0] + MLA r11,r14,r8, r11 @ r11 += s3*T[1] = Value for x2[3] RSB r4, r4, #0 - MUL r3, r14,r10 ; r3 = s3*T[0] - MLA r3, r4, r8, r3 ; r3 -= s2*T[1] = Value for x2[1] + MUL r3, r14,r10 @ r3 = s3*T[0] + MLA r3, r4, r8, r3 @ r3 -= s2*T[1] = Value for x2[1] CMP r5, r12 STMIA r1,{r2,r3,r9,r11} BGT mdct_bufferfly_generic_loop2 - LDR r2,[r13],#4 ; unstack r2 - ADD r1, r1, r0, LSL #2 ; r1 = x+POINTS*j - ; stall Xscale - SUBS r2, r2, #1 ; r2-- (j++) + LDR r2,[r13],#4 @ unstack r2 + ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j + @ stall Xscale + SUBS r2, r2, #1 @ r2-- (j++) BGT mdct_butterflies_loop2 LDMFD r13!,{r4,r14} LDR r1,[r13,#4] - SUBS r4, r4, #1 ; stages-- - MOV r14,r14,LSL #1 ; r14= 4<<i (i++) - MOV r6, r6, LSL #1 ; r6 = step <<= 1 (i++) + SUBS r4, r4, #1 @ stages-- + MOV r14,r14,LSL #1 @ r14= 4<<i (i++) + MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++) BGE mdct_butterflies_loop1 LDMFD r13,{r0-r1} -no_generics - ; mdct_butterflies part2 (loop around mdct_bufferfly_32) - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift - -mdct_bufferflies_loop3 - ; mdct_bufferfly_32 - - ; block1 - ADD r4, r1, #16*4 ; r4 = &in[16] - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[16] - ; r6 = x[17] - ; r9 = x[18] - ; r10= x[19] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] - ; r8 = x[1] - ; r11= x[2] - ; r12= x[3] - SUB r5, r5, r6 ; r5 = s0 = x[16] - x[17] - ADD r6, r5, r6, LSL #1 ; r6 = x[16] + x[17] -> x[16] - SUB r9, r9, r10 ; r9 = s1 = x[18] - x[19] - ADD r10,r9, r10,LSL #1 ; r10= x[18] + x[19] -> x[18] - SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[17] - SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[19] +no_generics: + @ mdct_butterflies part2 (loop around mdct_bufferfly_32) + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift + +mdct_bufferflies_loop3: + @ mdct_bufferfly_32 + + @ block1 + ADD r4, r1, #16*4 @ r4 = &in[16] + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16] + @ r6 = x[17] + @ r9 = x[18] + @ r10= x[19] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] + @ r8 = x[1] + @ r11= x[2] + @ r12= x[3] + SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17] + ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16] + SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19] + ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18] + SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17] + SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19] STMIA r4!,{r6,r7,r10,r11} - MOV r6,#0xed ; r6 =cPI1_8 - MOV r7,#0x62 ; r7 =cPI3_8 + MOV r6,#0xed @ r6 =cPI1_8 + MOV r7,#0x62 @ r7 =cPI3_8 MOV r5, r5, ASR #8 MOV r9, r9, ASR #8 MOV r8, r8, ASR #8 MOV r12,r12,ASR #8 - ; XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) - ; x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 - ; stall Xscale - MUL r11,r5, r6 ; r11 = s0*cPI1_8 - MLA r11,r9, r7, r11 ; r11 += s1*cPI3_8 + @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) + @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 + @ stall Xscale + MUL r11,r5, r6 @ r11 = s0*cPI1_8 + MLA r11,r9, r7, r11 @ r11 += s1*cPI3_8 RSB r9, r9, #0 - MUL r5, r7, r5 ; r5 = s0*cPI3_8 - MLA r5, r9, r6, r5 ; r5 -= s1*cPI1_8 + MUL r5, r7, r5 @ r5 = s0*cPI3_8 + MLA r5, r9, r6, r5 @ r5 -= s1*cPI1_8 - ; XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) - ; x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 - MUL r9, r8, r6 ; r9 = s2*cPI1_8 - MLA r9, r12,r7, r9 ; r9 += s3*cPI3_8 + @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) + @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 + MUL r9, r8, r6 @ r9 = s2*cPI1_8 + MLA r9, r12,r7, r9 @ r9 += s3*cPI3_8 RSB r8,r8,#0 - MUL r12,r6, r12 ; r12 = s3*cPI1_8 - MLA r12,r8, r7, r12 ; r12 -= s2*cPI3_8 + MUL r12,r6, r12 @ r12 = s3*cPI1_8 + MLA r12,r8, r7, r12 @ r12 -= s2*cPI3_8 STMIA r1!,{r5,r9,r11,r12} - ; block2 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[20] - ; r6 = x[21] - ; r9 = x[22] - ; r10= x[23] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[4] - ; r8 = x[5] - ; r11= x[6] - ; r12= x[7] - SUB r5, r5, r6 ; r5 = s0 = x[20] - x[21] - ADD r6, r5, r6, LSL #1 ; r6 = x[20] + x[21] -> x[20] - SUB r9, r9, r10 ; r9 = s1 = x[22] - x[23] - ADD r10,r9, r10,LSL #1 ; r10= x[22] + x[23] -> x[22] - SUB r8, r8, r7 ; r8 = s2 = x[ 5] - x[ 4] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 5] + x[ 4] -> x[21] - SUB r12,r12,r11 ; r12= s3 = x[ 7] - x[ 6] - ADD r11,r12,r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[23] - MOV r14,#0xb5 ; cPI2_8 + @ block2 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20] + @ r6 = x[21] + @ r9 = x[22] + @ r10= x[23] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4] + @ r8 = x[5] + @ r11= x[6] + @ r12= x[7] + SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21] + ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20] + SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23] + ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22] + SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21] + SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6] + ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23] + MOV r14,#0xb5 @ cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 ; r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 + SUB r5, r5, r9 @ r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 MOV r5, r5, ASR #8 - MUL r5, r14,r5 ; r5 = (s0-s1)*cPI2_8 - SUB r12,r12,r8 ; r12= s3 - s2 - ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 + MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8 + SUB r12,r12,r8 @ r12= s3 - s2 + ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 MOV r8, r8, ASR #8 - MUL r8, r14,r8 ; r8 = (s3+s2)*cPI2_8 + MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8 MOV r9, r9, ASR #8 - MUL r9, r14,r9 ; r9 = (s0+s1)*cPI2_8 + MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8 MOV r12,r12,ASR #8 - MUL r12,r14,r12 ; r12 = (s3-s2)*cPI2_8 + MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8 STMIA r1!,{r5,r8,r9,r12} - ; block3 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[24] - ; r6 = x[25] - ; r9 = x[25] - ; r10= x[26] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[8] - ; r8 = x[9] - ; r11= x[10] - ; r12= x[11] - SUB r5, r5, r6 ; r5 = s0 = x[24] - x[25] - ADD r6, r5, r6, LSL #1 ; r6 = x[24] + x[25] -> x[25] - SUB r9, r9, r10 ; r9 = s1 = x[26] - x[27] - ADD r10,r9, r10,LSL #1 ; r10= x[26] + x[27] -> x[26] - SUB r8, r8, r7 ; r8 = s2 = x[ 9] - x[ 8] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 9] + x[ 8] -> x[25] - SUB r12,r12,r11 ; r12= s3 = x[11] - x[10] - ADD r11,r12,r11, LSL #1 ; r11= x[11] + x[10] -> x[27] + @ block3 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24] + @ r6 = x[25] + @ r9 = x[25] + @ r10= x[26] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8] + @ r8 = x[9] + @ r11= x[10] + @ r12= x[11] + SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25] + ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25] + SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27] + ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26] + SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25] + SUB r12,r12,r11 @ r12= s3 = x[11] - x[10] + ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27] STMIA r4!,{r6,r7,r10,r11} - MOV r6,#0x62 ; r6 = cPI3_8 - MOV r7,#0xED ; r7 = cPI1_8 + MOV r6,#0x62 @ r6 = cPI3_8 + MOV r7,#0xED @ r7 = cPI1_8 - ; XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) - ; x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 - ; stall Xscale + @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) + @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 + @ stall Xscale MOV r5, r5, ASR #8 - MUL r11,r5, r6 ; r11 = s0*cPI3_8 + MUL r11,r5, r6 @ r11 = s0*cPI3_8 MOV r9, r9, ASR #8 - MLA r11,r9, r7, r11 ; r11 += s1*cPI1_8 + MLA r11,r9, r7, r11 @ r11 += s1*cPI1_8 RSB r9, r9, #0 - MUL r5, r7, r5 ; r5 = s0*cPI1_8 - MLA r5, r9, r6, r5 ; r5 -= s1*cPI3_8 + MUL r5, r7, r5 @ r5 = s0*cPI1_8 + MLA r5, r9, r6, r5 @ r5 -= s1*cPI3_8 - ; XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) - ; x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 + @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) + @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 MOV r8, r8, ASR #8 - MUL r9, r8, r6 ; r9 = s2*cPI3_8 + MUL r9, r8, r6 @ r9 = s2*cPI3_8 MOV r12,r12,ASR #8 - MLA r9, r12,r7, r9 ; r9 += s3*cPI1_8 + MLA r9, r12,r7, r9 @ r9 += s3*cPI1_8 RSB r8,r8,#0 - MUL r12,r6, r12 ; r12 = s3*cPI3_8 - MLA r12,r8, r7, r12 ; r12 -= s2*cPI1_8 + MUL r12,r6, r12 @ r12 = s3*cPI3_8 + MLA r12,r8, r7, r12 @ r12 -= s2*cPI1_8 STMIA r1!,{r5,r9,r11,r12} - ; block4 - LDMIA r4,{r5,r6,r10,r11} ; r5 = x[28] - ; r6 = x[29] - ; r10= x[30] - ; r11= x[31] - LDMIA r1,{r8,r9,r12,r14} ; r8 = x[12] - ; r9 = x[13] - ; r12= x[14] - ; r14= x[15] - SUB r5, r5, r6 ; r5 = s0 = x[28] - x[29] - ADD r6, r5, r6, LSL #1 ; r6 = x[28] + x[29] -> x[28] - SUB r7, r14,r12 ; r7 = s3 = x[15] - x[14] - ADD r12,r7, r12, LSL #1 ; r12= x[15] + x[14] -> x[31] - SUB r10,r10,r11 ; r10= s1 = x[30] - x[31] - ADD r11,r10,r11,LSL #1 ; r11= x[30] + x[31] -> x[30] - SUB r14, r8, r9 ; r14= s2 = x[12] - x[13] - ADD r9, r14, r9, LSL #1 ; r9 = x[12] + x[13] -> x[29] + @ block4 + LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28] + @ r6 = x[29] + @ r10= x[30] + @ r11= x[31] + LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12] + @ r9 = x[13] + @ r12= x[14] + @ r14= x[15] + SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29] + ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28] + SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14] + ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31] + SUB r10,r10,r11 @ r10= s1 = x[30] - x[31] + ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30] + SUB r14, r8, r9 @ r14= s2 = x[12] - x[13] + ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29] STMIA r4!,{r6,r9,r11,r12} STMIA r1!,{r5,r7,r10,r14} - ; mdct_butterfly16 (1st version) - ; block 1 + @ mdct_butterfly16 (1st version) + @ block 1 SUB r1,r1,#16*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] - ; r6 = x[ 9] - ; r9 = x[10] - ; r10= x[11] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] - ; r8 = x[1] - ; r11= x[2] - ; r12= x[3] - SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] - MOV r14,#0xB5 ; r14= cPI2_8 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] + @ r6 = x[ 9] + @ r9 = x[10] + @ r10= x[11] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] + @ r8 = x[1] + @ r11= x[2] + @ r12= x[3] + SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] + MOV r14,#0xB5 @ r14= cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 ; r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 + SUB r5, r5, r9 @ r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 MOV r5, r5, ASR #8 - MUL r5, r14,r5 ; r5 = (s0-s1)*cPI2_8 - SUB r12,r12,r8 ; r12= s3 - s2 - ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 + MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8 + SUB r12,r12,r8 @ r12= s3 - s2 + ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 MOV r8, r8, ASR #8 - MUL r8, r14,r8 ; r8 = (s3+s2)*cPI2_8 + MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8 MOV r9, r9, ASR #8 - MUL r9, r14,r9 ; r9 = (s0+s1)*cPI2_8 + MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8 MOV r12,r12,ASR #8 - MUL r12,r14,r12 ; r12 = (s3-s2)*cPI2_8 + MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8 STMIA r1!,{r5,r8,r9,r12} - ; block2 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] - ; r6 = x[13] - ; r9 = x[14] - ; r10= x[15] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] - ; r8 = x[ 5] - ; r11= x[ 6] - ; r12= x[ 7] - SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] - SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] - SUB r12,r9, r10 ; r12= s3 = x[14] - x[15] - ADD r10,r12,r10,LSL #1 ; r10= x[14] + x[15] -> x[14] + @ block2 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] + @ r6 = x[13] + @ r9 = x[14] + @ r10= x[15] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] + @ r8 = x[ 5] + @ r11= x[ 6] + @ r12= x[ 7] + SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] + SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] + SUB r12,r9, r10 @ r12= s3 = x[14] - x[15] + ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r12,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} - ; mdct_butterfly16 (2nd version) - ; block 1 + @ mdct_butterfly16 (2nd version) + @ block 1 ADD r1,r1,#16*4-8*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] - ; r6 = x[ 9] - ; r9 = x[10] - ; r10= x[11] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] - ; r8 = x[1] - ; r11= x[2] - ; r12= x[3] - SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] - MOV r14,#0xb5 ; r14= cPI2_8 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] + @ r6 = x[ 9] + @ r9 = x[10] + @ r10= x[11] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] + @ r8 = x[1] + @ r11= x[2] + @ r12= x[3] + SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] + MOV r14,#0xb5 @ r14= cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 ; r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 + SUB r5, r5, r9 @ r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 MOV r5, r5, ASR #8 - MUL r5, r14,r5 ; r5 = (s0-s1)*cPI2_8 - SUB r12,r12,r8 ; r12= s3 - s2 - ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 + MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8 + SUB r12,r12,r8 @ r12= s3 - s2 + ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 MOV r8, r8, ASR #8 - MUL r8, r14,r8 ; r8 = (s3+s2)*cPI2_8 + MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8 MOV r9, r9, ASR #8 - MUL r9, r14,r9 ; r9 = (s0+s1)*cPI2_8 + MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8 MOV r12,r12,ASR #8 - MUL r12,r14,r12 ; r12 = (s3-s2)*cPI2_8 + MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8 STMIA r1!,{r5,r8,r9,r12} - ; block2 - LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] - ; r6 = x[13] - ; r9 = x[14] - ; r10= x[15] - LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] - ; r8 = x[ 5] - ; r11= x[ 6] - ; r12= x[ 7] - SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] - SUB r9, r9, r10 ; r9 = s3 = x[14] - x[15] - ADD r10,r9, r10,LSL #1 ; r10= x[14] + x[15] -> x[14] - SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] + @ block2 + LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] + @ r6 = x[13] + @ r9 = x[14] + @ r10= x[15] + LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] + @ r8 = x[ 5] + @ r11= x[ 6] + @ r12= x[ 7] + SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] + SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15] + ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14] + SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r9,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - ; mdct_butterfly_8 + @ mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - ; r6 = x[0] - ; r7 = x[1] - ; r8 = x[2] - ; r9 = x[3] - ; r10= x[4] - ; r11= x[5] - ; r12= x[6] - ; r14= x[7] - ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] - ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 - SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 - SUB r10,r10,r6 ; r10= x[4] = s4 - s0 - SUB r11,r12,r8 ; r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 + @ r6 = x[0] + @ r7 = x[1] + @ r8 = x[2] + @ r9 = x[3] + @ r10= x[4] + @ r11= x[5] + @ r12= x[6] + @ r14= x[7] + ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] + ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 + SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 + SUB r10,r10,r6 @ r10= x[4] = s4 - s0 + SUB r11,r12,r8 @ r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} ADD r1,r1,#8*4 @@ -947,117 +947,117 @@ mdct_bufferflies_loop3 LDMFD r13,{r0-r3} -mdct_bitreverse_arm_low - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift +mdct_bitreverse_arm_low: + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift - MOV r4, #0 ; r4 = bit = 0 - ADD r5, r1, r0, LSL #1 ; r5 = w = x + (n>>1) + MOV r4, #0 @ r4 = bit = 0 + ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1) ADR r6, bitrev SUB r5, r5, #8 -brev_lp +brev_lp: LDRB r7, [r6, r4, LSR #6] AND r8, r4, #0x3f LDRB r8, [r6, r8] - ADD r4, r4, #1 ; bit++ - ; stall XScale - ORR r7, r7, r8, LSL #6 ; r7 = bitrev[bit] + ADD r4, r4, #1 @ bit++ + @ stall XScale + ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit] MOV r7, r7, LSR r3 - ADD r9, r1, r7, LSL #2 ; r9 = xx = x + (b>>shift) - CMP r5, r9 ; if (w > xx) - LDR r10,[r5],#-8 ; r10 = w[0] w -= 2 - LDRGT r11,[r5,#12] ; r11 = w[1] - LDRGT r12,[r9] ; r12 = xx[0] - LDRGT r14,[r9,#4] ; r14 = xx[1] - STRGT r10,[r9] ; xx[0]= w[0] - STRGT r11,[r9,#4] ; xx[1]= w[1] - STRGT r12,[r5,#8] ; w[0] = xx[0] - STRGT r14,[r5,#12] ; w[1] = xx[1] + ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift) + CMP r5, r9 @ if (w > xx) + LDR r10,[r5],#-8 @ r10 = w[0] w -= 2 + LDRGT r11,[r5,#12] @ r11 = w[1] + LDRGT r12,[r9] @ r12 = xx[0] + LDRGT r14,[r9,#4] @ r14 = xx[1] + STRGT r10,[r9] @ xx[0]= w[0] + STRGT r11,[r9,#4] @ xx[1]= w[1] + STRGT r12,[r5,#8] @ w[0] = xx[0] + STRGT r14,[r5,#12] @ w[1] = xx[1] CMP r5,r1 BGT brev_lp - ; mdct_step7 - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift - - CMP r2, #4 ; r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + - LDRLT r5, =sincos_lookup1 ; sincos_lookup0 + - ADD r7, r1, r0, LSL #1 ; r7 = w1 = x + (n>>1) - ADDGE r5, r5, r2, LSR #1 ; (step>>1) - ADD r8, r5, #1024 ; r8 = Ttop -step7_loop1 - LDR r6, [r1] ; r6 = w0[0] - LDR r9, [r1,#4] ; r9 = w0[1] - LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] ; r11= w1[1] - LDRB r14,[r5,#1] ; r14= T[1] - LDRB r12,[r5],r2 ; r12= T[0] T += step - - ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] + @ mdct_step7 + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift + + CMP r2, #4 @ r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + + LDRLT r5, =sincos_lookup1 @ sincos_lookup0 + + ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1) + ADDGE r5, r5, r2, LSR #1 @ (step>>1) + ADD r8, r5, #1024 @ r8 = Ttop +step7_loop1: + LDR r6, [r1] @ r6 = w0[0] + LDR r9, [r1,#4] @ r9 = w0[1] + LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] @ r11= w1[1] + LDRB r14,[r5,#1] @ r14= T[1] + LDRB r12,[r5],r2 @ r12= T[0] T += step + + ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] MOV r6, r6, ASR #9 - MUL r3, r6, r14 ; r3 = s0*T[1] + MUL r3, r6, r14 @ r3 = s0*T[1] MOV r11,r11,ASR #9 - MUL r4, r11,r12 ; r4 += s1*T[0] = s2 + MUL r4, r11,r12 @ r4 += s1*T[0] = s2 ADD r3, r3, r4 - MUL r14,r11,r14 ; r14 = s1*T[1] - MUL r12,r6, r12 ; r12 += s0*T[0] = s3 + MUL r14,r11,r14 @ r14 = s1*T[1] + MUL r12,r6, r12 @ r12 += s0*T[0] = s3 SUB r14,r14,r12 - ; r9 = s0b<<1 - ; r10= s1b<<1 - ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 + @ r9 = s0b<<1 + @ r10= s1b<<1 + ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b - ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b + SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b + ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 ; w0 += 2 + STR r10,[r1],#4 @ w0 += 2 STR r3, [r7] STR r12,[r7,#4] CMP r5,r8 BLT step7_loop1 -step7_loop2 - LDR r6, [r1] ; r6 = w0[0] - LDR r9, [r1,#4] ; r9 = w0[1] - LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] ; r11= w1[1] - LDRB r14,[r5,-r2]! ; r12= T[1] T -= step - LDRB r12,[r5,#1] ; r14= T[0] +step7_loop2: + LDR r6, [r1] @ r6 = w0[0] + LDR r9, [r1,#4] @ r9 = w0[1] + LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] @ r11= w1[1] + LDRB r14,[r5,-r2]! @ r12= T[1] T -= step + LDRB r12,[r5,#1] @ r14= T[0] - ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] + ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] MOV r6, r6, ASR #9 - MUL r3, r6, r14 ; r3 = s0*T[0] + MUL r3, r6, r14 @ r3 = s0*T[0] MOV r11,r11,ASR #9 - MUL r4, r11,r12 ; r4 += s1*T[1] = s2 + MUL r4, r11,r12 @ r4 += s1*T[1] = s2 ADD r3, r3, r4 - MUL r14,r11,r14 ; r14 = s1*T[0] - MUL r12,r6, r12 ; r12 += s0*T[1] = s3 + MUL r14,r11,r14 @ r14 = s1*T[0] + MUL r12,r6, r12 @ r12 += s0*T[1] = s3 SUB r14,r14,r12 - ; r9 = s0b<<1 - ; r10= s1b<<1 - ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 + @ r9 = s0b<<1 + @ r10= s1b<<1 + ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b - ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b + SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b + ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 ; w0 += 2 + STR r10,[r1],#4 @ w0 += 2 STR r3, [r7] STR r12,[r7,#4] @@ -1066,110 +1066,110 @@ step7_loop2 LDMFD r13!,{r0-r3} - ; r0 = points - ; r1 = in - ; r2 = step - ; r3 = shift - MOV r2, r2, ASR #2 ; r2 = step >>= 2 + @ r0 = points + @ r1 = in + @ r2 = step + @ r3 = shift + MOV r2, r2, ASR #2 @ r2 = step >>= 2 CMP r2, #0 CMPNE r2, #1 BEQ mdct_end - ; step > 1 (default case) - CMP r2, #4 ; r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + - LDRLT r5, =sincos_lookup1 ; sincos_lookup1 - ADD r7, r1, r0, LSL #1 ; r7 = iX = x + (n>>1) - ADDGE r5, r5, r2, LSR #1 ; (step>>1) -mdct_step8_default - LDR r6, [r1],#4 ; r6 = s0 = x[0] - LDR r8, [r1],#4 ; r8 = -s1 = x[1] - LDRB r12,[r5,#1] ; r12= T[1] - LDRB r14,[r5],r2 ; r14= T[0] T += step - RSB r8, r8, #0 ; r8 = s1 - - ; XPROD31(s0, s1, T[0], T[1], x, x+1) - ; x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] + @ step > 1 (default case) + CMP r2, #4 @ r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + + LDRLT r5, =sincos_lookup1 @ sincos_lookup1 + ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1) + ADDGE r5, r5, r2, LSR #1 @ (step>>1) +mdct_step8_default: + LDR r6, [r1],#4 @ r6 = s0 = x[0] + LDR r8, [r1],#4 @ r8 = -s1 = x[1] + LDRB r12,[r5,#1] @ r12= T[1] + LDRB r14,[r5],r2 @ r14= T[0] T += step + RSB r8, r8, #0 @ r8 = s1 + + @ XPROD31(s0, s1, T[0], T[1], x, x+1) + @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] MOV r6, r6, ASR #8 MOV r8, r8, ASR #8 - MUL r10,r8, r12 ; r10 = s1 * T[1] + MUL r10,r8, r12 @ r10 = s1 * T[1] CMP r1, r7 - MLA r10,r6, r14,r10 ; r10 += s0 * T[0] - RSB r6, r6, #0 ; r6 = -s0 - MUL r11,r8, r14 ; r11 = s1 * T[0] - MLA r11,r6, r12,r11 ; r11 -= s0 * T[1] + MLA r10,r6, r14,r10 @ r10 += s0 * T[0] + RSB r6, r6, #0 @ r6 = -s0 + MUL r11,r8, r14 @ r11 = s1 * T[0] + MLA r11,r6, r12,r11 @ r11 -= s0 * T[1] STR r10,[r1,#-8] STR r11,[r1,#-4] BLT mdct_step8_default -mdct_end +mdct_end: MOV r0, r2 LDMFD r13!,{r4-r11,PC} -bitrev - DCB 0 - DCB 32 - DCB 16 - DCB 48 - DCB 8 - DCB 40 - DCB 24 - DCB 56 - DCB 4 - DCB 36 - DCB 20 - DCB 52 - DCB 12 - DCB 44 - DCB 28 - DCB 60 - DCB 2 - DCB 34 - DCB 18 - DCB 50 - DCB 10 - DCB 42 - DCB 26 - DCB 58 - DCB 6 - DCB 38 - DCB 22 - DCB 54 - DCB 14 - DCB 46 - DCB 30 - DCB 62 - DCB 1 - DCB 33 - DCB 17 - DCB 49 - DCB 9 - DCB 41 - DCB 25 - DCB 57 - DCB 5 - DCB 37 - DCB 21 - DCB 53 - DCB 13 - DCB 45 - DCB 29 - DCB 61 - DCB 3 - DCB 35 - DCB 19 - DCB 51 - DCB 11 - DCB 43 - DCB 27 - DCB 59 - DCB 7 - DCB 39 - DCB 23 - DCB 55 - DCB 15 - DCB 47 - DCB 31 - DCB 63 - - END +bitrev: + .byte 0 + .byte 32 + .byte 16 + .byte 48 + .byte 8 + .byte 40 + .byte 24 + .byte 56 + .byte 4 + .byte 36 + .byte 20 + .byte 52 + .byte 12 + .byte 44 + .byte 28 + .byte 60 + .byte 2 + .byte 34 + .byte 18 + .byte 50 + .byte 10 + .byte 42 + .byte 26 + .byte 58 + .byte 6 + .byte 38 + .byte 22 + .byte 54 + .byte 14 + .byte 46 + .byte 30 + .byte 62 + .byte 1 + .byte 33 + .byte 17 + .byte 49 + .byte 9 + .byte 41 + .byte 25 + .byte 57 + .byte 5 + .byte 37 + .byte 21 + .byte 53 + .byte 13 + .byte 45 + .byte 29 + .byte 61 + .byte 3 + .byte 35 + .byte 19 + .byte 51 + .byte 11 + .byte 43 + .byte 27 + .byte 59 + .byte 7 + .byte 39 + .byte 23 + .byte 55 + .byte 15 + .byte 47 + .byte 31 + .byte 63 + + @ END diff --git a/testtremor.c b/testtremor.c new file mode 100644 index 0000000..734b084 --- /dev/null +++ b/testtremor.c @@ -0,0 +1,461 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggVorbis 'TREMOR' CODEC SOURCE CODE. * + * * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE OggVorbis 'TREMOR' SOURCE CODE IS (C) COPYRIGHT 1994-2002 * + * BY THE Xiph.Org FOUNDATION http://www.xiph.org/ * + * * + ******************************************************************** + + function: simple example decoder using vorbisidec + + ********************************************************************/ + +/* Takes a vorbis bitstream from stdin and writes raw stereo PCM to + stdout using vorbisfile. Using vorbisfile is much simpler than + dealing with libvorbis. */ + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include "ivorbiscodec.h" +#include "ivorbisfile.h" +#include "time.h" + +//#define PROFILE + +#ifdef _WIN32 /* We need the following two to set stdin/stdout to binary */ +#include <io.h> +#include <fcntl.h> +#include "windows.h" +#else +typedef int DWORD; +#endif + +char pcmout[4096]; /* take 4k out of the data segment, not the stack */ +char ref[4096]; /* take 4k out of the data segment, not the stack */ +char text[4096]; + +void Output(const char *fmt, ...) +{ + va_list ap; +#ifdef _WIN32_WCE + char *t = text; + WCHAR uni[4096]; + WCHAR *u = uni; + + va_start(ap,fmt); + vsprintf(text, fmt, ap); + va_end(ap); + + while (*t != 0) + { + *u++ = (WCHAR)(*t++); + } + *u++ = 0; + OutputDebugString(uni); +#else + va_start(ap,fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); +#endif +} + +typedef struct +{ + FILE *in; + FILE *out; + FILE *refin; + FILE *refout; + int max_samples; +} TestParams; + +static DWORD run_test(void *tp) +{ + TestParams *params = (TestParams *)tp; + FILE *in = params->in; + FILE *out = params->out; + FILE *refin = params->refin; + FILE *refout = params->refout; + int max_samples = params->max_samples; + OggVorbis_File vf; + int eof=0; + int current_section; + int maxdiff = 0; + int countdiffs = 0; + int samples = 0; + + if(ov_open(in, &vf, NULL, 0) < 0) { + Output("Input does not appear to be an Ogg bitstream.\n"); + exit(1); + } + + /* Throw the comments plus a few lines about the bitstream we're + decoding */ + { + char **ptr=ov_comment(&vf,-1)->user_comments; + vorbis_info *vi=ov_info(&vf,-1); + if (out != NULL) + { + while(*ptr){ + Output("%s\n",*ptr); + ++ptr; + } + Output("\nBitstream is %d channel, %ldHz\n",vi->channels,vi->rate); + Output("\nDecoded length: %ld samples\n", + (long)ov_pcm_total(&vf,-1)); + Output("Encoded by: %s\n\n",ov_comment(&vf,-1)->vendor); + } + } + + while((!eof) && (max_samples > 0)){ + long ret=ov_read(&vf,pcmout,sizeof(pcmout),¤t_section); + if (ret == 0) { + /* EOF */ + eof=1; + } else if (ret < 0) { + /* error in the stream. Not a problem, just reporting it in + case we (the app) cares. In this case, we don't. */ + } else { + /* we don't bother dealing with sample rate changes, etc, but + you'll have to*/ + if (out != NULL) + { + fwrite(pcmout,1,ret,out); + } + max_samples -= ret>>1; + if (refout != NULL) + { + fwrite(pcmout,1,ret,refout); + samples += ret>>1; + //Output("%d", samples); + } + if (refin != NULL) + { + int i, diff; + + fread(ref,1,ret,refin); + for (i=0; i<(ret>>1);i++) + { + diff = ((short *)pcmout)[i] - ((short *)ref)[i]; + if (diff != 0) + { + if (diff < 0) + diff = -diff; + if (diff > maxdiff) + maxdiff = diff; + countdiffs++; + if (countdiffs < 50) + { + Output("samples differ: %x vs %x\n", + ((unsigned short *)pcmout)[i], + ((unsigned short *)ref)[i]); + } + else if ((countdiffs % 100) == 0) + { + Output("%d differences, maximum = %d\n", + countdiffs, maxdiff); + } + } + } + } + } + } + + /* cleanup */ + ov_clear(&vf); + + return 0; +} + +#ifdef _WIN32 +static int filetimetoms(FILETIME *time) +{ + unsigned long long l; + + l = ((unsigned long long)time->dwLowDateTime) + (((unsigned long long)time->dwHighDateTime)<<32); + + return (int)(l/10000); +} + +char speedblock[32768]; +void speedtest() +{ + int readtime; + FILETIME userStartTime, userStopTime; + FILETIME kernelStartTime, kernelStopTime; + FILETIME exitStartTime, exitStopTime; + FILETIME creationStartTime, creationStopTime; + + Output("Speed test: STMIA speed\n"); + + GetThreadTimes(GetCurrentThread(), + &creationStartTime, + &exitStartTime, + &kernelStartTime, + &userStartTime); + stmiaTest(speedblock, 32768, 65536); + GetThreadTimes(GetCurrentThread(), + &creationStopTime, + &exitStopTime, + &kernelStopTime, + &userStopTime); + readtime = filetimetoms(&userStopTime)-filetimetoms(&userStartTime); + Output("Speed test complete: Timing=%g\n", + ((double)readtime)/1000); + + Output("Speed test: STR speed\n"); + + GetThreadTimes(GetCurrentThread(), + &creationStartTime, + &exitStartTime, + &kernelStartTime, + &userStartTime); + strTest(speedblock, 32768, 65536); + GetThreadTimes(GetCurrentThread(), + &creationStopTime, + &exitStopTime, + &kernelStopTime, + &userStopTime); + readtime = filetimetoms(&userStopTime)-filetimetoms(&userStartTime); + Output("Speed test complete: Timing=%g\n", + ((double)readtime)/1000); + + Output("Speed test: SMULL speed\n"); + + GetThreadTimes(GetCurrentThread(), + &creationStartTime, + &exitStartTime, + &kernelStartTime, + &userStartTime); + smullTest(speedblock, 32768, 65536); + GetThreadTimes(GetCurrentThread(), + &creationStopTime, + &exitStopTime, + &kernelStopTime, + &userStopTime); + readtime = filetimetoms(&userStopTime)-filetimetoms(&userStartTime); + Output("Speed test complete: Timing=%g\n", + ((double)readtime)/1000); +} +#endif + +int main(int argc, char *argv[]){ + FILE *in; + FILE *out = NULL; + FILE *refin = NULL; + FILE *refout = NULL; + int dectime, readtime; +#ifdef _WIN32 + FILETIME userStartTime, userStopTime; + FILETIME kernelStartTime, kernelStopTime; + FILETIME exitStartTime, exitStopTime; + FILETIME creationStartTime, creationStopTime; +#else + clock_t startTime, stopTime; +#endif + TestParams params; + + if (argc < 2) + { + Output("Syntax: testtremor <infile> [<outfile>]\n"); + exit(EXIT_FAILURE); + } + +#ifdef PROFILE + in = fopen(argv[1], "rb"); + if (in == NULL) + { + Output("Failed to open '%s' for input\n", argv[1]); + exit(EXIT_FAILURE); + } + + params.in = in; + params.out = NULL; + params.refin = NULL; + params.refout = NULL; + params.max_samples = 0x7FFFFFFF; + Profile_init(184000, 4); + run_test(¶ms); + Profile_dump(); +#else + in = fopen(argv[1], "rb"); + if (in == NULL) + { + Output("Failed to open '%s' for input\n", argv[1]); + exit(EXIT_FAILURE); + } + + if (argc >= 3) + { + out = fopen(argv[2], "wb"); + if (out == NULL) + { + Output("Failed to open '%s' for output\n", argv[2]); + exit(EXIT_FAILURE); + } + } + + if (argc >= 4) + { + refin = fopen(argv[3], "rb"); + if (refin == NULL) + { + Output("Can't find reference file. Creating instead.\n"); + refout = fopen(argv[3], "wb"); + if (refout == NULL) + { + Output("Failed to open '%s' as output reference file\n", argv[3]); + exit(EXIT_FAILURE); + } + } + } + + Output("First test: Decode correctness\n"); + params.in = in; + params.out = out; + params.refin = refin; + params.refout = refout; + params.max_samples = 0x7FFFFFFF;//1*1024*1024; + run_test(¶ms); + Output("First test complete\n"); + if (out != NULL) + fclose(out); + if (refin != NULL) + fclose(refin); + if (refout != NULL) + fclose(refout); + Output("Second test: Decode speed\n"); + in = fopen(argv[1], "rb"); + if (in == NULL) + { + Output("Failed to open '%s' for input\n", argv[1]); + exit(EXIT_FAILURE); + } +#ifdef _WIN32 + GetThreadTimes(GetCurrentThread(), + &creationStartTime, + &exitStartTime, + &kernelStartTime, + &userStartTime); +#else + startTime = clock(); +#endif + params.in = in; + params.out = NULL; + params.refin = NULL; + params.refout = NULL; + params.max_samples = 0x7FFFFFFF; + run_test(¶ms); +#ifdef _WIN32 + GetThreadTimes(GetCurrentThread(), + &creationStopTime, + &exitStopTime, + &kernelStopTime, + &userStopTime); + dectime = filetimetoms(&userStopTime)-filetimetoms(&userStartTime); +#else + stopTime = clock(); + dectime = stopTime-startTime; +#endif + Output("Second test complete: Timing=%g\n", + ((double)dectime)/1000); + Output("Third test: File read speed\n"); + + in = fopen(argv[1], "rb"); + if (in == NULL) + { + Output("Failed to open '%s' for input\n", argv[1]); + exit(EXIT_FAILURE); + } +#ifdef _WIN32 + GetThreadTimes(GetCurrentThread(), + &creationStartTime, + &exitStartTime, + &kernelStartTime, + &userStartTime); +#else + startTime = clock(); +#endif + while (!feof(in)) + { + fread(pcmout,1,4096,in); + } +#ifdef _WIN32 + GetThreadTimes(GetCurrentThread(), + &creationStopTime, + &exitStopTime, + &kernelStopTime, + &userStopTime); + readtime = filetimetoms(&userStopTime)-filetimetoms(&userStartTime); +#else + stopTime = clock(); + dectime = stopTime-startTime; +#endif + Output("Third test complete: Timing=%g\n", + ((double)readtime)/1000); + Output("Adjusted decode time: Timing=%g\n", + ((double)(dectime-readtime))/1000); +#endif + Output("Done.\n"); + return(0); +} + +#ifdef _WIN32_WCE + +#define TESTFILE 1 + +int WinMain(HINSTANCE h,HINSTANCE i,LPWSTR l,int n) +{ +#if TESTFILE == 9 + char *argv[] = { "testtremor", + "\\Storage Card\\Tremolo\\infile9.ogg", + "\\Storage Card\\Tremolo\\output9.pcm", +#ifdef _LOW_ACCURACY_ + "\\Storage Card\\Tremolo\\outputL9.ref", +#else + "\\Storage Card\\Tremolo\\output9.ref", +#endif /* _LOW_ACCURACY_ */ + NULL }; +#endif +#if TESTFILE == 2 + char *argv[] = { "testtremor", + "\\Storage Card\\Tremolo\\infile2.ogg", + "\\Storage Card\\Tremolo\\output2.pcm", +#ifdef _LOW_ACCURACY_ + "\\Storage Card\\Tremolo\\outputL2.ref", +#else + "\\Storage Card\\Tremolo\\output2.ref", +#endif /* _LOW_ACCURACY_ */ + NULL }; +#endif +#if TESTFILE == 0 + char *argv[] = { "testtremor", + "\\Storage Card\\Tremolo\\infile.ogg", + "\\Storage Card\\Tremolo\\output.pcm", +#ifdef _LOW_ACCURACY_ + "\\Storage Card\\Tremolo\\outputL.ref", +#else + "\\Storage Card\\Tremolo\\output.ref", +#endif /* _LOW_ACCURACY_ */ + NULL }; +#endif +#if TESTFILE == 1 + char *argv[] = { "testtremor", + "\\My Storage\\Tremolo\\Alarm_Classic.ogg", + "\\My Storage\\Tremolo\\output.pcm", +#ifdef _LOW_ACCURACY_ + "\\My Storage\\Tremolo\\outputL.ref", +#else + "\\My Storage\\Tremolo\\output.ref", +#endif /* _LOW_ACCURACY_ */ + NULL }; +#endif + return main(4, argv); +} +#endif + |