diff options
author | Robin Watts <robin@xiph.org> | 2010-05-29 12:52:40 +0000 |
---|---|---|
committer | Robin Watts <robin@xiph.org> | 2010-05-29 12:52:40 +0000 |
commit | d4a99079c96f680aae57af6a49a4c0c82eea0308 (patch) | |
tree | 4e1057c2f5dd4cdaa4855b9682f0e34f4d62281b | |
parent | 34bebf654da319c6204717a2bce09759d65867d4 (diff) | |
download | tremor-d4a99079c96f680aae57af6a49a4c0c82eea0308.tar.gz |
Move back to ARM format assembler files, but add a script to convert
from arm2gnu format. Update revised temporary makefile that now invokes
the conversion script as required.
git-svn-id: https://svn.xiph.org/branches/lowmem-branch/Tremolo@17254 0101bb08-14d6-0310-b084-bc0e0c8e3800
-rw-r--r-- | Makefile.rjw | 188 | ||||
-rw-r--r-- | Makefile2 | 107 | ||||
-rwxr-xr-x | arm2gnu.pl | 265 | ||||
-rw-r--r-- | bitwiseARM.s | 496 | ||||
-rw-r--r-- | floor1ARM.s | 48 | ||||
-rw-r--r-- | floor1LARM.s | 46 | ||||
-rw-r--r-- | mdctARM.s | 1676 | ||||
-rw-r--r-- | mdctLARM.s | 1618 |
8 files changed, 2395 insertions, 2049 deletions
diff --git a/Makefile.rjw b/Makefile.rjw new file mode 100644 index 0000000..c8acd47 --- /dev/null +++ b/Makefile.rjw @@ -0,0 +1,188 @@ +# Tremolo Makefile +# +# This is a temporary makefile used to test this branch until the merge +# completes. It doesn't use the config system etc, so should be considered +# a poor relation to doing it properly. +# +# Use: +# +# make -f Makefile.rjw +# +# $URL$ +# $Id$ + +srcdir = . +VPATH = $(srcdir) + +CC = arm-none-linux-gnueabi-gcc +CXX = arm-none-linux-gnueabi-g++ +LD = arm-none-linux-gnueabi-g++ +AR = arm-none-linux-gnueabi-ar cru +RANLIB = arm-none-linux-gnueabi-ranlib +STRIP = arm-none-linux-gnueabi-strip +WINDRES= arm-none-linux-gnueabi-windres +MKDIR = mkdir -p +RM = rm -f +RM_REC = rm -rf +ECHO = echo -n +CAT = cat +AS = arm-none-linux-gnueabi-as + +DEFINES := + +CFLAGS := -g -mcpu=cortex-a8 -mfpu=neon -I$(srcdir) -D__ARM__ -D_ARM_ + +CXXFLAGS := $(CFLAGS) + +LDFLAGS := +LIBS := + +OBJS := +MODULE_DIRS += . + +LIBOBJS := \ + build/bitwise.o \ + build/bitwiseARM.o \ + build/codebook.o \ + build/dsp.o \ + build/floor0.o \ + build/floor1.o \ + build/floor1ARM.o \ + build/floor_lookup.o \ + build/framing.o \ + build/info.o \ + build/mapping0.o \ + build/mdct.o \ + build/mdctARM.o \ + build/misc.o \ + build/res012.o \ + build/vorbisfile.o +EXEOBJS := build/testtremor.o + +LIBOBJS_C := \ + build/bitwise.oc \ + build/codebook.oc \ + build/dsp.oc \ + build/floor0.oc \ + build/floor1.oc \ + build/floor_lookup.oc \ + build/framing.oc \ + build/info.oc \ + build/mapping0.oc \ + build/mdct.oc \ + build/misc.oc \ + build/res012.oc \ + build/vorbisfile.oc +EXEOBJS_C := build/testtremor.oc + +LIBOBJS_L := \ + build/bitwise.ol \ + build/bitwiseARM.o \ + build/codebook.ol \ + build/dsp.ol \ + build/floor0.ol \ + build/floor1.ol \ + build/floor1LARM.o \ + build/floor_lookup.ol \ + build/framing.ol \ + build/info.ol \ + build/mapping0.ol \ + build/mdct.ol \ + build/mdctLARM.o \ + build/misc.ol \ + build/res012.ol \ + build/vorbisfile.ol +EXEOBJS_L := build/testtremor.ol + +LIBOBJS_LC := \ + build/bitwise.olc \ + build/codebook.olc \ + build/dsp.olc \ + build/floor0.olc \ + build/floor1.olc \ + build/floor_lookup.olc \ + build/framing.olc \ + build/info.olc \ + build/mapping0.olc \ + build/mdct.olc \ + build/misc.olc \ + build/res012.olc \ + build/vorbisfile.olc +EXEOBJS_LC := build/testtremor.olc + +# Rules +.SUFFIXES: .o .oc .ol .olc + +# Compilation rule +build/%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ -D_ARM_ASSEM_ + +build/%.oc: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +build/%.ol: %.c + $(CC) $(CFLAGS) -c $< -o $@ -D_LOW_ACCURACY_ -D_ARM_ASSEM_ + +build/%.olc: %.c + $(CC) $(CFLAGS) -c $< -o $@ -D_LOW_ACCURACY_ + +# Assembly rule +build/%.o: %.s + ./arm2gnu.pl < $< > build/$*.S + $(CC) -Ibuild $(CFLAGS) -c build/$*.S -o $@ + $(STRIP) -x $@ + +all: libTremolo.lib libTremoloC.lib libTremoloL.lib libTremoloLC.lib \ + testtremor.exe testtremorC.exe testtremorL.exe testtremorLC.exe + +libTremolo.lib: $(LIBOBJS) + $(AR) $@ $^ + $(RANLIB) $@ + +libTremoloC.lib: $(LIBOBJS_C) + $(AR) $@ $^ + $(RANLIB) $@ + +libTremoloL.lib: $(LIBOBJS_L) + $(AR) $@ $^ + $(RANLIB) $@ + +libTremoloLC.lib: $(LIBOBJS_LC) + $(AR) $@ $^ + $(RANLIB) $@ + +#bitwiseTEST.o: bitwise.c +# $(CC) $(CFLAGS) -c -o bitwiseTEST.o bitwise.c -D_V_BIT_TEST + +#bittest.exe: bitwiseTEST.o bitwiseARM.o +# $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,bittest.exe.map + +testtremor.exe: $(EXEOBJS) $(LIBOBJS) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremor.exe.map + +testtremorC.exe: $(EXEOBJS_C) $(LIBOBJS_C) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorC.exe.map + +testtremorL.exe: $(EXEOBJS_L) $(LIBOBJS_L) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorL.exe.map + +testtremorLC.exe: $(EXEOBJS_LC) $(LIBOBJS_LC) + $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorLC.exe.map + +annotate.exe: annotate.c + gcc $^ -o $@ + +clean: + @-rm build/* >& /dev/null + @-rm libTremolo.lib >& /dev/null + @-rm libTremoloC.lib >& /dev/null + @-rm libTremoloL.lib >& /dev/null + @-rm libTremoloLC.lib >& /dev/null + @-rm testtremor.exe >& /dev/null + @-rm testtremorC.exe >& /dev/null + @-rm testtremorL.exe >& /dev/null + @-rm testtremorLC.exe >& /dev/null + @-rm testtremor.exe.map >& /dev/null + @-rm testtremorC.exe.map >& /dev/null + @-rm testtremorL.exe.map >& /dev/null + @-rm testtremorLC.exe.map >& /dev/null diff --git a/Makefile2 b/Makefile2 deleted file mode 100644 index 1b29142..0000000 --- a/Makefile2 +++ /dev/null @@ -1,107 +0,0 @@ -# Tremolo Makefile for Windows CE port -# Uses the VLC toolchain -# $URL$ -# $Id$ - -srcdir = . -VPATH = $(srcdir) - -CC = arm-none-linux-gnueabi-gcc -CXX = arm-none-linux-gnueabi-g++ -LD = arm-none-linux-gnueabi-g++ -AR = arm-none-linux-gnueabi-ar cru -RANLIB = arm-none-linux-gnueabi-ranlib -STRIP = arm-none-linux-gnueabi-strip -WINDRES= arm-none-linux-gnueabi-windres -MKDIR = mkdir -p -RM = rm -f -RM_REC = rm -rf -ECHO = echo -n -CAT = cat -AS = arm-none-linux-gnueabi-as - -DEFINES := - -CFLAGS := -g -mcpu=cortex-a8 -mfpu=neon -I$(srcdir) -D__ARM__ -D_ARM_ - -CXXFLAGS := $(CFLAGS) - -#LDFLAGS := -L/opt/mingw32ce -Llibs/lib -lmingw32 -LDFLAGS := -LIBS := - -OBJS := -MODULE_DIRS += . - -LIBOBJS := bitwise.o bitwiseARM.o codebook.o dsp.o floor0.o \ - floor1.o floor1ARM.o floor_lookup.o framing.o info.o mapping0.o \ - mdct.o mdctARM.o misc.o res012.o vorbisfile.o -EXEOBJS := testtremor.o - -#LIBOBJS_C := bitwise.oc codebook.oc dsp.oc floor0.oc floor1.oc \ -# floor_lookup.oc framing.oc info.oc mapping0.oc mdct.oc misc.oc \ -# res012.oc vorbisfile.oc -LIBOBJS_C := bitwise.oc codebook.oc dsp.oc floor0.oc \ - floor1.oc floor_lookup.oc framing.oc info.oc mapping0.oc \ - mdct.oc misc.oc res012.oc vorbisfile.oc -EXEOBJS_C := testtremor.oc - -LIBOBJS_L := bitwise.ol bitwiseARM.o codebook.ol dsp.ol floor0.ol \ - floor1.ol floor1LARM.o floor_lookup.ol framing.ol info.ol mapping0.ol \ - mdct.ol mdctLARM.o misc.ol res012.ol vorbisfile.ol -EXEOBJS_L := testtremor.ol - -LIBOBJS_LC := bitwise.olc codebook.olc dsp.olc floor0.olc floor1.olc \ - floor_lookup.olc framing.olc info.olc mapping0.olc mdct.olc misc.olc \ - res012.olc vorbisfile.olc -EXEOBJS_LC := testtremor.olc - -# Rules -.SUFFIXES: .o .oc .ol .olc - -.c.o: - $(CC) $(CFLAGS) -c $(<) -o $*.o -D_ARM_ASSEM_ - -.c.oc: - $(CC) $(CFLAGS) -c $(<) -o $*.oc -DONLY_C - -.c.ol: - $(CC) $(CFLAGS) -c $(<) -o $*.ol -D_LOW_ACCURACY_ -D_ARM_ASSEM_ - -.c.olc: - $(CC) $(CFLAGS) -c $(<) -o $*.olc -D_LOW_ACCURACY_ -DONLY_C -.S.s: - ./arm2gnu.pl < $(<) > $*.s - -all: libTremolo.lib testtremor.exe testtremorC.exe testtremorL.exe testtremorLC.exe - -libTremolo.lib: $(LIBOBJS) - $(AR) $@ $^ - $(RANLIB) $@ - -#bitwiseTEST.o: bitwise.c -# $(CC) $(CFLAGS) -c -o bitwiseTEST.o bitwise.c -D_V_BIT_TEST - -#bittest.exe: bitwiseTEST.o bitwiseARM.o -# $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,bittest.exe.map - -testtremor.exe: testtremor.o $(LIBOBJS) - $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremor.exe.map - -testtremorC.exe: testtremor.oc $(LIBOBJS_C) - $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorC.exe.map - -testtremorL.exe: testtremor.ol $(LIBOBJS_L) - $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorL.exe.map - -testtremorLC.exe: testtremor.olc $(LIBOBJS_LC) - $(LD) $^ $(LDFLAGS) $(LIBS) -o $@ -Wl,-Map,testtremorLC.exe.map - -annotate.exe: annotate.c - gcc $^ -o $@ - -clean: - rm `find . -name \*.o` - rm `find . -name \*.ol` - rm `find . -name \*.oc` - rm `find . -name \*.olc` diff --git a/arm2gnu.pl b/arm2gnu.pl new file mode 100755 index 0000000..ad12baa --- /dev/null +++ b/arm2gnu.pl @@ -0,0 +1,265 @@ +#!/usr/bin/perl + +my $bigend; # little/big endian + +eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' + if $running_under_some_shell; + +while ($ARGV[0] =~ /^-/) { + $_ = shift; + last if /^--/; + if (/^-n/) { + $nflag++; + next; + } + die "I don't recognize this switch: $_\\n"; +} +$printit++ unless $nflag; + +$\ = "\n"; # automatically add newline on print +$n=0; + +$thumb = 0; # ARM mode by default, not Thumb. + +LINE: +while (<>) { + + # For ADRLs we need to add a new line after the substituted one. + $addPadding = 0; + + # First, we do not dare to touch *anything* inside double quotes, do we? + # Second, if you want a dollar character in the string, + # insert two of them -- that's how ARM C and assembler treat strings. + s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next }; + s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next }; + s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next }; + # If substituted -- leave immediately ! + + s/@/,:/; + s/;/@/; + while ( /@.*'/ ) { + s/(@.*)'/$1/g; + } + s/\{FALSE\}/0/g; + s/\{TRUE\}/1/g; + s/\{(\w\w\w\w+)\}/$1/g; + s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/; + s/\bGET[ \t]*([^ \t\n]+)/.include \"$1\"/; + s/\bIMPORT\b/.extern/; + s/\bEXPORT\b/.global/; + s/^(\s+)\[/$1IF/; + s/^(\s+)\|/$1ELSE/; + s/^(\s+)\]/$1ENDIF/; + s/IF *:DEF:/ .ifdef/; + s/IF *:LNOT: *:DEF:/ .ifndef/; + s/ELSE/ .else/; + s/ENDIF/ .endif/; + + if( /\bIF\b/ ) { + s/\bIF\b/ .if/; + s/=/==/; + } + if ( $n == 2) { + s/\$/\\/g; + } + if ($n == 1) { + s/\$//g; + s/label//g; + $n = 2; + } + if ( /MACRO/ ) { + s/MACRO *\n/.macro/; + $n=1; + } + if ( /\bMEND\b/ ) { + s/\bMEND\b/.endm/; + $n=0; + } + + # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there. + # + if ( /\bAREA\b/ ) { + s/^(.+)CODE(.+)READONLY(.*)/ .text/; + s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata\n .align 2/; + s/^(.+)\|\|\.data\|\|(.+)/ .data\n .align 2/; + s/^(.+)\|\|\.bss\|\|(.+)/ .bss/; + } + + s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3|| + s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2|| + s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2|| + s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/; + s/^(\s+)\%(\s)/ .space $1/; + + s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123 + s/\bCODE32\b/.code 32/ && do {$thumb = 0}; + s/\bCODE16\b/.code 16/ && do {$thumb = 1}; + if (/\bPROC\b/) + { + print " .thumb_func" if ($thumb); + s/\bPROC\b/@ $&/; + } + s/\bENDP\b/@ $&/; + s/\bSUBT\b/@ $&/; + s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25 + s/\bKEEP\b/@ $&/; + s/\bEXPORTAS\b/@ $&/; + s/\|\|(.)+\bEQU\b/@ $&/; + s/\|\|([\w\$]+)\|\|/$1/; + s/\bENTRY\b/@ $&/; + s/\bASSERT\b/@ $&/; + s/\bGBLL\b/@ $&/; + s/\bGBLA\b/@ $&/; + s/^\W+OPT\b/@ $&/; + s/:OR:/|/g; + s/:SHL:/<</g; + s/:SHR:/>>/g; + s/:AND:/&/g; + s/:LAND:/&&/g; + s/CPSR/cpsr/; + s/SPSR/spsr/; + s/ALIGN$/.balign 4/; + s/psr_cxsf/psr_all/; + s/LTORG/.ltorg/; + s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/; + + # {PC} + 0xdeadfeed --> . + 0xdeadfeed + s/\{PC\} \+/ \. +/; + + # Single hex constant on the line ! + # + # >>> NOTE <<< + # Double-precision floats in gcc are always mixed-endian, which means + # bytes in two words are little-endian, but words are big-endian. + # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address + # and 0xfeed0000 at high address. + # + s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! + s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/; + + # Single hex constant on the line ! +# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! +# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/; + s/\bDCFS[ \t]+0x/.word 0x/; + s/\bDCFS\b/.float/; + + s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/; + s/\bDCD\b/.word/; + s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/; + s/\bDCW\b/.short/; + s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/; + s/\bDCB\b/.byte/; + s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/; + s/^[A-Za-z_\.]\w+/$&:/; + s/^(\d+)/$1:/; + s/\%(\d+)/$1b_or_f/; + s/\%[Bb](\d+)/$1b/; + s/\%[Ff](\d+)/$1f/; + s/\%[Ff][Tt](\d+)/$1f/; + s/&([\dA-Fa-f]+)/0x$1/; + if ( /\b2_[01]+\b/ ) { + s/\b2_([01]+)\b/conv$1&&&&/g; + while ( /[01][01][01][01]&&&&/ ) { + s/0000&&&&/&&&&0/g; + s/0001&&&&/&&&&1/g; + s/0010&&&&/&&&&2/g; + s/0011&&&&/&&&&3/g; + s/0100&&&&/&&&&4/g; + s/0101&&&&/&&&&5/g; + s/0110&&&&/&&&&6/g; + s/0111&&&&/&&&&7/g; + s/1000&&&&/&&&&8/g; + s/1001&&&&/&&&&9/g; + s/1010&&&&/&&&&A/g; + s/1011&&&&/&&&&B/g; + s/1100&&&&/&&&&C/g; + s/1101&&&&/&&&&D/g; + s/1110&&&&/&&&&E/g; + s/1111&&&&/&&&&F/g; + } + s/000&&&&/&&&&0/g; + s/001&&&&/&&&&1/g; + s/010&&&&/&&&&2/g; + s/011&&&&/&&&&3/g; + s/100&&&&/&&&&4/g; + s/101&&&&/&&&&5/g; + s/110&&&&/&&&&6/g; + s/111&&&&/&&&&7/g; + s/00&&&&/&&&&0/g; + s/01&&&&/&&&&1/g; + s/10&&&&/&&&&2/g; + s/11&&&&/&&&&3/g; + s/0&&&&/&&&&0/g; + s/1&&&&/&&&&1/g; + s/conv&&&&/0x/g; + } + + if ( /commandline/) + { + if( /-bigend/) + { + $bigend=1; + } + } + + if ( /\bDCDU\b/ ) + { + my $cmd=$_; + my $value; + my $w1; + my $w2; + my $w3; + my $w4; + + s/\s+DCDU\b/@ $&/; + + $cmd =~ /\bDCDU\b\s+0x(\d+)/; + $value = $1; + $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/; + $w1 = $1; + $w2 = $2; + $w3 = $3; + $w4 = $4; + + if( $bigend ne "") + { + # big endian + + print " .byte 0x".$w1; + print " .byte 0x".$w2; + print " .byte 0x".$w3; + print " .byte 0x".$w4; + } + else + { + # little endian + + print " .byte 0x".$w4; + print " .byte 0x".$w3; + print " .byte 0x".$w2; + print " .byte 0x".$w1; + } + + } + + + if ( /\badrl\b/i ) + { + s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i; + $addPadding = 1; + } + s/\bEND\b/@ END/; +} continue { + printf ("%s", $_) if $printit; + if ($addPadding != 0) + { + printf (" mov r0,r0\n"); + $addPadding = 0; + } +} + diff --git a/bitwiseARM.s b/bitwiseARM.s index 7a24aee..5043a02 100644 --- a/bitwiseARM.s +++ b/bitwiseARM.s @@ -1,80 +1,80 @@ -@ Tremolo library -@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; Tremolo library +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - .text + AREA |.text|, CODE, READONLY - .global oggpack_look - .global oggpack_adv - .global oggpack_readinit - .global oggpack_read + EXPORT oggpack_look + EXPORT oggpack_adv + EXPORT oggpack_readinit + EXPORT oggpack_read -oggpack_look: - @ r0 = oggpack_buffer *b - @ r1 = int bits +oggpack_look + ; r0 = oggpack_buffer *b + ; r1 = int bits STMFD r13!,{r10,r11,r14} LDMIA r0,{r2,r3,r12} - @ r2 = bitsLeftInSegment - @ r3 = ptr - @ r12= bitsLeftInWord - SUBS r2,r2,r1 @ bitsLeftinSegment -= bits - BLT look_slow @ Not enough bits in this segment for - @ this request. Do it slowly. - LDR r10,[r3] @ r10= ptr[0] - RSB r14,r12,#32 @ r14= 32-bitsLeftInWord - SUBS r12,r12,r1 @ r12= bitsLeftInWord -= bits - LDRLT r11,[r3,#4]! @ r11= ptr[1] - MOV r10,r10,LSR r14 @ r10= ptr[0]>>(32-bitsLeftInWord) - ADDLE r12,r12,#32 @ r12= bitsLeftInWord += 32 - RSB r14,r14,#32 @ r14= 32-bitsLeftInWord - ORRLT r10,r10,r11,LSL r14 @ r10= Next 32 bits. + ; r2 = bitsLeftInSegment + ; r3 = ptr + ; r12= bitsLeftInWord + SUBS r2,r2,r1 ; bitsLeftinSegment -= bits + BLT look_slow ; Not enough bits in this segment for + ; this request. Do it slowly. + LDR r10,[r3] ; r10= ptr[0] + RSB r14,r12,#32 ; r14= 32-bitsLeftInWord + SUBS r12,r12,r1 ; r12= bitsLeftInWord -= bits + LDRLT r11,[r3,#4]! ; r11= ptr[1] + MOV r10,r10,LSR r14 ; r10= ptr[0]>>(32-bitsLeftInWord) + ADDLE r12,r12,#32 ; r12= bitsLeftInWord += 32 + RSB r14,r14,#32 ; r14= 32-bitsLeftInWord + ORRLT r10,r10,r11,LSL r14 ; r10= Next 32 bits. MOV r14,#1 RSB r14,r14,r14,LSL r1 AND r0,r10,r14 LDMFD r13!,{r10,r11,PC} -look_slow: +look_slow STMFD r13!,{r5,r6} - ADDS r10,r2,r1 @ r10= bitsLeftInSegment + bits (i.e. - @ the initial value of bitsLeftInSeg) - @ r10 = bitsLeftInSegment (initial) - @ r12 = bitsLeftInWord - RSB r14,r12,#32 @ r14= 32-bitsLeftInWord - MOV r5,r10 @ r5 = bitsLeftInSegment (initial) + ADDS r10,r2,r1 ; r10= bitsLeftInSegment + bits (i.e. + ; the initial value of bitsLeftInSeg) + ; r10 = bitsLeftInSegment (initial) + ; r12 = bitsLeftInWord + RSB r14,r12,#32 ; r14= 32-bitsLeftInWord + MOV r5,r10 ; r5 = bitsLeftInSegment (initial) BLT look_overrun - BEQ look_next_segment @ r10= r12 = 0, if we branch - CMP r12,r10 @ If bitsLeftInWord < bitsLeftInSeg - @ there must be more in the next word - LDR r10,[r3],#4 @ r10= ptr[0] - LDRLT r6,[r3] @ r6 = ptr[1] + BEQ look_next_segment ; r10= r12 = 0, if we branch + CMP r12,r10 ; If bitsLeftInWord < bitsLeftInSeg + ; there must be more in the next word + LDR r10,[r3],#4 ; r10= ptr[0] + LDRLT r6,[r3] ; r6 = ptr[1] MOV r11,#1 - MOV r10,r10,LSR r14 @ r10= first bitsLeftInWord bits - ORRLT r10,r10,r6,LSL r12 @ r10= first bitsLeftInSeg bits+crap - RSB r11,r11,r11,LSL r5 @ r11= mask - AND r10,r10,r11 @ r10= first r5 bits - @ Load the next segments data -look_next_segment: - @ At this point, r10 contains the first r5 bits of the result - LDR r11,[r0,#12] @ r11= head = b->head - @ Stall - @ Stall -look_next_segment_2: - LDR r11,[r11,#12] @ r11= head = head->next - @ Stall - @ Stall + MOV r10,r10,LSR r14 ; r10= first bitsLeftInWord bits + ORRLT r10,r10,r6,LSL r12 ; r10= first bitsLeftInSeg bits+crap + RSB r11,r11,r11,LSL r5 ; r11= mask + AND r10,r10,r11 ; r10= first r5 bits + ; Load the next segments data +look_next_segment + ; At this point, r10 contains the first r5 bits of the result + LDR r11,[r0,#12] ; r11= head = b->head + ; Stall + ; Stall +look_next_segment_2 + LDR r11,[r11,#12] ; r11= head = head->next + ; Stall + ; Stall CMP r11,#0 BEQ look_out_of_data - LDMIA r11,{r6,r12,r14} @ r6 = buffer - @ r12= begin - @ r14= length - LDR r6,[r6] @ r6 = buffer->data + LDMIA r11,{r6,r12,r14} ; r6 = buffer + ; r12= begin + ; r14= length + LDR r6,[r6] ; r6 = buffer->data CMP r14,#0 BEQ look_next_segment_2 - ADD r6,r6,r12 @ r6 = buffer->data+begin -look_slow_loop: - LDRB r12,[r6],#1 @ r12= *buffer - SUBS r14,r14,#1 @ r14= length - @ Stall - ORR r10,r10,r12,LSL r5 @ r10= first r5+8 bits + ADD r6,r6,r12 ; r6 = buffer->data+begin +look_slow_loop + LDRB r12,[r6],#1 ; r12= *buffer + SUBS r14,r14,#1 ; r14= length + ; Stall + ORR r10,r10,r12,LSL r5 ; r10= first r5+8 bits ADD r5,r5,#8 BLE look_really_slow CMP r5,r1 @@ -85,7 +85,7 @@ look_slow_loop: LDMFD r13!,{r5,r6,r10,r11,PC} -look_really_slow: +look_really_slow CMP r5,r1 BLT look_next_segment_2 MOV r14,#1 @@ -93,208 +93,208 @@ look_really_slow: AND r0,r10,r14 LDMFD r13!,{r5,r6,r10,r11,PC} -look_out_of_data: - MVN r0,#0 @ return -1 - @MOV r0,#0 +look_out_of_data + MVN r0,#0 ; return -1 + ;MOV r0,#0 LDMFD r13!,{r5,r6,r10,r11,PC} -look_overrun: - @ We had overrun when we started, so we need to skip -r10 bits. - LDR r11,[r0,#12] @ r11 = head = b->head - @ stall - @ stall -look_overrun_next_segment: - LDR r11,[r11,#12] @ r11 = head->next - @ stall - @ stall +look_overrun + ; We had overrun when we started, so we need to skip -r10 bits. + LDR r11,[r0,#12] ; r11 = head = b->head + ; stall + ; stall +look_overrun_next_segment + LDR r11,[r11,#12] ; r11 = head->next + ; stall + ; stall CMP r11,#0 BEQ look_out_of_data - LDMIA r11,{r6,r7,r14} @ r6 = buffer - @ r7 = begin - @ r14= length - LDR r6,[r6] @ r6 = buffer->data - @ stall - @ stall - ADD r6,r6,r7 @ r6 = buffer->data+begin - MOV r14,r14,LSL #3 @ r14= length in bits - ADDS r14,r14,r10 @ r14= length in bits-bits to skip + LDMIA r11,{r6,r7,r14} ; r6 = buffer + ; r7 = begin + ; r14= length + LDR r6,[r6] ; r6 = buffer->data + ; stall + ; stall + ADD r6,r6,r7 ; r6 = buffer->data+begin + MOV r14,r14,LSL #3 ; r14= length in bits + ADDS r14,r14,r10 ; r14= length in bits-bits to skip MOVLE r10,r14 BLE look_overrun_next_segment - RSB r10,r10,#0 @ r10= bits to skip - ADD r6,r10,r10,LSR #3 @ r6 = pointer to data + RSB r10,r10,#0 ; r10= bits to skip + ADD r6,r10,r10,LSR #3 ; r6 = pointer to data MOV r10,#0 B look_slow_loop -oggpack_adv: - @ r0 = oggpack_buffer *b - @ r1 = bits +oggpack_adv + ; r0 = oggpack_buffer *b + ; r1 = bits LDMIA r0,{r2,r3,r12} - @ r2 = bitsLeftInSegment - @ r3 = ptr - @ r12= bitsLeftInWord - SUBS r2,r2,r1 @ Does this run us out of bits in the - BLE adv_slow @ segment? If so, do it slowly + ; r2 = bitsLeftInSegment + ; r3 = ptr + ; r12= bitsLeftInWord + SUBS r2,r2,r1 ; Does this run us out of bits in the + BLE adv_slow ; segment? If so, do it slowly SUBS r12,r12,r1 ADDLE r12,r12,#32 ADDLE r3,r3,#4 STMIA r0,{r2,r3,r12} MOV PC,R14 -adv_slow: +adv_slow STMFD r13!,{r10,r14} - LDR r14,[r0,#12] @ r14= head - @ stall -adv_slow_loop: - LDR r1,[r0,#20] @ r1 = count - LDR r10,[r14,#8] @ r10= head->length - LDR r14,[r14,#12] @ r14= head->next - @ stall - ADD r1,r1,r10 @ r1 = count += head->length + LDR r14,[r0,#12] ; r14= head + ; stall +adv_slow_loop + LDR r1,[r0,#20] ; r1 = count + LDR r10,[r14,#8] ; r10= head->length + LDR r14,[r14,#12] ; r14= head->next + ; stall + ADD r1,r1,r10 ; r1 = count += head->length CMP r14,#0 BEQ adv_end - STR r1,[r0,#20] @ b->count = count - STR r14,[r0,#12] @ b->head = head - LDMIA r14,{r3,r10,r12} @ r3 = buffer - @ r10= begin - @ r12= length - LDR r3,[r3] @ r3 = buffer->data - ADD r3,r3,r10 @ r3 = Pointer to start (byte) - AND r10,r3,#3 @ r10= bytes to backtrk to word align - MOV r10,r10,LSL #3 @ r10= bits to backtrk to word align - RSB r10,r10,#32 @ r10= bits left in word - ADDS r10,r10,r2 @ r10= bits left in word after skip + STR r1,[r0,#20] ; b->count = count + STR r14,[r0,#12] ; b->head = head + LDMIA r14,{r3,r10,r12} ; r3 = buffer + ; r10= begin + ; r12= length + LDR r3,[r3] ; r3 = buffer->data + ADD r3,r3,r10 ; r3 = Pointer to start (byte) + AND r10,r3,#3 ; r10= bytes to backtrk to word align + MOV r10,r10,LSL #3 ; r10= bits to backtrk to word align + RSB r10,r10,#32 ; r10= bits left in word + ADDS r10,r10,r2 ; r10= bits left in word after skip ADDLE r10,r10,#32 ADDLE r3,r3,#4 - BIC r3,r3,#3 @ r3 = Pointer to start (word) - ADDS r2,r2,r12,LSL #3 @ r2 = length in bits after advance + BIC r3,r3,#3 ; r3 = Pointer to start (word) + ADDS r2,r2,r12,LSL #3 ; r2 = length in bits after advance BLE adv_slow_loop STMIA r0,{r2,r3,r10} LDMFD r13!,{r10,PC} -adv_end: +adv_end MOV r2, #0 MOV r12,#0 STMIA r0,{r2,r3,r12} LDMFD r13!,{r10,PC} -oggpack_readinit: - @ r0 = oggpack_buffer *b - @ r1 = oggreference *r - STR r1,[r0,#12] @ b->head = r1 - STR r1,[r0,#16] @ b->tail = r1 - LDMIA r1,{r2,r3,r12} @ r2 = b->head->buffer - @ r3 = b->head->begin - @ r12= b->head->length - LDR r2,[r2] @ r2 = b->head->buffer->data - MOV r1,r12,LSL #3 @ r1 = BitsInSegment +oggpack_readinit + ; r0 = oggpack_buffer *b + ; r1 = oggreference *r + STR r1,[r0,#12] ; b->head = r1 + STR r1,[r0,#16] ; b->tail = r1 + LDMIA r1,{r2,r3,r12} ; r2 = b->head->buffer + ; r3 = b->head->begin + ; r12= b->head->length + LDR r2,[r2] ; r2 = b->head->buffer->data + MOV r1,r12,LSL #3 ; r1 = BitsInSegment MOV r12,#0 - ADD r3,r2,r3 @ r3 = r2+b->head->begin - BIC r2,r3,#3 @ r2 = b->headptr (word) + ADD r3,r2,r3 ; r3 = r2+b->head->begin + BIC r2,r3,#3 ; r2 = b->headptr (word) AND r3,r3,#3 MOV r3,r3,LSL #3 - RSB r3,r3,#32 @ r3 = BitsInWord + RSB r3,r3,#32 ; r3 = BitsInWord STMIA r0,{r1,r2,r3} STR r12,[r0,#20] MOV PC,R14 -oggpack_read: - @ r0 = oggpack_buffer *b - @ r1 = int bits +oggpack_read + ; r0 = oggpack_buffer *b + ; r1 = int bits STMFD r13!,{r10,r11,r14} LDMIA r0,{r2,r3,r12} - @ r2 = bitsLeftInSegment - @ r3 = ptr - @ r12= bitsLeftInWord - SUBS r2,r2,r1 @ bitsLeftinSegment -= bits - BLT read_slow @ Not enough bits in this segment for - @ this request. Do it slowly. - LDR r10,[r3] @ r10= ptr[0] - RSB r14,r12,#32 @ r14= 32-bitsLeftInWord - SUBS r12,r12,r1 @ r12= bitsLeftInWord -= bits + ; r2 = bitsLeftInSegment + ; r3 = ptr + ; r12= bitsLeftInWord + SUBS r2,r2,r1 ; bitsLeftinSegment -= bits + BLT read_slow ; Not enough bits in this segment for + ; this request. Do it slowly. + LDR r10,[r3] ; r10= ptr[0] + RSB r14,r12,#32 ; r14= 32-bitsLeftInWord + SUBS r12,r12,r1 ; r12= bitsLeftInWord -= bits ADDLE r3,r3,#4 - LDRLT r11,[r3] @ r11= ptr[1] - MOV r10,r10,LSR r14 @ r10= ptr[0]>>(32-bitsLeftInWord) - ADDLE r12,r12,#32 @ r12= bitsLeftInWord += 32 - RSB r14,r14,#32 @ r14= 32-bitsLeftInWord - ORRLT r10,r10,r11,LSL r14 @ r10= Next 32 bits. + LDRLT r11,[r3] ; r11= ptr[1] + MOV r10,r10,LSR r14 ; r10= ptr[0]>>(32-bitsLeftInWord) + ADDLE r12,r12,#32 ; r12= bitsLeftInWord += 32 + RSB r14,r14,#32 ; r14= 32-bitsLeftInWord + ORRLT r10,r10,r11,LSL r14 ; r10= Next 32 bits. STMIA r0,{r2,r3,r12} MOV r14,#1 RSB r14,r14,r14,LSL r1 AND r0,r10,r14 LDMFD r13!,{r10,r11,PC} -read_slow: +read_slow STMFD r13!,{r5,r6} - ADDS r10,r2,r1 @ r10= bitsLeftInSegment + bits (i.e. - @ the initial value of bitsLeftInSeg) - @ r10 = bitsLeftInSegment (initial) - @ r12 = bitsLeftInWord - RSB r14,r12,#32 @ r14= 32-bitsLeftInWord - MOV r5,r10 @ r5 = bitsLeftInSegment (initial) + ADDS r10,r2,r1 ; r10= bitsLeftInSegment + bits (i.e. + ; the initial value of bitsLeftInSeg) + ; r10 = bitsLeftInSegment (initial) + ; r12 = bitsLeftInWord + RSB r14,r12,#32 ; r14= 32-bitsLeftInWord + MOV r5,r10 ; r5 = bitsLeftInSegment (initial) BLT read_overrun - BEQ read_next_segment @ r10= r12 = 0, if we branch - CMP r12,r10 @ If bitsLeftInWord < bitsLeftInSeg - @ there must be more in the next word - LDR r10,[r3],#4 @ r10= ptr[0] - LDRLT r6,[r3] @ r6 = ptr[1] + BEQ read_next_segment ; r10= r12 = 0, if we branch + CMP r12,r10 ; If bitsLeftInWord < bitsLeftInSeg + ; there must be more in the next word + LDR r10,[r3],#4 ; r10= ptr[0] + LDRLT r6,[r3] ; r6 = ptr[1] MOV r11,#1 - MOV r10,r10,LSR r14 @ r10= first bitsLeftInWord bits - ORRLT r10,r10,r6,LSL r12 @ r10= first bitsLeftInSeg bits+crap - RSB r11,r11,r11,LSL r5 @ r11= mask - AND r10,r10,r11 @ r10= first r5 bits - @ Load the next segments data -read_next_segment: - @ At this point, r10 contains the first r5 bits of the result - LDR r11,[r0,#12] @ r11= head = b->head - @ Stall -read_next_segment_2: - @ r11 = head - LDR r6,[r0,#20] @ r6 = count - LDR r12,[r11,#8] @ r12= length - LDR r11,[r11,#12] @ r11= head = head->next - @ Stall - ADD r6,r6,r12 @ count += length + MOV r10,r10,LSR r14 ; r10= first bitsLeftInWord bits + ORRLT r10,r10,r6,LSL r12 ; r10= first bitsLeftInSeg bits+crap + RSB r11,r11,r11,LSL r5 ; r11= mask + AND r10,r10,r11 ; r10= first r5 bits + ; Load the next segments data +read_next_segment + ; At this point, r10 contains the first r5 bits of the result + LDR r11,[r0,#12] ; r11= head = b->head + ; Stall +read_next_segment_2 + ; r11 = head + LDR r6,[r0,#20] ; r6 = count + LDR r12,[r11,#8] ; r12= length + LDR r11,[r11,#12] ; r11= head = head->next + ; Stall + ADD r6,r6,r12 ; count += length CMP r11,#0 BEQ read_out_of_data STR r11,[r0,#12] - STR r6,[r0,#20] @ b->count = count - LDMIA r11,{r6,r12,r14} @ r6 = buffer - @ r12= begin - @ r14= length - LDR r6,[r6] @ r6 = buffer->data + STR r6,[r0,#20] ; b->count = count + LDMIA r11,{r6,r12,r14} ; r6 = buffer + ; r12= begin + ; r14= length + LDR r6,[r6] ; r6 = buffer->data CMP r14,#0 BEQ read_next_segment_2 - ADD r6,r6,r12 @ r6 = buffer->data+begin -read_slow_loop: - LDRB r12,[r6],#1 @ r12= *buffer - SUBS r14,r14,#1 @ r14= length - @ Stall - ORR r10,r10,r12,LSL r5 @ r10= first r5+8 bits + ADD r6,r6,r12 ; r6 = buffer->data+begin +read_slow_loop + LDRB r12,[r6],#1 ; r12= *buffer + SUBS r14,r14,#1 ; r14= length + ; Stall + ORR r10,r10,r12,LSL r5 ; r10= first r5+8 bits ADD r5,r5,#8 BLE read_really_slow CMP r5,r1 BLT read_slow_loop -read_end: +read_end MOV r12,#1 RSB r12,r12,r12,LSL r1 - @ Store back the new position - @ r2 = -number of bits to go from this segment - @ r6 = ptr - @ r14= bytesLeftInSegment - @ r11= New head value - LDMIA r11,{r3,r6,r14} @ r3 = buffer - @ r6 = begin - @ r14= length - LDR r3,[r3] @ r3 = buffer->data - ADD r1,r2,r14,LSL #3 @ r1 = bitsLeftInSegment - @ stall - ADD r6,r3,r6 @ r6 = pointer - AND r3,r6,#3 @ r3 = bytes used in first word - RSB r3,r2,r3,LSL #3 @ r3 = bits used in first word - BIC r2,r6,#3 @ r2 = word ptr - RSBS r3,r3,#32 @ r3 = bitsLeftInWord + ; Store back the new position + ; r2 = -number of bits to go from this segment + ; r6 = ptr + ; r14= bytesLeftInSegment + ; r11= New head value + LDMIA r11,{r3,r6,r14} ; r3 = buffer + ; r6 = begin + ; r14= length + LDR r3,[r3] ; r3 = buffer->data + ADD r1,r2,r14,LSL #3 ; r1 = bitsLeftInSegment + ; stall + ADD r6,r3,r6 ; r6 = pointer + AND r3,r6,#3 ; r3 = bytes used in first word + RSB r3,r2,r3,LSL #3 ; r3 = bits used in first word + BIC r2,r6,#3 ; r2 = word ptr + RSBS r3,r3,#32 ; r3 = bitsLeftInWord ADDLE r3,r3,#32 ADDLE r2,r2,#4 STMIA r0,{r1,r2,r3} @@ -303,66 +303,66 @@ read_end: LDMFD r13!,{r5,r6,r10,r11,PC} -read_really_slow: +read_really_slow CMP r5,r1 BGE read_end - LDR r14,[r11,#8] @ r14= length of segment just done - @ stall - @ stall - ADD r2,r2,r14,LSL #3 @ r2 = -bits to use from next seg + LDR r14,[r11,#8] ; r14= length of segment just done + ; stall + ; stall + ADD r2,r2,r14,LSL #3 ; r2 = -bits to use from next seg B read_next_segment_2 -read_out_of_data: - @ Store back the new position - @ r2 = -number of bits to go from this segment - @ r6 = ptr - @ r14= bytesLeftInSegment - @ RJW: This may be overkill - we leave the buffer empty, with -1 - @ bits left in it. We might get away with just storing the - @ bitsLeftInSegment as -1. - LDR r11,[r0,#12] @ r11=head +read_out_of_data + ; Store back the new position + ; r2 = -number of bits to go from this segment + ; r6 = ptr + ; r14= bytesLeftInSegment + ; RJW: This may be overkill - we leave the buffer empty, with -1 + ; bits left in it. We might get away with just storing the + ; bitsLeftInSegment as -1. + LDR r11,[r0,#12] ; r11=head - LDMIA r11,{r3,r6,r14} @ r3 = buffer - @ r6 = begin - @ r14= length - LDR r3,[r3] @ r3 = buffer->data - ADD r6,r3,r6 @ r6 = pointer + LDMIA r11,{r3,r6,r14} ; r3 = buffer + ; r6 = begin + ; r14= length + LDR r3,[r3] ; r3 = buffer->data + ADD r6,r3,r6 ; r6 = pointer ADD r6,r6,r14 - AND r3,r6,#3 @ r3 = bytes used in first word - MOV r3,r3,LSL #3 @ r3 = bits used in first word - BIC r2,r6,#3 @ r2 = word ptr - RSBS r3,r3,#32 @ r3 = bitsLeftInWord - MVN r1,#0 @ r1 = -1 = bitsLeftInSegment + AND r3,r6,#3 ; r3 = bytes used in first word + MOV r3,r3,LSL #3 ; r3 = bits used in first word + BIC r2,r6,#3 ; r2 = word ptr + RSBS r3,r3,#32 ; r3 = bitsLeftInWord + MVN r1,#0 ; r1 = -1 = bitsLeftInSegment STMIA r0,{r1,r2,r3} - MVN r0,#0 @ return -1 - @MOV r0,#0 + MVN r0,#0 ; return -1 + ;MOV r0,#0 LDMFD r13!,{r5,r6,r10,r11,PC} -read_overrun: - @ We had overrun when we started, so we need to skip -r10 bits. - LDR r11,[r0,#12] @ r11 = head = b->head - @ stall - @ stall -read_overrun_next_segment: - LDR r11,[r11,#12] @ r11 = head->next - @ stall - @ stall +read_overrun + ; We had overrun when we started, so we need to skip -r10 bits. + LDR r11,[r0,#12] ; r11 = head = b->head + ; stall + ; stall +read_overrun_next_segment + LDR r11,[r11,#12] ; r11 = head->next + ; stall + ; stall CMP r11,#0 BEQ read_out_of_data - LDMIA r11,{r6,r7,r14} @ r6 = buffer - @ r7 = begin - @ r14= length - LDR r6,[r6] @ r6 = buffer->data - @ stall - @ stall - ADD r6,r6,r7 @ r6 = buffer->data+begin - MOV r14,r14,LSL #3 @ r14= length in bits - ADDS r14,r14,r10 @ r14= length in bits-bits to skip + LDMIA r11,{r6,r7,r14} ; r6 = buffer + ; r7 = begin + ; r14= length + LDR r6,[r6] ; r6 = buffer->data + ; stall + ; stall + ADD r6,r6,r7 ; r6 = buffer->data+begin + MOV r14,r14,LSL #3 ; r14= length in bits + ADDS r14,r14,r10 ; r14= length in bits-bits to skip MOVLE r10,r14 BLE read_overrun_next_segment - RSB r10,r10,#0 @ r10= bits to skip - ADD r6,r10,r10,LSR #3 @ r6 = pointer to data + RSB r10,r10,#0 ; r10= bits to skip + ADD r6,r10,r10,LSR #3 ; r6 = pointer to data MOV r10,#0 B read_slow_loop - @ END + END diff --git a/floor1ARM.s b/floor1ARM.s index f7f7ae1..689b5c5 100644 --- a/floor1ARM.s +++ b/floor1ARM.s @@ -1,36 +1,36 @@ -@ Tremolo library -@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; Tremolo library +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - .text + AREA |.text|, CODE, READONLY - .global render_line_arm + EXPORT render_line_arm -render_line_arm: - @ r0 = n - @ r1 = d - @ r2 = floor - @ r3 = base - @ <> = err - @ <> = adx - @ <> = ady +render_line_arm + ; r0 = n + ; r1 = d + ; r2 = floor + ; r3 = base + ; <> = err + ; <> = adx + ; <> = ady MOV r12,r13 STMFD r13!,{r4-r6,r11,r14} - LDMFD r12,{r11,r12,r14} @ r11 = err - @ r12 = adx - @ r14 = ady -rl_loop: - LDR r4,[r1] @ r4 = *d - LDR r5,[r2],r3,LSL #2 @ r5 = *floor r2 = floor+base - SUBS r11,r11,r14 @ err -= ady - ADDLT r11,r11,r12 @ if (err < 0) err+=adx - SMULL r6, r5, r4, r5 @ (r6,r5) = *d * *floor - ADDLT r2, r2, #4 @ floor+=1 + LDMFD r12,{r11,r12,r14} ; r11 = err + ; r12 = adx + ; r14 = ady +rl_loop + LDR r4,[r1] ; r4 = *d + LDR r5,[r2],r3,LSL #2 ; r5 = *floor r2 = floor+base + SUBS r11,r11,r14 ; err -= ady + ADDLT r11,r11,r12 ; if (err < 0) err+=adx + SMULL r6, r5, r4, r5 ; (r6,r5) = *d * *floor + ADDLT r2, r2, #4 ; floor+=1 MOVS r6, r6, LSR #15 - ADC r5, r6, r5, LSL #17 @ r5 = MULT31_SHIFT15 + ADC r5, r6, r5, LSL #17 ; r5 = MULT31_SHIFT15 STR r5,[r1],#4 SUBS r0, r0, #1 BGT rl_loop LDMFD r13!,{r4-r6,r11,PC} - @ END + END diff --git a/floor1LARM.s b/floor1LARM.s index 21163ae..d7ead1d 100644 --- a/floor1LARM.s +++ b/floor1LARM.s @@ -1,35 +1,35 @@ -@ Tremolo library -@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; Tremolo library +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - .text + AREA |.text|, CODE, READONLY - .global render_line_arm_low + EXPORT render_line_arm_low -render_line_arm_low: - @ r0 = n - @ r1 = d - @ r2 = floor - @ r3 = base - @ <> = err - @ <> = adx - @ <> = ady +render_line_arm_low + ; r0 = n + ; r1 = d + ; r2 = floor + ; r3 = base + ; <> = err + ; <> = adx + ; <> = ady MOV r12,r13 STMFD r13!,{r4-r6,r11,r14} - LDMFD r12,{r11,r12,r14} @ r11 = err - @ r12 = adx - @ r14 = ady -rl_loop: - LDR r4, [r1] @ r4 = *d - LDR r5, [r2], r3,LSL #2 @ r5 = *floor r2 = floor+base - SUBS r11,r11,r14 @ err -= ady + LDMFD r12,{r11,r12,r14} ; r11 = err + ; r12 = adx + ; r14 = ady +rl_loop + LDR r4, [r1] ; r4 = *d + LDR r5, [r2], r3,LSL #2 ; r5 = *floor r2 = floor+base + SUBS r11,r11,r14 ; err -= ady MOV r4, r4, ASR #6 - MUL r5, r4, r5 @ r5 = MULT31_SHIFT15 - ADDLT r11,r11,r12 @ if (err < 0) err+=adx - ADDLT r2, r2, #4 @ floor+=1 + MUL r5, r4, r5 ; r5 = MULT31_SHIFT15 + ADDLT r11,r11,r12 ; if (err < 0) err+=adx + ADDLT r2, r2, #4 ; floor+=1 SUBS r0, r0, #1 STR r5, [r1], #4 BGT rl_loop LDMFD r13!,{r4-r6,r11,PC} - @ END + END @@ -1,92 +1,92 @@ -@ Tremolo library -@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; Tremolo library +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - .text + AREA |.text|, CODE, READONLY - @ full accuracy version + ; full accuracy version - .global mdct_backward_arm - .global mdct_shift_right_arm - .global mdct_unroll_prelap_arm - .global mdct_unroll_part2_arm - .global mdct_unroll_part3_arm - .global mdct_unroll_postlap_arm + EXPORT mdct_backward_arm + EXPORT mdct_shift_right_arm + EXPORT mdct_unroll_prelap_arm + EXPORT mdct_unroll_part2_arm + EXPORT mdct_unroll_part3_arm + EXPORT mdct_unroll_postlap_arm - .extern sincos_lookup0 - .extern sincos_lookup1 + IMPORT sincos_lookup0 + IMPORT sincos_lookup1 -mdct_unroll_prelap_arm: - @ r0 = out - @ r1 = post - @ r2 = r - @ r3 = step +mdct_unroll_prelap_arm + ; r0 = out + ; r1 = post + ; r2 = r + ; r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r2, r1 @ r1 = r - post - SUBS r1, r1, #16 @ r1 = r - post - 16 + SUB r1, r2, r1 ; r1 = r - post + SUBS r1, r1, #16 ; r1 = r - post - 16 BLT unroll_over -unroll_loop: +unroll_loop LDMDB r2!,{r5,r6,r7,r12} - MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 - MOV r6, r6, ASR #9 @ r6 = (*--r)>>9 - MOV r7, r7, ASR #9 @ r7 = (*--r)>>9 - MOV r12,r12,ASR #9 @ r12= (*--r)>>9 + MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 + MOV r6, r6, ASR #9 ; r6 = (*--r)>>9 + MOV r7, r7, ASR #9 ; r7 = (*--r)>>9 + MOV r12,r12,ASR #9 ; r12= (*--r)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop -unroll_over: +unroll_over ADDS r1, r1, #16 BLE unroll_end -unroll_loop2: +unroll_loop2 LDR r5,[r2,#-4]! - @ stall - @ stall (Xscale) - MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 + ; stall + ; stall (Xscale) + MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop2 -unroll_end: +unroll_end LDMFD r13!,{r4-r7,PC} -mdct_unroll_postlap_arm: - @ r0 = out - @ r1 = post - @ r2 = l - @ r3 = step +mdct_unroll_postlap_arm + ; r0 = out + ; r1 = post + ; r2 = l + ; r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r1, r2 @ r1 = post - l - MOV r1, r1, ASR #1 @ r1 = (post - l)>>1 - SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4 + SUB r1, r1, r2 ; r1 = post - l + MOV r1, r1, ASR #1 ; r1 = (post - l)>>1 + SUBS r1, r1, #16 ; r1 = ((post - l)>>1) - 4 BLT unroll_over3 -unroll_loop3: +unroll_loop3 LDR r12,[r2],#8 LDR r7, [r2],#8 LDR r6, [r2],#8 @@ -97,142 +97,142 @@ unroll_loop3: RSB r6, r6, #0 RSB r7, r7, #0 - MOV r12, r12,ASR #9 @ r12= (-*l)>>9 - MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 - MOV r6, r6, ASR #9 @ r6 = (-*l)>>9 - MOV r7, r7, ASR #9 @ r7 = (-*l)>>9 + MOV r12, r12,ASR #9 ; r12= (-*l)>>9 + MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 + MOV r6, r6, ASR #9 ; r6 = (-*l)>>9 + MOV r7, r7, ASR #9 ; r7 = (-*l)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop3 -unroll_over3: +unroll_over3 ADDS r1, r1, #16 BLE unroll_over4 -unroll_loop4: +unroll_loop4 LDR r5,[r2], #8 - @ stall - @ stall (Xscale) + ; stall + ; stall (Xscale) RSB r5, r5, #0 - MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 + MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop4 -unroll_over4: +unroll_over4 LDMFD r13!,{r4-r7,PC} -mdct_unroll_part2_arm: - @ r0 = out - @ r1 = post - @ r2 = l - @ r3 = r - @ <> = step - @ <> = wL - @ <> = wR +mdct_unroll_part2_arm + ; r0 = out + ; r1 = post + ; r2 = l + ; r3 = r + ; <> = step + ; <> = wL + ; <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} @ r8 = step - @ r9 = wL - @ r10= wR + LDMFD r12,{r8,r9,r10} ; r8 = step + ; r9 = wL + ; r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r3, r1 @ r1 = (r - post) + SUBS r1, r3, r1 ; r1 = (r - post) BLE unroll_over5 -unroll_loop5: - LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first) - LDR r11,[r9],#4 @ r11= *wL++ - LDR r7, [r3, #-4]! @ r7 = *--r - LDR r6, [r10,#-4]! @ r6 = *--wR - - @ Can save a cycle here, at the cost of 1bit errors in rounding - SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++ - SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR +unroll_loop5 + LDR r12,[r2, #-8]! ; r12= *l (but l -= 2 first) + LDR r11,[r9],#4 ; r11= *wL++ + LDR r7, [r3, #-4]! ; r7 = *--r + LDR r6, [r10,#-4]! ; r6 = *--wR + + ; Can save a cycle here, at the cost of 1bit errors in rounding + SMULL r14,r11,r12,r11 ; (r14,r11) = *l * *wL++ + SMULL r14,r6, r7, r6 ; (r14,r6) = *--r * *--wR ADD r6, r6, r11 MOV r6, r6, ASR #8 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop5 -unroll_over5: +unroll_over5 LDMFD r13!,{r4,r6-r11,PC} -mdct_unroll_part3_arm: - @ r0 = out - @ r1 = post - @ r2 = l - @ r3 = r - @ <> = step - @ <> = wL - @ <> = wR +mdct_unroll_part3_arm + ; r0 = out + ; r1 = post + ; r2 = l + ; r3 = r + ; <> = step + ; <> = wL + ; <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} @ r8 = step - @ r9 = wL - @ r10= wR + LDMFD r12,{r8,r9,r10} ; r8 = step + ; r9 = wL + ; r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r1, r3 @ r1 = (post - r) + SUBS r1, r1, r3 ; r1 = (post - r) BLE unroll_over6 -unroll_loop6: - LDR r12,[r2],#8 @ r12= *l (but l += 2 first) - LDR r11,[r9],#4 @ r11= *wL++ - LDR r7, [r3],#4 @ r7 = *r++ - LDR r6, [r10,#-4]! @ r6 = *--wR - - @ Can save a cycle here, at the cost of 1bit errors in rounding - SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++ - SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR +unroll_loop6 + LDR r12,[r2],#8 ; r12= *l (but l += 2 first) + LDR r11,[r9],#4 ; r11= *wL++ + LDR r7, [r3],#4 ; r7 = *r++ + LDR r6, [r10,#-4]! ; r6 = *--wR + + ; Can save a cycle here, at the cost of 1bit errors in rounding + SMULL r14,r11,r12,r11 ; (r14,r11) = *l * *wL++ + SMULL r14,r6, r7, r6 ; (r14,r6) = *--r * *--wR SUB r6, r6, r11 MOV r6, r6, ASR #8 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop6 -unroll_over6: +unroll_over6 LDMFD r13!,{r4,r6-r11,PC} -mdct_shift_right_arm: - @ r0 = n - @ r1 = in - @ r2 = right +mdct_shift_right_arm + ; r0 = n + ; r1 = in + ; r2 = right STMFD r13!,{r4-r11,r14} - MOV r0, r0, LSR #2 @ n >>= 2 + MOV r0, r0, LSR #2 ; n >>= 2 ADD r1, r1, #4 SUBS r0, r0, #8 BLT sr_less_than_8 -sr_loop: +sr_loop LDR r3, [r1], #8 LDR r4, [r1], #8 LDR r5, [r1], #8 @@ -244,135 +244,135 @@ sr_loop: SUBS r0, r0, #8 STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14} BGE sr_loop -sr_less_than_8: +sr_less_than_8 ADDS r0, r0, #8 BEQ sr_end -sr_loop2: +sr_loop2 LDR r3, [r1], #8 SUBS r0, r0, #1 STR r3, [r2], #4 BGT sr_loop2 -sr_end: +sr_end LDMFD r13!,{r4-r11,PC} -mdct_backward_arm: - @ r0 = n - @ r1 = in +mdct_backward_arm + ; r0 = n + ; r1 = in STMFD r13!,{r4-r11,r14} - MOV r2,#1<<4 @ r2 = 1<<shift - MOV r3,#13-4 @ r3 = 13-shift -find_shift_loop: - TST r0,r2 @ if (n & (1<<shift)) == 0 + MOV r2,#1<<4 ; r2 = 1<<shift + MOV r3,#13-4 ; r3 = 13-shift +find_shift_loop + TST r0,r2 ; if (n & (1<<shift)) == 0 MOV r2,r2,LSL #1 - SUBEQ r3,r3,#1 @ shift-- + SUBEQ r3,r3,#1 ; shift-- BEQ find_shift_loop MOV r2,#2 - MOV r2,r2,LSL r3 @ r2 = step = 2<<shift - - @ presymmetry - @ r0 = n (a multiple of 4) - @ r1 = in - @ r2 = step - @ r3 = shift - - ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) - ADD r14,r1, r0 @ r14= in+(n>>2) - SUB r4, r4, #3*4 @ r4 = aX = in+n2-3 - LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 - -presymmetry_loop1: - LDR r7, [r4,#8] @ r6 = s2 = aX[2] - LDR r11,[r5,#4] @ r11= T[1] - LDR r6, [r4] @ r6 = s0 = aX[0] - LDR r10,[r5],r2,LSL #2 @ r10= T[0] T += step - - @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2]) - SMULL r8, r9, r7, r11 @ (r8, r9) = s2*T[1] - @ stall - @ stall ? - SMLAL r8, r9, r6, r10 @ (r8, r9) += s0*T[0] + MOV r2,r2,LSL r3 ; r2 = step = 2<<shift + + ; presymmetry + ; r0 = n (a multiple of 4) + ; r1 = in + ; r2 = step + ; r3 = shift + + ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) + ADD r14,r1, r0 ; r14= in+(n>>2) + SUB r4, r4, #3*4 ; r4 = aX = in+n2-3 + LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 + +presymmetry_loop1 + LDR r7, [r4,#8] ; r6 = s2 = aX[2] + LDR r11,[r5,#4] ; r11= T[1] + LDR r6, [r4] ; r6 = s0 = aX[0] + LDR r10,[r5],r2,LSL #2 ; r10= T[0] T += step + + ; XPROD31(s0, s2, T[0], T[1], &aX[0], &ax[2]) + SMULL r8, r9, r7, r11 ; (r8, r9) = s2*T[1] + ; stall + ; stall ? + SMLAL r8, r9, r6, r10 ; (r8, r9) += s0*T[0] RSB r6, r6, #0 - @ stall ? - SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[0] + ; stall ? + SMULL r8, r12,r7, r10 ; (r8, r12) = s2*T[0] MOV r9, r9, LSL #1 - @ stall ? - SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[1] - STR r9, [r4],#-16 @ aX[0] = r9 + ; stall ? + SMLAL r8, r12,r6, r11 ; (r8, r12) -= s0*T[1] + STR r9, [r4],#-16 ; aX[0] = r9 CMP r4,r14 MOV r12,r12,LSL #1 - STR r12,[r4,#8+16] @ aX[2] = r12 + STR r12,[r4,#8+16] ; aX[2] = r12 - BGE presymmetry_loop1 @ while (aX >= in+n4) + BGE presymmetry_loop1 ; while (aX >= in+n4) -presymmetry_loop2: - LDR r6,[r4] @ r6 = s0 = aX[0] - LDR r10,[r5,#4] @ r10= T[1] - LDR r7,[r4,#8] @ r6 = s2 = aX[2] - LDR r11,[r5],-r2,LSL #2 @ r11= T[0] T -= step +presymmetry_loop2 + LDR r6,[r4] ; r6 = s0 = aX[0] + LDR r10,[r5,#4] ; r10= T[1] + LDR r7,[r4,#8] ; r6 = s2 = aX[2] + LDR r11,[r5],-r2,LSL #2 ; r11= T[0] T -= step - @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2]) - SMULL r8, r9, r6, r10 @ (r8, r9) = s0*T[1] - @ stall - @ stall ? - SMLAL r8, r9, r7, r11 @ (r8, r9) += s2*T[0] + ; XPROD31(s0, s2, T[1], T[0], &aX[0], &ax[2]) + SMULL r8, r9, r6, r10 ; (r8, r9) = s0*T[1] + ; stall + ; stall ? + SMLAL r8, r9, r7, r11 ; (r8, r9) += s2*T[0] RSB r6, r6, #0 - @ stall ? - SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[1] + ; stall ? + SMULL r8, r12,r7, r10 ; (r8, r12) = s2*T[1] MOV r9, r9, LSL #1 - @ stall ? - SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[0] - STR r9, [r4],#-16 @ aX[0] = r9 + ; stall ? + SMLAL r8, r12,r6, r11 ; (r8, r12) -= s0*T[0] + STR r9, [r4],#-16 ; aX[0] = r9 CMP r4,r1 MOV r12,r12,LSL #1 - STR r12,[r4,#8+16] @ aX[2] = r12 + STR r12,[r4,#8+16] ; aX[2] = r12 - BGE presymmetry_loop2 @ while (aX >= in) + BGE presymmetry_loop2 ; while (aX >= in) - @ r0 = n - @ r1 = in - @ r2 = step - @ r3 = shift + ; r0 = n + ; r1 = in + ; r2 = step + ; r3 = shift STMFD r13!,{r3} - LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 - ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) - SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4 - LDR r11,[r5,#4] @ r11= T[1] - LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step -presymmetry_loop3: - LDR r8,[r1],#16 @ r8 = ro0 = bX[0] - LDR r9,[r1,#8-16] @ r9 = ro2 = bX[2] - LDR r6,[r4] @ r6 = ri0 = aX[0] - - @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] ) - @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 - SMULL r14,r12,r8, r11 @ (r14,r12) = ro0*T[1] - RSB r8,r8,#0 @ r8 = -ro0 - @ Stall ? - SMLAL r14,r12,r9, r10 @ (r14,r12) += ro2*T[0] - LDR r7,[r4,#8] @ r7 = ri2 = aX[2] - @ Stall ? - SMULL r14,r3, r9, r11 @ (r14,r3) = ro2*T[1] + LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 + ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) + SUB r4, r4, #4*4 ; r4 = aX = in+(n>>1)-4 + LDR r11,[r5,#4] ; r11= T[1] + LDR r10,[r5],r2, LSL #2 ; r10= T[0] T += step +presymmetry_loop3 + LDR r8,[r1],#16 ; r8 = ro0 = bX[0] + LDR r9,[r1,#8-16] ; r9 = ro2 = bX[2] + LDR r6,[r4] ; r6 = ri0 = aX[0] + + ; XNPROD31( ro2, ro0, T[1], T[0], &aX[0], &aX[2] ) + ; aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 + SMULL r14,r12,r8, r11 ; (r14,r12) = ro0*T[1] + RSB r8,r8,#0 ; r8 = -ro0 + ; Stall ? + SMLAL r14,r12,r9, r10 ; (r14,r12) += ro2*T[0] + LDR r7,[r4,#8] ; r7 = ri2 = aX[2] + ; Stall ? + SMULL r14,r3, r9, r11 ; (r14,r3) = ro2*T[1] MOV r12,r12,LSL #1 - LDR r11,[r5,#4] @ r11= T[1] - SMLAL r14,r3, r8, r10 @ (r14,r3) -= ro0*T[0] - LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step + LDR r11,[r5,#4] ; r11= T[1] + SMLAL r14,r3, r8, r10 ; (r14,r3) -= ro0*T[0] + LDR r10,[r5],r2, LSL #2 ; r10= T[0] T += step STR r12,[r4,#8] MOV r3, r3, LSL #1 STR r3, [r4],#-16 - @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] ) - @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 - SMULL r14,r12,r6, r10 @ (r14,r12) = ri0*T[0] - RSB r6,r6,#0 @ r6 = -ri0 - @ stall ? - SMLAL r14,r12,r7, r11 @ (r14,r12) += ri2*T[1] - @ stall ? - @ stall ? - SMULL r14,r3, r7, r10 @ (r14,r3) = ri2*T[0] + ; XNPROD31( ri2, ri0, T[0], T[1], &bX[0], &bX[2] ) + ; bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 + SMULL r14,r12,r6, r10 ; (r14,r12) = ri0*T[0] + RSB r6,r6,#0 ; r6 = -ri0 + ; stall ? + SMLAL r14,r12,r7, r11 ; (r14,r12) += ri2*T[1] + ; stall ? + ; stall ? + SMULL r14,r3, r7, r10 ; (r14,r3) = ri2*T[0] MOV r12,r12,LSL #1 - @ stall ? - SMLAL r14,r3, r6, r11 @ (r14,r3) -= ri0*T[1] + ; stall ? + SMLAL r14,r3, r6, r11 ; (r14,r3) -= ri0*T[1] CMP r4,r1 STR r12,[r1,#8-16] MOV r3, r3, LSL #1 @@ -380,571 +380,571 @@ presymmetry_loop3: BGE presymmetry_loop3 - SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in) + SUB r1,r1,r0 ; r1 = in -= n>>2 (i.e. restore in) LDR r3,[r13] STR r2,[r13,#-4]! - @ mdct_butterflies - @ r0 = n = (points * 2) - @ r1 = in = x - @ r2 = i - @ r3 = shift + ; mdct_butterflies + ; r0 = n = (points * 2) + ; r1 = in = x + ; r2 = i + ; r3 = shift STMFD r13!,{r0-r1} - RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages + RSBS r4,r3,#6 ; r4 = stages = 7-shift then --stages LDR r5,=sincos_lookup0 BLE no_generics - MOV r14,#4 @ r14= 4 (i=0) - MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift -mdct_butterflies_loop1: - MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS - MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0) + MOV r14,#4 ; r14= 4 (i=0) + MOV r6, r14,LSL r3 ; r6 = (4<<i)<<shift +mdct_butterflies_loop1 + MOV r0, r0, LSR #1 ; r0 = points>>i = POINTS + MOV r2, r14,LSR #2 ; r2 = (1<<i)-j (j=0) STMFD r13!,{r4,r14} -mdct_butterflies_loop2: - - @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) - @ mdct_butterfly_generic(r1, r0, r6) - @ r0 = points - @ r1 = x - @ preserve r2 (external loop counter) - @ preserve r3 - @ preserve r4 (external loop counter) - @ r5 = T = sincos_lookup0 - @ r6 = step - @ preserve r14 - - STR r2,[r13,#-4]! @ stack r2 - ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1) - ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS - ADD r12,r5,#1024*4 @ r12= sincos_lookup0+1024 - -mdct_bufferfly_generic_loop1: - LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0] - @ r3 = x1[1] - @ r8 = x1[2] - @ r11= x1[3] x1 -= 4 - LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0] - @ r9 = x2[1] - @ r10= x2[2] - @ r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) - SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2] - ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2]) - SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0] - ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1]) - SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2] - ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3]) +mdct_butterflies_loop2 + + ; mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) + ; mdct_butterfly_generic(r1, r0, r6) + ; r0 = points + ; r1 = x + ; preserve r2 (external loop counter) + ; preserve r3 + ; preserve r4 (external loop counter) + ; r5 = T = sincos_lookup0 + ; r6 = step + ; preserve r14 + + STR r2,[r13,#-4]! ; stack r2 + ADD r1,r1,r0,LSL #1 ; r1 = x2+4 = x + (POINTS>>1) + ADD r7,r1,r0,LSL #1 ; r7 = x1+4 = x + POINTS + ADD r12,r5,#1024*4 ; r12= sincos_lookup0+1024 + +mdct_bufferfly_generic_loop1 + LDMDB r7!,{r2,r3,r8,r11} ; r2 = x1[0] + ; r3 = x1[1] + ; r8 = x1[2] + ; r11= x1[3] x1 -= 4 + LDMDB r1!,{r4,r9,r10,r14} ; r4 = x2[0] + ; r9 = x2[1] + ; r10= x2[2] + ; r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) + SUB r11,r11,r8 ; r11= s1 = x1[3] - x1[2] + ADD r8, r11,r8, LSL #1 ; r8 = x1[3] + x1[2] (-> x1[2]) + SUB r9, r9, r4 ; r9 = s2 = x2[1] - x2[0] + ADD r4, r9, r4, LSL #1 ; r4 = x2[1] + x2[0] (-> x1[1]) + SUB r14,r14,r10 ; r14= s3 = x2[3] - x2[2] + ADD r10,r14,r10,LSL #1 ; r10= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r4,r8,r10} - @ r0 = points - @ r1 = x2 - @ r2 = s0 - @ r3 free - @ r4 free - @ r5 = T - @ r6 = step - @ r7 = x1 - @ r8 free - @ r9 = s2 - @ r10 free - @ r11= s1 - @ r12= limit - @ r14= s3 - - LDR r8, [r5,#4] @ r8 = T[1] - LDR r10,[r5],r6,LSL #2 @ r10= T[0] T += step - - @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) - @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 - @ stall Xscale - SMULL r4, r3, r2, r8 @ (r4, r3) = s0*T[1] - SMLAL r4, r3, r11,r10 @ (r4, r3) += s1*T[0] + ; r0 = points + ; r1 = x2 + ; r2 = s0 + ; r3 free + ; r4 free + ; r5 = T + ; r6 = step + ; r7 = x1 + ; r8 free + ; r9 = s2 + ; r10 free + ; r11= s1 + ; r12= limit + ; r14= s3 + + LDR r8, [r5,#4] ; r8 = T[1] + LDR r10,[r5],r6,LSL #2 ; r10= T[0] T += step + + ; XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) + ; x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 + ; stall Xscale + SMULL r4, r3, r2, r8 ; (r4, r3) = s0*T[1] + SMLAL r4, r3, r11,r10 ; (r4, r3) += s1*T[0] RSB r11,r11,#0 - SMULL r11,r4, r8, r11 @ (r11,r4) = -s1*T[1] - SMLAL r11,r4, r2, r10 @ (r11,r4) += s0*T[0] - MOV r2, r3, LSL #1 @ r2 = r3<<1 = Value for x2[0] - - @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) - @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 - SMULL r11,r3, r9, r10 @ (r11,r3) = s2*T[0] - MOV r4, r4, LSL #1 @ r4 = r4<<1 = Value for x2[2] - SMLAL r11,r3, r14,r8 @ (r11,r3) += s3*T[1] + SMULL r11,r4, r8, r11 ; (r11,r4) = -s1*T[1] + SMLAL r11,r4, r2, r10 ; (r11,r4) += s0*T[0] + MOV r2, r3, LSL #1 ; r2 = r3<<1 = Value for x2[0] + + ; XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) + ; x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 + SMULL r11,r3, r9, r10 ; (r11,r3) = s2*T[0] + MOV r4, r4, LSL #1 ; r4 = r4<<1 = Value for x2[2] + SMLAL r11,r3, r14,r8 ; (r11,r3) += s3*T[1] RSB r9, r9, #0 - SMULL r10,r11,r14,r10 @ (r10,r11) = s3*T[0] - MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1] - SMLAL r10,r11,r9,r8 @ (r10,r11) -= s2*T[1] + SMULL r10,r11,r14,r10 ; (r10,r11) = s3*T[0] + MOV r3, r3, LSL #1 ; r3 = r3<<1 = Value for x2[1] + SMLAL r10,r11,r9,r8 ; (r10,r11) -= s2*T[1] CMP r5, r12 - MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3] + MOV r11,r11,LSL #1 ; r11= r11<<1 = Value for x2[3] STMIA r1,{r2,r3,r4,r11} BLT mdct_bufferfly_generic_loop1 SUB r12,r12,#1024*4 -mdct_bufferfly_generic_loop2: - LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0] - @ r3 = x1[1] - @ r9 = x1[2] - @ r10= x1[3] x1 -= 4 - LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0] - @ r8 = x2[1] - @ r11= x2[2] - @ r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) - SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3] - ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2]) - SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1] - ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1]) - SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2] - ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3]) +mdct_bufferfly_generic_loop2 + LDMDB r7!,{r2,r3,r9,r10} ; r2 = x1[0] + ; r3 = x1[1] + ; r9 = x1[2] + ; r10= x1[3] x1 -= 4 + LDMDB r1!,{r4,r8,r11,r14} ; r4 = x2[0] + ; r8 = x2[1] + ; r11= x2[2] + ; r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) + SUB r9, r9,r10 ; r9 = s1 = x1[2] - x1[3] + ADD r10,r9,r10, LSL #1 ; r10= x1[2] + x1[3] (-> x1[2]) + SUB r4, r4, r8 ; r4 = s2 = x2[0] - x2[1] + ADD r8, r4, r8, LSL #1 ; r8 = x2[0] + x2[1] (-> x1[1]) + SUB r14,r14,r11 ; r14= s3 = x2[3] - x2[2] + ADD r11,r14,r11,LSL #1 ; r11= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r8,r10,r11} - @ r0 = points - @ r1 = x2 - @ r2 = s0 - @ r3 free - @ r4 = s2 - @ r5 = T - @ r6 = step - @ r7 = x1 - @ r8 free - @ r9 = s1 - @ r10 free - @ r11 free - @ r12= limit - @ r14= s3 - - LDR r8, [r5,#4] @ r8 = T[1] - LDR r10,[r5],-r6,LSL #2 @ r10= T[0] T -= step - - @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) - @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 - @ stall Xscale - SMULL r3, r11,r2, r8 @ (r3, r11) = s0*T[1] - SMLAL r3, r11,r9, r10 @ (r3, r11) += s1*T[0] + ; r0 = points + ; r1 = x2 + ; r2 = s0 + ; r3 free + ; r4 = s2 + ; r5 = T + ; r6 = step + ; r7 = x1 + ; r8 free + ; r9 = s1 + ; r10 free + ; r11 free + ; r12= limit + ; r14= s3 + + LDR r8, [r5,#4] ; r8 = T[1] + LDR r10,[r5],-r6,LSL #2 ; r10= T[0] T -= step + + ; XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) + ; x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 + ; stall Xscale + SMULL r3, r11,r2, r8 ; (r3, r11) = s0*T[1] + SMLAL r3, r11,r9, r10 ; (r3, r11) += s1*T[0] RSB r9, r9, #0 - SMULL r3, r2, r10,r2 @ (r3, r2) = s0*T[0] - SMLAL r3, r2, r9, r8 @ (r3, r2) += -s1*T[1] - MOV r9, r11,LSL #1 @ r9 = r11<<1 = Value for x2[2] - - @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) - @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 - SMULL r3, r11,r4, r10 @ (r3,r11) = s2*T[0] - MOV r2, r2, LSL #1 @ r2 = r2<<1 = Value for x2[0] - SMLAL r3, r11,r14,r8 @ (r3,r11) += s3*T[1] + SMULL r3, r2, r10,r2 ; (r3, r2) = s0*T[0] + SMLAL r3, r2, r9, r8 ; (r3, r2) += -s1*T[1] + MOV r9, r11,LSL #1 ; r9 = r11<<1 = Value for x2[2] + + ; XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) + ; x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 + SMULL r3, r11,r4, r10 ; (r3,r11) = s2*T[0] + MOV r2, r2, LSL #1 ; r2 = r2<<1 = Value for x2[0] + SMLAL r3, r11,r14,r8 ; (r3,r11) += s3*T[1] RSB r4, r4, #0 - SMULL r10,r3,r14,r10 @ (r10,r3) = s3*T[0] - MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3] - SMLAL r10,r3, r4, r8 @ (r10,r3) -= s2*T[1] + SMULL r10,r3,r14,r10 ; (r10,r3) = s3*T[0] + MOV r11,r11,LSL #1 ; r11= r11<<1 = Value for x2[3] + SMLAL r10,r3, r4, r8 ; (r10,r3) -= s2*T[1] CMP r5, r12 - MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1] + MOV r3, r3, LSL #1 ; r3 = r3<<1 = Value for x2[1] STMIA r1,{r2,r3,r9,r11} BGT mdct_bufferfly_generic_loop2 - LDR r2,[r13],#4 @ unstack r2 - ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j - @ stall Xscale - SUBS r2, r2, #1 @ r2-- (j++) + LDR r2,[r13],#4 ; unstack r2 + ADD r1, r1, r0, LSL #2 ; r1 = x+POINTS*j + ; stall Xscale + SUBS r2, r2, #1 ; r2-- (j++) BGT mdct_butterflies_loop2 LDMFD r13!,{r4,r14} LDR r1,[r13,#4] - SUBS r4, r4, #1 @ stages-- - MOV r14,r14,LSL #1 @ r14= 4<<i (i++) - MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++) + SUBS r4, r4, #1 ; stages-- + MOV r14,r14,LSL #1 ; r14= 4<<i (i++) + MOV r6, r6, LSL #1 ; r6 = step <<= 1 (i++) BGE mdct_butterflies_loop1 LDMFD r13,{r0-r1} -no_generics: - @ mdct_butterflies part2 (loop around mdct_bufferfly_32) - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift - -mdct_bufferflies_loop3: - @ mdct_bufferfly_32 - - @ block1 - ADD r4, r1, #16*4 @ r4 = &in[16] - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16] - @ r6 = x[17] - @ r9 = x[18] - @ r10= x[19] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] - @ r8 = x[1] - @ r11= x[2] - @ r12= x[3] - SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17] - ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16] - SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19] - ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18] - SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17] - SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19] +no_generics + ; mdct_butterflies part2 (loop around mdct_bufferfly_32) + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift + +mdct_bufferflies_loop3 + ; mdct_bufferfly_32 + + ; block1 + ADD r4, r1, #16*4 ; r4 = &in[16] + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[16] + ; r6 = x[17] + ; r9 = x[18] + ; r10= x[19] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] + ; r8 = x[1] + ; r11= x[2] + ; r12= x[3] + SUB r5, r5, r6 ; r5 = s0 = x[16] - x[17] + ADD r6, r5, r6, LSL #1 ; r6 = x[16] + x[17] -> x[16] + SUB r9, r9, r10 ; r9 = s1 = x[18] - x[19] + ADD r10,r9, r10,LSL #1 ; r10= x[18] + x[19] -> x[18] + SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[17] + SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[19] STMIA r4!,{r6,r7,r10,r11} LDR r6,cPI1_8 LDR r7,cPI3_8 - @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) - @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 - @ stall Xscale - SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI1_8 - SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI3_8 + ; XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) + ; x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 + ; stall Xscale + SMULL r14,r11,r5, r6 ; (r14,r11) = s0*cPI1_8 + SMLAL r14,r11,r9, r7 ; (r14,r11) += s1*cPI3_8 RSB r9, r9, #0 - SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI3_8 - SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI1_8 + SMULL r14,r5, r7, r5 ; (r14,r5) = s0*cPI3_8 + SMLAL r14,r5, r9, r6 ; (r14,r5) -= s1*cPI1_8 MOV r11,r11,LSL #1 MOV r5, r5, LSL #1 - @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) - @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 - SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI1_8 - SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI3_8 + ; XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) + ; x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 + SMULL r14,r9, r8, r6 ; (r14,r9) = s2*cPI1_8 + SMLAL r14,r9, r12,r7 ; (r14,r9) += s3*cPI3_8 RSB r8,r8,#0 - SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI1_8 - SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI3_8 + SMULL r14,r12,r6, r12 ; (r14,r12) = s3*cPI1_8 + SMLAL r14,r12,r8, r7 ; (r14,r12) -= s2*cPI3_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r9,r11,r12} - @ block2 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20] - @ r6 = x[21] - @ r9 = x[22] - @ r10= x[23] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4] - @ r8 = x[5] - @ r11= x[6] - @ r12= x[7] - SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21] - ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20] - SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23] - ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22] - SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21] - SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6] - ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23] + ; block2 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[20] + ; r6 = x[21] + ; r9 = x[22] + ; r10= x[23] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[4] + ; r8 = x[5] + ; r11= x[6] + ; r12= x[7] + SUB r5, r5, r6 ; r5 = s0 = x[20] - x[21] + ADD r6, r5, r6, LSL #1 ; r6 = x[20] + x[21] -> x[20] + SUB r9, r9, r10 ; r9 = s1 = x[22] - x[23] + ADD r10,r9, r10,LSL #1 ; r10= x[22] + x[23] -> x[22] + SUB r8, r8, r7 ; r8 = s2 = x[ 5] - x[ 4] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 5] + x[ 4] -> x[21] + SUB r12,r12,r11 ; r12= s3 = x[ 7] - x[ 6] + ADD r11,r12,r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[23] LDR r14,cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 @ r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 - SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8 - SUB r12,r12,r8 @ r12= s3 - s2 - ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 + SUB r5, r5, r9 ; r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 + SMULL r6, r5, r14,r5 ; (r6,r5) = (s0-s1)*cPI2_8 + SUB r12,r12,r8 ; r12= s3 - s2 + ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 - SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8 + SMULL r6, r8, r14,r8 ; (r6,r8) = (s3+s2)*cPI2_8 MOV r5, r5, LSL #1 - SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8 + SMULL r6, r9, r14,r9 ; (r6,r9) = (s0+s1)*cPI2_8 MOV r8, r8, LSL #1 - SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8 + SMULL r6, r12,r14,r12 ; (r6,r12) = (s3-s2)*cPI2_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r8,r9,r12} - @ block3 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24] - @ r6 = x[25] - @ r9 = x[25] - @ r10= x[26] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8] - @ r8 = x[9] - @ r11= x[10] - @ r12= x[11] - SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25] - ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25] - SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27] - ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26] - SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25] - SUB r12,r12,r11 @ r12= s3 = x[11] - x[10] - ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27] + ; block3 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[24] + ; r6 = x[25] + ; r9 = x[25] + ; r10= x[26] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[8] + ; r8 = x[9] + ; r11= x[10] + ; r12= x[11] + SUB r5, r5, r6 ; r5 = s0 = x[24] - x[25] + ADD r6, r5, r6, LSL #1 ; r6 = x[24] + x[25] -> x[25] + SUB r9, r9, r10 ; r9 = s1 = x[26] - x[27] + ADD r10,r9, r10,LSL #1 ; r10= x[26] + x[27] -> x[26] + SUB r8, r8, r7 ; r8 = s2 = x[ 9] - x[ 8] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 9] + x[ 8] -> x[25] + SUB r12,r12,r11 ; r12= s3 = x[11] - x[10] + ADD r11,r12,r11, LSL #1 ; r11= x[11] + x[10] -> x[27] STMIA r4!,{r6,r7,r10,r11} LDR r6,cPI3_8 LDR r7,cPI1_8 - @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) - @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 - @ stall Xscale - SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI3_8 - SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI1_8 + ; XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) + ; x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 + ; stall Xscale + SMULL r14,r11,r5, r6 ; (r14,r11) = s0*cPI3_8 + SMLAL r14,r11,r9, r7 ; (r14,r11) += s1*cPI1_8 RSB r9, r9, #0 - SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI1_8 - SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI3_8 + SMULL r14,r5, r7, r5 ; (r14,r5) = s0*cPI1_8 + SMLAL r14,r5, r9, r6 ; (r14,r5) -= s1*cPI3_8 MOV r11,r11,LSL #1 MOV r5, r5, LSL #1 - @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) - @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 - SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI3_8 - SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI1_8 + ; XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) + ; x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 + SMULL r14,r9, r8, r6 ; (r14,r9) = s2*cPI3_8 + SMLAL r14,r9, r12,r7 ; (r14,r9) += s3*cPI1_8 RSB r8,r8,#0 - SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI3_8 - SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI1_8 + SMULL r14,r12,r6, r12 ; (r14,r12) = s3*cPI3_8 + SMLAL r14,r12,r8, r7 ; (r14,r12) -= s2*cPI1_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r9,r11,r12} - @ block4 - LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28] - @ r6 = x[29] - @ r10= x[30] - @ r11= x[31] - LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12] - @ r9 = x[13] - @ r12= x[14] - @ r14= x[15] - SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29] - ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28] - SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14] - ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31] - SUB r10,r10,r11 @ r10= s1 = x[30] - x[31] - ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30] - SUB r14, r8, r9 @ r14= s2 = x[12] - x[13] - ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29] + ; block4 + LDMIA r4,{r5,r6,r10,r11} ; r5 = x[28] + ; r6 = x[29] + ; r10= x[30] + ; r11= x[31] + LDMIA r1,{r8,r9,r12,r14} ; r8 = x[12] + ; r9 = x[13] + ; r12= x[14] + ; r14= x[15] + SUB r5, r5, r6 ; r5 = s0 = x[28] - x[29] + ADD r6, r5, r6, LSL #1 ; r6 = x[28] + x[29] -> x[28] + SUB r7, r14,r12 ; r7 = s3 = x[15] - x[14] + ADD r12,r7, r12, LSL #1 ; r12= x[15] + x[14] -> x[31] + SUB r10,r10,r11 ; r10= s1 = x[30] - x[31] + ADD r11,r10,r11,LSL #1 ; r11= x[30] + x[31] -> x[30] + SUB r14, r8, r9 ; r14= s2 = x[12] - x[13] + ADD r9, r14, r9, LSL #1 ; r9 = x[12] + x[13] -> x[29] STMIA r4!,{r6,r9,r11,r12} STMIA r1!,{r5,r7,r10,r14} - @ mdct_butterfly16 (1st version) - @ block 1 + ; mdct_butterfly16 (1st version) + ; block 1 SUB r1,r1,#16*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] - @ r6 = x[ 9] - @ r9 = x[10] - @ r10= x[11] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] - @ r8 = x[1] - @ r11= x[2] - @ r12= x[3] - SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] + ; r6 = x[ 9] + ; r9 = x[10] + ; r10= x[11] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] + ; r8 = x[1] + ; r11= x[2] + ; r12= x[3] + SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] LDR r14,cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 @ r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 - SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8 - SUB r12,r12,r8 @ r12= s3 - s2 - ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 + SUB r5, r5, r9 ; r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 + SMULL r6, r5, r14,r5 ; (r6,r5) = (s0-s1)*cPI2_8 + SUB r12,r12,r8 ; r12= s3 - s2 + ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 - SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8 + SMULL r6, r8, r14,r8 ; (r6,r8) = (s3+s2)*cPI2_8 MOV r5, r5, LSL #1 - SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8 + SMULL r6, r9, r14,r9 ; (r6,r9) = (s0+s1)*cPI2_8 MOV r8, r8, LSL #1 - SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8 + SMULL r6, r12,r14,r12 ; (r6,r12) = (s3-s2)*cPI2_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r8,r9,r12} - @ block4 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] - @ r6 = x[13] - @ r9 = x[14] - @ r10= x[15] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] - @ r8 = x[ 5] - @ r11= x[ 6] - @ r12= x[ 7] - SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] - SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] - SUB r12,r9, r10 @ r12= s3 = x[14] - x[15] - ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14] + ; block4 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] + ; r6 = x[13] + ; r9 = x[14] + ; r10= x[15] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] + ; r8 = x[ 5] + ; r11= x[ 6] + ; r12= x[ 7] + SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] + SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] + SUB r12,r9, r10 ; r12= s3 = x[14] - x[15] + ADD r10,r12,r10,LSL #1 ; r10= x[14] + x[15] -> x[14] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r12,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} - @ block 2 + ; block 2 ADD r1,r1,#16*4-8*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] - @ r6 = x[ 9] - @ r9 = x[10] - @ r10= x[11] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] - @ r8 = x[1] - @ r11= x[2] - @ r12= x[3] - SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] + ; r6 = x[ 9] + ; r9 = x[10] + ; r10= x[11] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] + ; r8 = x[1] + ; r11= x[2] + ; r12= x[3] + SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] LDR r14,cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 @ r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 - SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8 - SUB r12,r12,r8 @ r12= s3 - s2 - ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 + SUB r5, r5, r9 ; r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 + SMULL r6, r5, r14,r5 ; (r6,r5) = (s0-s1)*cPI2_8 + SUB r12,r12,r8 ; r12= s3 - s2 + ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 - SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8 + SMULL r6, r8, r14,r8 ; (r6,r8) = (s3+s2)*cPI2_8 MOV r5, r5, LSL #1 - SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8 + SMULL r6, r9, r14,r9 ; (r6,r9) = (s0+s1)*cPI2_8 MOV r8, r8, LSL #1 - SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8 + SMULL r6, r12,r14,r12 ; (r6,r12) = (s3-s2)*cPI2_8 MOV r9, r9, LSL #1 MOV r12,r12,LSL #1 STMIA r1!,{r5,r8,r9,r12} - @ block4 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] - @ r6 = x[13] - @ r9 = x[14] - @ r10= x[15] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] - @ r8 = x[ 5] - @ r11= x[ 6] - @ r12= x[ 7] - SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] - SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15] - ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14] - SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] + ; block4 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] + ; r6 = x[13] + ; r9 = x[14] + ; r10= x[15] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] + ; r8 = x[ 5] + ; r11= x[ 6] + ; r12= x[ 7] + SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] + SUB r9, r9, r10 ; r9 = s3 = x[14] - x[15] + ADD r10,r9, r10,LSL #1 ; r10= x[14] + x[15] -> x[14] + SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r9,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} ADD r1,r1,#8*4 @@ -953,117 +953,117 @@ mdct_bufferflies_loop3: LDMFD r13,{r0-r3} -mdct_bitreverse_arm: - @ r0 = points = n - @ r1 = in - @ r2 = step - @ r3 = shift +mdct_bitreverse_arm + ; r0 = points = n + ; r1 = in + ; r2 = step + ; r3 = shift - MOV r4, #0 @ r4 = bit = 0 - ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1) + MOV r4, #0 ; r4 = bit = 0 + ADD r5, r1, r0, LSL #1 ; r5 = w = x + (n>>1) ADR r6, bitrev SUB r5, r5, #8 -brev_lp: +brev_lp LDRB r7, [r6, r4, LSR #6] AND r8, r4, #0x3f LDRB r8, [r6, r8] - ADD r4, r4, #1 @ bit++ - @ stall XScale - ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit] + ADD r4, r4, #1 ; bit++ + ; stall XScale + ORR r7, r7, r8, LSL #6 ; r7 = bitrev[bit] MOV r7, r7, LSR r3 - ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift) - CMP r5, r9 @ if (w > xx) - LDR r10,[r5],#-8 @ r10 = w[0] w -= 2 - LDRGT r11,[r5,#12] @ r11 = w[1] - LDRGT r12,[r9] @ r12 = xx[0] - LDRGT r14,[r9,#4] @ r14 = xx[1] - STRGT r10,[r9] @ xx[0]= w[0] - STRGT r11,[r9,#4] @ xx[1]= w[1] - STRGT r12,[r5,#8] @ w[0] = xx[0] - STRGT r14,[r5,#12] @ w[1] = xx[1] + ADD r9, r1, r7, LSL #2 ; r9 = xx = x + (b>>shift) + CMP r5, r9 ; if (w > xx) + LDR r10,[r5],#-8 ; r10 = w[0] w -= 2 + LDRGT r11,[r5,#12] ; r11 = w[1] + LDRGT r12,[r9] ; r12 = xx[0] + LDRGT r14,[r9,#4] ; r14 = xx[1] + STRGT r10,[r9] ; xx[0]= w[0] + STRGT r11,[r9,#4] ; xx[1]= w[1] + STRGT r12,[r5,#8] ; w[0] = xx[0] + STRGT r14,[r5,#12] ; w[1] = xx[1] CMP r5,r1 BGT brev_lp - @ mdct_step7 - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift - - CMP r2, #4 @ r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + - LDRLT r5, =sincos_lookup1 @ sincos_lookup0 + - ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1) - ADDGE r5, r5, r2, LSL #1 @ (step>>1) - ADD r8, r5, #1024*4 @ r8 = Ttop -step7_loop1: - LDR r6, [r1] @ r6 = w0[0] - LDR r9, [r1,#4] @ r9 = w0[1] - LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] @ r11= w1[1] - LDR r14,[r5,#4] @ r14= T[1] - LDR r12,[r5],r2,LSL #2 @ r12= T[0] T += step - - ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] - - @ Can save 1 cycle by using SMULL SMLAL - at the cost of being - @ 1 off. - SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[1] - SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[0] = s2 + ; mdct_step7 + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift + + CMP r2, #4 ; r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + + LDRLT r5, =sincos_lookup1 ; sincos_lookup0 + + ADD r7, r1, r0, LSL #1 ; r7 = w1 = x + (n>>1) + ADDGE r5, r5, r2, LSL #1 ; (step>>1) + ADD r8, r5, #1024*4 ; r8 = Ttop +step7_loop1 + LDR r6, [r1] ; r6 = w0[0] + LDR r9, [r1,#4] ; r9 = w0[1] + LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] ; r11= w1[1] + LDR r14,[r5,#4] ; r14= T[1] + LDR r12,[r5],r2,LSL #2 ; r12= T[0] T += step + + ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] + + ; Can save 1 cycle by using SMULL SMLAL - at the cost of being + ; 1 off. + SMULL r0, r3, r6, r14 ; (r0,r3) = s0*T[1] + SMULL r0, r4, r11,r12 ; (r0,r4) += s1*T[0] = s2 ADD r3, r3, r4 - SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[1] - SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[0] = s3 + SMULL r0, r14,r11,r14 ; (r0,r14) = s1*T[1] + SMULL r0, r12,r6, r12 ; (r0,r12) += s0*T[0] = s3 SUB r14,r14,r12 - @ r9 = s0b<<1 - @ r10= s1b<<1 - ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 + ; r9 = s0b<<1 + ; r10= s1b<<1 + ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b - ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b + SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b + ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 @ w0 += 2 + STR r10,[r1],#4 ; w0 += 2 STR r3, [r7] STR r12,[r7,#4] CMP r5,r8 BLT step7_loop1 -step7_loop2: - LDR r6, [r1] @ r6 = w0[0] - LDR r9, [r1,#4] @ r9 = w0[1] - LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] @ r11= w1[1] - LDR r14,[r5,-r2,LSL #2]! @ r12= T[1] T -= step - LDR r12,[r5,#4] @ r14= T[0] - - ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] - - @ Can save 1 cycle by using SMULL SMLAL - at the cost of being - @ 1 off. - SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[0] - SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[1] = s2 +step7_loop2 + LDR r6, [r1] ; r6 = w0[0] + LDR r9, [r1,#4] ; r9 = w0[1] + LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] ; r11= w1[1] + LDR r14,[r5,-r2,LSL #2]! ; r12= T[1] T -= step + LDR r12,[r5,#4] ; r14= T[0] + + ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] + + ; Can save 1 cycle by using SMULL SMLAL - at the cost of being + ; 1 off. + SMULL r0, r3, r6, r14 ; (r0,r3) = s0*T[0] + SMULL r0, r4, r11,r12 ; (r0,r4) += s1*T[1] = s2 ADD r3, r3, r4 - SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[0] - SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[1] = s3 + SMULL r0, r14,r11,r14 ; (r0,r14) = s1*T[0] + SMULL r0, r12,r6, r12 ; (r0,r12) += s0*T[1] = s3 SUB r14,r14,r12 - @ r9 = s0b<<1 - @ r10= s1b<<1 - ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 + ; r9 = s0b<<1 + ; r10= s1b<<1 + ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b - ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b + SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b + ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 @ w0 += 2 + STR r10,[r1],#4 ; w0 += 2 STR r3, [r7] STR r12,[r7,#4] @@ -1072,116 +1072,116 @@ step7_loop2: LDMFD r13!,{r0-r3} - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift - MOV r2, r2, ASR #2 @ r2 = step >>= 2 + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift + MOV r2, r2, ASR #2 ; r2 = step >>= 2 CMP r2, #0 CMPNE r2, #1 BEQ mdct_end - @ step > 1 (default case) - CMP r2, #4 @ r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + - LDRLT r5, =sincos_lookup1 @ sincos_lookup1 - ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1) - ADDGE r5, r5, r2, LSL #1 @ (step>>1) -mdct_step8_default: - LDR r6, [r1],#4 @ r6 = s0 = x[0] - LDR r8, [r1],#4 @ r8 = -s1 = x[1] - LDR r12,[r5,#4] @ r12= T[1] - LDR r14,[r5],r2,LSL #2 @ r14= T[0] T += step - RSB r8, r8, #0 @ r8 = s1 - - @ XPROD31(s0, s1, T[0], T[1], x, x+1) - @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] - SMULL r9, r10, r8, r12 @ (r9,r10) = s1 * T[1] + ; step > 1 (default case) + CMP r2, #4 ; r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + + LDRLT r5, =sincos_lookup1 ; sincos_lookup1 + ADD r7, r1, r0, LSL #1 ; r7 = iX = x + (n>>1) + ADDGE r5, r5, r2, LSL #1 ; (step>>1) +mdct_step8_default + LDR r6, [r1],#4 ; r6 = s0 = x[0] + LDR r8, [r1],#4 ; r8 = -s1 = x[1] + LDR r12,[r5,#4] ; r12= T[1] + LDR r14,[r5],r2,LSL #2 ; r14= T[0] T += step + RSB r8, r8, #0 ; r8 = s1 + + ; XPROD31(s0, s1, T[0], T[1], x, x+1) + ; x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] + SMULL r9, r10, r8, r12 ; (r9,r10) = s1 * T[1] CMP r1, r7 - SMLAL r9, r10, r6, r14 @ (r9,r10) += s0 * T[0] - RSB r6, r6, #0 @ r6 = -s0 - SMULL r9, r11, r8, r14 @ (r9,r11) = s1 * T[0] + SMLAL r9, r10, r6, r14 ; (r9,r10) += s0 * T[0] + RSB r6, r6, #0 ; r6 = -s0 + SMULL r9, r11, r8, r14 ; (r9,r11) = s1 * T[0] MOV r10,r10,LSL #1 - SMLAL r9, r11, r6, r12 @ (r9,r11) -= s0 * T[1] + SMLAL r9, r11, r6, r12 ; (r9,r11) -= s0 * T[1] STR r10,[r1,#-8] MOV r11,r11,LSL #1 STR r11,[r1,#-4] BLT mdct_step8_default -mdct_end: +mdct_end MOV r0, r2 LDMFD r13!,{r4-r11,PC} -cPI1_8: - .word 0x7641af3d -cPI2_8: - .word 0x5a82799a -cPI3_8: - .word 0x30fbc54d -bitrev: - .byte 0 - .byte 32 - .byte 16 - .byte 48 - .byte 8 - .byte 40 - .byte 24 - .byte 56 - .byte 4 - .byte 36 - .byte 20 - .byte 52 - .byte 12 - .byte 44 - .byte 28 - .byte 60 - .byte 2 - .byte 34 - .byte 18 - .byte 50 - .byte 10 - .byte 42 - .byte 26 - .byte 58 - .byte 6 - .byte 38 - .byte 22 - .byte 54 - .byte 14 - .byte 46 - .byte 30 - .byte 62 - .byte 1 - .byte 33 - .byte 17 - .byte 49 - .byte 9 - .byte 41 - .byte 25 - .byte 57 - .byte 5 - .byte 37 - .byte 21 - .byte 53 - .byte 13 - .byte 45 - .byte 29 - .byte 61 - .byte 3 - .byte 35 - .byte 19 - .byte 51 - .byte 11 - .byte 43 - .byte 27 - .byte 59 - .byte 7 - .byte 39 - .byte 23 - .byte 55 - .byte 15 - .byte 47 - .byte 31 - .byte 63 - - @ END +cPI1_8 + DCD 0x7641af3d +cPI2_8 + DCD 0x5a82799a +cPI3_8 + DCD 0x30fbc54d +bitrev + DCB 0 + DCB 32 + DCB 16 + DCB 48 + DCB 8 + DCB 40 + DCB 24 + DCB 56 + DCB 4 + DCB 36 + DCB 20 + DCB 52 + DCB 12 + DCB 44 + DCB 28 + DCB 60 + DCB 2 + DCB 34 + DCB 18 + DCB 50 + DCB 10 + DCB 42 + DCB 26 + DCB 58 + DCB 6 + DCB 38 + DCB 22 + DCB 54 + DCB 14 + DCB 46 + DCB 30 + DCB 62 + DCB 1 + DCB 33 + DCB 17 + DCB 49 + DCB 9 + DCB 41 + DCB 25 + DCB 57 + DCB 5 + DCB 37 + DCB 21 + DCB 53 + DCB 13 + DCB 45 + DCB 29 + DCB 61 + DCB 3 + DCB 35 + DCB 19 + DCB 51 + DCB 11 + DCB 43 + DCB 27 + DCB 59 + DCB 7 + DCB 39 + DCB 23 + DCB 55 + DCB 15 + DCB 47 + DCB 31 + DCB 63 + + END @@ -1,92 +1,92 @@ -@ Tremolo library -@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd +; Tremolo library +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd - .text + AREA |.text|, CODE, READONLY - @ low accuracy version + ; low accuracy version - .global mdct_backward_arm_low - .global mdct_shift_right_arm_low - .global mdct_unroll_prelap_arm_low - .global mdct_unroll_part2_arm_low - .global mdct_unroll_part3_arm_low - .global mdct_unroll_postlap_arm_low + EXPORT mdct_backward_arm_low + EXPORT mdct_shift_right_arm_low + EXPORT mdct_unroll_prelap_arm_low + EXPORT mdct_unroll_part2_arm_low + EXPORT mdct_unroll_part3_arm_low + EXPORT mdct_unroll_postlap_arm_low - .extern sincos_lookup0 - .extern sincos_lookup1 + IMPORT sincos_lookup0 + IMPORT sincos_lookup1 -mdct_unroll_prelap_arm_low: - @ r0 = out - @ r1 = post - @ r2 = r - @ r3 = step +mdct_unroll_prelap_arm_low + ; r0 = out + ; r1 = post + ; r2 = r + ; r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r2, r1 @ r1 = r - post - SUBS r1, r1, #16 @ r1 = r - post - 16 + SUB r1, r2, r1 ; r1 = r - post + SUBS r1, r1, #16 ; r1 = r - post - 16 BLT unroll_over -unroll_loop: +unroll_loop LDMDB r2!,{r5,r6,r7,r12} - MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 - MOV r6, r6, ASR #9 @ r6 = (*--r)>>9 - MOV r7, r7, ASR #9 @ r7 = (*--r)>>9 - MOV r12,r12,ASR #9 @ r12= (*--r)>>9 + MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 + MOV r6, r6, ASR #9 ; r6 = (*--r)>>9 + MOV r7, r7, ASR #9 ; r7 = (*--r)>>9 + MOV r12,r12,ASR #9 ; r12= (*--r)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop -unroll_over: +unroll_over ADDS r1, r1, #16 BLE unroll_end -unroll_loop2: +unroll_loop2 LDR r5,[r2,#-4]! - @ stall - @ stall (Xscale) - MOV r5, r5, ASR #9 @ r5 = (*--r)>>9 + ; stall + ; stall (Xscale) + MOV r5, r5, ASR #9 ; r5 = (*--r)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop2 -unroll_end: +unroll_end LDMFD r13!,{r4-r7,PC} -mdct_unroll_postlap_arm_low: - @ r0 = out - @ r1 = post - @ r2 = l - @ r3 = step +mdct_unroll_postlap_arm_low + ; r0 = out + ; r1 = post + ; r2 = l + ; r3 = step STMFD r13!,{r4-r7,r14} MVN r4, #0x8000 MOV r3, r3, LSL #1 - SUB r1, r1, r2 @ r1 = post - l - MOV r1, r1, ASR #1 @ r1 = (post - l)>>1 - SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4 + SUB r1, r1, r2 ; r1 = post - l + MOV r1, r1, ASR #1 ; r1 = (post - l)>>1 + SUBS r1, r1, #16 ; r1 = ((post - l)>>1) - 4 BLT unroll_over3 -unroll_loop3: +unroll_loop3 LDR r12,[r2],#8 LDR r7, [r2],#8 LDR r6, [r2],#8 @@ -97,145 +97,145 @@ unroll_loop3: RSB r6, r6, #0 RSB r7, r7, #0 - MOV r12, r12,ASR #9 @ r12= (-*l)>>9 - MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 - MOV r6, r6, ASR #9 @ r6 = (-*l)>>9 - MOV r7, r7, ASR #9 @ r7 = (-*l)>>9 + MOV r12, r12,ASR #9 ; r12= (-*l)>>9 + MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 + MOV r6, r6, ASR #9 ; r6 = (-*l)>>9 + MOV r7, r7, ASR #9 ; r7 = (-*l)>>9 MOV r14,r12,ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r12,r4, r14,ASR #31 STRH r12,[r0], r3 MOV r14,r7, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r7, r4, r14,ASR #31 STRH r7, [r0], r3 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r3 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #16 BGE unroll_loop3 -unroll_over3: +unroll_over3 ADDS r1, r1, #16 BLE unroll_over4 -unroll_loop4: +unroll_loop4 LDR r5,[r2], #8 - @ stall - @ stall (Xscale) + ; stall + ; stall (Xscale) RSB r5, r5, #0 - MOV r5, r5, ASR #9 @ r5 = (-*l)>>9 + MOV r5, r5, ASR #9 ; r5 = (-*l)>>9 MOV r14,r5, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r5, r4, r14,ASR #31 STRH r5, [r0], r3 SUBS r1, r1, #4 BGT unroll_loop4 -unroll_over4: +unroll_over4 LDMFD r13!,{r4-r7,PC} -mdct_unroll_part2_arm_low: - @ r0 = out - @ r1 = post - @ r2 = l - @ r3 = r - @ <> = step - @ <> = wL - @ <> = wR +mdct_unroll_part2_arm_low + ; r0 = out + ; r1 = post + ; r2 = l + ; r3 = r + ; <> = step + ; <> = wL + ; <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} @ r8 = step - @ r9 = wL - @ r10= wR + LDMFD r12,{r8,r9,r10} ; r8 = step + ; r9 = wL + ; r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r3, r1 @ r1 = (r - post) + SUBS r1, r3, r1 ; r1 = (r - post) BLE unroll_over5 -unroll_loop5: - LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first) - LDR r7, [r3, #-4]! @ r7 = *--r - LDRB r6, [r10,#-1]! @ r6 = *--wR - LDRB r11,[r9],#1 @ r11= *wL++ +unroll_loop5 + LDR r12,[r2, #-8]! ; r12= *l (but l -= 2 first) + LDR r7, [r3, #-4]! ; r7 = *--r + LDRB r6, [r10,#-1]! ; r6 = *--wR + LDRB r11,[r9],#1 ; r11= *wL++ MOV r12, r12, ASR #8 - @ Can save a cycle here, at the cost of 1bit errors in rounding - MUL r11,r12,r11 @ r11 = *l * *wL++ + ; Can save a cycle here, at the cost of 1bit errors in rounding + MUL r11,r12,r11 ; r11 = *l * *wL++ MOV r7, r7, ASR #8 - MLA r6, r7, r6, r11 @ r6 = *--r * *--wR + MLA r6, r7, r6, r11 ; r6 = *--r * *--wR MOV r6, r6, ASR #9 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop5 -unroll_over5: +unroll_over5 LDMFD r13!,{r4,r6-r11,PC} -mdct_unroll_part3_arm_low: - @ r0 = out - @ r1 = post - @ r2 = l - @ r3 = r - @ <> = step - @ <> = wL - @ <> = wR +mdct_unroll_part3_arm_low + ; r0 = out + ; r1 = post + ; r2 = l + ; r3 = r + ; <> = step + ; <> = wL + ; <> = wR MOV r12,r13 STMFD r13!,{r4,r6-r11,r14} - LDMFD r12,{r8,r9,r10} @ r8 = step - @ r9 = wL - @ r10= wR + LDMFD r12,{r8,r9,r10} ; r8 = step + ; r9 = wL + ; r10= wR MVN r4, #0x8000 MOV r8, r8, LSL #1 - SUBS r1, r1, r3 @ r1 = (post - r) + SUBS r1, r1, r3 ; r1 = (post - r) BLE unroll_over6 -unroll_loop6: - LDR r12,[r2],#8 @ r12= *l (but l += 2 first) - LDR r7, [r3],#4 @ r7 = *r++ - LDRB r11,[r9],#1 @ r11= *wL++ - LDRB r6, [r10,#-1]! @ r6 = *--wR +unroll_loop6 + LDR r12,[r2],#8 ; r12= *l (but l += 2 first) + LDR r7, [r3],#4 ; r7 = *r++ + LDRB r11,[r9],#1 ; r11= *wL++ + LDRB r6, [r10,#-1]! ; r6 = *--wR - @ Can save a cycle here, at the cost of 1bit errors in rounding + ; Can save a cycle here, at the cost of 1bit errors in rounding MOV r12,r12,ASR #8 - MUL r11,r12,r11 @ (r14,r11) = *l * *wL++ + MUL r11,r12,r11 ; (r14,r11) = *l * *wL++ MOV r7, r7, ASR #8 - MUL r6, r7, r6 @ (r14,r6) = *--r * *--wR + MUL r6, r7, r6 ; (r14,r6) = *--r * *--wR SUB r6, r6, r11 MOV r6, r6, ASR #9 MOV r14,r6, ASR #15 - TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range + TEQ r14,r14,ASR #31 ; if r14==0 || r14==-1 then in range EORNE r6, r4, r14,ASR #31 STRH r6, [r0], r8 SUBS r1, r1, #4 BGT unroll_loop6 -unroll_over6: +unroll_over6 LDMFD r13!,{r4,r6-r11,PC} -mdct_shift_right_arm_low: - @ r0 = n - @ r1 = in - @ r2 = right +mdct_shift_right_arm_low + ; r0 = n + ; r1 = in + ; r2 = right STMFD r13!,{r4-r11,r14} - MOV r0, r0, LSR #2 @ n >>= 2 + MOV r0, r0, LSR #2 ; n >>= 2 ADD r1, r1, #4 SUBS r0, r0, #8 BLT sr_less_than_8 -sr_loop: +sr_loop LDR r3, [r1], #8 LDR r4, [r1], #8 LDR r5, [r1], #8 @@ -247,225 +247,225 @@ sr_loop: SUBS r0, r0, #8 STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14} BGE sr_loop -sr_less_than_8: +sr_less_than_8 ADDS r0, r0, #8 BEQ sr_end -sr_loop2: +sr_loop2 LDR r3, [r1], #8 SUBS r0, r0, #1 STR r3, [r2], #4 BGT sr_loop2 -sr_end: +sr_end LDMFD r13!,{r4-r11,PC} -mdct_backward_arm_low: - @ r0 = n - @ r1 = in +mdct_backward_arm_low + ; r0 = n + ; r1 = in STMFD r13!,{r4-r11,r14} - MOV r2, #1<<4 @ r2 = 1<<shift - MOV r3, #13-4 @ r3 = 13-shift -find_shift_loop: - TST r0, r2 @ if (n & (1<<shift)) == 0 + MOV r2, #1<<4 ; r2 = 1<<shift + MOV r3, #13-4 ; r3 = 13-shift +find_shift_loop + TST r0, r2 ; if (n & (1<<shift)) == 0 MOV r2, r2, LSL #1 - SUBEQ r3, r3, #1 @ shift-- + SUBEQ r3, r3, #1 ; shift-- BEQ find_shift_loop MOV r2, #2 - MOV r2, r2, LSL r3 @ r2 = step = 2<<shift - - @ presymmetry - @ r0 = n (a multiple of 4) - @ r1 = in - @ r2 = step - @ r3 = shift - - ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) - ADD r14,r1, r0 @ r14= in+(n>>2) - SUB r4, r4, #3*4 @ r4 = aX = in+n2-3 - LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 - -presymmetry_loop1: - LDR r7, [r4,#8] @ r6 = s2 = aX[2] - LDRB r11,[r5,#1] @ r11= T[1] - LDR r6, [r4],#-16 @ r6 = s0 = aX[0] - LDRB r10,[r5],r2 @ r10= T[0] T += step + MOV r2, r2, LSL r3 ; r2 = step = 2<<shift + + ; presymmetry + ; r0 = n (a multiple of 4) + ; r1 = in + ; r2 = step + ; r3 = shift + + ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) + ADD r14,r1, r0 ; r14= in+(n>>2) + SUB r4, r4, #3*4 ; r4 = aX = in+n2-3 + LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 + +presymmetry_loop1 + LDR r7, [r4,#8] ; r6 = s2 = aX[2] + LDRB r11,[r5,#1] ; r11= T[1] + LDR r6, [r4],#-16 ; r6 = s0 = aX[0] + LDRB r10,[r5],r2 ; r10= T[0] T += step MOV r6, r6, ASR #8 MOV r7, r7, ASR #8 - @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2]) - MUL r9, r6, r10 @ r9 = s0*T[0] + ; XPROD31(s0, s2, T[0], T[1], &aX[0], &ax[2]) + MUL r9, r6, r10 ; r9 = s0*T[0] RSB r6, r6, #0 - MLA r9, r7, r11,r9 @ r9 += s2*T[1] + MLA r9, r7, r11,r9 ; r9 += s2*T[1] CMP r4, r14 - MUL r12,r7, r10 @ r12 = s2*T[0] - STR r9, [r4,#16] @ aX[0] = r9 - MLA r12,r6, r11,r12 @ r12 -= s0*T[1] - STR r12,[r4,#8+16] @ aX[2] = r12 - - BGE presymmetry_loop1 @ while (aX >= in+n4) - -presymmetry_loop2: - LDR r6, [r4],#-16 @ r6 = s0 = aX[0] - LDRB r10,[r5,#1] @ r10= T[1] - LDR r7, [r4,#16+8] @ r6 = s2 = aX[2] - LDRB r11,[r5],-r2 @ r11= T[0] T -= step + MUL r12,r7, r10 ; r12 = s2*T[0] + STR r9, [r4,#16] ; aX[0] = r9 + MLA r12,r6, r11,r12 ; r12 -= s0*T[1] + STR r12,[r4,#8+16] ; aX[2] = r12 + + BGE presymmetry_loop1 ; while (aX >= in+n4) + +presymmetry_loop2 + LDR r6, [r4],#-16 ; r6 = s0 = aX[0] + LDRB r10,[r5,#1] ; r10= T[1] + LDR r7, [r4,#16+8] ; r6 = s2 = aX[2] + LDRB r11,[r5],-r2 ; r11= T[0] T -= step MOV r6, r6, ASR #8 MOV r7, r7, ASR #8 - @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2]) - MUL r9, r6, r10 @ r9 = s0*T[1] + ; XPROD31(s0, s2, T[1], T[0], &aX[0], &ax[2]) + MUL r9, r6, r10 ; r9 = s0*T[1] RSB r6, r6, #0 - MLA r9, r7, r11,r9 @ r9 += s2*T[0] + MLA r9, r7, r11,r9 ; r9 += s2*T[0] CMP r4, r1 - MUL r12,r7, r10 @ r12 = s2*T[1] - STR r9, [r4,#16] @ aX[0] = r9 - MLA r12,r6, r11,r12 @ r12 -= s0*T[0] - STR r12,[r4,#8+16] @ aX[2] = r12 + MUL r12,r7, r10 ; r12 = s2*T[1] + STR r9, [r4,#16] ; aX[0] = r9 + MLA r12,r6, r11,r12 ; r12 -= s0*T[0] + STR r12,[r4,#8+16] ; aX[2] = r12 - BGE presymmetry_loop2 @ while (aX >= in) + BGE presymmetry_loop2 ; while (aX >= in) - @ r0 = n - @ r1 = in - @ r2 = step - @ r3 = shift + ; r0 = n + ; r1 = in + ; r2 = step + ; r3 = shift STMFD r13!,{r3} - LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0 - ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1) - SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4 - LDRB r11,[r5,#1] @ r11= T[1] - LDRB r10,[r5],r2 @ r10= T[0] T += step -presymmetry_loop3: - LDR r8, [r1],#16 @ r8 = ro0 = bX[0] - LDR r9, [r1,#8-16] @ r9 = ro2 = bX[2] - LDR r6, [r4],#-16 @ r6 = ri0 = aX[0] - LDR r7, [r4,#8+16] @ r7 = ri2 = aX[2] + LDR r5, =sincos_lookup0 ; r5 = T=sincos_lookup0 + ADD r4, r1, r0, LSL #1 ; r4 = aX = in+(n>>1) + SUB r4, r4, #4*4 ; r4 = aX = in+(n>>1)-4 + LDRB r11,[r5,#1] ; r11= T[1] + LDRB r10,[r5],r2 ; r10= T[0] T += step +presymmetry_loop3 + LDR r8, [r1],#16 ; r8 = ro0 = bX[0] + LDR r9, [r1,#8-16] ; r9 = ro2 = bX[2] + LDR r6, [r4],#-16 ; r6 = ri0 = aX[0] + LDR r7, [r4,#8+16] ; r7 = ri2 = aX[2] MOV r8, r8, ASR #8 MOV r9, r9, ASR #8 MOV r6, r6, ASR #8 - @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] ) - @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 - MUL r12,r8, r11 @ r12 = ro0*T[1] + ; XNPROD31( ro2, ro0, T[1], T[0], &aX[0], &aX[2] ) + ; aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31 + MUL r12,r8, r11 ; r12 = ro0*T[1] MOV r7, r7, ASR #8 - MLA r12,r9, r10,r12 @ r12 += ro2*T[0] - RSB r8, r8, #0 @ r8 = -ro0 - MUL r3, r9, r11 @ r3 = ro2*T[1] - LDRB r11,[r5,#1] @ r11= T[1] - MLA r3, r8, r10,r3 @ r3 -= ro0*T[0] - LDRB r10,[r5],r2 @ r10= T[0] T += step + MLA r12,r9, r10,r12 ; r12 += ro2*T[0] + RSB r8, r8, #0 ; r8 = -ro0 + MUL r3, r9, r11 ; r3 = ro2*T[1] + LDRB r11,[r5,#1] ; r11= T[1] + MLA r3, r8, r10,r3 ; r3 -= ro0*T[0] + LDRB r10,[r5],r2 ; r10= T[0] T += step STR r12,[r4,#16+8] STR r3, [r4,#16] - @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] ) - @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 - MUL r12,r6, r10 @ r12 = ri0*T[0] - RSB r6, r6, #0 @ r6 = -ri0 - MLA r12,r7, r11,r12 @ r12 += ri2*T[1] + ; XNPROD31( ri2, ri0, T[0], T[1], &bX[0], &bX[2] ) + ; bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31 + MUL r12,r6, r10 ; r12 = ri0*T[0] + RSB r6, r6, #0 ; r6 = -ri0 + MLA r12,r7, r11,r12 ; r12 += ri2*T[1] CMP r4, r1 - MUL r3, r7, r10 @ r3 = ri2*T[0] + MUL r3, r7, r10 ; r3 = ri2*T[0] STR r12,[r1,#8-16] - MLA r3, r6, r11,r3 @ r3 -= ri0*T[1] + MLA r3, r6, r11,r3 ; r3 -= ri0*T[1] STR r3, [r1,#-16] BGE presymmetry_loop3 - SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in) + SUB r1,r1,r0 ; r1 = in -= n>>2 (i.e. restore in) LDR r3,[r13] STR r2,[r13,#-4]! - @ mdct_butterflies - @ r0 = n = (points * 2) - @ r1 = in = x - @ r2 = i - @ r3 = shift + ; mdct_butterflies + ; r0 = n = (points * 2) + ; r1 = in = x + ; r2 = i + ; r3 = shift STMFD r13!,{r0-r1} - RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages + RSBS r4,r3,#6 ; r4 = stages = 7-shift then --stages LDR r5,=sincos_lookup0 BLE no_generics - MOV r14,#4 @ r14= 4 (i=0) - MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift -mdct_butterflies_loop1: - MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS - MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0) + MOV r14,#4 ; r14= 4 (i=0) + MOV r6, r14,LSL r3 ; r6 = (4<<i)<<shift +mdct_butterflies_loop1 + MOV r0, r0, LSR #1 ; r0 = points>>i = POINTS + MOV r2, r14,LSR #2 ; r2 = (1<<i)-j (j=0) STMFD r13!,{r4,r14} -mdct_butterflies_loop2: - - @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) - @ mdct_butterfly_generic(r1, r0, r6) - @ r0 = points - @ r1 = x - @ preserve r2 (external loop counter) - @ preserve r3 - @ preserve r4 (external loop counter) - @ r5 = T = sincos_lookup0 - @ r6 = step - @ preserve r14 - - STR r2,[r13,#-4]! @ stack r2 - ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1) - ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS - ADD r12,r5,#1024 @ r12= sincos_lookup0+1024 - -mdct_bufferfly_generic_loop1: - LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0] - @ r3 = x1[1] - @ r8 = x1[2] - @ r11= x1[3] x1 -= 4 - LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0] - @ r9 = x2[1] - @ r10= x2[2] - @ r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) - SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2] - ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2]) - SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0] - ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1]) - SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2] - ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3]) +mdct_butterflies_loop2 + + ; mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift)) + ; mdct_butterfly_generic(r1, r0, r6) + ; r0 = points + ; r1 = x + ; preserve r2 (external loop counter) + ; preserve r3 + ; preserve r4 (external loop counter) + ; r5 = T = sincos_lookup0 + ; r6 = step + ; preserve r14 + + STR r2,[r13,#-4]! ; stack r2 + ADD r1,r1,r0,LSL #1 ; r1 = x2+4 = x + (POINTS>>1) + ADD r7,r1,r0,LSL #1 ; r7 = x1+4 = x + POINTS + ADD r12,r5,#1024 ; r12= sincos_lookup0+1024 + +mdct_bufferfly_generic_loop1 + LDMDB r7!,{r2,r3,r8,r11} ; r2 = x1[0] + ; r3 = x1[1] + ; r8 = x1[2] + ; r11= x1[3] x1 -= 4 + LDMDB r1!,{r4,r9,r10,r14} ; r4 = x2[0] + ; r9 = x2[1] + ; r10= x2[2] + ; r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) + SUB r11,r11,r8 ; r11= s1 = x1[3] - x1[2] + ADD r8, r11,r8, LSL #1 ; r8 = x1[3] + x1[2] (-> x1[2]) + SUB r9, r9, r4 ; r9 = s2 = x2[1] - x2[0] + ADD r4, r9, r4, LSL #1 ; r4 = x2[1] + x2[0] (-> x1[1]) + SUB r14,r14,r10 ; r14= s3 = x2[3] - x2[2] + ADD r10,r14,r10,LSL #1 ; r10= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r4,r8,r10} - @ r0 = points - @ r1 = x2 - @ r2 = s0 - @ r3 free - @ r4 free - @ r5 = T - @ r6 = step - @ r7 = x1 - @ r8 free - @ r9 = s2 - @ r10 free - @ r11= s1 - @ r12= limit - @ r14= s3 - - LDRB r8, [r5,#1] @ r8 = T[1] - LDRB r10,[r5],r6 @ r10= T[0] T += step + ; r0 = points + ; r1 = x2 + ; r2 = s0 + ; r3 free + ; r4 free + ; r5 = T + ; r6 = step + ; r7 = x1 + ; r8 free + ; r9 = s2 + ; r10 free + ; r11= s1 + ; r12= limit + ; r14= s3 + + LDRB r8, [r5,#1] ; r8 = T[1] + LDRB r10,[r5],r6 ; r10= T[0] T += step MOV r2, r2, ASR #8 MOV r11,r11,ASR #8 MOV r9, r9, ASR #8 MOV r14,r14,ASR #8 - @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) - @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 - @ stall Xscale - MUL r3, r2, r8 @ r3 = s0*T[1] - MLA r3, r11,r10,r3 @ r3 += s1*T[0] + ; XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2]) + ; x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31 + ; stall Xscale + MUL r3, r2, r8 ; r3 = s0*T[1] + MLA r3, r11,r10,r3 ; r3 += s1*T[0] RSB r11,r11,#0 - MUL r4, r8, r11 @ r4 = -s1*T[1] - MLA r4, r2, r10,r4 @ r4 += s0*T[0] = Value for x2[2] - MOV r2, r3 @ r2 = r3 = Value for x2[0] - - @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) - @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 - MUL r3, r9, r10 @ r3 = s2*T[0] - MLA r3, r14,r8, r3 @ r3 += s3*T[1] = Value for x2[1] + MUL r4, r8, r11 ; r4 = -s1*T[1] + MLA r4, r2, r10,r4 ; r4 += s0*T[0] = Value for x2[2] + MOV r2, r3 ; r2 = r3 = Value for x2[0] + + ; XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3]) + ; x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31 + MUL r3, r9, r10 ; r3 = s2*T[0] + MLA r3, r14,r8, r3 ; r3 += s3*T[1] = Value for x2[1] RSB r9, r9, #0 - MUL r11,r14,r10 @ r11 = s3*T[0] - MLA r11,r9, r8, r11 @ r11 -= s2*T[1] = Value for x2[3] + MUL r11,r14,r10 ; r11 = s3*T[0] + MLA r11,r9, r8, r11 ; r11 -= s2*T[1] = Value for x2[3] CMP r5, r12 STMIA r1,{r2,r3,r4,r11} @@ -473,472 +473,472 @@ mdct_bufferfly_generic_loop1: BLT mdct_bufferfly_generic_loop1 SUB r12,r12,#1024 -mdct_bufferfly_generic_loop2: - LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0] - @ r3 = x1[1] - @ r9 = x1[2] - @ r10= x1[3] x1 -= 4 - LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0] - @ r8 = x2[1] - @ r11= x2[2] - @ r14= x2[3] x2 -= 4 - - SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1] - ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0]) - SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3] - ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2]) - SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1] - ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1]) - SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2] - ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3]) +mdct_bufferfly_generic_loop2 + LDMDB r7!,{r2,r3,r9,r10} ; r2 = x1[0] + ; r3 = x1[1] + ; r9 = x1[2] + ; r10= x1[3] x1 -= 4 + LDMDB r1!,{r4,r8,r11,r14} ; r4 = x2[0] + ; r8 = x2[1] + ; r11= x2[2] + ; r14= x2[3] x2 -= 4 + + SUB r2, r2, r3 ; r2 = s0 = x1[0] - x1[1] + ADD r3, r2, r3, LSL #1 ; r3 = x1[0] + x1[1] (-> x1[0]) + SUB r9, r9,r10 ; r9 = s1 = x1[2] - x1[3] + ADD r10,r9,r10, LSL #1 ; r10= x1[2] + x1[3] (-> x1[2]) + SUB r4, r4, r8 ; r4 = s2 = x2[0] - x2[1] + ADD r8, r4, r8, LSL #1 ; r8 = x2[0] + x2[1] (-> x1[1]) + SUB r14,r14,r11 ; r14= s3 = x2[3] - x2[2] + ADD r11,r14,r11,LSL #1 ; r11= x2[3] + x2[2] (-> x1[3]) STMIA r7,{r3,r8,r10,r11} - @ r0 = points - @ r1 = x2 - @ r2 = s0 - @ r3 free - @ r4 = s2 - @ r5 = T - @ r6 = step - @ r7 = x1 - @ r8 free - @ r9 = s1 - @ r10 free - @ r11 free - @ r12= limit - @ r14= s3 - - LDRB r8, [r5,#1] @ r8 = T[1] - LDRB r10,[r5],-r6 @ r10= T[0] T -= step + ; r0 = points + ; r1 = x2 + ; r2 = s0 + ; r3 free + ; r4 = s2 + ; r5 = T + ; r6 = step + ; r7 = x1 + ; r8 free + ; r9 = s1 + ; r10 free + ; r11 free + ; r12= limit + ; r14= s3 + + LDRB r8, [r5,#1] ; r8 = T[1] + LDRB r10,[r5],-r6 ; r10= T[0] T -= step MOV r2, r2, ASR #8 MOV r9, r9, ASR #8 MOV r4, r4, ASR #8 MOV r14,r14,ASR #8 - @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) - @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 - @ stall Xscale - MUL r11,r2, r8 @ r11 = s0*T[1] - MLA r11,r9, r10,r11 @ r11 += s1*T[0] + ; XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2]) + ; x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31 + ; stall Xscale + MUL r11,r2, r8 ; r11 = s0*T[1] + MLA r11,r9, r10,r11 ; r11 += s1*T[0] RSB r9, r9, #0 - MUL r2, r10,r2 @ r2 = s0*T[0] - MLA r2, r9, r8, r2 @ r2 += -s1*T[1] = Value for x2[0] - MOV r9, r11 @ r9 = r11 = Value for x2[2] - - @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) - @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 - MUL r11,r4, r10 @ r11 = s2*T[0] - MLA r11,r14,r8, r11 @ r11 += s3*T[1] = Value for x2[3] + MUL r2, r10,r2 ; r2 = s0*T[0] + MLA r2, r9, r8, r2 ; r2 += -s1*T[1] = Value for x2[0] + MOV r9, r11 ; r9 = r11 = Value for x2[2] + + ; XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3]) + ; x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31 + MUL r11,r4, r10 ; r11 = s2*T[0] + MLA r11,r14,r8, r11 ; r11 += s3*T[1] = Value for x2[3] RSB r4, r4, #0 - MUL r3, r14,r10 @ r3 = s3*T[0] - MLA r3, r4, r8, r3 @ r3 -= s2*T[1] = Value for x2[1] + MUL r3, r14,r10 ; r3 = s3*T[0] + MLA r3, r4, r8, r3 ; r3 -= s2*T[1] = Value for x2[1] CMP r5, r12 STMIA r1,{r2,r3,r9,r11} BGT mdct_bufferfly_generic_loop2 - LDR r2,[r13],#4 @ unstack r2 - ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j - @ stall Xscale - SUBS r2, r2, #1 @ r2-- (j++) + LDR r2,[r13],#4 ; unstack r2 + ADD r1, r1, r0, LSL #2 ; r1 = x+POINTS*j + ; stall Xscale + SUBS r2, r2, #1 ; r2-- (j++) BGT mdct_butterflies_loop2 LDMFD r13!,{r4,r14} LDR r1,[r13,#4] - SUBS r4, r4, #1 @ stages-- - MOV r14,r14,LSL #1 @ r14= 4<<i (i++) - MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++) + SUBS r4, r4, #1 ; stages-- + MOV r14,r14,LSL #1 ; r14= 4<<i (i++) + MOV r6, r6, LSL #1 ; r6 = step <<= 1 (i++) BGE mdct_butterflies_loop1 LDMFD r13,{r0-r1} -no_generics: - @ mdct_butterflies part2 (loop around mdct_bufferfly_32) - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift - -mdct_bufferflies_loop3: - @ mdct_bufferfly_32 - - @ block1 - ADD r4, r1, #16*4 @ r4 = &in[16] - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16] - @ r6 = x[17] - @ r9 = x[18] - @ r10= x[19] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] - @ r8 = x[1] - @ r11= x[2] - @ r12= x[3] - SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17] - ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16] - SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19] - ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18] - SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17] - SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19] +no_generics + ; mdct_butterflies part2 (loop around mdct_bufferfly_32) + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift + +mdct_bufferflies_loop3 + ; mdct_bufferfly_32 + + ; block1 + ADD r4, r1, #16*4 ; r4 = &in[16] + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[16] + ; r6 = x[17] + ; r9 = x[18] + ; r10= x[19] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] + ; r8 = x[1] + ; r11= x[2] + ; r12= x[3] + SUB r5, r5, r6 ; r5 = s0 = x[16] - x[17] + ADD r6, r5, r6, LSL #1 ; r6 = x[16] + x[17] -> x[16] + SUB r9, r9, r10 ; r9 = s1 = x[18] - x[19] + ADD r10,r9, r10,LSL #1 ; r10= x[18] + x[19] -> x[18] + SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[17] + SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[19] STMIA r4!,{r6,r7,r10,r11} - MOV r6,#0xed @ r6 =cPI1_8 - MOV r7,#0x62 @ r7 =cPI3_8 + MOV r6,#0xed ; r6 =cPI1_8 + MOV r7,#0x62 ; r7 =cPI3_8 MOV r5, r5, ASR #8 MOV r9, r9, ASR #8 MOV r8, r8, ASR #8 MOV r12,r12,ASR #8 - @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) - @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 - @ stall Xscale - MUL r11,r5, r6 @ r11 = s0*cPI1_8 - MLA r11,r9, r7, r11 @ r11 += s1*cPI3_8 + ; XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ) + ; x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8 + ; stall Xscale + MUL r11,r5, r6 ; r11 = s0*cPI1_8 + MLA r11,r9, r7, r11 ; r11 += s1*cPI3_8 RSB r9, r9, #0 - MUL r5, r7, r5 @ r5 = s0*cPI3_8 - MLA r5, r9, r6, r5 @ r5 -= s1*cPI1_8 + MUL r5, r7, r5 ; r5 = s0*cPI3_8 + MLA r5, r9, r6, r5 ; r5 -= s1*cPI1_8 - @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) - @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 - MUL r9, r8, r6 @ r9 = s2*cPI1_8 - MLA r9, r12,r7, r9 @ r9 += s3*cPI3_8 + ; XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ) + ; x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8 + MUL r9, r8, r6 ; r9 = s2*cPI1_8 + MLA r9, r12,r7, r9 ; r9 += s3*cPI3_8 RSB r8,r8,#0 - MUL r12,r6, r12 @ r12 = s3*cPI1_8 - MLA r12,r8, r7, r12 @ r12 -= s2*cPI3_8 + MUL r12,r6, r12 ; r12 = s3*cPI1_8 + MLA r12,r8, r7, r12 ; r12 -= s2*cPI3_8 STMIA r1!,{r5,r9,r11,r12} - @ block2 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20] - @ r6 = x[21] - @ r9 = x[22] - @ r10= x[23] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4] - @ r8 = x[5] - @ r11= x[6] - @ r12= x[7] - SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21] - ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20] - SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23] - ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22] - SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21] - SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6] - ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23] - MOV r14,#0xb5 @ cPI2_8 + ; block2 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[20] + ; r6 = x[21] + ; r9 = x[22] + ; r10= x[23] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[4] + ; r8 = x[5] + ; r11= x[6] + ; r12= x[7] + SUB r5, r5, r6 ; r5 = s0 = x[20] - x[21] + ADD r6, r5, r6, LSL #1 ; r6 = x[20] + x[21] -> x[20] + SUB r9, r9, r10 ; r9 = s1 = x[22] - x[23] + ADD r10,r9, r10,LSL #1 ; r10= x[22] + x[23] -> x[22] + SUB r8, r8, r7 ; r8 = s2 = x[ 5] - x[ 4] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 5] + x[ 4] -> x[21] + SUB r12,r12,r11 ; r12= s3 = x[ 7] - x[ 6] + ADD r11,r12,r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[23] + MOV r14,#0xb5 ; cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 @ r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 + SUB r5, r5, r9 ; r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 MOV r5, r5, ASR #8 - MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8 - SUB r12,r12,r8 @ r12= s3 - s2 - ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 + MUL r5, r14,r5 ; r5 = (s0-s1)*cPI2_8 + SUB r12,r12,r8 ; r12= s3 - s2 + ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 MOV r8, r8, ASR #8 - MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8 + MUL r8, r14,r8 ; r8 = (s3+s2)*cPI2_8 MOV r9, r9, ASR #8 - MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8 + MUL r9, r14,r9 ; r9 = (s0+s1)*cPI2_8 MOV r12,r12,ASR #8 - MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8 + MUL r12,r14,r12 ; r12 = (s3-s2)*cPI2_8 STMIA r1!,{r5,r8,r9,r12} - @ block3 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24] - @ r6 = x[25] - @ r9 = x[25] - @ r10= x[26] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8] - @ r8 = x[9] - @ r11= x[10] - @ r12= x[11] - SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25] - ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25] - SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27] - ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26] - SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25] - SUB r12,r12,r11 @ r12= s3 = x[11] - x[10] - ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27] + ; block3 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[24] + ; r6 = x[25] + ; r9 = x[25] + ; r10= x[26] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[8] + ; r8 = x[9] + ; r11= x[10] + ; r12= x[11] + SUB r5, r5, r6 ; r5 = s0 = x[24] - x[25] + ADD r6, r5, r6, LSL #1 ; r6 = x[24] + x[25] -> x[25] + SUB r9, r9, r10 ; r9 = s1 = x[26] - x[27] + ADD r10,r9, r10,LSL #1 ; r10= x[26] + x[27] -> x[26] + SUB r8, r8, r7 ; r8 = s2 = x[ 9] - x[ 8] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 9] + x[ 8] -> x[25] + SUB r12,r12,r11 ; r12= s3 = x[11] - x[10] + ADD r11,r12,r11, LSL #1 ; r11= x[11] + x[10] -> x[27] STMIA r4!,{r6,r7,r10,r11} - MOV r6,#0x62 @ r6 = cPI3_8 - MOV r7,#0xED @ r7 = cPI1_8 + MOV r6,#0x62 ; r6 = cPI3_8 + MOV r7,#0xED ; r7 = cPI1_8 - @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) - @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 - @ stall Xscale + ; XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] ) + ; x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8 + ; stall Xscale MOV r5, r5, ASR #8 - MUL r11,r5, r6 @ r11 = s0*cPI3_8 + MUL r11,r5, r6 ; r11 = s0*cPI3_8 MOV r9, r9, ASR #8 - MLA r11,r9, r7, r11 @ r11 += s1*cPI1_8 + MLA r11,r9, r7, r11 ; r11 += s1*cPI1_8 RSB r9, r9, #0 - MUL r5, r7, r5 @ r5 = s0*cPI1_8 - MLA r5, r9, r6, r5 @ r5 -= s1*cPI3_8 + MUL r5, r7, r5 ; r5 = s0*cPI1_8 + MLA r5, r9, r6, r5 ; r5 -= s1*cPI3_8 - @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) - @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 + ; XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] ) + ; x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8 MOV r8, r8, ASR #8 - MUL r9, r8, r6 @ r9 = s2*cPI3_8 + MUL r9, r8, r6 ; r9 = s2*cPI3_8 MOV r12,r12,ASR #8 - MLA r9, r12,r7, r9 @ r9 += s3*cPI1_8 + MLA r9, r12,r7, r9 ; r9 += s3*cPI1_8 RSB r8,r8,#0 - MUL r12,r6, r12 @ r12 = s3*cPI3_8 - MLA r12,r8, r7, r12 @ r12 -= s2*cPI1_8 + MUL r12,r6, r12 ; r12 = s3*cPI3_8 + MLA r12,r8, r7, r12 ; r12 -= s2*cPI1_8 STMIA r1!,{r5,r9,r11,r12} - @ block4 - LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28] - @ r6 = x[29] - @ r10= x[30] - @ r11= x[31] - LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12] - @ r9 = x[13] - @ r12= x[14] - @ r14= x[15] - SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29] - ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28] - SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14] - ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31] - SUB r10,r10,r11 @ r10= s1 = x[30] - x[31] - ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30] - SUB r14, r8, r9 @ r14= s2 = x[12] - x[13] - ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29] + ; block4 + LDMIA r4,{r5,r6,r10,r11} ; r5 = x[28] + ; r6 = x[29] + ; r10= x[30] + ; r11= x[31] + LDMIA r1,{r8,r9,r12,r14} ; r8 = x[12] + ; r9 = x[13] + ; r12= x[14] + ; r14= x[15] + SUB r5, r5, r6 ; r5 = s0 = x[28] - x[29] + ADD r6, r5, r6, LSL #1 ; r6 = x[28] + x[29] -> x[28] + SUB r7, r14,r12 ; r7 = s3 = x[15] - x[14] + ADD r12,r7, r12, LSL #1 ; r12= x[15] + x[14] -> x[31] + SUB r10,r10,r11 ; r10= s1 = x[30] - x[31] + ADD r11,r10,r11,LSL #1 ; r11= x[30] + x[31] -> x[30] + SUB r14, r8, r9 ; r14= s2 = x[12] - x[13] + ADD r9, r14, r9, LSL #1 ; r9 = x[12] + x[13] -> x[29] STMIA r4!,{r6,r9,r11,r12} STMIA r1!,{r5,r7,r10,r14} - @ mdct_butterfly16 (1st version) - @ block 1 + ; mdct_butterfly16 (1st version) + ; block 1 SUB r1,r1,#16*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] - @ r6 = x[ 9] - @ r9 = x[10] - @ r10= x[11] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] - @ r8 = x[1] - @ r11= x[2] - @ r12= x[3] - SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] - MOV r14,#0xB5 @ r14= cPI2_8 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] + ; r6 = x[ 9] + ; r9 = x[10] + ; r10= x[11] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] + ; r8 = x[1] + ; r11= x[2] + ; r12= x[3] + SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] + MOV r14,#0xB5 ; r14= cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 @ r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 + SUB r5, r5, r9 ; r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 MOV r5, r5, ASR #8 - MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8 - SUB r12,r12,r8 @ r12= s3 - s2 - ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 + MUL r5, r14,r5 ; r5 = (s0-s1)*cPI2_8 + SUB r12,r12,r8 ; r12= s3 - s2 + ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 MOV r8, r8, ASR #8 - MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8 + MUL r8, r14,r8 ; r8 = (s3+s2)*cPI2_8 MOV r9, r9, ASR #8 - MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8 + MUL r9, r14,r9 ; r9 = (s0+s1)*cPI2_8 MOV r12,r12,ASR #8 - MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8 + MUL r12,r14,r12 ; r12 = (s3-s2)*cPI2_8 STMIA r1!,{r5,r8,r9,r12} - @ block2 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] - @ r6 = x[13] - @ r9 = x[14] - @ r10= x[15] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] - @ r8 = x[ 5] - @ r11= x[ 6] - @ r12= x[ 7] - SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] - SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] - SUB r12,r9, r10 @ r12= s3 = x[14] - x[15] - ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14] + ; block2 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] + ; r6 = x[13] + ; r9 = x[14] + ; r10= x[15] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] + ; r8 = x[ 5] + ; r11= x[ 6] + ; r12= x[ 7] + SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] + SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] + SUB r12,r9, r10 ; r12= s3 = x[14] - x[15] + ADD r10,r12,r10,LSL #1 ; r10= x[14] + x[15] -> x[14] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r12,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} - @ mdct_butterfly16 (2nd version) - @ block 1 + ; mdct_butterfly16 (2nd version) + ; block 1 ADD r1,r1,#16*4-8*4 ADD r4,r1,#8*4 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8] - @ r6 = x[ 9] - @ r9 = x[10] - @ r10= x[11] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0] - @ r8 = x[1] - @ r11= x[2] - @ r12= x[3] - SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9] - ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8] - SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11] - ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10] - SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0] - ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9] - SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2] - ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11] - MOV r14,#0xb5 @ r14= cPI2_8 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[ 8] + ; r6 = x[ 9] + ; r9 = x[10] + ; r10= x[11] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[0] + ; r8 = x[1] + ; r11= x[2] + ; r12= x[3] + SUB r5, r5, r6 ; r5 = s0 = x[ 8] - x[ 9] + ADD r6, r5, r6, LSL #1 ; r6 = x[ 8] + x[ 9] -> x[ 8] + SUB r9, r9, r10 ; r9 = s1 = x[10] - x[11] + ADD r10,r9, r10,LSL #1 ; r10= x[10] + x[11] -> x[10] + SUB r8, r8, r7 ; r8 = s2 = x[ 1] - x[ 0] + ADD r7, r8, r7, LSL #1 ; r7 = x[ 1] + x[ 0] -> x[ 9] + SUB r12,r12,r11 ; r12= s3 = x[ 3] - x[ 2] + ADD r11,r12,r11, LSL #1 ; r11= x[ 3] + x[ 2] -> x[11] + MOV r14,#0xb5 ; r14= cPI2_8 STMIA r4!,{r6,r7,r10,r11} - SUB r5, r5, r9 @ r5 = s0 - s1 - ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1 + SUB r5, r5, r9 ; r5 = s0 - s1 + ADD r9, r5, r9, LSL #1 ; r9 = s0 + s1 MOV r5, r5, ASR #8 - MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8 - SUB r12,r12,r8 @ r12= s3 - s2 - ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2 + MUL r5, r14,r5 ; r5 = (s0-s1)*cPI2_8 + SUB r12,r12,r8 ; r12= s3 - s2 + ADD r8, r12,r8, LSL #1 ; r8 = s3 + s2 MOV r8, r8, ASR #8 - MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8 + MUL r8, r14,r8 ; r8 = (s3+s2)*cPI2_8 MOV r9, r9, ASR #8 - MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8 + MUL r9, r14,r9 ; r9 = (s0+s1)*cPI2_8 MOV r12,r12,ASR #8 - MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8 + MUL r12,r14,r12 ; r12 = (s3-s2)*cPI2_8 STMIA r1!,{r5,r8,r9,r12} - @ block2 - LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12] - @ r6 = x[13] - @ r9 = x[14] - @ r10= x[15] - LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4] - @ r8 = x[ 5] - @ r11= x[ 6] - @ r12= x[ 7] - SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13] - ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12] - SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15] - ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14] - SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5] - ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13] - SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6] - ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15] + ; block2 + LDMIA r4,{r5,r6,r9,r10} ; r5 = x[12] + ; r6 = x[13] + ; r9 = x[14] + ; r10= x[15] + LDMIA r1,{r7,r8,r11,r12} ; r7 = x[ 4] + ; r8 = x[ 5] + ; r11= x[ 6] + ; r12= x[ 7] + SUB r5, r5, r6 ; r5 = s2 = x[12] - x[13] + ADD r6, r5, r6, LSL #1 ; r6 = x[12] + x[13] -> x[12] + SUB r9, r9, r10 ; r9 = s3 = x[14] - x[15] + ADD r10,r9, r10,LSL #1 ; r10= x[14] + x[15] -> x[14] + SUB r14,r7, r8 ; r14= s0 = x[ 4] - x[ 5] + ADD r8, r14,r8, LSL #1 ; r8 = x[ 4] + x[ 5] -> x[13] + SUB r7, r12,r11 ; r7 = s1 = x[ 7] - x[ 6] + ADD r11,r7, r11, LSL #1 ; r11= x[ 7] + x[ 6] -> x[15] STMIA r4!,{r6,r8,r10,r11} STMIA r1!,{r5,r7,r9,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14} - @ mdct_butterfly_8 + ; mdct_butterfly_8 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14} - @ r6 = x[0] - @ r7 = x[1] - @ r8 = x[2] - @ r9 = x[3] - @ r10= x[4] - @ r11= x[5] - @ r12= x[6] - @ r14= x[7] - ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1] - SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1] - ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3] - SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3] - ADD r10,r10,r11 @ r10= s4 = x[4] + x[5] - SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5] - ADD r12,r12,r14 @ r12= s6 = x[6] + x[7] - SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7] - - ADD r2, r11,r9 @ r2 = x[0] = s5 + s3 - SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3 - SUB r3, r14,r7 @ r3 = x[1] = s7 - s1 - ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1 - SUB r10,r10,r6 @ r10= x[4] = s4 - s0 - SUB r11,r12,r8 @ r11= x[5] = s6 - s2 - ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0 - ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2 + ; r6 = x[0] + ; r7 = x[1] + ; r8 = x[2] + ; r9 = x[3] + ; r10= x[4] + ; r11= x[5] + ; r12= x[6] + ; r14= x[7] + ADD r6, r6, r7 ; r6 = s0 = x[0] + x[1] + SUB r7, r6, r7, LSL #1 ; r7 = s1 = x[0] - x[1] + ADD r8, r8, r9 ; r8 = s2 = x[2] + x[3] + SUB r9, r8, r9, LSL #1 ; r9 = s3 = x[2] - x[3] + ADD r10,r10,r11 ; r10= s4 = x[4] + x[5] + SUB r11,r10,r11,LSL #1 ; r11= s5 = x[4] - x[5] + ADD r12,r12,r14 ; r12= s6 = x[6] + x[7] + SUB r14,r12,r14,LSL #1 ; r14= s7 = x[6] - x[7] + + ADD r2, r11,r9 ; r2 = x[0] = s5 + s3 + SUB r4, r2, r9, LSL #1 ; r4 = x[2] = s5 - s3 + SUB r3, r14,r7 ; r3 = x[1] = s7 - s1 + ADD r5, r3, r7, LSL #1 ; r5 = x[3] = s7 + s1 + SUB r10,r10,r6 ; r10= x[4] = s4 - s0 + SUB r11,r12,r8 ; r11= x[5] = s6 - s2 + ADD r12,r10,r6, LSL #1 ; r12= x[6] = s4 + s0 + ADD r14,r11,r8, LSL #1 ; r14= x[7] = s6 + s2 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14} ADD r1,r1,#8*4 @@ -947,117 +947,117 @@ mdct_bufferflies_loop3: LDMFD r13,{r0-r3} -mdct_bitreverse_arm_low: - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift +mdct_bitreverse_arm_low + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift - MOV r4, #0 @ r4 = bit = 0 - ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1) + MOV r4, #0 ; r4 = bit = 0 + ADD r5, r1, r0, LSL #1 ; r5 = w = x + (n>>1) ADR r6, bitrev SUB r5, r5, #8 -brev_lp: +brev_lp LDRB r7, [r6, r4, LSR #6] AND r8, r4, #0x3f LDRB r8, [r6, r8] - ADD r4, r4, #1 @ bit++ - @ stall XScale - ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit] + ADD r4, r4, #1 ; bit++ + ; stall XScale + ORR r7, r7, r8, LSL #6 ; r7 = bitrev[bit] MOV r7, r7, LSR r3 - ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift) - CMP r5, r9 @ if (w > xx) - LDR r10,[r5],#-8 @ r10 = w[0] w -= 2 - LDRGT r11,[r5,#12] @ r11 = w[1] - LDRGT r12,[r9] @ r12 = xx[0] - LDRGT r14,[r9,#4] @ r14 = xx[1] - STRGT r10,[r9] @ xx[0]= w[0] - STRGT r11,[r9,#4] @ xx[1]= w[1] - STRGT r12,[r5,#8] @ w[0] = xx[0] - STRGT r14,[r5,#12] @ w[1] = xx[1] + ADD r9, r1, r7, LSL #2 ; r9 = xx = x + (b>>shift) + CMP r5, r9 ; if (w > xx) + LDR r10,[r5],#-8 ; r10 = w[0] w -= 2 + LDRGT r11,[r5,#12] ; r11 = w[1] + LDRGT r12,[r9] ; r12 = xx[0] + LDRGT r14,[r9,#4] ; r14 = xx[1] + STRGT r10,[r9] ; xx[0]= w[0] + STRGT r11,[r9,#4] ; xx[1]= w[1] + STRGT r12,[r5,#8] ; w[0] = xx[0] + STRGT r14,[r5,#12] ; w[1] = xx[1] CMP r5,r1 BGT brev_lp - @ mdct_step7 - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift - - CMP r2, #4 @ r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + - LDRLT r5, =sincos_lookup1 @ sincos_lookup0 + - ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1) - ADDGE r5, r5, r2, LSR #1 @ (step>>1) - ADD r8, r5, #1024 @ r8 = Ttop -step7_loop1: - LDR r6, [r1] @ r6 = w0[0] - LDR r9, [r1,#4] @ r9 = w0[1] - LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] @ r11= w1[1] - LDRB r14,[r5,#1] @ r14= T[1] - LDRB r12,[r5],r2 @ r12= T[0] T += step - - ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] + ; mdct_step7 + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift + + CMP r2, #4 ; r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + + LDRLT r5, =sincos_lookup1 ; sincos_lookup0 + + ADD r7, r1, r0, LSL #1 ; r7 = w1 = x + (n>>1) + ADDGE r5, r5, r2, LSR #1 ; (step>>1) + ADD r8, r5, #1024 ; r8 = Ttop +step7_loop1 + LDR r6, [r1] ; r6 = w0[0] + LDR r9, [r1,#4] ; r9 = w0[1] + LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] ; r11= w1[1] + LDRB r14,[r5,#1] ; r14= T[1] + LDRB r12,[r5],r2 ; r12= T[0] T += step + + ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] MOV r6, r6, ASR #9 - MUL r3, r6, r14 @ r3 = s0*T[1] + MUL r3, r6, r14 ; r3 = s0*T[1] MOV r11,r11,ASR #9 - MUL r4, r11,r12 @ r4 += s1*T[0] = s2 + MUL r4, r11,r12 ; r4 += s1*T[0] = s2 ADD r3, r3, r4 - MUL r14,r11,r14 @ r14 = s1*T[1] - MUL r12,r6, r12 @ r12 += s0*T[0] = s3 + MUL r14,r11,r14 ; r14 = s1*T[1] + MUL r12,r6, r12 ; r12 += s0*T[0] = s3 SUB r14,r14,r12 - @ r9 = s0b<<1 - @ r10= s1b<<1 - ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 + ; r9 = s0b<<1 + ; r10= s1b<<1 + ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b - ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b + SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b + ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 @ w0 += 2 + STR r10,[r1],#4 ; w0 += 2 STR r3, [r7] STR r12,[r7,#4] CMP r5,r8 BLT step7_loop1 -step7_loop2: - LDR r6, [r1] @ r6 = w0[0] - LDR r9, [r1,#4] @ r9 = w0[1] - LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2 - LDR r11,[r7,#4] @ r11= w1[1] - LDRB r14,[r5,-r2]! @ r12= T[1] T -= step - LDRB r12,[r5,#1] @ r14= T[0] +step7_loop2 + LDR r6, [r1] ; r6 = w0[0] + LDR r9, [r1,#4] ; r9 = w0[1] + LDR r10,[r7,#-8]! ; r10= w1[0] w1 -= 2 + LDR r11,[r7,#4] ; r11= w1[1] + LDRB r14,[r5,-r2]! ; r12= T[1] T -= step + LDRB r12,[r5,#1] ; r14= T[0] - ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0] - SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0] - SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1] - ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1] + ADD r6, r6, r10 ; r6 = s0 = w0[0] + w1[0] + SUB r10,r6, r10,LSL #1 ; r10= s1b= w0[0] - w1[0] + SUB r11,r11,r9 ; r11= s1 = w1[1] - w0[1] + ADD r9, r11,r9, LSL #1 ; r9 = s0b= w1[1] + w0[1] MOV r6, r6, ASR #9 - MUL r3, r6, r14 @ r3 = s0*T[0] + MUL r3, r6, r14 ; r3 = s0*T[0] MOV r11,r11,ASR #9 - MUL r4, r11,r12 @ r4 += s1*T[1] = s2 + MUL r4, r11,r12 ; r4 += s1*T[1] = s2 ADD r3, r3, r4 - MUL r14,r11,r14 @ r14 = s1*T[0] - MUL r12,r6, r12 @ r12 += s0*T[1] = s3 + MUL r14,r11,r14 ; r14 = s1*T[0] + MUL r12,r6, r12 ; r12 += s0*T[1] = s3 SUB r14,r14,r12 - @ r9 = s0b<<1 - @ r10= s1b<<1 - ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2 - SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2 + ; r9 = s0b<<1 + ; r10= s1b<<1 + ADD r9, r3, r9, ASR #1 ; r9 = s0b + s2 + SUB r3, r9, r3, LSL #1 ; r3 = s0b - s2 - SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b - ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b + SUB r12,r14,r10,ASR #1 ; r12= s3 - s1b + ADD r10,r14,r10,ASR #1 ; r10= s3 + s1b STR r9, [r1],#4 - STR r10,[r1],#4 @ w0 += 2 + STR r10,[r1],#4 ; w0 += 2 STR r3, [r7] STR r12,[r7,#4] @@ -1066,110 +1066,110 @@ step7_loop2: LDMFD r13!,{r0-r3} - @ r0 = points - @ r1 = in - @ r2 = step - @ r3 = shift - MOV r2, r2, ASR #2 @ r2 = step >>= 2 + ; r0 = points + ; r1 = in + ; r2 = step + ; r3 = shift + MOV r2, r2, ASR #2 ; r2 = step >>= 2 CMP r2, #0 CMPNE r2, #1 BEQ mdct_end - @ step > 1 (default case) - CMP r2, #4 @ r5 = T = (step>=4) ? - LDRGE r5, =sincos_lookup0 @ sincos_lookup0 + - LDRLT r5, =sincos_lookup1 @ sincos_lookup1 - ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1) - ADDGE r5, r5, r2, LSR #1 @ (step>>1) -mdct_step8_default: - LDR r6, [r1],#4 @ r6 = s0 = x[0] - LDR r8, [r1],#4 @ r8 = -s1 = x[1] - LDRB r12,[r5,#1] @ r12= T[1] - LDRB r14,[r5],r2 @ r14= T[0] T += step - RSB r8, r8, #0 @ r8 = s1 - - @ XPROD31(s0, s1, T[0], T[1], x, x+1) - @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] + ; step > 1 (default case) + CMP r2, #4 ; r5 = T = (step>=4) ? + LDRGE r5, =sincos_lookup0 ; sincos_lookup0 + + LDRLT r5, =sincos_lookup1 ; sincos_lookup1 + ADD r7, r1, r0, LSL #1 ; r7 = iX = x + (n>>1) + ADDGE r5, r5, r2, LSR #1 ; (step>>1) +mdct_step8_default + LDR r6, [r1],#4 ; r6 = s0 = x[0] + LDR r8, [r1],#4 ; r8 = -s1 = x[1] + LDRB r12,[r5,#1] ; r12= T[1] + LDRB r14,[r5],r2 ; r14= T[0] T += step + RSB r8, r8, #0 ; r8 = s1 + + ; XPROD31(s0, s1, T[0], T[1], x, x+1) + ; x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1] MOV r6, r6, ASR #8 MOV r8, r8, ASR #8 - MUL r10,r8, r12 @ r10 = s1 * T[1] + MUL r10,r8, r12 ; r10 = s1 * T[1] CMP r1, r7 - MLA r10,r6, r14,r10 @ r10 += s0 * T[0] - RSB r6, r6, #0 @ r6 = -s0 - MUL r11,r8, r14 @ r11 = s1 * T[0] - MLA r11,r6, r12,r11 @ r11 -= s0 * T[1] + MLA r10,r6, r14,r10 ; r10 += s0 * T[0] + RSB r6, r6, #0 ; r6 = -s0 + MUL r11,r8, r14 ; r11 = s1 * T[0] + MLA r11,r6, r12,r11 ; r11 -= s0 * T[1] STR r10,[r1,#-8] STR r11,[r1,#-4] BLT mdct_step8_default -mdct_end: +mdct_end MOV r0, r2 LDMFD r13!,{r4-r11,PC} -bitrev: - .byte 0 - .byte 32 - .byte 16 - .byte 48 - .byte 8 - .byte 40 - .byte 24 - .byte 56 - .byte 4 - .byte 36 - .byte 20 - .byte 52 - .byte 12 - .byte 44 - .byte 28 - .byte 60 - .byte 2 - .byte 34 - .byte 18 - .byte 50 - .byte 10 - .byte 42 - .byte 26 - .byte 58 - .byte 6 - .byte 38 - .byte 22 - .byte 54 - .byte 14 - .byte 46 - .byte 30 - .byte 62 - .byte 1 - .byte 33 - .byte 17 - .byte 49 - .byte 9 - .byte 41 - .byte 25 - .byte 57 - .byte 5 - .byte 37 - .byte 21 - .byte 53 - .byte 13 - .byte 45 - .byte 29 - .byte 61 - .byte 3 - .byte 35 - .byte 19 - .byte 51 - .byte 11 - .byte 43 - .byte 27 - .byte 59 - .byte 7 - .byte 39 - .byte 23 - .byte 55 - .byte 15 - .byte 47 - .byte 31 - .byte 63 - - @ END +bitrev + DCB 0 + DCB 32 + DCB 16 + DCB 48 + DCB 8 + DCB 40 + DCB 24 + DCB 56 + DCB 4 + DCB 36 + DCB 20 + DCB 52 + DCB 12 + DCB 44 + DCB 28 + DCB 60 + DCB 2 + DCB 34 + DCB 18 + DCB 50 + DCB 10 + DCB 42 + DCB 26 + DCB 58 + DCB 6 + DCB 38 + DCB 22 + DCB 54 + DCB 14 + DCB 46 + DCB 30 + DCB 62 + DCB 1 + DCB 33 + DCB 17 + DCB 49 + DCB 9 + DCB 41 + DCB 25 + DCB 57 + DCB 5 + DCB 37 + DCB 21 + DCB 53 + DCB 13 + DCB 45 + DCB 29 + DCB 61 + DCB 3 + DCB 35 + DCB 19 + DCB 51 + DCB 11 + DCB 43 + DCB 27 + DCB 59 + DCB 7 + DCB 39 + DCB 23 + DCB 55 + DCB 15 + DCB 47 + DCB 31 + DCB 63 + + END |