diff options
author | Nedeljko Babic <nbabic@mips.com> | 2012-02-25 10:20:52 +0100 |
---|---|---|
committer | Nedeljko Babic <nbabic@mips.com> | 2012-04-03 14:21:48 +0200 |
commit | dc716e37fafdce4ed7b1d66f39de4617534736ff (patch) | |
tree | 1f37558b12518dd7bc3b989c2b8f61bf9ba30bdd | |
parent | 546899e66dbe4b08d0472b414193cda619087b5d (diff) | |
download | tremor-dc716e37fafdce4ed7b1d66f39de4617534736ff.tar.gz |
Add folder with assembly files for MIPS
File decode_mapMIPS.S can be used on all MIPS32R2 architectures.
Files floor1_inverse2MIPS.S and mdct_backwardMIPS.S can be used on MIPS32R2
architectures that support MIPS DSP ASE rev 1
-rw-r--r-- | mips-dspr1/decode_mapMIPS.S | 409 | ||||
-rw-r--r-- | mips-dspr1/floor1_inverse2MIPS.S | 269 | ||||
-rw-r--r-- | mips-dspr1/mdct_backwardMIPS.S | 1864 |
3 files changed, 2542 insertions, 0 deletions
diff --git a/mips-dspr1/decode_mapMIPS.S b/mips-dspr1/decode_mapMIPS.S new file mode 100644 index 0000000..9ca36c9 --- /dev/null +++ b/mips-dspr1/decode_mapMIPS.S @@ -0,0 +1,409 @@ +/***************************************************************************** +* Copyright (c) 2012 +* MIPS Technologies, Inc., California. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND +* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +* SUCH DAMAGE. +* +* Author: Nedeljko Babic (nbabic@mips.com) +*****************************************************************************/ +#/*************************************************************************** +#* +#* File: decode_map.S +#* +#* Description: +#* basic codebook pack/unpack/code/decode operations +#* +#***************************************************************************/ + +#/*************************************************************************** +#* +#* Function: decode_map +#* +#* Description: +#* decode vector / dim granularity gaurding is done in the upper layer +#* +#* Parameters: +#* +#* a0 : pointer to codebook structure +#* a1 : pointer to packet buffer +#* a2 : vector array +#* a3 : point +#* +#* +#* Reference: see codebook.c +#* +#* +#* Notes: +#* +#***************************************************************************/ + .text + .align 2 + .globl decode_map + .set nomips16 + .set nomicromips + .ent decode_map + .type decode_map, @function + #### int decode_map(codebook *s, oggpack_buffer *b, ogg_int32_t *v, + #### int point) + # a0: codebook *s + # a1: oggpack_buffer *b + # a2: ogg_int32_t *v + # a3: int point +decode_map: + .frame $sp, 48, $ra + .set noreorder + .cpload $t9 + .set reorder + addiu $sp, -48 + .cprestore 16 + sw $s0, 24($sp) + sw $s1, 28($sp) + sw $s2, 32($sp) + sw $s3, 36($sp) + sw $s4, 40($sp) + sw $ra, 44($sp) + move $s1, $a0 + move $s2, $a1 + move $s4, $a2 + move $s3, $a3 + jal decode_packed_entry_number + lw $s0, 8($s2) # s0: b->headend + move $a3, $v0 # a3: entry = + # decode_packed_entry_number(s,b) + .set noreorder + .set nomacro + bltz $s0, decode_map_end # if(b->headend<0)return (-1); + li $v0, -1 # v0: return -1 + .set macro + .set reorder + lw $s2, 44($s1) # s2: s->q_pack + lw $t1, 28($s1) # t1: s->q_minp + lw $t0, 24($s1) # t0: s->q_delp + mul $t6, $a3, $s2 # t6: entry*s->q_pack + lw $t3, 36($s1) # t3: s->q_min + sub $t1, $s3, $t1 # t1: add = point-s->q_minp + sub $t0, $s3, $t0 # t0: shiftM = point-s->q_delp + neg $t4, $t1 # t4: -add + neg $t5, $t0 # t5: -shiftM + srav $t9, $t3, $t1 # t9: add= s->q_min >> add + sll $t8, $t3, $t4 # t8: add= s->q_min << -add + lw $t2, 32($s1) # t2: mul = s->q_del + slt $t7, $zero, $t1 # if(add>0) + movz $t9, $t8, $t7 + sllv $t8, $t2, $t5 # t8: mul <<= -shiftM + slti $t7, $t0, 0 # if (shiftM<0) + movn $t2, $t8, $t7 # mul <<= -shiftM + movn $t0, $zero, $t7 # shiftM = 0 + lw $s0, 12($s1) # s0: s->dec_type + lw $a0, 20($s1) # a0: s->dim + lw $t4, 16($s1) # t4: s->q_bits + li $t1, 1 + #### 1 used by test file 0 #### + #### according to decode type #### + sll $a0, 2 # a0: s->dim * 4 + addiu $s0, -1 # s0: s->dec_type - 1 + sllv $t9, $t9, $t0 # t9: add <<= shiftM + # switch(s->dec_type) + .set noreorder + .set nomacro + lw $t7, 40($s1) # t7: s->q_seq + move $s3, $s4 # s3 -> &v[0] + beqz $s0, decode_map_case_1 + li $v1, 0 # v1: i=0 + addiu $s0, -1 # s0: s->dec_type - 2 + slti $t3, $t4, 9 # if(s->q_bits<=8) + beqz $s0, decode_map_case_2 + lw $t8, 48($s1) # t8 -> s->q_val + addiu $s0, -1 # s0: s->dec_type - 3 + bnez $s0, decode_map_end # default case of switch statement + li $v0, -1 +decode_map_case_3: + #### offset into array #### + beqz $a0, decode_map_end_case + nop + beqz $t3, decode_map_case3_else_lp # if(s->q_bits<=8) + add $t8, $t6 # void *ptr= s->q_val+ + # entry*s->q_pack; +decode_map_case3_lp: + lbu $t5, 0($t8) # t5: ((unsigned char *)ptr)[i] + addiu $t8, 1 + addiu $v1, 4 # i++ + mul $t5, $t2 # t5: v[i] * mul + add $t5, $t9 # t5: add + v[i] * mul + addiu $s3, 4 + srav $t5, $t5, $t0 # t5:((add + v[i] * mul) >> shiftM) + bne $v1, $a0, decode_map_case3_lp # for(i=0;i<s->dim;i++) + sw $t5, -4($s3) # v[i]=((unsigned char *)ptr)[i]; + + b decode_map_end_case + nop + # s->q_bits>8 +decode_map_case3_else_lp: + lhu $t5, 0($t8) # t5: ((ogg_uint16_t *)ptr)[i] + addiu $t8, 2 + addiu $v1, 4 # i++ + mul $t5, $t2 # t5: v[i] * mul + add $t5, $t9 # t5: add + v[i] * mul + addiu $s3, 4 + srav $t5, $t5, $t0 # t5:((add + v[i] * mul) >> shiftM) + bne $v1, $a0, decode_map_case3_else_lp # for(i=0;i<s->dim;i++) + sw $t5, -4($s3) # v[i]=((ogg_uint16_t *)ptr)[i]; + + b decode_map_end_case + nop +decode_map_case_2: + #### packed vector of column offsets #### + beqz $a0, decode_map_end_case + sllv $t1, $t1, $s2 # t1: 1<<s->q_pack + beqz $t3, decode_map_case2_else_lp # if(s->q_bits<=8) + addiu $t1, -1 # t1: mask=(1<<s->q_pack)-1 + +decode_map_case2_lp: + and $t3, $a3, $t1 # t3: entry&mask +#if __mips_dsp__ + lbux $t5, $t3($t8) # t5:(unsigned char*)(s->q_val))[entry&mask] +#else + addu $t5,$t3,$t8 + lbu $t5,($t5) +#endif + srav $a3, $a3, $s2 # a3: entry>>=s->q_pack + addiu $v1, 4 # i++ + mul $t5, $t2 # t5: v[i] * mul + add $t5, $t9 # t5: add + v[i] * mul + addiu $s3, 4 + srav $t5, $t5, $t0 # t5:((add + v[i] * mul) >> shiftM) + bne $v1, $a0, decode_map_case2_lp + sw $t5, -4($s3) + + b decode_map_end_case + nop + # s->q_bits>8 +decode_map_case2_else_lp: + and $t3, $a3, $t1 # t3: entry&mask + sll $t3, 1 # t3: (entry&mask) * 2 + addu $t5,$t3,$t8 + lhu $t5,($t5) # t5:(ogg_uint16_t*)(s->q_val))[entry&mask] + srav $a3, $a3, $s2 # a3: entry>>=s->q_pack + addiu $v1, 4 # i++ + mul $t5, $t2 # t5: v[i] * mul + add $t5, $t9 # t5: add + v[i] * mul + addiu $s3, 4 + srav $t5, $t5, $t0 # t5:((add + v[i] * mul) >> shiftM) + bne $v1, $a0, decode_map_case2_else_lp + sw $t5, -4($s3) + + b decode_map_end_case + nop +decode_map_case_1: + #### packed vector of values #### + sllv $t1, $t1, $t4 # t1: 1<<s->q_bits + beqz $a0, decode_map_end_case + addiu $t1, -1 # t1: mask=(1<<s->q_bits)-1 + +decode_map_case_1_lp: + and $t3, $a3, $t1 # t3: entry&mask + mul $t3, $t2 # t3: (entry&mask)*mul + srav $a3, $a3, $t4 # a3: entry>>=s->q_bits + addiu $v1, 4 # i++ + addiu $s3, 4 # s3 -> &v[i+1] + add $t5, $t9, $t3 # add + (entry&mask) * mul + srav $t5, $t5, $t0 # ((add+(entry&mask)*mul)>>shiftM) + bne $v1, $a0, decode_map_case_1_lp + sw $t5, -4($s3) # v[i]= ((add + + # (entry&mask) * mul) >> shiftM) +decode_map_end_case: + beqz $t7, decode_map_end # if(s->q_seq) + li $v0, 0 # return 0 + addiu $a0, -4 # a0: s->dim -- + blez $a0, decode_map_end + lw $t0, 0($s4) # v[0] + +decode_map_finall_lp: + lw $t1, 4($s4) # v[i] + addiu $s4, 4 + addiu $a0, -4 # s->dim - 1 + add $t0, $t1, $t0 # v[i]+=v[i-1] + bnez $a0, decode_map_finall_lp # for(i=1;i<s->dim;i++) + sw $t0, 0($s4) + .set macro + .set reorder + +decode_map_end: + lw $s0, 24($sp) + lw $s1, 28($sp) + lw $s2, 32($sp) + lw $s3, 36($sp) + lw $s4, 40($sp) + lw $ra, 44($sp) + addiu $sp, 48 + jr $ra + .end decode_map +#/*************************************************************************** +#* +#* Function: vorbis_book_decodevv_add +#* +#* Description: +#* decode residual values +#* +#* Parameters: +#* +#* a0 : pointer to codebook structure +#* a1 : pointer to arrays of input buffer +#* a2 : offset in buffers +#* a3 : number of channels +#* : pointer to oggpack buffer +#* : samples per partition +#* : -8 +#* +#* +#* Reference: see codebook.c +#* +#* +#* Notes: +#* +#***************************************************************************/ + .text + .align 2 + .globl vorbis_book_decodevv_add + .set nomips16 + .set nomicromips + .ent vorbis_book_decodevv_add + .type vorbis_book_decodevv_add, @function + #### long vorbis_book_decodevv_add(codebook *book,ogg_int32_t **a, + #### long offset,int ch,oggpack_buffer *b, + #### int n,int point) + # a0: codebook *book + # a1: ogg_int32_t **a + # a2: long offset + # a3: int ch + # 16($sp): oggpack_buffer *b + # 20($sp): int n + # 24($sp): int point +vorbis_book_decodevv_add: + .frame $sp, 64, $ra + .set noreorder + .cpload $t9 + .set reorder + addiu $sp, -64 + lw $t0, 52($a0) # t0: book->used_entries + .cprestore 16 + sw $s0, 24($sp) + sw $s1, 28($sp) + sw $s2, 32($sp) + sw $s3, 36($sp) + sw $s4, 40($sp) + sll $s2, $a2, 2 # s2: i = offset * 4 + li $s1, 0 # s1: chptr = 0 + sll $s4, $a3, 2 # s4: ch * 4 + slt $t1, $zero, $t0 + sw $s5, 44($sp) + sw $s6, 48($sp) + sw $s7, 52($sp) + sw $s8, 56($sp) + sw $ra, 60($sp) + .set noreorder + .set nomacro + beqz $t1, vorbis_book_decodevv_add_end # if(book->used_entries>0) + li $v0, 0 # return 0 + .set macro + .set reorder + + lw $s5, 56($a0) # s5: ogg_int32_t *v = + # book->dec_buf; + lw $s7, 84($sp) # s7: n + move $s0, $a1 # s0: a + .set noreorder + .set nomacro + beqz $s5, vorbis_book_decodevv_add_end # if (!v) return -1 + li $v0, -1 # return -1 + .set macro + .set reorder + lw $s3, 20($a0) # s3: book -> dim + sll $s7, 2 # n * 4 + addu $s7, $s2 # s7: offset + n + lw $s6, 0($s0) # s6: &a[0][0] + sll $s3, 2 # s3: (book -> dim) * 4 + add $s6, $s2 # s6: &a[0][offset] + + move $s8, $a0 # s8: book +vorbis_book_decodevv_add_i_lp: + lw $a1, 80($sp) # a1: b + lw $a3, 88($sp) # a3: point + move $a2, $s5 # a2: v + jal decode_map # decode_map returns 0 or -1 + .set noreorder + .set nomacro + bnez $v0, vorbis_book_decodevv_add_end # if(decode_map(book,b,v,point)) + # return -1 + li $t9, 0 # t9: j = 0 + +vorbis_book_decodevv_add_j_lp: + .set macro + .set reorder + addiu $s1, 4 # s1: chptr++ + lw $t4, 0($s6) # t4: a[chptr][i] +#if __mips_dsp__ + lwx $t0, $t9($s5) # t0: v[j] +#else + addu $t0, $t9, $s5 + lw $t0,($t0) +#endif + addiu $t9, 4 # j++ + add $t0, $t4 # t0: a[chptr++][i]+=v[j] + sw $t0, 0($s6) +#if __mips_dsp__ + lwx $s6, $s1($s0) # s6: &a[chptr++][i] +#else + addu $s6, $s1, $s0 + lw $s6,($s6) +#endif + bne $s1, $s4, vorbis_book_decodevv_add_j_end + lw $s6, 0($s0) # s6: &a[0][0] + li $s1, 0 # s1: chptr = 0 + addiu $s2, 4 # s2: i++ +vorbis_book_decodevv_add_j_end: + .set noreorder + .set nomacro + bne $t9, $s3, vorbis_book_decodevv_add_j_lp + add $s6, $s2 # s6: next &a[chptr][i] + bne $s2, $s7, vorbis_book_decodevv_add_i_lp + move $a0, $s8 # a0: book + .set macro + .set reorder + +vorbis_book_decodevv_add_end: + lw $ra, 60($sp) + lw $s0, 24($sp) + lw $s1, 28($sp) + lw $s2, 32($sp) + lw $s3, 36($sp) + lw $s4, 40($sp) + lw $s5, 44($sp) + lw $s6, 48($sp) + lw $s7, 52($sp) + lw $s8, 56($sp) + addiu $sp, 64 + jr $ra + .end vorbis_book_decodevv_add
\ No newline at end of file diff --git a/mips-dspr1/floor1_inverse2MIPS.S b/mips-dspr1/floor1_inverse2MIPS.S new file mode 100644 index 0000000..569bcfc --- /dev/null +++ b/mips-dspr1/floor1_inverse2MIPS.S @@ -0,0 +1,269 @@ +/***************************************************************************** +* Copyright (c) 2012 +* MIPS Technologies, Inc., California. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND +* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +* SUCH DAMAGE. +* +* Author: Nedeljko Babic (nbabic@mips.com) +*****************************************************************************/ +#/*************************************************************************** +#* +#* File: floor1_inverse2_asm.S +#* +#* Description: +#* one of floor backend 1 implementation functions +#* +#***************************************************************************/ + +#/*************************************************************************** +#* +#* Function: floor1_inverse2 +#* +#* Description: +#* +#* Parameters: +#* +#* a0 : vorbis_dsp_state buffers; the current vorbis audio +#* analysis/synthesis state +#* a1 : pointer to floor parameters +#* a2 : floor mem. +#* a3 : pointer to out pcm buffer +#* +#* +#* Reference: see floor1.c +#* +#* +#* Notes: +#* +#***************************************************************************/ + .text + .align 2 + .globl floor1_inverse2 + .set nomips16 + .set nomicromips + .ent floor1_inverse2 + .type floor1_inverse2, @function + + #### int floor1_inverse2(vorbis_dsp_state *vd,vorbis_info_floor *in, + #### ogg_int32_t *fit_value,ogg_int32_t *out) #### +floor1_inverse2: + .frame $sp, 24, $ra + .set noreorder + .cpload $t9 + .set reorder + addiu $sp, -24 + lw $t1, 0($a0) # *(vd -> vi) + lw $t0, 48($a0) # vd -> W + lw $t2, 28($t1) # ci->blocksizes + sll $t0, 2 + lwx $a0, $t0($t2) # ci->blocksizes[vd->W] + + .set noreorder + .set nomacro + bnez $a2, render_lines + sra $a0, 1 # n=ci->blocksizes[vd->W]/2 + +floor1_inverse2_memset: # memset(out,0,sizeof(*out)*n); + addiu $a0, -8 + sw $zero, 0($a3) + sw $zero, 4($a3) + sw $zero, 8($a3) + sw $zero, 12($a3) + sw $zero, 16($a3) + sw $zero, 20($a3) + sw $zero, 24($a3) + sw $zero, 28($a3) + bgtz $a0, floor1_inverse2_memset + addiu $a3, 32 + .set macro + .set reorder + + addiu $sp, 24 + li $v0, 0 # v0: return (0) + jr $ra + +render_lines: + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + move $s4, $a0 # store n + #### render the lines #### + lw $t1, 32($a1) # t1 = info->mult /* 1 2 3 or 4 */ + lw $t2, 0($a2) # t2 = fit_value[0] + li $s2, 0 # s2: int hx=0 + srl $v0, $t1, 1 # v0 = info->mult / 2 + andi $v1, $t1, 1 # v1 = info->mult % 2 + movz $v1, $zero, $v0 + sllv $s5, $t2, $v0 # s5: fit_value[0]*info->mult + add $t2, $s5 # t2: fit_value[0]*info->mult + movn $s5, $t2, $v1 # s5: ly = fit_value[0]*info->mult + li $t8, 1 # t8: j=1 + lw $s0, 12($a1) # s0: &(info->forward_index[0]) + lw $t9, 28($a1) # t9: info->posts + lbux $t5, $t8($s0) # t5: int current= + # info->forward_index[j] + # //(forward_index[j]>0) + lw $s1, 8($a1) # s1: &(info->postlist[0]) + li $a1, 0 # a1: int lx=0 + beq $t8, $t9, floor1_inverse2_no_lp # if (j==info->posts) + +floor1_inverse2_lp: + sll $t5, 2 # t5 = current * 4 + lwx $t2, $t5($a2) # t2: fit_value[current] + sra $t5, 1 # t5 = current * 2 + andi $s3, $t2, 0x7FFF # s3: int hy=fit_value[current]&0x7fff + addu $s2, $t5, $s1 + lhu $s2, ($s2) # s2: hx=info->postlist[current] + .set noreorder + .set nomacro + bne $s3, $t2, floor1_inverse2_16 + addiu $t8, 1 + .set macro + .set reorder + sllv $t2, $s3, $v0 # t2: hy*=info->mult + add $s3, $t2 # + movz $s3, $t2, $v1 # s3: hy *= info->mult + #### inlined static void render_line(int n,int x0,int x1,int y0, + #### int y1,ogg_int32_t *d) #### + sub $t2, $s3, $s5 # t2: dy = y1 - y0 + sub $t3, $s2, $a1 # t3: adx = x1 - x0 + absq_s.w $t4, $t2 # t4: ady=abs(dy) + div $zero, $t2, $t3 + teq $t3, $zero, 0x7 + slt $t6, $s2, $a0 # if(n>x1) + movn $a0, $s2, $t6 # n = x1 + addiu $t5, $t3, -1 # t5: err = adx - 1 + la $t6, FLOOR_fromdB_LOOKUP + sll $s5, 2 # y0 * 4 + sub $a0, $a1 # n -= x0 + .set noreorder + .set nomacro + blez $a0, render_line_end + add $t6, $s5 # t6: floor=&FLOOR_fromdB_LOOKUP[y0]; + .set macro + .set reorder + sll $a1, 2 # a1: x0*4 + add $t0, $a3, $a1 # t0: d += x0 + mflo $t1 # t1: base =dy/adx; + mul $t7, $t3, $t1 # t7: base*adx + sll $t1, 2 # base * 4 + absq_s.w $t7, $t7 # abs(base*adx); + .set noreorder + .set nomacro + bgez $t2, render_line_lp_start # if (dy < 0) + sub $t4, $t7 # ady-=abs(base*adx); + .set macro + .set reorder + addiu $t1, -4 # (base * 4)-- + sub $t4, $t3, $t4 # ady = adx-ady + li $t5, 0 # err = 0 + + # one lap unrolled to fill up the stalls +render_line_lp_start: + lw $t2, 0($t0) # t2: *d + lw $t7, 0($t6) # t7: *floor + addiu $a0, -1 + addiu $t0, 4 # d++ + mult $t2, $t7 # MULT32(*d,*floor) + .set noreorder + .set nomacro + beqz $a0, render_line_lp_end + add $t6, $t1 # floor += base + .set macro + .set reorder +render_line_lp: + sub $t5, $t4 # err -= ady + lw $t2, 0($t0) # t2: *d + addiu $t0, 4 # d++ + .set noreorder + .set nomacro + bgez $t5, render_line_err + extr.w $s5, $ac0, 15 # MULT31_SHIFT15(*d,*floor); + .set macro + .set reorder + add $t5, $t3 # err += adx + addiu $t6, 4 # floor += 1 +render_line_err: + lw $t7, 0($t6) # t7: *floor + add $t6, $t1 # floor += base + addiu $a0, -1 # n-- + mult $t2, $t7 # MULT32(*d,*floor) + .set noreorder + .set nomacro + bgtz $a0, render_line_lp + sw $s5, -8($t0) +render_line_lp_end: + sub $t5, $t4 # err -= ady + bgez $t5, render_line_err_1 + extr.w $s5, $ac0, 15 # MULT31_SHIFT15(*d,*floor); + .set macro + .set reorder + add $t5, $t3 # err += adx + addiu $t6, 4 # floor += 1 +render_line_err_1: + sw $s5, -4($t0) + +render_line_end: + move $a0, $s4 # a0 = n + move $a1, $s2 # lx = hx + move $s5, $s3 # ly = hy +floor1_inverse2_16: + .set noreorder + .set nomacro + bne $t8, $t9, floor1_inverse2_lp # for(j=1;j<info->posts;j++) + lbux $t5, $t8($s0) # t5: int current= + # info->forward_index[j] + +floor1_inverse2_no_lp: + move $s0, $a3 # s0: &out[0] + beq $s2, $a0, floor1_inverse2_end + li $v0, 1 + sll $s2, 2 # hx*4 + sll $a0, 2 # n * 4 + addu $s0, $s2 # s0: &out[hx] + +floor1_inverse2_lp2: + lw $t0, 0($s0) # out[j] + addiu $s0, 4 + mul $t0, $s5 # out[j]*=ly /* be certain */ + addiu $s2, 4 + bne $s2, $a0, floor1_inverse2_lp2 # for(j=hx;j<n;j++) + sw $t0, -4($s0) + .set macro + .set reorder + +floor1_inverse2_end: + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + addiu $sp, 24 + jr $ra + .end floor1_inverse2
\ No newline at end of file diff --git a/mips-dspr1/mdct_backwardMIPS.S b/mips-dspr1/mdct_backwardMIPS.S new file mode 100644 index 0000000..a97e193 --- /dev/null +++ b/mips-dspr1/mdct_backwardMIPS.S @@ -0,0 +1,1864 @@ +/***************************************************************************** +* Copyright (c) 2012 +* MIPS Technologies, Inc., California. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND +* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +* SUCH DAMAGE. +* +* Author: Nedeljko Babic (nbabic@mips.com) +*****************************************************************************/ +#/*************************************************************************** +#* +#* File: mdct_backward.S +#* +#* Description: +#* +#***************************************************************************/ + +#/*************************************************************************** +#* +#* Function: mdct_backward +#* +#* Parameters: +#* +#* a0 : number of subbands (sblimit) +#* a1 : pointer to granule +#* +#* +#* Reference: see mdct.c +#* +#* +#* Notes: +#* partial; doesn't perform last-step deinterleave/unrolling. That +#* can be done more efficiently during pcm output +#* +#***************************************************************************/ +#ifdef _LOW_ACCURACY_ +#define cPI3_8 (0x0062) +#define cPI2_8 (0x00b5) +#define cPI1_8 (0x00ed) +#else +#define cPI3_8 (0x30fbc54d) +#define cPI2_8 (0x5a82799a) +#define cPI1_8 (0x7641af3d) +#endif + +#******************************** mdct_backward ******************** + .text + .align 2 + .globl mdct_backward + .set nomips16 + .set nomicromips + .ent mdct_backward + .type mdct_backward, @function + #### void mdct_backward(int n, DATA_TYPE *in) #### + # a0 = n + # a1 = &in +mdct_backward: + .frame $sp, 56, $ra + .set noreorder + .cpload $t9 + .set reorder + + addiu $sp, -56 + li $a3, -1 + li $t0, 4 # t0: shift = 4 + sw $a0, 56($sp) # n + sw $a1, 60($sp) # &in + sw $s0, 16($sp) + sw $s1, 20($sp) + sw $s2, 24($sp) + sw $s5, 36($sp) + sw $s6, 40($sp) + sw $s7, 44($sp) + sw $s8, 48($sp) + sw $ra, 52($sp) + sll $ra, $a0, 1 # ra: (points >> i) * 4 + sw $s3, 28($sp) + sw $s4, 32($sp) + li $t1, 1 + +mdct_backward_shift: + sllv $t3, $t1, $t0 # 1<<shift + and $t3, $t3, $a0 # (n&(1<<shift)) + .set noreorder + .set nomacro + beqz $t3, mdct_backward_shift # for (shift=4;!(n&(1<<shift)); + # shift++) + addiu $t0, 1 # shift++ + .set macro + .set reorder + + addiu $t0, -1 # t0: shift + li $t1, 13 # t1: 13 + sub $t0, $t1, $t0 # shift=13-shift; + li $t1, 2 + sllv $a2, $t1, $t0 # a2: step=2<<shift + sw $t0, 0($sp) # shift + +#******************************** presymmetry ******************** + addu $t9, $a1, $a0 # t9 = in + n/4 + sll $a0, 1 # a0: (n/2)*4 + addu $s1, $a1, $a0 + addiu $s1, -3*4 # s1: aX = in + n2 - 3 + la $s0, sincos_lookup0 # s0: T = sincos_lookup0 + sw $a2, 4($sp) + sll $a2, 2 # step * 4 + + lw $t3, 0($s1) # t3: r0= aX[0]; + lw $t0, 0($s0) # t0: T[0] + lw $t2, 2*4($s1) # t2: r2= aX[2]; + lw $t1, 4($s0) # t1: T[1] + # XPROD31( r0, r2, T[0], T[1], &aX[0], &aX[2] ); + # pipeline warm-up before loop + mult $t3, $t0 # MULT32(r0, T[0]) + mtlo $zero + madd $t2, $t1 # MULT32(r0, T[0]) + + # MULT32(r2, T[1]) + addiu $s1, -4*4 # aX -= 4 +presymmetry_lp1: + add $s0, $a2 # T += step + slt $t8, $s1, $t9 # (aX>=in+n4); + mfhi $t5 + mult $t2, $t0 # MULT32(r2, T[0]) + mtlo $a3 + msub $t3, $t1 # MULT32(r2, T[0]) - + # MULT32(r0, T[1]) + lw $t3, 0($s1) # t3: r0= aX[0]; + lw $t0, 0($s0) # t0: T[0] + lw $t2, 2*4($s1) # t2: r2= aX[2]; + sll $t5, 1 # (MULT32(r0,T[0]) + + # MULT32(r2,T[1]))<<1 + mfhi $t6 + sw $t5, 4*4($s1) # aX[0] = (MULT32(r0, T[0]) + + # MULT32(r2, T[1]))<<1 + lw $t1, 4($s0) # t1: T[1] + # XPROD31( r0, r2, T[0], T[1], &aX[0], &aX[2] ); + mult $t3, $t0 # MULT32(r0, T[0]) + mtlo $zero + addiu $s1, -4*4 # aX -= 4 + madd $t2, $t1 # MULT32(r0, T[0]) + + # MULT32(r2, T[1]) + sll $t6, 1 # (MULT32(r2,T[0]) - + # MULT32(r0,T[1])) << 1 + .set noreorder + .set nomacro + beqz $t8, presymmetry_lp1 # while(aX>=in+n4); + sw $t6, 4*10($s1) # aX[2] = (MULT32(r2, T[0]) - + # MULT32(r0, T[1]))<<1 + + # pipeline warm-up before loop + lw $t3, 4*4($s1) # t3: r0= aX[0]; + lw $t0, 4($s0) # t0: T[1] + lw $t2, 6*4($s1) # t2: r2= aX[2]; + lw $t1, 0($s0) # t1: T[0] + # XPROD31( r0, r2, T[1], T[0], &aX[0], &aX[2] ); + mult $t3, $t0 # MULT32(r0, T[1]) + mtlo $zero + madd $t2, $t1 # MULT32(r0, T[1]) + + # MULT32(r2, T[0]) +presymmetry_lp2: + sub $s0, $a2 # T -= step + slt $t8, $s1, $a1 # (aX>=in); + mfhi $t5 + mult $t2, $t0 # MULT32(r2, T[1]) + mtlo $a3 + msub $t3, $t1 # MULT32(r2, T[1]) - + # MULT32(r0, T[0]) + lw $t3, 0($s1) # t3: r0= aX[0]; + lw $t0, 4($s0) # t0: T[1] + lw $t2, 2*4($s1) # t2: r2= aX[2]; + sll $t5, 1 # (MULT32(r0, T[1]) + + # MULT32(r2, T[0]))<<1 + mfhi $t6 + sw $t5, 4*4($s1) # aX[0] = (MULT32(r0, T[1]) + + # MULT32(r2, T[0]))<<1 + lw $t1, 0($s0) # t1: T[0] + # XPROD31( r0, r2, T[1], T[0], &aX[0], &aX[2] ); + mult $t3, $t0 # MULT32(r0, T[1]) + mtlo $zero + addiu $s1, -4*4 # aX -= 4 + madd $t2, $t1 # MULT32(r0, T[1]) + + # MULT32(r2, T[0]) + sll $t6, 1 # (MULT32(r2, T[1]) - + # MULT32(r0, T[0])) << 1 + beqz $t8, presymmetry_lp2 # while(aX>=in) + sw $t6, 4*10($s1) # aX[2] = (MULT32(r2, T[1]) - + # MULT32(r0, T[0]))<<1 + .set macro + .set reorder + + addu $s1, $a1, $a0 # + addiu $s1, -4*4 # s1: aX = in + n2 - 4 + move $s2, $a1 # s2: bX = in + la $s0, sincos_lookup0 # s0: T = sincos_lookup0 + + # pipeline warm-up before loop + lw $v0, 0($s2) # v0: ro0= bX[0]; + lw $v1, 8($s2) # v1: ro2= bX[2]; + lw $t0, 0($s0) # t0 = T[0] + lw $t1, 4($s0) # t1 = T[1] + lw $t3, 0($s1) # t3: ri0= aX[0]; + lw $t2, 8($s1) # t2: ri2= aX[2]; + # XNPROD31( ro2, ro0, T[1], T[0], &aX[0], &aX[2] ) + mult $v1, $t1 # MULT32(ro2, T[1]) + mtlo $a3 + msub $v0, $t0 # MULT32(ro2, T[1]) - + # MULT32(ro0, T[0]) + addu $s0, $a2 # T += step +presymmetry_lp3: + addiu $s1, -4*4 # aX -= 4 + addiu $s2, 4*4 # bX += 4 + lw $t7, 0($s0) # t7 = T[0] + mfhi $t5 + mult $v0, $t1 # MULT32(ro0, T[1]) + mtlo $zero + madd $v1, $t0 # MULT32(ro0, T[1]) + + # MULT32(ro2, T[0]) + slt $t8, $s1, $s2 # (aX>=bX); + lw $t4, 4($s0) # t4 = T[1] + lw $v0, 0($s2) # v0: ro0= bX[0]; + lw $v1, 8($s2) # v1: ro2= bX[2]; + mfhi $t6 + # XNPROD31( ri2, ri0, T[0], T[1], &bX[0], &bX[2] ) + mult $t2, $t7 # MULT32(ri2, T[0]) + mtlo $a3 + msub $t3, $t4 # MULT32(ri2, T[0]) - + # MULT32(ri0, T[1]) + sll $t5, 1 # (MULT32(ro2, T[1]) - + # MULT32(ro0, T[0]))<<1 + sw $t5, 4*4($s1) # aX[0] = (MULT32(ro2, T[1]) - + # MULT32(ro0, T[0]))<<1 + lw $t0, 0($s0) # t0 = T[0] + sll $t6, 1 # (MULT32(ro0, T[1]) + + # MULT32(ro2, T[0]))<<1 + mfhi $t5 + mult $t3, $t7 # MULT32(ri0, T[0]) + mtlo $zero + madd $t2, $t4 # MULT32(ri2, T[0]) + + # MULT32(ri0, T[1]) + sw $t6, 6*4($s1) # aX[2] = (MULT32(ro0, T[1]) + + # MULT32(ro2, T[0]))<<1 + lw $t1, 4($s0) # t1 = T[1] + lw $t3, 0($s1) # t3: ri0= aX[0]; + sll $t5, 1 # (MULT32(ri2, T[0]) - + # MULT32(ri0, T[1]))<<1 + mfhi $t6 + sw $t5, -4*4($s2) # bX[0] = (MULT32(ri2, T[0]) - + # MULT32(ri0, T[1]))<<1 + lw $t2, 8($s1) # t2: ri2= aX[2]; + # XNPROD31( ro2, ro0, T[1], T[0], &aX[0], &aX[2] ) + mult $v1, $t1 # MULT32(ro2, T[1]) + mtlo $a3 + msub $v0, $t0 # MULT32(ro2, T[1]) - + # MULT32(ro0, T[0]) + addu $s0, $a2 # T += step + sll $t6, 1 # (MULT32(ri0, T[0]) + + # MULT32(ri2, T[1]))<<1 + .set noreorder + .set nomacro + beqz $t8, presymmetry_lp3 # while(aX>=bX); + sw $t6, -2*4($s2) # bX[2] = (MULT32(ri0, T[0]) + + # MULT32(ri2, T[1]))<<1 + +#******************************** mdct_butterflies ******************** + lw $a2, 0($sp) # a2: shift + li $t0, 6 + sub $s5, $t0, $a2 # s5: --stages=6-shift; + bltz $s5, mdct_butterfly_32 + li $s6, 0 # s6: i = 0 + .set macro + .set reorder + addiu $s5, 1 # to simplify for loop test + li $s7, 0 # s7: j = 0 + li $s8, 1 # s8: 1<<i + +mdct_butterflies_lp_1: + lw $t2, 0($sp) # t2: shift + move $a0, $ra # a0: (points>>i) * 4 + addu $t0, $a1, $ra # x+(points>>i)*(j+1)*4 + sw $t0, 8($sp) + li $a2, 16 # a2: 4 * 4 + add $t2, $s6 # t2: i+shift + +mdct_butterflies_lp_2: + sllv $a2, $a2, $t2 # a2: 4 * 4 << (i + shift) + # from this point variable names from mdct_butterfly_generic + addu $s1, $a1, $a0 # s1: x1 = x + points + sra $a0, 1 # a0: (points * 4) >> 1 + addiu $s1, -4*4 # x + points - 4 + addu $s2, $a1, $a0 # s2: x + (points >> 1) + addiu $s2, -4*4 # s2: x2 = x + (points >> 1) - 4 + la $s0, sincos_lookup0 # s0: LOOKUP_T *T = sincos_lookup0; + move $t8, $a2 # counter + slti $t9, $a2, 4*1024 + # pipeline warm-up before loop + lw $t0, 0($s1) # t0: x1[0] + lw $t2, 8($s1) # t2: x1[2] + .set noreorder + .set nomacro + beqz $t9, mdct_butterfly_generic_lp1_end # jump if only one iteration + lw $t1, 4($s1) # t1: x1[1] + +mdct_butterfly_generic_lp1: + lw $t3, 12($s1) # t3: x1[3] + lw $v0, 0($s0) # v0: T[0] + lw $v1, 4($s0) # v1: T[1] + sub $s4, $t3, $t2 # s4: r1 = x1[3] - x1[2]; + sub $s3, $t0, $t1 # s3: r0 = x1[0] - x1[1]; + # XPROD31( r1, r0, T[0], T[1], &x2[0], &x2[2] ); + mult $s4, $v0 # MULT32(r1, T[0]) + mtlo $zero + madd $s3, $v1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1])) + add $t0, $t1 # x1[0] += x1[1]; + lw $t4, 0($s2) # t4: x2[0] + lw $t5, 4($s2) # t5: x2[1] + lw $t6, 8($s2) # t6: x2[2] + mfhi $t1 + mult $s3, $v0 # MULT32(r0, T[0]) + mtlo $a3 + msub $s4, $v1 # (MULT32(r0,T[0]) - + # MULT32(r1,T[1])) + lw $t7, 12($s2) # t7: x2[3] + sub $s3, $t5, $t4 # s3: r2 = x2[1] - x2[0]; + sub $s4, $t7, $t6 # s4: r3 = x2[3] - x2[2]; + sll $t1, 1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1]))<<1 + sw $t1, 0($s2) + add $t2, $t3 # x1[2] += x1[3]; + mfhi $t3 + # XPROD31( r2, r3, T[0], T[1], &x2[1], &x2[3] ); + mult $s3, $v0 # MULT32(r2, T[0]) + mtlo $zero + madd $s4, $v1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1])) + add $t5, $t4 # x1[1] = x2[1] + x2[0]; + add $t6, $t7 # x1[3] = x2[3] + x2[2]; + sw $t0, 0($s1) + sll $t3, 1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1]))<<1 + sw $t3, 8($s2) + sw $t2, 8($s1) + mfhi $t1 + mult $s4, $v0 # MULT32(r3, T[0]) + mtlo $a3 + msub $s3, $v1 # (MULT32(r3,T[0]) - + # MULT32(r2,T[1])) + sw $t5, 4($s1) + sw $t6, 12($s1) + addu $t8, $a2 # counter += 4* step + sll $t1, 1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1]))<<1 + sw $t1, 4($s2) + slti $t9, $t8, 4 * 1024 # while(T<sincos_lookup0+1024); + mfhi $t3 + addiu $s1, -4*4 # x1 -= 4; + addu $s0, $a2 # T += step; + addiu $s2, -4*4 # x2 -= 4; + lw $t0, 0($s1) # t0: x1[0] + lw $t2, 8($s1) # t2: x1[2] + lw $t1, 4($s1) # t1: x1[1] + sll $t3, 1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1]))<<1 + bnez $t9, mdct_butterfly_generic_lp1 + sw $t3, 7*4($s2) + .set macro + .set reorder + + # pipeline drained after mdct_butterfly_generic_lp1 +mdct_butterfly_generic_lp1_end: + lw $t3, 12($s1) # t3: x1[3] + lw $v0, 0($s0) # v0: T[0] + lw $v1, 4($s0) # v1: T[1] + sub $s4, $t3, $t2 # s4: r1 = x1[3] - x1[2]; + sub $s3, $t0, $t1 # s3: r0 = x1[0] - x1[1]; + # XPROD31( r1, r0, T[0], T[1], &x2[0], &x2[2] ); + mult $s4, $v0 # MULT32(r1, T[0]) + mtlo $zero + madd $s3, $v1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1])) + add $t0, $t1 # x1[0] += x1[1]; + lw $t4, 0($s2) # t4: x2[0] + lw $t5, 4($s2) # t5: x2[1] + lw $t6, 8($s2) # t6: x2[2] + mfhi $t1 + mult $s3, $v0 # MULT32(r0, T[0]) + mtlo $a3 + msub $s4, $v1 # (MULT32(r0,T[0]) - + # MULT32(r1,T[1])) + lw $t7, 12($s2) # t7: x2[3] + sub $s3, $t5, $t4 # s3: r2 = x2[1] - x2[0]; + sub $s4, $t7, $t6 # s4: r3 = x2[3] - x2[2]; + sll $t1, 1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1]))<<1 + sw $t1, 0($s2) + add $t2, $t3 # x1[2] += x1[3]; + mfhi $t3 + # XPROD31( r2, r3, T[0], T[1], &x2[1], &x2[3] ); + mult $s3, $v0 # MULT32(r2, T[0]) + mtlo $zero + madd $s4, $v1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1])) + add $t5, $t4 # x1[1] = x2[1] + x2[0]; + add $t6, $t7 # x1[3] = x2[3] + x2[2]; + sw $t0, 0($s1) + sll $t3, 1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1]))<<1 + sw $t3, 8($s2) + sw $t2, 8($s1) + mfhi $t1 + mult $s4, $v0 # MULT32(r3, T[0]) + mtlo $a3 + msub $s3, $v1 # (MULT32(r3,T[0]) - + # MULT32(r2,T[1])) + sw $t5, 4($s1) + sw $t6, 12($s1) + sra $t0, $a0, 1 # t0: (points>>2) + sll $t1, 1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1]))<<1 + sw $t1, 4($s2) + mfhi $t3 + addiu $s1, -4*4 # x1 = x + (points>>1) - 4 + addiu $s2, -4*4 # x2 -= 4; + addu $s0, $a2 # T += step; + move $t8, $a2 # counter + slti $t9, $a2, 4*1024 + sll $t3, 1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1]))<<1 + sw $t3, 7*4($s2) + + # pipeline warm-up before loop + lw $t0, 0($s1) # t0: x1[0] + lw $t2, 8($s1) # t2: x1[2] + .set noreorder + .set nomacro + beqz $t9, mdct_butterfly_generic_lp2_end # jump if only one iteration + lw $t1, 4($s1) # t1: x1[1] + +mdct_butterfly_generic_lp2: + lw $t3, 12($s1) # t3: x1[3] + lw $v0, 0($s0) # v0: T[0] + lw $v1, 4($s0) # v1: T[1] + sub $s3, $t0, $t1 # s3: r0 = x1[0] - x1[1]; + sub $s4, $t2, $t3 # s4: r1 = x1[2] - x1[3]; + # XNPROD31( r0, r1, T[0], T[1], &x2[0], &x2[2] ); + mult $s3, $v0 # MULT32(r0, T[0]) + mtlo $a3 + msub $s4, $v1 # (MULT32(r0,T[0]) - + # MULT32(r1,T[1])) + add $t0, $t1 # x1[0] += x1[1]; + lw $t4, 0($s2) # t4: x2[0] + lw $t5, 4($s2) # t5: x2[1] + lw $t6, 8($s2) # t6: x2[2] + mfhi $t1 + mult $s4, $v0 # MULT32(r1, T[0]) + mtlo $zero + madd $s3, $v1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1])) + lw $t7, 12($s2) # t7: x2[3] + add $t2, $t3 # x1[2] += x1[3]; + sub $s4, $t7, $t6 # s4: r3 = x2[3] - x2[2]; + sll $t1, 1 # (MULT32(r0,T[0]) - + # MULT32(r1,T[1]))<<1 + sw $t1, 0($s2) + sub $s3, $t4, $t5 # s3: r2 = x2[0] - x2[1]; + mfhi $t3 + # XNPROD31( r3, r2, T[0], T[1], &x2[1], &x2[3] ); + mult $s4, $v0 # MULT32(r3, T[0]) + mtlo $a3 + msub $s3, $v1 # (MULT32(r3,T[0]) - + # MULT32(r2,T[1])) + add $t5, $t4 # x1[1] = x2[1] + x2[0]; + add $t6, $t7 # x1[3] = x2[3] + x2[2]; + sw $t0, 0($s1) + sll $t3, 1 # (MULT32(r1,T[0]) - + # MULT32(r0,T[1]))<<1 + sw $t3, 8($s2) + sw $t2, 8($s1) + mfhi $t1 + mult $s3, $v0 # MULT32(r2, T[0]) + mtlo $zero + madd $s4, $v1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1])) + sw $t5, 4($s1) + sw $t6, 12($s1) + addu $t8, $a2 # counter += 4* step + sll $t1, 1 # (MULT32(r3,T[0]) - + # MULT32(r2,T[1]))<<1 + sw $t1, 4($s2) + slti $t9, $t8, 4 * 1024 # while(T<sincos_lookup0); + mfhi $t3 + sub $s0, $a2 # T -= step; + addiu $s1, -4*4 # x1 -= 4; + addiu $s2, -4*4 # x2 -= 4; + lw $t0, 0($s1) # t0: x1[0] + lw $t2, 8($s1) # t2: x1[2] + lw $t1, 4($s1) # t1: x1[1] + sll $t3, 1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1]))<<1 + bnez $t9, mdct_butterfly_generic_lp2 + sw $t3, 7*4($s2) + + # pipeline drained after mdct_butterfly_generic_lp2 +mdct_butterfly_generic_lp2_end: + lw $t3, 12($s1) # t3: x1[3] + lw $v0, 0($s0) # v0: T[0] + lw $v1, 4($s0) # v1: T[1] + sub $s3, $t0, $t1 # s3: r0 = x1[0] - x1[1]; + sub $s4, $t2, $t3 # s4: r1 = x1[2] - x1[3]; + # XNPROD31( r0, r1, T[0], T[1], &x2[0], &x2[2] ); + mult $s3, $v0 # MULT32(r0, T[0]) + mtlo $a3 + msub $s4, $v1 # (MULT32(r0,T[0]) - + # MULT32(r1,T[1])) + add $t0, $t1 # x1[0] += x1[1]; + lw $t4, 0($s2) # t4: x2[0] + lw $t5, 4($s2) # t5: x2[1] + lw $t6, 8($s2) # t6: x2[2] + mfhi $t1 + mult $s4, $v0 # MULT32(r1, T[0]) + mtlo $zero + madd $s3, $v1 # (MULT32(r1,T[0]) + + # MULT32(r0,T[1])) + lw $t7, 12($s2) # t7: x2[3] + add $t2, $t3 # x1[2] += x1[3]; + sub $s4, $t7, $t6 # s4: r3 = x2[3] - x2[2]; + sll $t1, 1 # (MULT32(r0,T[0]) - + # MULT32(r1,T[1]))<<1 + sw $t1, 0($s2) + sub $s3, $t4, $t5 # s3: r2 = x2[0] - x2[1]; + mfhi $t3 + # XNPROD31( r3, r2, T[0], T[1], &x2[1], &x2[3] ); + mult $s4, $v0 # MULT32(r3, T[0]) + mtlo $a3 + msub $s3, $v1 # (MULT32(r3,T[0]) - + # MULT32(r2,T[1])) + add $t5, $t4 # x1[1] = x2[1] + x2[0]; + add $t6, $t7 # x1[3] = x2[3] + x2[2]; + sw $t0, 0($s1) + sll $t3, 1 # (MULT32(r1,T[0]) - + # MULT32(r0,T[1]))<<1 + sw $t3, 8($s2) + sw $t2, 8($s1) + mfhi $t1 + mult $s3, $v0 # MULT32(r2, T[0]) + mtlo $zero + madd $s4, $v1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1])) + sw $t5, 4($s1) + sw $t6, 12($s1) + lw $a1, 8($sp) # a1: x+(points>>i)*j + sll $t1, 1 # (MULT32(r3,T[0]) - + # MULT32(r2,T[1]))<<1 + sw $t1, 4($s2) + lw $t2, 0($sp) # t2: shift + mfhi $t3 + move $a0, $ra # a0: (points>>i) + addu $t0, $a1, $ra # x+(points>>i)*(j+1) + sw $t0, 8($sp) + li $a2, 16 # a2: 4 * 4 + add $t2, $s6 # t2: i+shift + addiu $s7, 1 # s7: j++ + sll $t3, 1 # (MULT32(r2,T[0]) + + # MULT32(r3,T[1]))<<1 + bne $s7, $s8, mdct_butterflies_lp_2 # for(j=0;j<(1<<i);j++) + sw $t3, 3*4($s2) + + lw $a1, 60($sp) # a1: x + addiu $s6, 1 # i++ + sll $s8, 1 # s8: 1<<i + sra $ra, 1 # ra: points >> i + bne $s5, $s6, mdct_butterflies_lp_1 # for(i=0;--stages>=0;i++) + li $s7, 0 # s7: j = 0 + .set macro + .set reorder + + #### 32 point butterfly (in place, 4 register) #### + #### STIN void mdct_butterfly_32(DATA_TYPE *x) #### +mdct_butterfly_32: + lw $a0, 56($sp) # a0: points + li $s0, cPI3_8 + sra $a0, 1 + li $s1, cPI1_8 + li $s2, cPI2_8 + beqz $a0, mdct_butterfly_32_end + +mdct_butterfly_32_lp: + lw $t8, 16*4($a1) # t8: x[16] + lw $t9, 17*4($a1) # t9: x[17] + lw $v0, 18*4($a1) # v0: x[18] + lw $v1, 19*4($a1) # v1: x[19] + sub $t0, $t8, $t9 # t0: r0 = x[16] - x[17] + sub $t1, $v0, $v1 # t1: r1 = x[18] - x[19] + # XNPROD31( r0, r1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] ); + mult $t0, $s0 # MULT32(r0, cPI3_8) + mtlo $a3 + msub $t1, $s1 # MULT32(r0, cPI3_8) - + # MULT32(r1, cPI1_8) + lw $t4, 0($a1) # t4: x[0] + lw $t5, 4($a1) # t5: x[1] + lw $t6, 2*4($a1) # t6: x[2] + lw $t7, 3*4($a1) # t7: x[3] + mfhi $s3 + mult $t1, $s0 # MULT32(r1, cPI3_8) + mtlo $zero + madd $t0, $s1 # MULT32(r1, cPI3_8) + + # MULT32(r0, cPI1_8) + sub $t2, $t5, $t4 # t2: r2 = x[1] - x[0] + sub $t3, $t7, $t6 # t3: r3 = x[3] - x[2] + add $t8, $t9 # x[16] += x[17] + sll $s3, 1 # (MULT32(r0, cPI3_8) - + # MULT32(r1, cPI1_8)) << 1 + sw $s3, 0($a1) + add $v0, $v1 # x[18] += x[19] + mfhi $s4 + # XPROD31 ( r2, r3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] ); + mult $t2, $s1 # MULT32(r2, cPI1_8) + mtlo $zero + madd $t3, $s0 # MULT32(r2, cPI1_8) + + # MULT32(r3, cPI3_8) + add $t9, $t5, $t4 # x[17] = x[ 1] + x[ 0] + add $v1, $t7, $t6 # x[19] = x[ 3] + x[ 2] + sw $t8, 16*4($a1) # t8: x[16] + sll $s4, 1 # (MULT32(r0, cPI3_8) - + # MULT32(r1, cPI1_8)) << 1 + sw $s4, 2*4($a1) + sw $t9, 17*4($a1) # t9: x[17] + mfhi $t0 + mult $t3, $s1 # MULT32(r3, cPI1_8) + mtlo $a3 + msub $t2, $s0 # MULT32(r3, cPI1_8) - + # MULT32(r2, cPI3_8) + sw $v0, 18*4($a1) # v0: x[18] + sw $v1, 19*4($a1) # v1: x[19] + lw $t8, 20*4($a1) # t8: x[20] + sll $t0, 1 # (MULT32(r2, cPI1_8) + + # MULT32(r3, cPI3_8)) << 1 + sw $t0, 4($a1) + lw $t9, 21*4($a1) # t9: x[21] + mfhi $s4 + lw $v0, 22*4($a1) # v0: x[22] + lw $v1, 23*4($a1) # v1: x[23] + sub $t0, $t8, $t9 # t0: r0 = x[20] - x[21] + lw $t4, 4*4($a1) # t4: x[4] + sub $t1, $v0, $v1 # t1: r1 = x[22] - x[23] + sub $s3, $t0, $t1 # s3: r0 - r1 + sll $s4, 1 # (MULT32(r3, cPI1_8) - + # MULT32(r2, cPI3_8)) << 1 + sw $s4, 3*4($a1) + lw $t5, 5*4($a1) # t5: x[5] + lw $t6, 6*4($a1) # t6: x[6] + lw $t7, 7*4($a1) # t7: x[7] + mult $s3, $s2 # MULT32((r0-r1), cPI2_8) + add $t8, $t9 # x[20] += x[21] + add $v0, $v1 # x[22] += x[23] + sub $t2, $t5, $t4 # t2: r2 = x[5] - x[4] + sub $t3, $t7, $t6 # t3: r3 = x[7] - x[6] + mfhi $s3 + add $s4, $t2, $t3 # s4: r3 + r2 + mult $s4, $s2 # MULT32((r3+r2), cPI2_8) + add $t9, $t5, $t4 # x[21] = x[ 5] + x[ 4] + add $v1, $t7, $t6 # x[23] = x[ 7] + x[ 6] + add $t0, $t1 # t0: r0 + r1 + sub $t3, $t2 # t3: r3 - r2 + mfhi $s4 + mult $t0, $s2 # MULT32((r0+r1), cPI2_8) + sll $s3, 1 # MULT31((r0-r1), cPI2_8) + sw $s3, 4*4($a1) # x[ 4] = MULT31((r0 - r1) , cPI2_8) + sw $t8, 20*4($a1) # t8: x[20] + sw $t9, 21*4($a1) # t9: x[21] + mfhi $s3 + mult $t3, $s2 # MULT32((r3-r2), cPI2_8) + sll $s4, 1 # MULT31((r3+r2), cPI2_8) + sw $s4, 5*4($a1) # x[ 5] = MULT31((r3 + r2) , cPI2_8) + sw $v0, 22*4($a1) # v0: x[22] + sw $v1, 23*4($a1) # v1: x[23] + mfhi $s4 + sll $s3, 1 # MULT31((r0+r1), cPI2_8) + sw $s3, 6*4($a1) # x[ 6] = MULT31((r0 + r1) , cPI2_8) + lw $t8, 24*4($a1) # t8: x[24] + lw $t9, 25*4($a1) # t9: x[25] + lw $v0, 26*4($a1) # v0: x[26] + lw $v1, 27*4($a1) # v1: x[27] + sll $s4, 1 # MULT31((r3-r2), cPI2_8) + sw $s4, 7*4($a1) # x[ 7] = MULT31((r3 - r2) , cPI2_8) + lw $t4, 8*4($a1) # t4: x[8] + lw $t5, 9*4($a1) # t5: x[9] + lw $t6, 10*4($a1) # t6: x[10] + lw $t7, 11*4($a1) # t7: x[11] + sub $t0, $t8, $t9 # t0: r0 = x[24] - x[25] + sub $t1, $v0, $v1 # t1: r1 = x[26] - x[27] + sub $t2, $t5, $t4 # t2: r2 = x[9] - x[8] + sub $t3, $t7, $t6 # t3: r3 = x[11] - x[10] + add $t8, $t9 # x[24] += x[25] + add $v0, $v1 # x[26] += x[27] + add $t9, $t5, $t4 # x[25] = x[ 9] + x[ 8] + add $v1, $t7, $t6 # x[27] = x[ 11] + x[ 10] + sw $t8, 24*4($a1) # t8: x[24] + sw $t9, 25*4($a1) # t9: x[25] + sw $v0, 26*4($a1) # v0: x[26] + sw $v1, 27*4($a1) # v1: x[27] + # XNPROD31( r0, r1, cPI1_8, cPI3_8, &x[ 8], &x[ 10] ); + mult $t0, $s1 # MULT32(r0, cPI1_8) + mtlo $a3 + msub $t1, $s0 # MULT32(r0, cPI1_8) - + # MULT32(r1, cPI3_8) + lw $t8, 28*4($a1) # t8: x[28] + lw $t9, 29*4($a1) # t9: x[29] + lw $v0, 30*4($a1) # v0: x[30] + lw $v1, 31*4($a1) # v1: x[31] + mfhi $s3 + mult $t1, $s1 # MULT32(r1, cPI1_8) + mtlo $zero + madd $t0, $s0 # MULT32(r1, cPI1_8) + + # MULT32(r0, cPI3_8) + lw $t4, 12*4($a1) # t4: x[12] + lw $t5, 13*4($a1) # t5: x[13] + lw $t6, 14*4($a1) # t6: x[14] + sll $s3, 1 # (MULT32(r0, cPI1_8) - + # MULT32(r1, cPI3_8)) << 1 + mfhi $s4 + # XPROD31 ( r2, r3, cPI3_8, cPI1_8, &x[ 9], &x[ 11] ); + mult $t2, $s0 # MULT32(r2, cPI3_8) + mtlo $zero + madd $t3, $s1 # MULT32(r2, cPI3_8) + + # MULT32(r3, cPI1_8) + sw $s3, 8*4($a1) + lw $t7, 15*4($a1) # t7: x[15] + sub $t0, $t8, $t9 # t0: r0 = x[28] - x[29] + sll $s4, 1 # (MULT32(r1, cPI1_8) + + # MULT32(r0, cPI3_8)) << 1 + mfhi $s3 + mult $t3, $s0 # MULT32(r3, cPI3_8) + mtlo $a3 + msub $t2, $s1 # MULT32(r3, cPI3_8) - + # MULT32(r2, cPI1_8) + sw $s4, 10*4($a1) + sub $t1, $v0, $v1 # t1: r1 = x[30] - x[31] + sub $t2, $t4, $t5 # t2: r2 = x[12] - x[13] + sub $t3, $t7, $t6 # t3: r3 = x[15] - x[14] + mfhi $s4 + add $t8, $t9 # x[28] += x[29] + sll $s3, 1 # (MULT32(r2, cPI3_8) + + # MULT32(r3, cPI1_8)) << 1 + sw $s3, 9*4($a1) + add $v0, $v1 # x[30] += x[31] + add $t9, $t5, $t4 # x[29] = x[ 13] + x[ 12] + add $v1, $t7, $t6 # x[31] = x[ 15] + x[ 14] + sll $s4, 1 # (MULT32(r3, cPI3_8) - + # MULT32(r2, cPI1_8)) << 1 + sw $s4, 11*4($a1) + sw $t8, 28*4($a1) # t8: x[28] + sw $t9, 29*4($a1) # t9: x[29] + sw $v0, 30*4($a1) # v0: x[30] + sw $v1, 31*4($a1) # v1: x[31] + sw $t0, 12*4($a1) # x[12] = r0 + sw $t3, 13*4($a1) # x[13] = r3 + sw $t1, 14*4($a1) # x[14] = r1 + sw $t2, 15*4($a1) # x[15] = r2 + + #### 16 point butterfly (in place, 4 register) #### + #### STIN void mdct_butterfly_16(DATA_TYPE *x) #### + lw $t4, 0($a1) # t4: x[ 0] + lw $t5, 4($a1) # t5: x[ 1] + lw $t6, 2*4($a1) # t6: x[ 2] + lw $t7, 3*4($a1) # t7: x[ 3] + lw $t8, 8*4($a1) # t8: x[ 8] + lw $t9, 9*4($a1) # t9: x[ 9] + lw $v0, 10*4($a1) # v0: x[ 10] + lw $v1, 11*4($a1) # v1: x[ 11] + sub $t0, $t8, $t9 # t0: r0 = x[ 8] - x[ 9] + sub $t1, $v0, $v1 # t1: r1 = x[ 10] - x[ 11] + sub $t2, $t5, $t4 # t2: r2 = x[ 1] - x[ 0] + sub $t3, $t7, $t6 # t3: r3 = x[ 3] - x[ 2] + sub $s3, $t0, $t1 # s3: r0 - r1 + add $s4, $t2, $t3 # s4: r2 + r3 + add $t0, $t1 # t0: r0 + r1 + sub $t3, $t2 # t3: r3 - r2 + mult $s3, $s2 # MULT32((r0-r1), cPI2_8) + add $t8, $t9 # x[ 8] += x[ 9] + add $v0, $v1 # x[10] += x[11] + add $t9, $t5, $t4 # x[ 9] = x[ 1] + x[0] + add $v1, $t7, $t6 # x[11] = x[ 3] + x[2] + mfhi $s3 + mult $s4, $s2 # MULT32((r3+r2), cPI2_8) + sw $t8, 8*4($a1) + sw $v0, 10*4($a1) + sw $t9, 9*4($a1) + sw $v1, 11*4($a1) + mfhi $s4 + mult $t0, $s2 # MULT32((r0+r1), cPI2_8) + sll $s3, 1 # MULT31((r0-r1), cPI2_8) + sw $s3, 0($a1) # x[0] = MULT32((r0-r1), cPI2_8) + lw $t4, 4*4($a1) # t4: x[ 4] + lw $t5, 5*4($a1) # t5: x[ 5] + mfhi $t0 + mult $t3, $s2 # MULT32((r3-r2), cPI2_8) + sll $s4, 1 # MULT31((r3+r2), cPI2_8) + sw $s4, 4($a1) # x[1] = MULT31((r3+r2), cPI2_8) + lw $t6, 6*4($a1) # t6: x[ 6] + lw $t7, 7*4($a1) # t7: x[ 7] + mfhi $t3 + sll $t0, 1 # MULT31((r0+r1), cPI2_8) + sw $t0, 2*4($a1) # x[2] = MULT31((r0+r1), cPI2_8) + lw $t8, 12*4($a1) # t8: x[ 12] + lw $t9, 13*4($a1) # t9: x[ 13] + lw $v0, 14*4($a1) # v0: x[ 14] + lw $v1, 15*4($a1) # v1: x[ 15] + sll $t3, 1 # MULT31((r3-r2), cPI2_8) + sw $t3, 3*4($a1) # x[3] = MULT31((r3-r2), cPI2_8) + sub $t2, $t8, $t9 # t2: r0 = x[12] - x[13] + sub $t3, $v0, $v1 # t3: r1 = x[14] - x[15] + sub $t0, $t4, $t5 # t0: r2 = x[ 4] - x[ 5] + sub $t1, $t7, $t6 # t1: r3 = x[ 7] - x[ 6] + add $t8, $t9 # x[12] += x[13] + add $v0, $v1 # x[14] += x[15] + add $t9, $t5, $t4 # x[13] = x[ 5] + x[4] + add $v1, $t7, $t6 # x[15] = x[ 7] + x[6] + sw $t8, 12*4($a1) + sw $v0, 14*4($a1) + sw $t9, 13*4($a1) + sw $v1, 15*4($a1) + sw $t2, 4*4($a1) # x[ 4] = r2 + sw $t1, 5*4($a1) # x[ 5] = r1 + sw $t3, 6*4($a1) # x[ 6] = r3 + sw $t0, 7*4($a1) # x[ 7] = r0 + + #### mdct_butterfly_8(x) #### + lw $t4, 0($a1) # t4: x[0] + lw $t5, 4($a1) # t5: x[1] + lw $t6, 2*4($a1) # t6: x[2] + lw $t7, 3*4($a1) # t7: x[3] + lw $t8, 4*4($a1) # t8: x[4] + lw $t9, 5*4($a1) # t9: x[5] + lw $t0, 6*4($a1) # t0: x[6] + lw $t1, 7*4($a1) # t1: x[7] + add $t2, $t4, $t5 # t2: r0 = x[0] + x[1] + sub $t4, $t5 # t4: r1 = x[0] - x[1] + add $t3, $t6, $t7 # t3: r2 = x[2] + x[3] + sub $t6, $t7 # t6: r3 = x[2] - x[3] + add $t5, $t8, $t9 # t5: r4 = x[4] + x[5] + sub $t8, $t9 # t8: r5 = x[4] - x[5] + add $t7, $t0, $t1 # t7: r6 = x[6] + x[7] + sub $t0, $t1 # t0: r7 = x[6] - x[7] + add $t1, $t8, $t6 # x[0] = r5 + r3 + sub $v0, $t0, $t4 # x[1] = r7 - r1 + sub $t8, $t6 # x[2] = r5 - r3 + add $t0, $t4 # x[3] = r7 + r1 + sub $t6, $t5, $t2 # x[4] = r4 - r0 + sub $t4, $t7, $t3 # x[5] = r6 - r2 + add $t5, $t2 # x[6] = r4 + r0 + add $t7, $t3 # x[7] = r6 + r2 + sw $t1, 0($a1) + sw $v0, 4($a1) + sw $t8, 2*4($a1) + sw $t0, 3*4($a1) + sw $t6, 4*4($a1) + sw $t4, 5*4($a1) + sw $t5, 6*4($a1) + sw $t7, 7*4($a1) + + #### mdct_butterfly_8(x + 8) #### + lw $t4, 8*4($a1) + lw $t5, 9*4($a1) + lw $t6, 10*4($a1) + lw $t7, 11*4($a1) + lw $t8, 12*4($a1) + lw $t9, 13*4($a1) + lw $t0, 14*4($a1) + lw $t1, 15*4($a1) + add $t2, $t4, $t5 # t2: r0 = x[8] + x[9] + sub $t4, $t5 # t4: r1 = x[8] - x[9] + add $t3, $t6, $t7 # t3: r2 = x[10] + x[11] + sub $t6, $t7 # t6: r3 = x[10] - x[11] + add $t5, $t8, $t9 # t5: r4 = x[12] + x[13] + sub $t8, $t9 # t8: r5 = x[12] - x[13] + add $t7, $t0, $t1 # t7: r6 = x[14] + x[15] + sub $t0, $t1 # t0: r7 = x[14] - x[15] + add $t1, $t8, $t6 # x[8] = r5 + r3 + sub $v0, $t0, $t4 # x[9] = r7 - r1 + sub $t8, $t6 # x[10] = r5 - r3 + add $t0, $t4 # x[11] = r7 + r1 + sub $t6, $t5, $t2 # x[12] = r4 - r0 + sub $t4, $t7, $t3 # x[13] = r6 - r2 + add $t5, $t2 # x[14] = r4 + r0 + add $t7, $t3 # x[15] = r6 + r2 + sw $t1, 8*4($a1) + sw $v0, 9*4($a1) + sw $t8, 10*4($a1) + sw $t0, 11*4($a1) + sw $t6, 12*4($a1) + sw $t4, 13*4($a1) + sw $t5, 14*4($a1) + sw $t7, 15*4($a1) + + + #### 16 point butterfly (in place, 4 register) #### + #### mdct_butterfly_16(x + 16) #### + lw $t4, 0+16*4($a1) # t4: x[ 0] + lw $t5, 4+16*4($a1) # t5: x[ 1] + lw $t6, 2*4+16*4($a1) # t6: x[ 2] + lw $t7, 3*4+16*4($a1) # t7: x[ 3] + lw $t8, 8*4+16*4($a1) # t8: x[ 8] + lw $t9, 9*4+16*4($a1) # t9: x[ 9] + lw $v0, 10*4+16*4($a1) # v0: x[ 10] + lw $v1, 11*4+16*4($a1) # v1: x[ 11] + sub $t0, $t8, $t9 # t0: r0 = x[ 8] - x[ 9] + sub $t1, $v0, $v1 # t1: r1 = x[ 10] - x[ 11] + sub $t2, $t5, $t4 # t2: r2 = x[ 1] - x[ 0] + sub $t3, $t7, $t6 # t3: r3 = x[ 3] - x[ 2] + sub $s3, $t0, $t1 # s3: r0 - r1 + add $s4, $t2, $t3 # s4: r2 + r3 + add $t0, $t1 # t0: r0 + r1 + sub $t3, $t2 # t3: r3 - r2 + mult $s3, $s2 # MULT32((r0-r1), cPI2_8) + add $t8, $t9 # x[ 8] += x[ 9] + add $v0, $v1 # x[10] += x[11] + add $t9, $t5, $t4 # x[ 9] = x[ 1] + x[0] + add $v1, $t7, $t6 # x[11] = x[ 3] + x[2] + mfhi $s3 + mult $s4, $s2 # MULT32((r3+r2), cPI2_8) + sw $t8, 8*4+16*4($a1) + sw $v0, 10*4+16*4($a1) + sw $t9, 9*4+16*4($a1) + sw $v1, 11*4+16*4($a1) + mfhi $s4 + mult $t0, $s2 # MULT32((r0+r1), cPI2_8) + sll $s3, 1 # MULT31((r0-r1), cPI2_8) + sw $s3, 0+16*4($a1) # x[0] = MULT32((r0-r1), cPI2_8) + lw $t4, 4*4+16*4($a1) # t4: x[ 4] + lw $t5, 5*4+16*4($a1) # t5: x[ 5] + mfhi $t0 + mult $t3, $s2 # MULT32((r3-r2), cPI2_8) + sll $s4, 1 # MULT31((r3+r2), cPI2_8) + sw $s4, 4+16*4($a1) # x[1] = MULT31((r3+r2), cPI2_8) + lw $t6, 6*4+16*4($a1) # t6: x[ 6] + lw $t7, 7*4+16*4($a1) # t7: x[ 7] + mfhi $t3 + sll $t0, 1 # MULT31((r0+r1), cPI2_8) + sw $t0, 2*4+16*4($a1) # x[2] = MULT31((r0+r1), cPI2_8) + lw $t8, 12*4+16*4($a1) # t8: x[ 12] + lw $t9, 13*4+16*4($a1) # t9: x[ 13] + lw $v0, 14*4+16*4($a1) # v0: x[ 14] + lw $v1, 15*4+16*4($a1) # v1: x[ 15] + sll $t3, 1 # MULT31((r3-r2), cPI2_8) + sw $t3, 3*4+16*4($a1) # x[3] = MULT31((r3-r2), cPI2_8) + sub $t2, $t8, $t9 # t2: r2 = x[12] - x[13] + sub $t3, $v0, $v1 # t3: r3 = x[14] - x[15] + sub $t0, $t4, $t5 # t0: r0 = x[ 4] - x[ 5] + sub $t1, $t7, $t6 # t1: r1 = x[ 7] - x[ 6] + add $t8, $t9 # x[12] += x[13] + add $v0, $v1 # x[14] += x[15] + add $t9, $t5, $t4 # x[13] = x[ 5] + x[4] + add $v1, $t7, $t6 # x[15] = x[ 7] + x[6] + sw $t8, 12*4+16*4($a1) + sw $v0, 14*4+16*4($a1) + sw $t9, 13*4+16*4($a1) + sw $v1, 15*4+16*4($a1) + sw $t2, 4*4+16*4($a1) # x[ 4] = r2 + sw $t1, 5*4+16*4($a1) # x[ 5] = r1 + sw $t3, 6*4+16*4($a1) # x[ 6] = r3 + sw $t0, 7*4+16*4($a1) # x[ 7] = r0 + + #### mdct_butterfly_8(x) #### + lw $t4, 0+16*4($a1) # t4: x[0] + lw $t5, 4+16*4($a1) # t5: x[1] + lw $t6, 2*4+16*4($a1) # t6: x[2] + lw $t7, 3*4+16*4($a1) # t7: x[3] + lw $t8, 4*4+16*4($a1) # t8: x[4] + lw $t9, 5*4+16*4($a1) # t9: x[5] + lw $t0, 6*4+16*4($a1) # t0: x[6] + lw $t1, 7*4+16*4($a1) # t1: x[7] + add $t2, $t4, $t5 # t2: r0 = x[0] + x[1] + sub $t4, $t5 # t4: r1 = x[0] - x[1] + add $t3, $t6, $t7 # t3: r2 = x[2] + x[3] + sub $t6, $t7 # t6: r3 = x[2] - x[3] + add $t5, $t8, $t9 # t5: r4 = x[4] + x[5] + sub $t8, $t9 # t8: r5 = x[4] - x[5] + add $t7, $t0, $t1 # t7: r6 = x[6] + x[7] + sub $t0, $t1 # t0: r7 = x[6] - x[7] + add $t1, $t8, $t6 # x[0] = r5 + r3 + sub $v0, $t0, $t4 # x[1] = r7 - r1 + sub $t8, $t6 # x[2] = r5 - r3 + add $t0, $t4 # x[3] = r7 + r1 + sub $t6, $t5, $t2 # x[4] = r4 - r0 + sub $t4, $t7, $t3 # x[5] = r6 - r2 + add $t5, $t2 # x[6] = r4 + r0 + add $t7, $t3 # x[7] = r6 + r2 + sw $t1, 0+16*4($a1) + sw $v0, 4+16*4($a1) + sw $t8, 2*4+16*4($a1) + sw $t0, 3*4+16*4($a1) + sw $t6, 4*4+16*4($a1) + sw $t4, 5*4+16*4($a1) + sw $t5, 6*4+16*4($a1) + sw $t7, 7*4+16*4($a1) + + #### mdct_butterfly_8(x + 8) #### + lw $t4, 8*4+16*4($a1) + lw $t5, 9*4+16*4($a1) + lw $t6, 10*4+16*4($a1) + lw $t7, 11*4+16*4($a1) + lw $t8, 12*4+16*4($a1) + lw $t9, 13*4+16*4($a1) + lw $t0, 14*4+16*4($a1) + lw $t1, 15*4+16*4($a1) + add $t2, $t4, $t5 # t2: r0 = x[8] + x[9] + sub $t4, $t5 # t4: r1 = x[8] - x[9] + add $t3, $t6, $t7 # t3: r2 = x[10] + x[11] + sub $t6, $t7 # t6: r3 = x[10] - x[11] + add $t5, $t8, $t9 # t5: r4 = x[12] + x[13] + sub $t8, $t9 # t8: r5 = x[12] - x[13] + add $t7, $t0, $t1 # t7: r6 = x[14] + x[15] + sub $t0, $t1 # t0: r7 = x[14] - x[15] + add $t1, $t8, $t6 # x[8] = r5 + r3 + sub $v0, $t0, $t4 # x[9] = r7 - r1 + sub $t8, $t6 # x[10] = r5 - r3 + add $t0, $t4 # x[11] = r7 + r1 + sub $t6, $t5, $t2 # x[12] = r4 - r0 + sub $t4, $t7, $t3 # x[13] = r6 - r2 + add $t5, $t2 # x[14] = r4 + r0 + add $t7, $t3 # x[15] = r6 + r2 + sw $t1, 8*4+16*4($a1) + sw $v0, 9*4+16*4($a1) + sw $t8, 10*4+16*4($a1) + sw $t0, 11*4+16*4($a1) + sw $t6, 12*4+16*4($a1) + sw $t4, 13*4+16*4($a1) + sw $t5, 14*4+16*4($a1) + sw $t7, 15*4+16*4($a1) + addiu $a0, -32 + .set noreorder + .set nomacro + bgtz $a0, mdct_butterfly_32_lp # for(j=0;j<points;j+=32) + addiu $a1, 32 * 4 + +mdct_butterfly_32_end: + lw $a1, 60($sp) + lw $a0, 56($sp) + lw $a2, 0($sp) # shift + li $t0, 0 # int bit = 0; + sll $a0, 1 # a0: (n >> 1) * 4 + addu $t2, $a1, $a0 # DATA_TYPE *w = x+(n>>1); + +mdct_bitreverse_lp: + bitrev $t1, $t0 # bitrev12(bit); + srav $t1, $t1, $a2 # (b>>shift) + addiu $t2, -8 # w -= 2; + sll $t1, 2 # (b>>shift)*4 + add $t3, $a1, $t1 # DATA_TYPE *xx = x + (b>>shift); + slt $t1, $t3, $t2 # if(w>xx){ + beqz $t1, mdct_bitreverse_lp_end + addiu $t0, 16 # bit++ << 4 + .set macro + .set reorder + lw $t1, 0($t3) # r1 = xx[0]; + lw $t4, 0($t2) # w[0] + lw $t5, 4($t3) # r2 = xx[1]; + lw $t6, 4($t2) # w[1] + sw $t4, 0($t3) # xx[0] = w[0]; + sw $t1, 0($t2) # w[0] = r1; + sw $t6, 4($t3) # xx[1] = w[1]; + sw $t5, 4($t2) # w[1] = r2; +mdct_bitreverse_lp_end: + slt $t1, $a1, $t2 + bnez $t1, mdct_bitreverse_lp + + #******************************** mdct_step7 ******************** + lw $a2, 4($sp) # step + la $s0, sincos_lookup0 + la $t5, sincos_lookup1 + sll $t1, $a2, 1 # t1: (step >> 1) * 4 + addu $s0, $t1 # s0: *T = sincos_lookup0+(step>>1) + slti $t2, $a2, 4 + movn $s0, $t5, $t2 # LOOKUP_T *T = (step>=4)? + # (sincos_lookup0+(step>>1)): + # sincos_lookup1; + move $s1, $a1 # DATA_TYPE *w0 = x; + addu $s2, $a1, $a0 # DATA_TYPE *w1 = x+(n>>1); + addiu $s3, $s0, 4*1024 # LOOKUP_T *Ttop = T+1024; + sll $a2, 2 # step * 4 + + # pipeline warm-up before loop + lw $t1, 4($s1) # t1: w0[1] + lw $t0, 0($s1) # t0: w0[0] +mdct_step7_lp1: + lw $t2, -8($s2) # t2: w1[0] + lw $t3, -4($s2) # t3: w1[1] + lw $v0, 0($s0) # v0: T[0] + lw $v1, 4($s0) # v1: T[1] + addu $t5, $t0, $t2 # t5: r0 = w0[0] + w1[0]; + sub $t9, $t3, $t1 # t9: r1 = w1[1] - w0[1]; + mult $t5, $v1 # MULT32(r0, T[1]) + mtlo $zero + madd $t9, $v0 # MULT32(r0, T[1]) + + # MULT32(r1, T[0]); + add $s0, $a2 # T+=step; + add $t1, $t3 # w0[1] + w1[1] + sub $t0, $t2 # w0[0] - w1[0] + sra $t1, 1 # r0 = (w0[1] + w1[1])>>1; + mfhi $t4 # r2 = MULT32(r0, T[1]) + + # MULT32(r1, T[0]); + mult $t9, $v1 # MULT32(r1, T[1]) + mtlo $a3 + msub $t5, $v0 # MULT32(r1, T[1]) - + # MULT32(r0, T[0]); + sra $t6, $t0, 1 # r1 = (w0[0] - w1[0])>>1; + add $t3, $t1, $t4 # r0 + r2; + sub $t1, $t4 # r0 - r2 + addiu $s2, -8; + mfhi $t5 # r3 = MULT32(r1, T[1]) - + # MULT32(r0, T[0]); + sw $t3, 0($s1) # w0[0] = r0 + r2; + sw $t1, 0($s2) # w1[0] = r0 - r2; + slt $t9, $s0, $s3 # while(T<Ttop) + addiu $s1, 8 # w0 += 2; + lw $t1, 4($s1) # t1: w0[1] + lw $t0, 0($s1) # t0: w0[0] + add $t2, $t6, $t5 # r1 + r3 + sub $t6, $t5, $t6 # r3 - r1 + sw $t2, -4($s1) # w0[1] = r1 + r3; + .set noreorder + .set nomacro + bnez $t9, mdct_step7_lp1 + sw $t6, 4($s2) # w1[1] = r3 - r1; + + # pipeline warm-up before loop + sub $s0, $a2 # T-=step; + lw $t1, 4($s1) # t1: w0[1] + lw $t0, 0($s1) # t0: w0[0] +mdct_step7_lp2: + lw $t2, -8($s2) # t2: w1[0] + lw $t3, -4($s2) # t3: w1[1] + lw $v0, 0($s0) # v0: T[0] + lw $v1, 4($s0) # v1: T[1] + add $t5, $t0, $t2 # t5: r0 = w0[0] + w1[0]; + sub $t9, $t3, $t1 # t9: r1 = w1[1] - w0[1]; + mult $t5, $v0 # MULT32(r0, T[0]) + mtlo $zero + madd $t9, $v1 # MULT32(r0, T[0]) + + # MULT32(r1, T[1]); + addiu $s2, -8; + add $t1, $t3 # w0[1] + w1[1] + sub $t0, $t2 # w0[0] - w1[0] + sra $t1, 1 # r0 = (w0[1] + w1[1])>>1; + mfhi $t4 # r2 = MULT32(r0, T[1]) + + # MULT32(r1, T[0]); + mult $t9, $v0 # MULT32(r1, T[0]) + mtlo $a3 + msub $t5, $v1 # MULT32(r1, T[0]) - + # MULT32(r0, T[1]); + sra $t6, $t0, 1 # r1 = (w0[0] - w1[0])>>1; + add $t3, $t1, $t4 # r0 + r2; + sub $t1, $t4 # r0 - r2 + sw $t3, 0($s1) # w0[0] = r0 + r2; + mfhi $t5 # r3 = MULT32(r1, T[0]) - + # MULT32(r0, T[1]); + sw $t1, 0($s2) # w1[0] = r0 - r2; + addiu $s1, 8 # w0 += 2; + sub $s0, $a2 # T-=step; + slt $t9, $s1, $s2 # while(w0<w1); + lw $t1, 4($s1) # t1: w0[1] + lw $t0, 0($s1) # t0: w0[0] + add $t2, $t6, $t5 # r1 + r3 + sub $t6, $t5, $t6 # r3 - r1 + sw $t2, -4($s1) # w0[1] = r1 + r3; + bnez $t9, mdct_step7_lp2 + sw $t6, 4($s2) # w1[1] = r3 - r1; + + .set macro + .set reorder +#******************************** mdct_step8 ******************** + lw $a2, 4($sp) # step (always >=0) + la $s0, sincos_lookup0 # s0: T = sincos_lookup0; + sra $a2, 2 # step>>=2; + addu $s2, $a1, $a0 # DATA_TYPE *iX = x+(n>>1); + addiu $t1, $a2, -1 # step - 1 + bgtz $t1, mdct_step8_default # switch(step) { + la $s1, sincos_lookup1 # s1: V = sincos_lookup1; + beqz $t1, mdct_step8_case1 + #### linear interpolation between table values: offset=0.25, step=0.5 #### + lw $t0, 0($s0) # t0 = *T + lw $t1, 4($s0) # t1 = *(T+1) + addiu $s0, 8 # T+=2 + + # pipeline warm-up before loop + lw $v0, 0($s1) # v0 = *V + lw $v1, 4($s1) # v1 = *(V+1) + lw $t4, 0($a1) # r0 = x[0] + lw $t5, 4($a1) # r1 = x[1] + sub $t2, $v0, $t0 # q0 = (v0-t0) + sub $t3, $v1, $t1 # q1 = (v1-t1) + sra $t2, 2 # q0 = (v0-t0)>>2 + sra $t3, 2 # q1 = (v1-t1)>>2 + add $t0, $t2 # t0 += (q0 = (v0-t0)>>2) +mdct_step8_case0_lp: + add $t1, $t3 # t1 += (q1 = (v1-t1)>>2) + negu $t5, $t5 # r1 = -x[1] + # XPROD31( r0, r1, t0, t1, x, x+1 ); + mult $t4, $t0 # MULT32(r0,t0) + mtlo $zero + madd $t5, $t1 # MULT32(r0,t0) + MULT32(r1,t1) + lw $t7, 3*4($a1) # r1 = x[3] + lw $t6, 2*4($a1) # r0 = x[2] + sub $t8, $v0, $t2 # t8: new t0 = v0 - q0 + sub $s5, $v1, $t3 # s5: new t1 = v1 - q1 + mfhi $s3 + mult $t5, $t0 # MULT32(r1,t0) + mtlo $a3 + msub $t4, $t1 # MULT32(r1,t0) - MULT32(r0,t1) + negu $t7, $t7 # r1 = -x[3] + sll $s3, 1 # MULT31(r0,t0) + MULT31(r1,t1) + sw $s3, 0($a1) + mfhi $s3 + # XPROD31( r0, r1, t0, t1, x+2, x+3 ) + mult $t6, $t8 # MULT32(r0,t0) + mtlo $zero + madd $t7, $s5 # MULT32(r0,t0) + MULT32(r1,t1) + lw $t0, 0($s0) # t0 = *T + lw $t1, 4($s0) # t1 = *T+1 + sll $s3, 1 # MULT31(r1,t0) - MULT31(r0,t1) + sw $s3, 4($a1) + mfhi $s3 + mult $t7, $t8 # MULT32(r1,t0) + mtlo $a3 + msub $t6, $s5 # MULT32(r1,t0) + MULT32(r0,t1) + lw $t4, 4*4($a1) # r0 = x[4] + lw $t5, 5*4($a1) # r1 = x[5] + sub $t2, $t0, $v0 # q0 = (t0-v0) + sll $s3, 1 # MULT31(r0,t0) + MULT31(r1,t1) + sw $s3, 2*4($a1) + mfhi $s3 + sub $t3, $t1, $v1 # q1 = (t1-v1) + sra $t2, 2 # q0 = (t0-v0)>>2 + sra $t3, 2 # q1 = (t1-v1)>>2 + add $v0, $t2 # v0 += (q0 = (t0-v0)>>2) + add $v1, $t3 # v1 += (q1 = (t1-v1)>>2) + negu $t5, $t5 # r1 = -x[1] + sll $s3, 1 # MULT31(r1,t0) - MULT31(r0,t1) + sw $s3, 3*4($a1) + # XPROD31( r0, r1, v0, v1, x+4, x+5 ); + mult $t4, $v0 # MULT32(r0,v0) + mtlo $zero + madd $t5, $v1 # MULT32(r0,v0) + MULT32(r1,v1) + lw $t7, 7*4($a1) # r1 = x[7] + lw $t6, 6*4($a1) # r0 = x[6] + sub $t8, $t0, $t2 # t8: new v0 = t0 - q0 + sub $s5, $t1, $t3 # s5: new v1 = t1 - q1 + mfhi $s3 + mult $t5, $v0 # MULT32(r1,v0) + mtlo $a3 + msub $t4, $v1 # MULT32(r1,v0) + MULT32(r0,v1) + negu $t7, $t7 # r1 = -x[7] + lw $v0, 2*4($s1) # v0 = *V + lw $v1, 3*4($s1) # v1 = *(V+1) + sll $s3, 1 # MULT31(r0,t0) + MULT31(r1,t1) + sw $s3, 4*4($a1) + mfhi $s3 + # XPROD31( r0, r1, v0, v1, x+5, x+6 ) + mult $t6, $t8 # MULT32(r0,v0) + mtlo $zero + madd $t7, $s5 # MULT32(r0,v0) + MULT32(r1,v1) + addiu $a1, 8*4 # x += 8 + addiu $s0, 2*4 # T += 2 + slt $t9, $a1, $s2 # x<iX + sll $s3, 1 # MULT31(r1,v0) - MULT31(r0,v1) + sw $s3, -2*4($a1) + mfhi $s3 + mult $t7, $t8 # MULT32(r1,v0) + mtlo $a3 + msub $t6, $s5 # MULT32(r1,v0) + MULT32(r0,v1) + addiu $s1, 2*4 # V += 2 + lw $t4, 0($a1) # r0 = x[0] + lw $t5, 4($a1) # r1 = x[1] + sll $s3, 1 # MULT31(r0,v0) + MULT31(r1,v1) + mfhi $s4 + sw $s3, -4($a1) + sub $t2, $v0, $t0 # q0 = (v0-t0) + sub $t3, $v1, $t1 # q1 = (v1-t1) + sra $t2, 2 # q0 = (v0-t0)>>2 + sra $t3, 2 # q1 = (v1-t1)>>2 + add $t0, $t2 # t0 += (q0 = (v0-t0)>>2) + sll $s4, 1 # MULT31(r1,v0) - MULT31(r0,v1) + .set noreorder + .set nomacro + bnez $t9, mdct_step8_case0_lp # while(x<iX) + sw $s4, -2*4($a1) + + b mdct_step8_end + lw $s0, 16($sp) + .set macro + .set reorder + +mdct_step8_case1: + #### linear interpolation between table values: offset=0.5, step=1 #### + lw $t2, 0($s0) # t2: t0 = *T + lw $s5, 4($s0) # s5: t1 = *(T+1) + addiu $s0, 8 # T+=2 + sra $t2, 1 + sra $s5, 1 + + # pipeline warm-up before loop + lw $t1, 4($a1) # t1: x[1] + lw $t0, 0($a1) # t0: r0 = x[0] + lw $v0, 0($s1) # v0 + lw $v1, 4($s1) # v1 + sra $v0, 1 + sra $v1, 1 + negu $t1, $t1 # t1: r1 = -x[1] + add $t2, $v0 # t2: t0 += (v0 = (*V++)>>1); +mdct_step8_case1_lp: + add $t3, $v1, $s5 # t3: t1 += (v1 = (*V++)>>1); + #XPROD31( r0, r1, T[0], T[1], x, x+1); + mult $t0, $t2 # MULT32(r0,t0) + mtlo $zero + madd $t1, $t3 # MULT32(r0,t0) + MULT32(r1,t1) + addiu $s1, 8 # V+=2 + lw $t6, 12($a1) # t6: x[3] + lw $t7, 8($a1) # t7: r0 = x[2] + lw $t8, 0($s0) # t8: t0 + mfhi $t4 + mult $t1, $t2 # MULT32(r1,T[0]) + mtlo $a3 + msub $t0, $t3 # MULT32(r1,T[0]) - MULT32(r0,T[1]) + lw $s5, 4($s0) # s5: t1 + sra $t8, 1 + sll $t4, 1 # MULT31(r0,T[0]) + MULT31(r1,T[1]) + sw $t4, 0($a1) + mfhi $t5 + sra $s5, 1 + negu $t6, $t6 # t6: r1 = -x[3] + add $v0, $t8 # t8: v0 += (t0 = (*T++)>>1); + add $v1, $s5 # s5: v1 += (t1 = (*T++)>>1); + #XPROD31( r0, r1, v0, v1, x, x+1); + mult $t7, $v0 # MULT32(r0,v0) + mtlo $zero + madd $t6, $v1 # MULT32(r0,v0) + MULT32(r1,v1) + sll $t5, 1 # MULT31(r1,T[0]) - MULT31(r0,T[1]) + sw $t5, 4($a1) + addiu $s0, 2*4 # T+=2 + addiu $a1, 4*4 # x+=4 + mfhi $t4 + mult $t6, $v0 # MULT32(r1,v0) + mtlo $a3 + msub $t7, $v1 # MULT32(r1,v0) - MULT32(r0,v1) + slt $t9, $a1, $s2 # x<iX + lw $t1, 4($a1) # t1: x[1] + lw $t0, 0($a1) # t0: r0 = x[0] + sll $t4, 1 # MULT31(r0,v0) + MULT31(r1,v1) + sw $t4, -2*4($a1) + mfhi $t5 + lw $v0, 0($s1) # v0 + lw $v1, 4($s1) # v1 + sra $v0, 1 + sra $v1, 1 + negu $t1, $t1 # t1: r1 = -x[1] + add $t2, $v0, $t8 # t2: t0 += (v0 = (*V++)>>1); + sll $t5, 1 # MULT31(r1,T[0]) - MULT31(r0,T[1]) + .set noreorder + .set nomacro + bnez $t9, mdct_step8_case1_lp # while(x<iX) + sw $t5, -4($a1) + + b mdct_step8_end + lw $s0, 16($sp) + .set macro + .set reorder + +mdct_step8_default: # most common case + la $t5, sincos_lookup1 + sll $t1, $a2, 1 # t1: (step >> 1) * 4 + slti $t2, $a2, 4 + addu $s0, $t1 # s0: *T = sincos_lookup0+(step>>1) + movn $s0, $t5, $t2 # LOOKUP_T *T = (step>=4)? + # (sincos_lookup0+(step>>1)): + # sincos_lookup1; + sll $a2, 2 # step *4 + + lw $t1, 4($a1) # t1: x[1] + lw $t0, 0($a1) # t0: r0 = x[0] + lw $t2, 0($s0) # t2: T[0] + negu $t1, $t1 # t1: r1 = -x[1] + lw $t3, 4($s0) # t3: T[1] + #XPROD31( r0, r1, T[0], T[1], x, x+1); + mult $t0, $t2 # MULT32(r0,T[0]) +mdct_step8_default_lp: + mtlo $zero + madd $t1, $t3 # MULT32(r0,T[0]) + MULT32(r1,T[1]) + addu $s0, $a2 # T+=step + addiu $a1, 2*4 # x+=2 + mfhi $t4 + mult $t1, $t2 # MULT32(r1,T[0]) + mtlo $a3 + msub $t0, $t3 # MULT32(r1,T[0]) - MULT32(r0,T[1]) + lw $t1, 4($a1) # t1: x[1] + lw $t0, 0($a1) # t0: r0 = x[0] + sll $t4, 1 # MULT31(r0,T[0]) + MULT31(r1,T[1]) + sw $t4, -2*4($a1) + mfhi $t5 + lw $t2, 0($s0) # t2: T[0] + negu $t1, $t1 # t1: r1 = -x[1] + lw $t3, 4($s0) # t3: T[1] + #XPROD31( r0, r1, T[0], T[1], x, x+1); + mult $t0, $t2 # MULT32(r0,T[0]) + slt $t9, $a1, $s2 # x<iX + sll $t5, 1 # MULT31(r1,T[0]) - MULT31(r0,T[1]) + .set noreorder + .set nomacro + bnez $t9, mdct_step8_default_lp # while(x<iX) + sw $t5, -4($a1) + .set macro + .set reorder + + lw $s0, 16($sp) +mdct_step8_end: + lw $s1, 20($sp) + lw $s2, 24($sp) + lw $s3, 28($sp) + lw $s4, 32($sp) + lw $s5, 36($sp) + lw $s6, 40($sp) + lw $s7, 44($sp) + lw $s8, 48($sp) + lw $ra, 52($sp) + addiu $sp, 56 + jr $ra + .end mdct_backward +#**************************************************************************** + +#/*************************************************************************** +#* +#* Function: mdct_unroll_lap +#* +#* Description: +#* +#* Parameters: +#* +#* a0 : n0 +#* a1 : n1 +#* a2 : lw +#* a3 : w +#* t0 : in +#* t1 : right +#* t2 : w0 +#* t3 : w1 +#* t4 : out +#* t5 : step +#* t6 : start +#* t7 : end +#* s0 : l +#* s1 : r +#* s2 : wR +#* s3 : wL +#* s4 : preLap +#* s5 : halfLap +#* s6 : postLap +#* +#* +#* Reference: see mdct.c +#* +#* +#* Notes: +#* +#***************************************************************************/ + + .text + .align 2 + .globl mdct_unroll_lap + .set nomips16 + .set nomicromips + .ent mdct_unroll_lap + .type mdct_unroll_lap, @function + + #### void mdct_unroll_lap(int n0,int n1, + # int lW,int W, + # DATA_TYPE *in, + # DATA_TYPE *right, + # LOOKUP_T *w0, + # LOOKUP_T *w1, + # ogg_int16_t *out, + # int step, + # int start, /* samples, this frame */ + # int end /* samples, this frame */) #### + +mdct_unroll_lap: + .frame $sp, 32, $ra + .set noreorder + .cpload $t9 + .set reorder + addiu $sp, -32 + sll $t8, $a0, 1 # (n0>>1) * 4 + sra $v0, $a0, 2 # n0>>2 + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + sw $s6, 24($sp) + lw $t0, 48($sp) # *in + lw $t1, 52($sp) # *right + lw $t2, 56($sp) # *w0 + lw $t3, 60($sp) # *w1 + lw $t4, 64($sp) # *out + lw $t5, 68($sp) # step + lw $t6, 72($sp) # start + lw $t7, 76($sp) # end + sll $t9, $a1, 1 # (n1>>1) * 4 + sra $v1, $a1, 2 # n1>>2 + sll $t5, $t5, 1 + .set noreorder + .set nomacro + + beqz $a3, mdct_unroll_lap_W_zero # if W == 0 + sw $s7, 28($sp) + #W=1 + beqz $a2, mdct_unroll_lap_lW_zero # if lW == 0 + li $s4, 0 # preLap=0 + .set macro + .set reorder + #W=1,lW=1 + move $s5, $v1 # halfLap=n1>>2 + sll $v1, $v1, 2 + add $s0, $t0, $t9 # *l=in+n1>>1; + add $s1, $t1, $v1 # *r=right+n1>>2; + add $s2, $t3, $t9 # *wR=w1+(n1>>1); + move $s3, $t3 # *wL=w1 + .set noreorder + .set nomacro + b mdct_unroll_lap_lW_W_end + li $s6, 0 # postLap=0 + .set macro + .set reorder + +mdct_unroll_lap_lW_zero: + #W=1,lW=0 + move $s5, $v0 # halfLap= n0>>2 + sub $s6, $v1, $v0 # postLap= n1>>2 - n0>>2 + sll $v0, $v0, 2 + add $s0, $t0, $t8 # *l= in+n0>>1; + add $s1, $t1, $v0 # *r= right+n0>>2; + add $s2, $t2, $t8 # *wR= w0+(n0>>1); + .set noreorder + .set nomacro + b mdct_unroll_lap_lW_W_end + move $s3, $t2 # *wL= w0 + +mdct_unroll_lap_W_zero: + #W=0 + add $s0, $t0, $t8 # *l=in+ n0>>1; + add $s2, $t2, $t8 # *wR= w0+(n0>>1); + move $s3, $t2 # *wL=w0 + move $s5, $v0 # halfLap = n0 >>2 + beqz $a2, mdct_unroll_lap_lW_zero1 # if lW=0 + li $s6, 0 # postLap = 0 + .set macro + .set reorder + #W=0, lW=1 + sub $s4, $v1, $v0 # preLap= n1>>2 - n0>>2 + move $s5, $v0 # halfLap= n0>>2 + sll $v0, $v0, 2 + sll $v1, $v1, 2 + .set noreorder + .set nomacro + b mdct_unroll_lap_lW_W_end + add $s1, $t1, $v1 # *r=right+ n1>>2; + .set macro + .set reorder + +mdct_unroll_lap_lW_zero1: + #W=0, lW=0 + sll $v0, $v0, 2 + add $s1, $t1, $v0 # *r=right+ n0>>2; + li $s4, 0 # preLap=0 + +mdct_unroll_lap_lW_W_end: + .set noreorder + .set nomacro +# preLap preceeding direct-copy lapping from previous frame, if any + beqz $s4, mdct_unroll_lap_croos_lap # if(preLap) + move $t1, $t6 # t1: off = start + .set macro + .set reorder + slt $a1, $t6, $s4 # off= (start<preLap?start:preLap); + movz $t1, $s4, $a1 # t1: off + move $t0, $t7 # t0: n= end + slt $s7, $t7, $s4 # n= (end<preLap?end:preLap); + movz $t0, $s4, $s7 # t0: end + sll $a1, $t1, 2 # a1: off*4 + sub $t7, $t7, $t0 # end-= n; + sub $t6, $t6, $t1 # start-= off; + sub $t0, $t1 # t0: n - off + .set noreorder + .set nomacro + #while(r>post) <=> while(r-off>r-n) <=> while(off<n) <=> while((n-off)>0) + beqz $t0, mdct_unroll_lap_croos_lap + sub $s1, $s1, $a1 # s1: r-= off; + lw $a1, -4($s1) # a1: (*--r) + addiu $t0, $t0, -1 + addi $s1, $s1, -4 + beqz $t0, mdct_unroll_lap_prelap_while_loop_end + sra $a2, $a1, 9 # a2: (*--r)>>9 + .set macro + .set reorder +mdct_unroll_lap_prelap_while_loop: + shll_s.w $a2, $a2, 16 + lw $a1, -4($s1) # a1: (*--r) + addiu $t0, $t0, -1 + sra $a2, $a2, 16 # CLIP_TO_15((*--r)>>9) + addi $s1, $s1, -4 + sh $a2, 0($t4) # *out = CLIP_TO_15((*--r)>>9) + sra $a2, $a1, 9 # a2: (*--r)>>9 + .set noreorder + .set nomacro + bnez $t0, mdct_unroll_lap_prelap_while_loop + add $t4, $t4, $t5 # out+=step + .set macro + .set reorder +mdct_unroll_lap_prelap_while_loop_end: + sra $a2, $a1, 9 + shll_s.w $a2, $a2, 16 + sra $a2, $a2, 16 # CLIP_TO_15((*--r)>>9) + sh $a2, 0($t4) # *out = CLIP_TO_15((*--r)>>9) + add $t4, $t4, $t5 # out+=step + +# cross-lap; two halves due to wrap-around +mdct_unroll_lap_croos_lap: + move $t1, $t6 # t1: off = start + slt $a1, $t6, $s5 # start<halfLap?start:halfLap; + movz $t1, $s5, $a1 # t1: off + move $t0, $t7 # t0: n = end + slt $s7, $t7, $s5 # end<halfLap?end:halfLap; + movz $t0, $s5, $s7 # t0: n + sub $t6, $t6, $t1 # start-= off; + sll $a3, $t1, 2 + sll $a2, $t1, 3 # a2= off*2 + sub $s1, $s1, $a3 # r-= off; + sub $s2, $s2, $a3 # wR -= off; + add $s3, $s3, $a3 # wL += off; + sub $t7, $t7, $t0 # end-= n; + sub $t0, $t1 # t0: n - off + .set noreorder + .set nomacro + #while(r>post) <=> while(r-off>r-n) <=> while(off<n) <=> while((n-off)>0) + beqz $t0, mdct_unroll_lap_croos_lap_1 + sub $s0, $s0, $a2 # l -= off*2; + .set macro + .set reorder + addiu $s0, $s0, -8 # l-=2 + lw $a1, -4($s1) # *--r + addi $s1, $s1, -4 + lw $a0, -4($s2) # *--wR + addi $s2, $s2, -4 + lw $t8, 0($s0) # *l + lw $t9, 0($s3) # *wL + addiu $t0, $t0, -1 + .set noreorder + .set nomacro + beqz $t0, mdct_unroll_lap_croos_while_loop_end + addi $s3, $s3, 4 # wL++ + .set macro + .set reorder +mdct_unroll_lap_croos_while_loop: + mult $a1, $a0 # a1=MULT32(*--r,*--wR) + mtlo $zero + addiu $s0, $s0, -8 + shilo $ac0, -1 # a1=MULT31(*--r,*--wR) + lw $a1, -4($s1) + addi $s1, $s1, -4 + lw $a0, -4($s2) + dpaq_sa.l.w $ac0, $t8, $t9 # MULT31(*--r,*--wR) + MULT31(*l,*wL++) + mfhi $s7 # s7=(MULT31(*--r,*--wR) + + # MULT31(*l,*wL++)) + addi $s2, $s2, -4 + lw $t8, 0($s0) + lw $t9, 0($s3) + addi $s3, $s3, 4 + sra $s7, $s7, 9 # s7=(MULT31(*--r,*--wR) + + # MULT31(*l,*wL++))>>9 + shll_s.w $s7, $s7, 16 # *out = CLIP_TO_15((MULT31(*--r,*--wR) + # + MULT31(*l,*wL++))>>9); + sra $s7, $s7, 16 + sh $s7, 0($t4) + addiu $t0, $t0, -1 + .set noreorder + .set nomacro + bnez $t0, mdct_unroll_lap_croos_while_loop + add $t4, $t4, $t5 # out+=step + .set macro + .set reorder +mdct_unroll_lap_croos_while_loop_end: + mult $a1, $a0 # a1=MULT31(*--r,*--wR) + mtlo $zero + shilo $ac0, -1 + dpaq_sa.l.w $ac0, $t8, $t9 # MULT31(*--r,*--wR) + + # MULT31(*l,*wL++) + mfhi $s7 + sra $s7, $s7, 9 # s7=(MULT31(*--r,*--wR) + + # MULT31(*l,*wL++))>>9 + shll_s.w $s7, $s7, 16 # *out = CLIP_TO_15((MULT31(*--r,*--wR) + # + MULT31(*l,*wL++))>>9); + sra $s7, $s7, 16 + sh $s7, 0($t4) + add $t4, $t4, $t5 # out+=step + +# cross-lap1; two halves due to wrap-around +mdct_unroll_lap_croos_lap_1: + move $t1, $t6 # t1: off= start + slt $a1, $t6, $s5 # start<halfLap?start:halfLap + movz $t1, $s5, $a1 # t1: off + move $t0, $t7 # t0: n= end + slt $s7, $t7, $s5 # end<halfLap?end:halfLap + movz $t0, $s5, $s7 # t0: n + sub $t6, $t6, $t1 # start-= off; + sll $a3, $t1, 2 + sll $a2, $t1, 3 # a2= off*2 + sub $t7, $t7, $t0 # end-= n; + sub $s2, $s2, $a3 # wR -= off; + sub $t0, $t1 # t0: n-off + add $s1, $s1, $a3 # r+= off; + add $s3, $s3, $a3 # wL += off; + .set noreorder + .set nomacro + #while(r<post) <=> while(r+off<r+n) <=> while(off<n) <=> while((n-off)>0) + beqz $t0, mdct_unroll_lap_post_lap + add $s0, $s0, $a2 # l += off*2; + .set macro + .set reorder + lw $a1, 0($s1) # *r + addi $s1, $s1, 4 # r++ + lw $a0, -4($s2) # *--wR + addi $s2, $s2, -4 + lw $t8, 0($s0) # *l + addiu $s0, $s0, 8 # l+=2 + lw $t9, 0($s3) # *wL + addiu $t0, $t0, -1 + .set noreorder + .set nomacro + beqz $t0, mdct_unroll_lap_croos_while_loop1_end + addi $s3, $s3, 4 # wL++ + .set macro + .set reorder + li $v1, 0xFFFFFFFF +mdct_unroll_lap_croos_while_loop1: + mult $a1, $a0 # a1=MULT32(*r++,*--wR) + mtlo $v1 + lw $a1, 0($s1) # *r + shilo $ac0, -1 # MULT32(*r++,*--wR) + dpsq_sa.l.w $ac0, $t8, $t9 # (MULT31(*r++,*--wR) - + # MULT31(*l,*wL++)) + addi $s1, $s1, 4 # r++ + lw $a0, -4($s2) # *--wR + addi $s2, $s2, -4 + mfhi $s7 + lw $t8, 0($s0) # *l + addiu $s0, $s0, 8 # l+=2 + lw $t9, 0($s3) # *wL + addi $s3, $s3, 4 # wL++ + sra $s7, $s7, 9 # s7=(MULT31(*r++,*--wR) - + # MULT31(*l,*wL++))>>9 + shll_s.w $s7, $s7, 16 # *out = CLIP_TO_15((MULT31(*r++,*--wR) + # - MULT31(*l,*wL++))>>9); + sra $s7, $s7, 16 + addiu $t0, $t0, -1 + sh $s7, 0($t4) + .set noreorder + .set nomacro + bnez $t0, mdct_unroll_lap_croos_while_loop1 + add $t4, $t4, $t5 # out+=step + .set macro + .set reorder +mdct_unroll_lap_croos_while_loop1_end: + mult $a1, $a0 # a1=MULT31(*r++,*--wR) + mtlo $v1 + shilo $ac0, -1 # MULT32(*r++,*--wR) + dpsq_sa.l.w $ac0, $t8, $t9 # (MULT31(*r++,*--wR) - + # MULT31(*l,*wL++)) + mfhi $s7 + sra $s7, $s7, 9 # s7=(MULT31(*r++,*--wR) - + # MULT31(*l,*wL++))>>9 + shll_s.w $s7, $s7, 16 # *out = CLIP_TO_15((MULT31(*r++,*--wR) + # - MULT31(*l,*wL++))>>9); + sra $s7, $s7, 16 + sh $s7, 0($t4) + add $t4, $t4, $t5 # out+=step + +# postLap preceeding direct-copy lapping from previous frame, if any +mdct_unroll_lap_post_lap: + beqz $s6, mdct_unroll_lap_end # if(postLap) + move $t1, $t6 # t1: off = start; + slt $a1, $t6, $s6 # start<postLap?start:postLap + movz $t1, $s6, $a1 # t1: off + move $t0, $t7 # t0: n= end + slt $s7, $t7, $s6 # end<postLap?end:postLap + movz $t0, $s6, $s7 # t0: n + sll $a2, $t1, 3 # (off*2)*4 + sub $t0, $t1 # t0: n - off + .set noreorder + .set nomacro + #while(l<post) <=> while(l-off*2<l-n*2) <=> while(off<n)<=> while(0<(n-off)) + beqz $t0, mdct_unroll_lap_end + add $s0, $s0, $a2 # l+= off*2; + lw $a1,0($s0) # *l + addiu $t0,$t0, -1 + addiu $s0,$s0, 8 # l+=2 + negu $a1,$a1 # -*l + beqz $t0, mdct_unroll_lap_postlap_while_loop_end + sra $a2,$a1, 9 # (-*l)>>9 + .set macro + .set reorder +mdct_unroll_lap_postlap_while_loop: + lw $a1, 0($s0) # *l + shll_s.w $a2, $a2, 16 + addiu $t0, $t0, -1 + negu $a1, $a1 # -*l + sra $a2, $a2,16 # CLIP_TO_15((-*l)>>9) + addiu $s0, $s0,8 # l+=2 + sh $a2, 0($t4) # *out = CLIP_TO_15((-*l)>>9) + add $t4, $t4,$t5 # out+=step + .set noreorder + .set nomacro + bnez $t0, mdct_unroll_lap_postlap_while_loop + sra $a2, $a1,9 + .set macro + .set reorder +mdct_unroll_lap_postlap_while_loop_end: + shll_s.w $a2, $a2, 16 + sra $a2, $a2, 16 # CLIP_TO_15((-*l)>>9) + sh $a2, 0($t4) # *out = CLIP_TO_15((-*l)>>9) + add $t4, $t4, $t5 # out+=step + +mdct_unroll_lap_end: + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + lw $s6, 24($sp) + lw $s7, 28($sp) + addiu $sp, 32 + jr $ra + .end mdct_unroll_lap
\ No newline at end of file |