diff options
Diffstat (limited to 'REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S')
-rw-r--r-- | REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S | 722 |
1 files changed, 722 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S new file mode 100644 index 0000000000..0224f74898 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S @@ -0,0 +1,722 @@ +/* Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Implements the functions + + char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5]) + + AND + + char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5]) + + The algorithm is as follows: + > if src and dest are 8 byte aligned, perform double word copy + else + > copy byte by byte on unaligned addresses. + + The aligned comparison are made using cmpb instructions. */ + +/* The focus on optimization for performance improvements are as follows: + 1. data alignment [gain from aligned memory access on read/write] + 2. POWER7 gains performance with loop unrolling/unwinding + [gain by reduction of branch penalty]. + 3. The final pad with null bytes is done by calling an optimized + memset. */ + +#ifdef USE_AS_STPNCPY +# ifndef STPNCPY +# define FUNC_NAME __stpncpy +# else +# define FUNC_NAME STPNCPY +# endif +#else +# ifndef STRNCPY +# define FUNC_NAME strncpy +# else +# define FUNC_NAME STRNCPY +# endif +#endif /* !USE_AS_STPNCPY */ + +#define FRAMESIZE (FRAME_MIN_SIZE+32) + +#ifndef MEMSET +/* For builds with no IFUNC support, local calls should be made to internal + GLIBC symbol (created by libc_hidden_builtin_def). */ +# ifdef SHARED +# define MEMSET __GI_memset +# else +# define MEMSET memset +# endif +#endif + + .machine power7 +EALIGN(FUNC_NAME, 4, 0) + CALL_MCOUNT 3 + + mflr r0 /* load link register LR to r0 */ + or r10, r3, r4 /* to verify source and destination */ + rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */ + + std r19, -8(r1) /* save callers register , r19 */ + std r18, -16(r1) /* save callers register , r18 */ + std r0, 16(r1) /* store the link register */ + stdu r1, -FRAMESIZE(r1) /* create the stack frame */ + + mr r9, r3 /* save r3 into r9 for use */ + mr r18, r3 /* save r3 for retCode of strncpy */ + bne 0, L(unaligned) + +L(aligned): + srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */ + cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */ + ble 7, L(update1) + + ld r10, 0(r4) /* load doubleWord from src */ + cmpb r8, r10, r8 /* compare src with NULL ,we read just now */ + cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ + bne cr7, L(update3) + + std r10, 0(r3) /* copy doubleword at offset=0 */ + ld r10, 8(r4) /* load next doubleword from offset=8 */ + cmpb r8, r10, r8 /* compare src with NULL , we read just now */ + cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ + bne 7,L(HopBy8) + + addi r8, r11, -4 + mr r7, r3 + srdi r8, r8, 2 + mr r6, r4 + addi r8, r8, 1 + li r12, 0 + mtctr r8 + b L(dwordCopy) + + .p2align 4 +L(dWordUnroll): + std r8, 16(r9) + ld r8, 24(r4) /* load dword,perform loop unrolling again */ + cmpb r10, r8, r10 + cmpdi cr7, r10, 0 + bne cr7, L(HopBy24) + + std r8, 24(r7) /* copy dword at offset=24 */ + addi r9, r9, 32 + addi r4, r4, 32 + bdz L(leftDwords) /* continue with loop on counter */ + + ld r3, 32(r6) + cmpb r8, r3, r10 + cmpdi cr7, r8, 0 + bne cr7, L(update2) + + std r3, 32(r7) + ld r10, 40(r6) + cmpb r8, r10, r8 + cmpdi cr7, r8, 0 + bne cr7, L(HopBy40) + + mr r6, r4 /* update values */ + mr r7, r9 + mr r11, r0 + mr r5, r19 + +L(dwordCopy): + std r10, 8(r9) /* copy dword at offset=8 */ + addi r19, r5, -32 + addi r0, r11, -4 + ld r8, 16(r4) + cmpb r10, r8, r12 + cmpdi cr7, r10, 0 + beq cr7, L(dWordUnroll) + + addi r9, r9, 16 /* increment dst by 16 */ + addi r4, r4, 16 /* increment src by 16 */ + addi r5, r5, -16 /* decrement length 'n' by 16 */ + addi r0, r11, -2 /* decrement loop counter */ + +L(dWordUnrollOFF): + ld r10, 0(r4) /* load first dword */ + li r8, 0 /* load mask */ + cmpb r8, r10, r8 + cmpdi cr7, r8, 0 + bne cr7, L(byte_by_byte) + mtctr r0 + li r7, 0 + b L(CopyDword) + + .p2align 4 +L(loadDWordandCompare): + ld r10, 0(r4) + cmpb r8, r10, r7 + cmpdi cr7, r8, 0 + bne cr7, L(byte_by_byte) + +L(CopyDword): + addi r9, r9, 8 + std r10, -8(r9) + addi r4, r4, 8 + addi r5, r5, -8 + bdnz L(loadDWordandCompare) + +L(byte_by_byte): + cmpldi cr7, r5, 3 + ble cr7, L(verifyByte) + srdi r10, r5, 2 + mr r19, r9 + mtctr r10 + b L(firstByteUnroll) + + .p2align 4 +L(bytes_unroll): + lbz r10, 1(r4) /* load byte from src */ + cmpdi cr7, r10, 0 /* compare for NULL */ + stb r10, 1(r19) /* store byte to dst */ + beq cr7, L(updtDestComputeN2ndByte) + + addi r4, r4, 4 /* advance src */ + + lbz r10, -2(r4) /* perform loop unrolling for byte r/w */ + cmpdi cr7, r10, 0 + stb r10, 2(r19) + beq cr7, L(updtDestComputeN3rdByte) + + lbz r10, -1(r4) /* perform loop unrolling for byte r/w */ + addi r19, r19, 4 + cmpdi cr7, r10, 0 + stb r10, -1(r19) + beq cr7, L(ComputeNByte) + + bdz L(update0) + +L(firstByteUnroll): + lbz r10, 0(r4) /* perform loop unrolling for byte r/w */ + cmpdi cr7, 10, 0 + stb r10, 0(r19) + bne cr7, L(bytes_unroll) + addi r19, r19, 1 + +L(ComputeNByte): + subf r9, r19, r9 /* compute 'n'n bytes to fill */ + add r8, r9, r5 + +L(zeroFill): + cmpdi cr7, r8, 0 /* compare if length is zero */ + beq cr7, L(update3return) + + mr r3, r19 /* fill buffer with */ + li r4, 0 /* zero fill buffer */ + mr r5, r8 /* how many bytes to fill buffer with */ + bl MEMSET /* call optimized memset */ + nop + +L(update3return): +#ifdef USE_AS_STPNCPY + addi r3, r19, -1 /* update return value */ +#endif + +L(hop2return): +#ifndef USE_AS_STPNCPY + mr r3, r18 /* set return value */ +#endif + addi r1, r1, FRAMESIZE /* restore stack pointer */ + ld r0, 16(r1) /* read the saved link register */ + ld r18, -16(r1) /* restore callers save register, r18 */ + ld r19, -8(r1) /* restore callers save register, r19 */ + mtlr r0 /* branch to link register */ + blr /* return */ + + .p2align 4 +L(update0): + mr r9, r19 + + .p2align 4 +L(verifyByte): + rldicl. r8, r5, 0, 62 +#ifdef USE_AS_STPNCPY + mr r3, r9 +#endif + beq cr0, L(hop2return) + mtctr r8 + addi r4, r4, -1 + mr r19, r9 + b L(oneBYone) + + .p2align 4 +L(proceed): + bdz L(done) + +L(oneBYone): + lbzu r10, 1(r4) /* copy byte */ + addi r19, r19, 1 + addi r8, r8, -1 + cmpdi cr7, r10, 0 + stb r10, -1(r19) + bne cr7, L(proceed) + b L(zeroFill) + + .p2align 4 +L(done): + addi r1, r1, FRAMESIZE /* restore stack pointer */ +#ifdef USE_AS_STPNCPY + mr r3, r19 /* set the return value */ +#else + mr r3, r18 /* set the return value */ +#endif + ld r0, 16(r1) /* read the saved link register */ + ld r18, -16(r1) /* restore callers save register, r18 */ + ld r19, -8(r1) /* restore callers save register, r19 */ + mtlr r0 /* branch to link register */ + blr /* return */ + +L(update1): + mr r0, r11 + mr r19, r5 + + .p2align 4 +L(leftDwords): + cmpdi cr7, r0, 0 + mr r5, r19 + bne cr7, L(dWordUnrollOFF) + b L(byte_by_byte) + + .p2align 4 +L(updtDestComputeN2ndByte): + addi r19, r19, 2 /* update dst by 2 */ + subf r9, r19, r9 /* compute distance covered */ + add r8, r9, r5 + b L(zeroFill) + + .p2align 4 +L(updtDestComputeN3rdByte): + addi r19, r19, 3 /* update dst by 3 */ + subf r9, r19, r9 /* compute distance covered */ + add r8, r9, r5 + b L(zeroFill) + + .p2align 4 +L(HopBy24): + addi r9, r9, 24 /* increment dst by 24 */ + addi r4, r4, 24 /* increment src by 24 */ + addi r5, r5, -24 /* decrement length 'n' by 24 */ + addi r0, r11, -3 /* decrement loop counter */ + b L(dWordUnrollOFF) + + .p2align 4 +L(update2): + mr r5, r19 + b L(dWordUnrollOFF) + + .p2align 4 +L(HopBy40): + addi r9, r7, 40 /* increment dst by 40 */ + addi r4, r6, 40 /* increment src by 40 */ + addi r5, r5, -40 /* decrement length 'n' by 40 */ + addi r0, r11, -5 /* decrement loop counter */ + b L(dWordUnrollOFF) + +L(update3): + mr r0, r11 + b L(dWordUnrollOFF) + +L(HopBy8): + addi r9, r3, 8 /* increment dst by 8 */ + addi r4, r4, 8 /* increment src by 8 */ + addi r5, r5, -8 /* decrement length 'n' by 8 */ + addi r0, r11, -1 /* decrement loop counter */ + b L(dWordUnrollOFF) + +L(unaligned): + cmpdi r5, 16 /* Proceed byte by byte for less than 16 */ + ble L(byte_by_byte) + rldicl r7, r3, 0, 61 + rldicl r6, r4, 0, 61 + cmpdi r6, 0 /* Check src alignment */ + beq L(srcaligndstunalign) + /* src is unaligned */ + rlwinm r10, r4, 3,26,28 /* Calculate padding. */ + clrrdi r4, r4, 3 /* Align the addr to dw boundary */ + ld r8, 0(r4) /* Load doubleword from memory. */ + li r0, 0 + /* Discard bits not part of the string */ +#ifdef __LITTLE_ENDIAN__ + srd r7, r8, r10 +#else + sld r7, r8, r10 +#endif + cmpb r0, r7, r0 /* Compare each byte against null */ + /* Discard bits not part of the string */ +#ifdef __LITTLE_ENDIAN__ + sld r0, r0, r10 +#else + srd r0, r0, r10 +#endif + cmpdi r0, 0 + bne L(bytebybyte) /* if it has null, copy byte by byte */ + subfic r6, r6, 8 + rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */ + rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */ + addi r3, r3, -1 + + cmpdi r12, 0 /* check dest alignment */ + beq L(srcunaligndstalign) + + /* both src and dst unaligned */ +#ifdef __LITTLE_ENDIAN__ + sld r8, r7, r10 + mr r11, r10 + addi r11, r11, -8 /* Adjust byte pointer on loaded dw */ +#else + srd r8, r7, r10 + subfic r11, r10, 64 +#endif + /* dst alignment is greater then src alignment? */ + cmpd cr7, r12, r10 + ble cr7, L(dst_align_small) + /* src alignment is less than dst */ + + /* Calculate the dst alignment difference */ + subfic r7, r9, 8 + mtctr r7 + + /* Write until dst is aligned */ + cmpdi r0, r7, 4 + blt L(storebyte1) /* less than 4, store byte by byte */ + beq L(equal1) /* if its 4, store word */ + addi r0, r7, -4 /* greater than 4, so stb and stw */ + mtctr r0 +L(storebyte1): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ +#else + addi r11, r11, -8 +#endif + srd r7, r8, r11 + stbu r7, 1(r3) + addi r5, r5, -1 + bdnz L(storebyte1) + + subfic r7, r9, 8 /* Check the remaining bytes */ + cmpdi r0, r7, 4 + blt L(proceed1) + + .align 4 +L(equal1): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ + srd r7, r8, r11 +#else + subfic r11, r11, 64 + sld r7, r8, r11 + srdi r7, r7, 32 +#endif + stw r7, 1(r3) + addi r3, r3, 4 + addi r5, r5, -4 + +L(proceed1): + mr r7, r8 + /* calculate the Left over bytes to be written */ + subfic r11, r10, 64 + subfic r12, r12, 64 + subf r12, r12, r11 /* remaining bytes on second dw */ + subfic r10, r12, 64 /* remaining bytes on first dw */ + subfic r9, r9, 8 + subf r6, r9, r6 /* recalculate padding */ +L(srcunaligndstalign): + addi r3, r3, 1 + subfic r12, r10, 64 /* remaining bytes on second dw */ + addi r4, r4, 8 + li r0,0 + b L(storedouble) + + .align 4 +L(dst_align_small): + mtctr r6 + /* Write until src is aligned */ +L(storebyte2): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on dw */ +#else + addi r11, r11, -8 +#endif + srd r7, r8, r11 + stbu r7, 1(r3) + addi r5, r5, -1 + bdnz L(storebyte2) + + addi r4, r4, 8 /* Increment src pointer */ + addi r3, r3, 1 /* Increment dst pointer */ + mr r9, r3 + li r8, 0 + cmpd cr7, r12, r10 + beq cr7, L(aligned) + rldicl r6, r3, 0, 61 /* Recalculate padding */ + mr r7, r6 + + /* src is algined */ +L(srcaligndstunalign): + mr r9, r3 + mr r6, r7 + ld r8, 0(r4) + subfic r10, r7, 8 + mr r7, r8 + li r0, 0 /* Check null */ + cmpb r0, r8, r0 + cmpdi r0, 0 + bne L(byte_by_byte) /* Do byte by byte if there is NULL */ + rlwinm r12, r3, 3,26,28 /* Calculate padding */ + addi r3, r3, -1 + /* write byte by byte until aligned */ +#ifdef __LITTLE_ENDIAN__ + li r11, -8 +#else + li r11, 64 +#endif + mtctr r10 + cmpdi r0, r10, 4 + blt L(storebyte) + beq L(equal) + addi r0, r10, -4 + mtctr r0 +L(storebyte): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on dw */ +#else + addi r11, r11, -8 +#endif + srd r7, r8, r11 + stbu r7, 1(r3) + addi r5, r5, -1 + bdnz L(storebyte) + + cmpdi r0, r10, 4 + blt L(align) + + .align 4 +L(equal): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 + srd r7, r8, r11 +#else + subfic r11, r11, 64 + sld r7, r8, r11 + srdi r7, r7, 32 +#endif + stw r7, 1(r3) + addi r5, r5, -4 + addi r3, r3, 4 +L(align): + addi r3, r3, 1 + addi r4, r4, 8 /* Increment src pointer */ + subfic r10, r12, 64 + li r0, 0 + /* dst addr aligned to 8 */ +L(storedouble): + cmpdi r5, 8 + ble L(null1) + ld r7, 0(r4) /* load next dw */ + cmpb r0, r7, r0 + cmpdi r0, 0 /* check for null on each new dw */ + bne L(null) +#ifdef __LITTLE_ENDIAN__ + srd r9, r8, r10 /* bytes from first dw */ + sld r11, r7, r12 /* bytes from second dw */ +#else + sld r9, r8, r10 + srd r11, r7, r12 +#endif + or r11, r9, r11 /* make as a single dw */ + std r11, 0(r3) /* store as std on aligned addr */ + mr r8, r7 /* still few bytes left to be written */ + addi r3, r3, 8 /* increment dst addr */ + addi r4, r4, 8 /* increment src addr */ + addi r5, r5, -8 + b L(storedouble) /* Loop until NULL */ + + .align 4 + +/* We've hit the end of the string. Do the rest byte-by-byte. */ +L(null): + addi r3, r3, -1 + mr r10, r12 + mtctr r6 +#ifdef __LITTLE_ENDIAN__ + subfic r10, r10, 64 + addi r10, r10, -8 +#endif + cmpdi r0, r5, 4 + blt L(loop) + cmpdi r0, r6, 4 + blt L(loop) + + /* we can still use stw if leftover >= 4 */ +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, 8 + srd r11, r8, r10 +#else + subfic r10, r10, 64 + sld r11, r8, r10 + srdi r11, r11, 32 +#endif + stw r11, 1(r3) + addi r5, r5, -4 + addi r3, r3, 4 + cmpdi r0, r5, 0 + beq L(g1) + cmpdi r0, r6, 4 + beq L(bytebybyte1) + addi r10, r10, 32 +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, -8 +#else + subfic r10, r10, 64 +#endif + addi r0, r6, -4 + mtctr r0 + /* remaining byte by byte part of first dw */ +L(loop): +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, 8 +#else + addi r10, r10, -8 +#endif + srd r0, r8, r10 + stbu r0, 1(r3) + addi r5, r5, -1 + cmpdi r0, r5, 0 + beq L(g1) + bdnz L(loop) +L(bytebybyte1): + addi r3, r3, 1 + /* remaining byte by byte part of second dw */ +L(bytebybyte): + addi r3, r3, -8 + addi r4, r4, -1 + +#ifdef __LITTLE_ENDIAN__ + extrdi. r0, r7, 8, 56 + stbu r7, 8(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 48 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 40 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 32 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 24 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 16 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 8 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi r0, r7, 8, 0 + stbu r0, 1(r3) + addi r5, r5, -1 + b L(g2) +#else + extrdi. r0, r7, 8, 0 + stbu r0, 8(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 8 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 16 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 24 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 32 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 40 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + extrdi. r0, r7, 8, 48 + stbu r0, 1(r3) + addi r5, r5, -1 + beq L(g2) + cmpdi r5, 0 + beq L(g1) + stbu r7, 1(r3) + addi r5, r5, -1 + b L(g2) +#endif +L(g1): +#ifdef USE_AS_STPNCPY + addi r3, r3, 1 +#endif +L(g2): + addi r3, r3, 1 + mr r19, r3 + mr r8, r5 + b L(zeroFill) +L(null1): + mr r9, r3 + subf r4, r6, r4 + b L(byte_by_byte) +END(FUNC_NAME) +#ifndef USE_AS_STPNCPY +libc_hidden_builtin_def (strncpy) +#endif |