summaryrefslogtreecommitdiff
path: root/libjack/simd.c
diff options
context:
space:
mode:
Diffstat (limited to 'libjack/simd.c')
-rw-r--r--libjack/simd.c195
1 files changed, 91 insertions, 104 deletions
diff --git a/libjack/simd.c b/libjack/simd.c
index 0c2c558..139ba2c 100644
--- a/libjack/simd.c
+++ b/libjack/simd.c
@@ -1,22 +1,22 @@
/* -*- mode: c; c-file-style: "bsd"; -*- */
/*
Copyright (C) 2005-2008 Jussi Laako
-
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
-
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public License
- along with this program; if not, write to the Free Software
+ along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-*/
+ */
#include <config.h>
@@ -45,23 +45,23 @@ have_3dnow ()
"movl $0x80000001, %%eax\n\t" \
"cpuid\n\t" \
\
- "xorl %%eax, %%eax\n\t" \
+ "xorl %%eax, %%eax\n\t" \
\
"movl $1, %%ecx\n\t" \
"shll $31, %%ecx\n\t" \
"testl %%ecx, %%edx\n\t" \
- "jz tdnow_testexit\n\t" \
+ "jz tdnow_testexit\n\t" \
"movl $1, %%eax\n\t" \
\
"movl $1, %%ecx\n\t" \
"shll $30, %%ecx\n\t" \
"testl %%ecx, %%edx\n\t" \
- "jz tdnow_testexit\n\t" \
+ "jz tdnow_testexit\n\t" \
"movl $2, %%eax\n\t" \
"jmp tdnow_testexit\n\t" \
\
"tdnow_prexit:\n\t" \
- "xorl %%eax, %%eax\n\t" \
+ "xorl %%eax, %%eax\n\t" \
"tdnow_testexit:\n\t"
: "=a" (res)
:
@@ -88,7 +88,7 @@ have_sse ()
"movl $1, %%eax\n\t" \
"cpuid\n\t" \
\
- "xorl %%eax, %%eax\n\t" \
+ "xorl %%eax, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"shll $25, %%ebx\n\t" \
@@ -123,49 +123,47 @@ void
x86_3dnow_copyf (float *dest, const float *src, int length)
{
int i, n1, n2;
- pv2sf m64p_src = (pv2sf) src;
- pv2sf m64p_dest = (pv2sf) dest;
+ pv2sf m64p_src = (pv2sf)src;
+ pv2sf m64p_dest = (pv2sf)dest;
n1 = (length >> 4);
n2 = ((length & 0xf) >> 1);
- for (i = 0; i < n1; i++)
- {
+ for (i = 0; i < n1; i++) {
asm volatile ("movq %0, %%mm0\n\t"
- : : "m" (*m64p_src++) : "mm0", "memory");
+ : : "m" (*m64p_src++) : "mm0", "memory");
asm volatile ("movq %0, %%mm1\n\t"
- : : "m" (*m64p_src++) : "mm1", "memory");
+ : : "m" (*m64p_src++) : "mm1", "memory");
asm volatile ("movq %0, %%mm2\n\t"
- : : "m" (*m64p_src++) : "mm2", "memory");
+ : : "m" (*m64p_src++) : "mm2", "memory");
asm volatile ("movq %0, %%mm3\n\t"
- : : "m" (*m64p_src++) : "mm3", "memory");
+ : : "m" (*m64p_src++) : "mm3", "memory");
asm volatile ("movq %0, %%mm4\n\t"
- : : "m" (*m64p_src++) : "mm4", "memory");
+ : : "m" (*m64p_src++) : "mm4", "memory");
asm volatile ("movq %0, %%mm5\n\t"
- : : "m" (*m64p_src++) : "mm5", "memory");
+ : : "m" (*m64p_src++) : "mm5", "memory");
asm volatile ("movq %0, %%mm6\n\t"
- : : "m" (*m64p_src++) : "mm6", "memory");
+ : : "m" (*m64p_src++) : "mm6", "memory");
asm volatile ("movq %0, %%mm7\n\t"
- : : "m" (*m64p_src++) : "mm7", "memory");
+ : : "m" (*m64p_src++) : "mm7", "memory");
asm volatile ("movq %%mm0, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm0", "memory");
+ : "=m" (*m64p_dest++) : : "mm0", "memory");
asm volatile ("movq %%mm1, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm1", "memory");
+ : "=m" (*m64p_dest++) : : "mm1", "memory");
asm volatile ("movq %%mm2, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm2", "memory");
+ : "=m" (*m64p_dest++) : : "mm2", "memory");
asm volatile ("movq %%mm3, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm3", "memory");
+ : "=m" (*m64p_dest++) : : "mm3", "memory");
asm volatile ("movq %%mm4, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm4", "memory");
+ : "=m" (*m64p_dest++) : : "mm4", "memory");
asm volatile ("movq %%mm5, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm5", "memory");
+ : "=m" (*m64p_dest++) : : "mm5", "memory");
asm volatile ("movq %%mm6, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm6", "memory");
+ : "=m" (*m64p_dest++) : : "mm6", "memory");
asm volatile ("movq %%mm7, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm7", "memory");
+ : "=m" (*m64p_dest++) : : "mm7", "memory");
}
- for (i = 0; i < n2; i++)
- {
+ for (i = 0; i < n2; i++) {
asm volatile (
"movq %1, %%mm0\n\t" \
"movq %%mm0, %0\n\t"
@@ -173,8 +171,7 @@ x86_3dnow_copyf (float *dest, const float *src, int length)
: "m" (*m64p_src++)
: "mm0", "memory");
}
- if (length & 0x1)
- {
+ if (length & 0x1) {
asm volatile (
"movd %1, %%mm0\n\t" \
"movd %%mm0, %0\n\t"
@@ -191,23 +188,21 @@ void
x86_3dnow_add2f (float *dest, const float *src, int length)
{
int i, n;
- pv2sf m64p_dest = (pv2sf) dest;
- pv2sf m64p_src = (pv2sf) src;
+ pv2sf m64p_dest = (pv2sf)dest;
+ pv2sf m64p_src = (pv2sf)src;
n = (length >> 1);
- for (i = 0; i < n; i++)
- {
+ for (i = 0; i < n; i++) {
asm volatile (
"movq %1, %%mm0\n\t" \
"pfadd %2, %%mm0\n\t" \
"movq %%mm0, %0\n\t"
: "=m" (m64p_dest[i])
: "m0" (m64p_dest[i]),
- "m" (m64p_src[i])
+ "m" (m64p_src[i])
: "mm0", "memory");
}
- if (n & 0x1)
- {
+ if (n & 0x1) {
asm volatile (
"movd %1, %%mm0\n\t" \
"movd %2, %%mm1\n\t" \
@@ -215,7 +210,7 @@ x86_3dnow_add2f (float *dest, const float *src, int length)
"movd %%mm0, %0\n\t"
: "=m" (dest[length - 1])
: "m0" (dest[length - 1]),
- "m" (src[length - 1])
+ "m" (src[length - 1])
: "mm0", "mm1", "memory");
}
asm volatile (
@@ -227,59 +222,56 @@ void
x86_sse_copyf (float *dest, const float *src, int length)
{
int i, n1, n2, si3;
- pv4sf m128p_src = (pv4sf) src;
- pv4sf m128p_dest = (pv4sf) dest;
+ pv4sf m128p_src = (pv4sf)src;
+ pv4sf m128p_dest = (pv4sf)dest;
n1 = (length >> 5);
n2 = ((length & 0x1f) >> 2);
si3 = (length & ~0x3);
- for (i = 0; i < n1; i++)
- {
+ for (i = 0; i < n1; i++) {
asm volatile ("movaps %0, %%xmm0\n\t"
- : : "m" (*m128p_src++) : "xmm0", "memory");
+ : : "m" (*m128p_src++) : "xmm0", "memory");
asm volatile ("movaps %0, %%xmm1\n\t"
- : : "m" (*m128p_src++) : "xmm1", "memory");
+ : : "m" (*m128p_src++) : "xmm1", "memory");
asm volatile ("movaps %0, %%xmm2\n\t"
- : : "m" (*m128p_src++) : "xmm2", "memory");
+ : : "m" (*m128p_src++) : "xmm2", "memory");
asm volatile ("movaps %0, %%xmm3\n\t"
- : : "m" (*m128p_src++) : "xmm3", "memory");
+ : : "m" (*m128p_src++) : "xmm3", "memory");
asm volatile ("movaps %0, %%xmm4\n\t"
- : : "m" (*m128p_src++) : "xmm4", "memory");
+ : : "m" (*m128p_src++) : "xmm4", "memory");
asm volatile ("movaps %0, %%xmm5\n\t"
- : : "m" (*m128p_src++) : "xmm5", "memory");
+ : : "m" (*m128p_src++) : "xmm5", "memory");
asm volatile ("movaps %0, %%xmm6\n\t"
- : : "m" (*m128p_src++) : "xmm6", "memory");
+ : : "m" (*m128p_src++) : "xmm6", "memory");
asm volatile ("movaps %0, %%xmm7\n\t"
- : : "m" (*m128p_src++) : "xmm7", "memory");
+ : : "m" (*m128p_src++) : "xmm7", "memory");
asm volatile ("movaps %%xmm0, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm0", "memory");
+ : "=m" (*m128p_dest++) : : "xmm0", "memory");
asm volatile ("movaps %%xmm1, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm1", "memory");
+ : "=m" (*m128p_dest++) : : "xmm1", "memory");
asm volatile ("movaps %%xmm2, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm2", "memory");
+ : "=m" (*m128p_dest++) : : "xmm2", "memory");
asm volatile ("movaps %%xmm3, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm3", "memory");
+ : "=m" (*m128p_dest++) : : "xmm3", "memory");
asm volatile ("movaps %%xmm4, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm4", "memory");
+ : "=m" (*m128p_dest++) : : "xmm4", "memory");
asm volatile ("movaps %%xmm5, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm5", "memory");
+ : "=m" (*m128p_dest++) : : "xmm5", "memory");
asm volatile ("movaps %%xmm6, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm6", "memory");
+ : "=m" (*m128p_dest++) : : "xmm6", "memory");
asm volatile ("movaps %%xmm7, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm7", "memory");
+ : "=m" (*m128p_dest++) : : "xmm7", "memory");
}
- for (i = 0; i < n2; i++)
- {
+ for (i = 0; i < n2; i++) {
asm volatile (
- "movaps %1, %%xmm0\n\t" \
+ "movaps %1, %%xmm0\n\t" \
"movaps %%xmm0, %0\n\t"
: "=m" (*m128p_dest++)
: "m" (*m128p_src++)
: "xmm0", "memory");
}
- for (i = si3; i < length; i++)
- {
+ for (i = si3; i < length; i++) {
asm volatile (
"movss %1, %%xmm0\n\t" \
"movss %%xmm0, %0\n\t"
@@ -293,38 +285,35 @@ void
x86_sse_add2f (float *dest, const float *src, int length)
{
int i, n, si2;
- pv4sf m128p_src = (pv4sf) src;
- pv4sf m128p_dest = (pv4sf) dest;
+ pv4sf m128p_src = (pv4sf)src;
+ pv4sf m128p_dest = (pv4sf)dest;
- if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0))
- {
+ if (__builtin_expect (((long)src & 0xf) || ((long)dest & 0xf), 0)) {
/*jack_error("x86_sse_add2f(): non aligned pointers!");*/
si2 = 0;
goto sse_nonalign;
}
si2 = (length & ~0x3);
n = (length >> 2);
- for (i = 0; i < n; i++)
- {
+ for (i = 0; i < n; i++) {
asm volatile (
- "movaps %1, %%xmm0\n\t" \
+ "movaps %1, %%xmm0\n\t" \
"addps %2, %%xmm0\n\t" \
"movaps %%xmm0, %0\n\t"
: "=m" (m128p_dest[i])
: "m0" (m128p_dest[i]),
- "m" (m128p_src[i])
+ "m" (m128p_src[i])
: "xmm0", "memory");
}
sse_nonalign:
- for (i = si2; i < length; i++)
- {
+ for (i = si2; i < length; i++) {
asm volatile (
"movss %1, %%xmm0\n\t" \
"addss %2, %%xmm0\n\t" \
"movss %%xmm0, %0\n\t"
: "=m" (dest[i])
: "m0" (dest[i]),
- "m" (src[i])
+ "m" (src[i])
: "xmm0", "memory");
}
}
@@ -332,29 +321,29 @@ sse_nonalign:
void x86_sse_f2i (int *dest, const float *src, int length, float scale)
{
int i;
- static const float max[4] __attribute__((aligned(16))) =
- { -1.0F, -1.0F, -1.0F, -1.0F };
- static const float min[4] __attribute__((aligned(16))) =
- { 1.0F, 1.0F, 1.0F, 1.0F };
- float s[4] __attribute__((aligned(16)));
+ static const float max[4] __attribute__((aligned (16))) =
+ { -1.0F, -1.0F, -1.0F, -1.0F };
+ static const float min[4] __attribute__((aligned (16))) =
+ { 1.0F, 1.0F, 1.0F, 1.0F };
+ float s[4] __attribute__((aligned (16)));
s[0] = s[1] = s[2] = s[3] = scale;
asm volatile (
- "movaps %0, %%xmm4\n\t" \
- "movaps %1, %%xmm5\n\t" \
+ "movaps %0, %%xmm4\n\t" \
+ "movaps %1, %%xmm5\n\t" \
"movaps %2, %%xmm6\n\t"
:
: "m" (*max),
- "m" (*min),
- "m" (*s)
+ "m" (*min),
+ "m" (*s)
: "xmm4", "xmm5", "xmm6");
- if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
+ if (__builtin_expect ((((long)dest & 0xf) || ((long)src & 0xf)), 0)) {
goto sse_nonalign;
- for (i = 0; i < length; i += 4)
- {
+ }
+ for (i = 0; i < length; i += 4) {
asm volatile (
- "movaps %1, %%xmm1\n\t" \
+ "movaps %1, %%xmm1\n\t" \
"maxps %%xmm4, %%xmm1\n\t" \
"minps %%xmm5, %%xmm1\n\t" \
"mulps %%xmm6, %%xmm1\n\t" \
@@ -367,10 +356,9 @@ void x86_sse_f2i (int *dest, const float *src, int length, float scale)
return;
sse_nonalign:
- for (i = 0; i < length; i += 4)
- {
+ for (i = 0; i < length; i += 4) {
asm volatile (
- "movups %1, %%xmm1\n\t" \
+ "movups %1, %%xmm1\n\t" \
"maxps %%xmm4, %%xmm1\n\t" \
"minps %%xmm5, %%xmm1\n\t" \
"mulps %%xmm6, %%xmm1\n\t" \
@@ -386,7 +374,7 @@ sse_nonalign:
void x86_sse_i2f (float *dest, const int *src, int length, float scale)
{
int i;
- float s[4] __attribute__((aligned(16)));
+ float s[4] __attribute__((aligned (16)));
s[0] = s[1] = s[2] = s[3] = scale;
asm volatile (
@@ -395,10 +383,10 @@ void x86_sse_i2f (float *dest, const int *src, int length, float scale)
: "m" (*s)
: "xmm4" );
- if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
- goto sse_nonalign;
- for (i = 0; i < length; i += 4)
- {
+ if (__builtin_expect ((((long)dest & 0xf) || ((long)src & 0xf)), 0)) {
+ goto sse_nonalign;
+ }
+ for (i = 0; i < length; i += 4) {
asm volatile (
"cvtdq2ps %1, %%xmm0\n\t" \
"mulps %%xmm4, %%xmm0\n\t" \
@@ -410,10 +398,9 @@ void x86_sse_i2f (float *dest, const int *src, int length, float scale)
return;
sse_nonalign:
- for (i = 0; i < length; i += 4)
- {
+ for (i = 0; i < length; i += 4) {
asm volatile (
- "movdqu %1, %%xmm1\n\t" \
+ "movdqu %1, %%xmm1\n\t" \
"cvtdq2ps %%xmm1, %%xmm0\n\t" \
"mulps %%xmm4, %%xmm0\n\t" \
"movups %%xmm0, %0\n\t"
@@ -423,7 +410,7 @@ sse_nonalign:
}
}
-#endif /* ARCH_X86 */
+#endif /* ARCH_X86 */
-#endif /* USE_DYNSIMD */
+#endif /* USE_DYNSIMD */