From 018ef8b85510566d031e3b6cec1709d41221da87 Mon Sep 17 00:00:00 2001
From: rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Tue, 13 Dec 2005 05:23:12 +0000
Subject: gcc/fortran/ 	* Make-lang.in (fortran/trans-resolve.o): Depend on 
 fortran/dependency.h. 	* gfortran.h (gfc_expr): Add an
 "inline_noncopying_intrinsic" flag. 	* dependency.h
 (gfc_get_noncopying_intrinsic_argument): Declare. 
 (gfc_check_fncall_dependency): Change prototype. 	* dependency.c
 (gfc_get_noncopying_intrinsic_argument): New function. 
 (gfc_check_argument_var_dependency): New function, split from 
 gfc_check_fncall_dependency. 	(gfc_check_argument_dependency): New function.
 	(gfc_check_fncall_dependency): Replace the expression parameter with 
 separate symbol and argument list parameters.  Generalize the function 
 to handle dependencies for any type of expression, not just variables. 
 Accept a further argument giving the intent of the expression being 
 tested.  Ignore	intent(in) arguments if that expression is also 
 intent(in). 	* resolve.c: Include dependency.h. 
 (find_noncopying_intrinsics): New function. 	(resolve_function,
 resolve_call): Call it on success. 	* trans-array.h
 (gfc_conv_array_transpose): Declare. 	(gfc_check_fncall_dependency): Remove
 prototype. 	* trans-array.c (gfc_conv_array_transpose): New function. 
 * trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the 
 libcall handling if the expression is to be evaluated inline. 	Add a case for
 handling inline transpose()s. 	* trans-expr.c (gfc_trans_arrayfunc_assign):
 Adjust for the new 	interface provided by gfc_check_fncall_dependency.

libgfortran/
	* m4/matmul.m4: Use a different order in the special case of a
	transposed first argument.
	* generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
	* generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
	* generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
	* generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.


git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@108459 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgfortran/m4/matmul.m4 | 71 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 8 deletions(-)

(limited to 'libgfortran/m4/matmul.m4')

diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4
index 730e4d78fd3..f488f5ed38e 100644
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@@ -37,16 +37,29 @@ include(iparm.m4)dnl
 
 `#if defined (HAVE_'rtype_name`)'
 
-/* This is a C version of the following fortran pseudo-code. The key
-   point is the loop order -- we access all arrays column-first, which
-   improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+   multiplication C=MATMUL(A,B), and in the frequent special case where
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
+   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
+   looking at their strides.
+
+   The equivalent Fortran pseudo-code is:
 
    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
-   DO J=1,N
-     DO K=1,COUNT
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
+     C = 0
+     DO J=1,N
+       DO K=1,COUNT
+         DO I=1,M
+           C(I,J) = C(I,J)+A(I,K)*B(K,J)
+   ELSE
+     DO J=1,N
        DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
+         DO K=1,COUNT
+           S = S+A(I,K)+B(K,J)
+         C(I,J) = S
+   ENDIF
 */
 
 extern void matmul_`'rtype_code (rtype * const restrict retarray, 
@@ -206,7 +219,28 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
 	    }
 	}
     }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+    {
+      const rtype_name *restrict abase_x;
+      const rtype_name *restrict bbase_y;
+      rtype_name *restrict dest_y;
+      rtype_name s;
+
+      for (y = 0; y < ycount; y++)
+	{
+	  bbase_y = &bbase[y*bystride];
+	  dest_y = &dest[y*rystride];
+	  for (x = 0; x < xcount; x++)
+	    {
+	      abase_x = &abase[x*axstride];
+	      s = (rtype_name) 0;
+	      for (n = 0; n < count; n++)
+		s += abase_x[n] * bbase_y[n];
+	      dest_y[x] = s;
+	    }
+	}
+    }
+  else if (axstride < aystride)
     {
       for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@@ -218,6 +252,27 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
     }
+  else
+    {
+      const rtype_name *restrict abase_x;
+      const rtype_name *restrict bbase_y;
+      rtype_name *restrict dest_y;
+      rtype_name s;
+
+      for (y = 0; y < ycount; y++)
+	{
+	  bbase_y = &bbase[y*bystride];
+	  dest_y = &dest[y*rystride];
+	  for (x = 0; x < xcount; x++)
+	    {
+	      abase_x = &abase[x*axstride];
+	      s = (rtype_name) 0;
+	      for (n = 0; n < count; n++)
+		s += abase_x[n*aystride] * bbase_y[n*bxstride];
+	      dest_y[x*rxstride] = s;
+	    }
+	}
+    }
 }
 
 #endif
-- 
cgit v1.2.1