summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarti Maria <marti.maria@littlecms.com>2020-05-25 19:44:47 +0200
committerMarti Maria <marti.maria@littlecms.com>2020-05-25 19:44:47 +0200
commitd881cc6791858f67e1bfd6fa306c2184a99f99e9 (patch)
treeb097bdc641d4e8bec80ecd070ef3c481c5258bd7
parentf9e2e80ab45a91dc99a9ca8eee39aa1986900b64 (diff)
downloadlcms2-d881cc6791858f67e1bfd6fa306c2184a99f99e9.tar.gz
Merged SSE matrix-shaper optimizer
It makes small difference, but here you go! In my computer, matrix shaper runs times x 2.4 faster the core lcms2 without SSE is about times x 2
-rw-r--r--plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj1
-rw-r--r--plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters9
-rw-r--r--plugins/fast_float/src/Makefile.am2
-rw-r--r--plugins/fast_float/src/fast_8_matsh.c67
-rw-r--r--plugins/fast_float/src/fast_8_matsh_sse.c413
-rw-r--r--plugins/fast_float/src/fast_float_internal.h9
-rw-r--r--plugins/fast_float/src/fast_float_sup.c3
-rw-r--r--plugins/fast_float/testbed/fast_float_testbed.c2
-rw-r--r--src/cmsalpha.c10
9 files changed, 461 insertions, 55 deletions
diff --git a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj
index 1dfe131..40e42b9 100644
--- a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj
+++ b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj
@@ -26,6 +26,7 @@
<ClCompile Include="..\..\src\fast_16_tethra.c" />
<ClCompile Include="..\..\src\fast_8_curves.c" />
<ClCompile Include="..\..\src\fast_8_matsh.c" />
+ <ClCompile Include="..\..\src\fast_8_matsh_sse.c" />
<ClCompile Include="..\..\src\fast_8_tethra.c" />
<ClCompile Include="..\..\src\fast_float_15bits.c" />
<ClCompile Include="..\..\src\fast_float_15mats.c" />
diff --git a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters
index ca46a2e..c2761d1 100644
--- a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters
+++ b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters
@@ -45,9 +45,6 @@
<ClCompile Include="..\..\src\fast_float_cmyk.c">
<Filter>Source Files</Filter>
</ClCompile>
- <ClCompile Include="..\..\src\fast_8_matsh.c">
- <Filter>Source Files</Filter>
- </ClCompile>
<ClCompile Include="..\..\src\fast_8_curves.c">
<Filter>Source Files</Filter>
</ClCompile>
@@ -57,6 +54,12 @@
<ClCompile Include="..\..\src\fast_16_tethra.c">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="..\..\src\fast_8_matsh_sse.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\fast_8_matsh.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="..\..\COPYING.GPL3">
diff --git a/plugins/fast_float/src/Makefile.am b/plugins/fast_float/src/Makefile.am
index 4244cf7..9b597f2 100644
--- a/plugins/fast_float/src/Makefile.am
+++ b/plugins/fast_float/src/Makefile.am
@@ -24,5 +24,5 @@ liblcms2_fast_float_la_LIBADD = $(LCMS_LIB_DEPLIBS) $(top_builddir)/src/liblcms2
liblcms2_fast_float_la_SOURCES = \
fast_float_15bits.c fast_float_15mats.c fast_float_curves.c fast_float_matsh.c fast_float_separate.c \
fast_float_sup.c fast_float_tethra.c fast_float_cmyk.c fast_float_internal.h \
- fast_8_curves.c fast_8_matsh.c fast_8_tethra.c
+ fast_8_curves.c fast_8_matsh.c fast_8_matsh_sse.c fast_8_tethra.c
diff --git a/plugins/fast_float/src/fast_8_matsh.c b/plugins/fast_float/src/fast_8_matsh.c
index 22e7f2b..6a126f3 100644
--- a/plugins/fast_float/src/fast_8_matsh.c
+++ b/plugins/fast_float/src/fast_8_matsh.c
@@ -30,10 +30,9 @@ typedef cmsInt32Number cmsS1Fixed14Number; // Note that this may hold more tha
// This is the private data container used by this optimization
typedef struct {
- // This is for SSE2, MUST be aligned at 16 bit boundary
+ // Alignment makes it faster
- cmsFloat32Number fMatrix[4][4];
- cmsFloat32Number fShaper1[256 * 3];
+ cmsS1Fixed14Number Mat[4][4]; // n.14 to n.14 (needs a saturation after that)
void * real_ptr;
@@ -42,10 +41,7 @@ typedef struct {
cmsS1Fixed14Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0)
cmsS1Fixed14Number Shaper1G[256];
cmsS1Fixed14Number Shaper1B[256];
-
- cmsS1Fixed14Number Mat[3][3]; // n.14 to n.14 (needs a saturation after that)
- cmsS1Fixed14Number Off[3];
-
+
cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255
cmsUInt8Number Shaper2G[0x4001];
cmsUInt8Number Shaper2B[0x4001];
@@ -97,20 +93,6 @@ void FillFirstShaper(cmsS1Fixed14Number* Table, cmsToneCurve* Curve)
}
}
-static
-void FillFirstShaperFloat(cmsFloat32Number* Table, cmsToneCurve* Curve)
-{
- int i;
- cmsFloat32Number R;
-
- for (i=0; i < 256; i++) {
-
- R = (cmsFloat32Number) (i / 255.0);
-
- Table[i] = cmsEvalToneCurveFloat(Curve, R);
- }
-}
-
// This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve
static
@@ -118,15 +100,17 @@ void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve)
{
int i;
cmsFloat32Number R, Val;
- cmsUInt16Number w;
+ cmsInt32Number w;
for (i=0; i < 0x4001; i++) {
- R = (cmsFloat32Number) (i / 16384.0);
+ R = (cmsFloat32Number) (i / 16384.0f);
Val = cmsEvalToneCurveFloat(Curve, R);
- w = _cmsSaturateWord(Val * 65535.0 + 0.5);
+ w = (cmsInt32Number) (Val * 255.0f + 0.5f);
+ if (w < 0) w = 0;
+ if (w > 255) w = 255;
- Table[i] = FROM_16_TO_8(w);
+ Table[i] = (cmsInt8Number) w;
}
}
@@ -153,30 +137,22 @@ XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cms
FillSecondShaper(p ->Shaper2G, Curve2[1]);
FillSecondShaper(p ->Shaper2B, Curve2[2]);
-
- FillFirstShaperFloat(p ->fShaper1, Curve1[0]);
- FillFirstShaperFloat(p ->fShaper1 + 256, Curve1[1]);
- FillFirstShaperFloat(p ->fShaper1 + 256*2, Curve1[2]);
-
+
// Convert matrix to nFixed14. Note that those values may take more than 16 bits as
for (i=0; i < 3; i++) {
for (j=0; j < 3; j++) {
- p ->Mat[i][j] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]);
- p ->fMatrix[j][i] = (cmsFloat32Number) Mat ->v[i].n[j];
+ p ->Mat[j][i] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]);
}
}
-
-
+
for (i=0; i < 3; i++) {
if (Off == NULL) {
-
- p ->Off[i] = 0x2000;
- p ->fMatrix[3][i] = 0.0f;
+
+ p->Mat[3][i] = DOUBLE_TO_1FIXED14(0.5);
}
- else {
- p ->Off[i] = DOUBLE_TO_1FIXED14(Off->n[i]) + 0x2000;
- p ->fMatrix[3][i] = (cmsFloat32Number) Off->n[i];
+ else {
+ p->Mat[3][i] = DOUBLE_TO_1FIXED14(Off->n[i] + 0.5);
}
}
@@ -237,20 +213,19 @@ void MatShaperXform8(struct _cmstransform_struct *CMMcargo,
gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
if (nalpha)
- aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
-
+ aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
for (ii = 0; ii < PixelsPerLine; ii++) {
-
+
// Across first shaper, which also converts to 1.14 fixed point. 16 bits guaranteed.
r = p->Shaper1R[*rin];
g = p->Shaper1G[*gin];
b = p->Shaper1B[*bin];
// Evaluate the matrix in 1.14 fixed point
- l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 14;
- l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 14;
- l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 14;
+ l1 = (p->Mat[0][0] * r + p->Mat[1][0] * g + p->Mat[2][0] * b + p->Mat[3][0]) >> 14;
+ l2 = (p->Mat[0][1] * r + p->Mat[1][1] * g + p->Mat[2][1] * b + p->Mat[3][1]) >> 14;
+ l3 = (p->Mat[0][2] * r + p->Mat[1][2] * g + p->Mat[2][2] * b + p->Mat[3][2]) >> 14;
// Now we have to clip to 0..1.0 range
diff --git a/plugins/fast_float/src/fast_8_matsh_sse.c b/plugins/fast_float/src/fast_8_matsh_sse.c
new file mode 100644
index 0000000..e6a9990
--- /dev/null
+++ b/plugins/fast_float/src/fast_8_matsh_sse.c
@@ -0,0 +1,413 @@
+//---------------------------------------------------------------------------------
+//
+// Little Color Management System, fast floating point extensions
+// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved
+//
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+//---------------------------------------------------------------------------------
+
+// Optimization for matrix-shaper in 8 bits using SSE2 intrinsics
+
+#include "fast_float_internal.h"
+
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <emmintrin.h>
+
+
+// This is the private data container used by this optimization
+typedef struct {
+
+ // This is for SSE, MUST be aligned at 16 bit boundary
+
+ cmsFloat32Number Mat[4][4]; // n.14 to n.14 (needs a saturation after that)
+
+ void * real_ptr;
+
+ cmsContext ContextID;
+
+ cmsFloat32Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0)
+ cmsFloat32Number Shaper1G[256];
+ cmsFloat32Number Shaper1B[256];
+
+ cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255
+ cmsUInt8Number Shaper2G[0x4001];
+ cmsUInt8Number Shaper2B[0x4001];
+
+} XMatShaper8Data;
+
+
+static
+XMatShaper8Data* malloc_aligned(cmsContext ContextID)
+{
+ cmsUInt8Number* real_ptr = (cmsUInt8Number*) _cmsMallocZero(ContextID, sizeof(XMatShaper8Data) + 32);
+ cmsUInt8Number* aligned = (cmsUInt8Number*) (((uintptr_t)real_ptr + 16) & ~0xf);
+ XMatShaper8Data* p = (XMatShaper8Data*) aligned;
+
+ p ->real_ptr = real_ptr;
+ return p;
+}
+
+static
+void free_aligned(XMatShaper8Data* a)
+{
+ _cmsFree(a->ContextID, a->real_ptr);
+}
+
+
+// Free the private data container
+static
+void FreeMatShaper(cmsContext ContextID, void* Data)
+{
+ UNUSED_PARAMETER(ContextID);
+
+ if (Data != NULL) free_aligned((XMatShaper8Data*) Data);
+}
+
+
+// This table converts from 8 bits to 1.14 after applying the curve
+static
+void FillFirstShaper(cmsFloat32Number* Table, cmsToneCurve* Curve)
+{
+ cmsInt32Number i;
+ cmsFloat32Number R;
+
+ for (i = 0; i < 256; i++) {
+
+ R = (cmsFloat32Number)(i / 255.0);
+ Table[i] = cmsEvalToneCurveFloat(Curve, R);
+ }
+}
+
+
+// This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve
+static
+void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve)
+{
+ int i;
+ cmsFloat32Number R, Val;
+ cmsInt32Number w;
+
+ for (i=0; i < 0x4001; i++) {
+
+ R = (cmsFloat32Number) (i / 16384.0f);
+ Val = cmsEvalToneCurveFloat(Curve, R);
+ w = (cmsInt32Number) (Val * 255.0f + 0.5f);
+ if (w < 0) w = 0;
+ if (w > 255) w = 255;
+
+ Table[i] = (cmsInt8Number) w;
+
+ }
+}
+
+// Compute the matrix-shaper structure
+static
+XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3])
+{
+ XMatShaper8Data* p;
+ int i, j;
+
+ // Allocate a big chuck of memory to store precomputed tables
+ p = malloc_aligned(ContextID);
+ if (p == NULL) return FALSE;
+
+ p -> ContextID = ContextID;
+
+ // Precompute tables
+ FillFirstShaper(p ->Shaper1R, Curve1[0]);
+ FillFirstShaper(p ->Shaper1G, Curve1[1]);
+ FillFirstShaper(p ->Shaper1B, Curve1[2]);
+
+ FillSecondShaper(p ->Shaper2R, Curve2[0]);
+ FillSecondShaper(p ->Shaper2G, Curve2[1]);
+ FillSecondShaper(p ->Shaper2B, Curve2[2]);
+
+
+ // Convert matrix to float
+ for (i=0; i < 3; i++) {
+ for (j=0; j < 3; j++) {
+ p ->Mat[j][i] = (cmsFloat32Number) Mat->v[i].n[j];
+ }
+ }
+
+ // Roundoff
+ for (i=0; i < 3; i++) {
+
+ if (Off == NULL) {
+
+ p->Mat[3][i] = 0.0f;
+ }
+ else {
+ p->Mat[3][i] = (cmsFloat32Number)Off->n[i];
+ }
+ }
+
+
+ return p;
+}
+
+// A fast matrix-shaper evaluator for 8 bits.
+static
+void MatShaperXform8SSE(struct _cmstransform_struct *CMMcargo,
+ const void* Input,
+ void* Output,
+ cmsUInt32Number PixelsPerLine,
+ cmsUInt32Number LineCount,
+ const cmsStride* Stride)
+{
+ XMatShaper8Data* p = (XMatShaper8Data*) _cmsGetTransformUserData(CMMcargo);
+
+ cmsUInt32Number i, ii;
+
+ cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS];
+ cmsUInt32Number SourceIncrements[cmsMAXCHANNELS];
+ cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS];
+ cmsUInt32Number DestIncrements[cmsMAXCHANNELS];
+
+ const cmsUInt8Number* rin;
+ const cmsUInt8Number* gin;
+ const cmsUInt8Number* bin;
+ const cmsUInt8Number* ain = NULL;
+
+ cmsUInt8Number* rout;
+ cmsUInt8Number* gout;
+ cmsUInt8Number* bout;
+ cmsUInt8Number* aout = NULL;
+
+ cmsUInt32Number nalpha, strideIn, strideOut;
+
+ __m128 mat0 = _mm_load_ps(p->Mat[0]);
+ __m128 mat1 = _mm_load_ps(p->Mat[1]);
+ __m128 mat2 = _mm_load_ps(p->Mat[2]);
+ __m128 mat3 = _mm_load_ps(p->Mat[3]);
+
+ __m128 zero = _mm_setzero_ps();
+ __m128 one = _mm_set1_ps(1.0f);
+ __m128 scale = _mm_set1_ps((cmsFloat32Number)0x4000);
+
+ cmsUInt8Number buffer[32];
+ cmsUInt32Number* output_index = (cmsUInt32Number*)(((uintptr_t)buffer + 16) & ~0xf);
+
+
+ _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements);
+ _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements);
+
+ strideIn = strideOut = 0;
+ for (i = 0; i < LineCount; i++) {
+
+ rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn;
+ gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn;
+ bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn;
+ if (nalpha)
+ ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn;
+
+
+ rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut;
+ gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
+ bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
+ if (nalpha)
+ aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
+
+ /**
+ * Prefectch
+ */
+ __m128 rvector = _mm_set1_ps(p->Shaper1R[*rin]);
+ __m128 gvector = _mm_set1_ps(p->Shaper1G[*gin]);
+ __m128 bvector = _mm_set1_ps(p->Shaper1B[*bin]);
+
+ for (ii = 0; ii < PixelsPerLine; ii++) {
+
+ __m128 el1 = _mm_mul_ps(rvector, mat0);
+ __m128 el2 = _mm_mul_ps(gvector, mat1);
+ __m128 el3 = _mm_mul_ps(bvector, mat2);
+
+ __m128 sum = _mm_add_ps(el1, _mm_add_ps(el2, _mm_add_ps(el3, mat3)));
+
+ __m128 out = _mm_min_ps(_mm_max_ps(sum, zero), one);
+
+ out = _mm_mul_ps(out, scale);
+
+ /**
+ * Rounding and converting to index.
+ * Actually this is a costly instruction that may be blocking performance
+ */
+ _mm_store_si128((__m128i*)output_index, _mm_cvtps_epi32(out));
+
+
+ // Handle alpha
+ if (ain) {
+ *aout = *ain;
+ }
+
+ rin += SourceIncrements[0];
+ gin += SourceIncrements[1];
+ bin += SourceIncrements[2];
+ if (ain) ain += SourceIncrements[3];
+
+ /**
+ * Take next value whilst store is being performed
+ */
+ if (i < PixelsPerLine - 1)
+ {
+ rvector = _mm_set1_ps(p->Shaper1R[*rin]);
+ gvector = _mm_set1_ps(p->Shaper1G[*gin]);
+ bvector = _mm_set1_ps(p->Shaper1B[*bin]);
+ }
+
+ *rout = p->Shaper2R[output_index[0]];
+ *gout = p->Shaper2G[output_index[1]];
+ *bout = p->Shaper2B[output_index[2]];
+
+ rout += DestIncrements[0];
+ gout += DestIncrements[1];
+ bout += DestIncrements[2];
+ if (aout) aout += DestIncrements[3];
+ }
+
+ strideIn += Stride->BytesPerLineIn;
+ strideOut += Stride->BytesPerLineOut;
+ }
+}
+
+
+
+// 8 bits on input allows matrix-shaper boost up a little bit
+cmsBool Optimize8MatrixShaperSSE(_cmsTransformFn* TransformFn,
+ void** UserData,
+ _cmsFreeUserDataFn* FreeUserData,
+ cmsPipeline** Lut,
+ cmsUInt32Number* InputFormat,
+ cmsUInt32Number* OutputFormat,
+ cmsUInt32Number* dwFlags)
+{
+ cmsStage* Curve1, *Curve2;
+ cmsStage* Matrix1, *Matrix2;
+ _cmsStageMatrixData* Data1;
+ _cmsStageMatrixData* Data2;
+ cmsMAT3 res;
+ cmsBool IdentityMat = FALSE;
+ cmsPipeline* Dest, *Src;
+ cmsContext ContextID;
+ cmsUInt32Number nChans;
+ cmsFloat64Number factor = 1.0;
+ int cpuinfo[4];
+
+ // Check for SSE2 support
+ __cpuid(cpuinfo, 1);
+ if (!(cpuinfo[3] & (1 << 26))) return FALSE;
+
+ // Only works on RGB to RGB and gray to gray
+ if ( !( (T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3) ||
+ (T_CHANNELS(*InputFormat) == 1 && T_CHANNELS(*OutputFormat) == 1) )) return FALSE;
+
+ // Only works on 8 bit input
+ if (T_BYTES(*InputFormat) != 1 || T_BYTES(*OutputFormat) != 1) return FALSE;
+
+ // Seems suitable, proceed
+ Src = *Lut;
+
+ // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for
+ if (!cmsPipelineCheckAndRetreiveStages(Src, 4,
+ cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType,
+ &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE;
+
+ ContextID = cmsGetPipelineContextID(Src);
+ nChans = T_CHANNELS(*InputFormat);
+
+ // Get both matrices, which are 3x3
+ Data1 = (_cmsStageMatrixData*) cmsStageData(Matrix1);
+ Data2 = (_cmsStageMatrixData*) cmsStageData(Matrix2);
+
+ // Input offset should be zero
+ if (Data1 ->Offset != NULL) return FALSE;
+
+ if (cmsStageInputChannels(Matrix1) == 1 && cmsStageOutputChannels(Matrix2) == 1)
+ {
+ // This is a gray to gray. Just multiply
+ factor = Data1->Double[0]*Data2->Double[0] +
+ Data1->Double[1]*Data2->Double[1] +
+ Data1->Double[2]*Data2->Double[2];
+
+ if (fabs(1 - factor) < (1.0 / 65535.0)) IdentityMat = TRUE;
+ }
+ else
+ {
+ // Multiply both matrices to get the result
+ _cmsMAT3per(&res, (cmsMAT3*) Data2 ->Double, (cmsMAT3*) Data1 ->Double);
+
+ // Now the result is in res + Data2 -> Offset. Maybe is a plain identity?
+ IdentityMat = FALSE;
+ if (_cmsMAT3isIdentity(&res) && Data2 ->Offset == NULL) {
+
+ // We can get rid of full matrix
+ IdentityMat = TRUE;
+ }
+ }
+
+ // Allocate an empty LUT
+ Dest = cmsPipelineAlloc(ContextID, nChans, nChans);
+ if (!Dest) return FALSE;
+
+ // Assamble the new LUT
+ cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1));
+
+ if (!IdentityMat) {
+
+ if (nChans == 1)
+ cmsPipelineInsertStage(Dest, cmsAT_END,
+ cmsStageAllocMatrix(ContextID, 1, 1, (const cmsFloat64Number*) &factor, Data2->Offset));
+ else
+ cmsPipelineInsertStage(Dest, cmsAT_END,
+ cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*) &res, Data2 ->Offset));
+ }
+
+
+ cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2));
+
+ // If identity on matrix, we can further optimize the curves, so call the join curves routine
+ if (IdentityMat) {
+
+ Optimize8ByJoiningCurves(TransformFn, UserData, FreeUserData, &Dest, InputFormat, OutputFormat, dwFlags);
+ }
+ else {
+ _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*) cmsStageData(Curve1);
+ _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*) cmsStageData(Curve2);
+
+ // In this particular optimization, caché does not help as it takes more time to deal with
+ // the caché that with the pixel handling
+ *dwFlags |= cmsFLAGS_NOCACHE;
+
+
+ // Setup the optimizarion routines
+ *UserData = SetMatShaper(ContextID, mpeC1 ->TheCurves, &res, (cmsVEC3*) Data2 ->Offset, mpeC2->TheCurves);
+ *FreeUserData = FreeMatShaper;
+
+ *TransformFn = (_cmsTransformFn) MatShaperXform8SSE;
+ }
+
+ *dwFlags &= ~cmsFLAGS_CAN_CHANGE_FORMATTER;
+ cmsPipelineFree(Src);
+ *Lut = Dest;
+ return TRUE;
+}
+
+
diff --git a/plugins/fast_float/src/fast_float_internal.h b/plugins/fast_float/src/fast_float_internal.h
index 8499483..92b377a 100644
--- a/plugins/fast_float/src/fast_float_internal.h
+++ b/plugins/fast_float/src/fast_float_internal.h
@@ -171,6 +171,15 @@ cmsBool Optimize8MatrixShaper(_cmsTransformFn* TransformFn,
cmsUInt32Number* OutputFormat,
cmsUInt32Number* dwFlags);
+// 8 bits using SSE
+cmsBool Optimize8MatrixShaperSSE(_cmsTransformFn* TransformFn,
+ void** UserData,
+ _cmsFreeUserDataFn* FreeUserData,
+ cmsPipeline** Lut,
+ cmsUInt32Number* InputFormat,
+ cmsUInt32Number* OutputFormat,
+ cmsUInt32Number* dwFlags);
+
cmsBool OptimizeMatrixShaper15(_cmsTransformFn* TransformFn,
void** UserData,
_cmsFreeUserDataFn* FreeUserData,
diff --git a/plugins/fast_float/src/fast_float_sup.c b/plugins/fast_float/src/fast_float_sup.c
index 67c4e90..4b25705 100644
--- a/plugins/fast_float/src/fast_float_sup.c
+++ b/plugins/fast_float/src/fast_float_sup.c
@@ -40,6 +40,9 @@ cmsBool Floating_Point_Transforms_Dispatcher(_cmsTransformFn* TransformFn,
// Try to optimize by joining curves
if (Optimize8ByJoiningCurves(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE;
+ // Try to use SSE2 to optimize as a set of curves plus a matrix plus a set of curves
+ if (Optimize8MatrixShaperSSE(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE;
+
// Try to optimize as a set of curves plus a matrix plus a set of curves
if (Optimize8MatrixShaper(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE;
diff --git a/plugins/fast_float/testbed/fast_float_testbed.c b/plugins/fast_float/testbed/fast_float_testbed.c
index 1852bf2..ae7fb25 100644
--- a/plugins/fast_float/testbed/fast_float_testbed.c
+++ b/plugins/fast_float/testbed/fast_float_testbed.c
@@ -1809,7 +1809,7 @@ int main()
printf("Installing plug-in ... ");
cmsPlugin(cmsFastFloatExtensions());
printf("done.\n\n");
-
+
CheckComputeIncrements();
diff --git a/src/cmsalpha.c b/src/cmsalpha.c
index f747bc6..28cf2c5 100644
--- a/src/cmsalpha.c
+++ b/src/cmsalpha.c
@@ -362,10 +362,12 @@ int FormatterPos(cmsUInt32Number frm)
if (b == 4 && T_FLOAT(frm))
return 4; // FLT
if (b == 2 && !T_FLOAT(frm))
- if (T_ENDIAN16(frm))
- return 2; // 16SE
- else
- return 1; // 16
+ {
+ if (T_ENDIAN16(frm))
+ return 2; // 16SE
+ else
+ return 1; // 16
+ }
if (b == 1 && !T_FLOAT(frm))
return 0; // 8
return -1; // not recognized