From d881cc6791858f67e1bfd6fa306c2184a99f99e9 Mon Sep 17 00:00:00 2001 From: Marti Maria Date: Mon, 25 May 2020 19:44:47 +0200 Subject: Merged SSE matrix-shaper optimizer It makes small difference, but here you go! In my computer, matrix shaper runs times x 2.4 faster the core lcms2 without SSE is about times x 2 --- .../VC2019/lcms2_fast_float_plugin.vcxproj | 1 + .../VC2019/lcms2_fast_float_plugin.vcxproj.filters | 9 +- plugins/fast_float/src/Makefile.am | 2 +- plugins/fast_float/src/fast_8_matsh.c | 67 ++-- plugins/fast_float/src/fast_8_matsh_sse.c | 413 +++++++++++++++++++++ plugins/fast_float/src/fast_float_internal.h | 9 + plugins/fast_float/src/fast_float_sup.c | 3 + plugins/fast_float/testbed/fast_float_testbed.c | 2 +- src/cmsalpha.c | 10 +- 9 files changed, 461 insertions(+), 55 deletions(-) create mode 100644 plugins/fast_float/src/fast_8_matsh_sse.c diff --git a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj index 1dfe131..40e42b9 100644 --- a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj +++ b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj @@ -26,6 +26,7 @@ + diff --git a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters index ca46a2e..c2761d1 100644 --- a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters +++ b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters @@ -45,9 +45,6 @@ Source Files - - Source Files - Source Files @@ -57,6 +54,12 @@ Source Files + + Source Files + + + Source Files + diff --git a/plugins/fast_float/src/Makefile.am b/plugins/fast_float/src/Makefile.am index 4244cf7..9b597f2 100644 --- a/plugins/fast_float/src/Makefile.am +++ b/plugins/fast_float/src/Makefile.am @@ -24,5 +24,5 @@ liblcms2_fast_float_la_LIBADD = $(LCMS_LIB_DEPLIBS) $(top_builddir)/src/liblcms2 liblcms2_fast_float_la_SOURCES = \ fast_float_15bits.c fast_float_15mats.c fast_float_curves.c fast_float_matsh.c fast_float_separate.c \ fast_float_sup.c fast_float_tethra.c fast_float_cmyk.c fast_float_internal.h \ - fast_8_curves.c fast_8_matsh.c fast_8_tethra.c + fast_8_curves.c fast_8_matsh.c fast_8_matsh_sse.c fast_8_tethra.c diff --git a/plugins/fast_float/src/fast_8_matsh.c b/plugins/fast_float/src/fast_8_matsh.c index 22e7f2b..6a126f3 100644 --- a/plugins/fast_float/src/fast_8_matsh.c +++ b/plugins/fast_float/src/fast_8_matsh.c @@ -30,10 +30,9 @@ typedef cmsInt32Number cmsS1Fixed14Number; // Note that this may hold more tha // This is the private data container used by this optimization typedef struct { - // This is for SSE2, MUST be aligned at 16 bit boundary + // Alignment makes it faster - cmsFloat32Number fMatrix[4][4]; - cmsFloat32Number fShaper1[256 * 3]; + cmsS1Fixed14Number Mat[4][4]; // n.14 to n.14 (needs a saturation after that) void * real_ptr; @@ -42,10 +41,7 @@ typedef struct { cmsS1Fixed14Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0) cmsS1Fixed14Number Shaper1G[256]; cmsS1Fixed14Number Shaper1B[256]; - - cmsS1Fixed14Number Mat[3][3]; // n.14 to n.14 (needs a saturation after that) - cmsS1Fixed14Number Off[3]; - + cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255 cmsUInt8Number Shaper2G[0x4001]; cmsUInt8Number Shaper2B[0x4001]; @@ -97,20 +93,6 @@ void FillFirstShaper(cmsS1Fixed14Number* Table, cmsToneCurve* Curve) } } -static -void FillFirstShaperFloat(cmsFloat32Number* Table, cmsToneCurve* Curve) -{ - int i; - cmsFloat32Number R; - - for (i=0; i < 256; i++) { - - R = (cmsFloat32Number) (i / 255.0); - - Table[i] = cmsEvalToneCurveFloat(Curve, R); - } -} - // This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve static @@ -118,15 +100,17 @@ void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve) { int i; cmsFloat32Number R, Val; - cmsUInt16Number w; + cmsInt32Number w; for (i=0; i < 0x4001; i++) { - R = (cmsFloat32Number) (i / 16384.0); + R = (cmsFloat32Number) (i / 16384.0f); Val = cmsEvalToneCurveFloat(Curve, R); - w = _cmsSaturateWord(Val * 65535.0 + 0.5); + w = (cmsInt32Number) (Val * 255.0f + 0.5f); + if (w < 0) w = 0; + if (w > 255) w = 255; - Table[i] = FROM_16_TO_8(w); + Table[i] = (cmsInt8Number) w; } } @@ -153,30 +137,22 @@ XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cms FillSecondShaper(p ->Shaper2G, Curve2[1]); FillSecondShaper(p ->Shaper2B, Curve2[2]); - - FillFirstShaperFloat(p ->fShaper1, Curve1[0]); - FillFirstShaperFloat(p ->fShaper1 + 256, Curve1[1]); - FillFirstShaperFloat(p ->fShaper1 + 256*2, Curve1[2]); - + // Convert matrix to nFixed14. Note that those values may take more than 16 bits as for (i=0; i < 3; i++) { for (j=0; j < 3; j++) { - p ->Mat[i][j] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]); - p ->fMatrix[j][i] = (cmsFloat32Number) Mat ->v[i].n[j]; + p ->Mat[j][i] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]); } } - - + for (i=0; i < 3; i++) { if (Off == NULL) { - - p ->Off[i] = 0x2000; - p ->fMatrix[3][i] = 0.0f; + + p->Mat[3][i] = DOUBLE_TO_1FIXED14(0.5); } - else { - p ->Off[i] = DOUBLE_TO_1FIXED14(Off->n[i]) + 0x2000; - p ->fMatrix[3][i] = (cmsFloat32Number) Off->n[i]; + else { + p->Mat[3][i] = DOUBLE_TO_1FIXED14(Off->n[i] + 0.5); } } @@ -237,20 +213,19 @@ void MatShaperXform8(struct _cmstransform_struct *CMMcargo, gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut; if (nalpha) - aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; - + aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; for (ii = 0; ii < PixelsPerLine; ii++) { - + // Across first shaper, which also converts to 1.14 fixed point. 16 bits guaranteed. r = p->Shaper1R[*rin]; g = p->Shaper1G[*gin]; b = p->Shaper1B[*bin]; // Evaluate the matrix in 1.14 fixed point - l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 14; - l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 14; - l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 14; + l1 = (p->Mat[0][0] * r + p->Mat[1][0] * g + p->Mat[2][0] * b + p->Mat[3][0]) >> 14; + l2 = (p->Mat[0][1] * r + p->Mat[1][1] * g + p->Mat[2][1] * b + p->Mat[3][1]) >> 14; + l3 = (p->Mat[0][2] * r + p->Mat[1][2] * g + p->Mat[2][2] * b + p->Mat[3][2]) >> 14; // Now we have to clip to 0..1.0 range diff --git a/plugins/fast_float/src/fast_8_matsh_sse.c b/plugins/fast_float/src/fast_8_matsh_sse.c new file mode 100644 index 0000000..e6a9990 --- /dev/null +++ b/plugins/fast_float/src/fast_8_matsh_sse.c @@ -0,0 +1,413 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +// Optimization for matrix-shaper in 8 bits using SSE2 intrinsics + +#include "fast_float_internal.h" + + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include + + +// This is the private data container used by this optimization +typedef struct { + + // This is for SSE, MUST be aligned at 16 bit boundary + + cmsFloat32Number Mat[4][4]; // n.14 to n.14 (needs a saturation after that) + + void * real_ptr; + + cmsContext ContextID; + + cmsFloat32Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0) + cmsFloat32Number Shaper1G[256]; + cmsFloat32Number Shaper1B[256]; + + cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255 + cmsUInt8Number Shaper2G[0x4001]; + cmsUInt8Number Shaper2B[0x4001]; + +} XMatShaper8Data; + + +static +XMatShaper8Data* malloc_aligned(cmsContext ContextID) +{ + cmsUInt8Number* real_ptr = (cmsUInt8Number*) _cmsMallocZero(ContextID, sizeof(XMatShaper8Data) + 32); + cmsUInt8Number* aligned = (cmsUInt8Number*) (((uintptr_t)real_ptr + 16) & ~0xf); + XMatShaper8Data* p = (XMatShaper8Data*) aligned; + + p ->real_ptr = real_ptr; + return p; +} + +static +void free_aligned(XMatShaper8Data* a) +{ + _cmsFree(a->ContextID, a->real_ptr); +} + + +// Free the private data container +static +void FreeMatShaper(cmsContext ContextID, void* Data) +{ + UNUSED_PARAMETER(ContextID); + + if (Data != NULL) free_aligned((XMatShaper8Data*) Data); +} + + +// This table converts from 8 bits to 1.14 after applying the curve +static +void FillFirstShaper(cmsFloat32Number* Table, cmsToneCurve* Curve) +{ + cmsInt32Number i; + cmsFloat32Number R; + + for (i = 0; i < 256; i++) { + + R = (cmsFloat32Number)(i / 255.0); + Table[i] = cmsEvalToneCurveFloat(Curve, R); + } +} + + +// This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve +static +void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve) +{ + int i; + cmsFloat32Number R, Val; + cmsInt32Number w; + + for (i=0; i < 0x4001; i++) { + + R = (cmsFloat32Number) (i / 16384.0f); + Val = cmsEvalToneCurveFloat(Curve, R); + w = (cmsInt32Number) (Val * 255.0f + 0.5f); + if (w < 0) w = 0; + if (w > 255) w = 255; + + Table[i] = (cmsInt8Number) w; + + } +} + +// Compute the matrix-shaper structure +static +XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3]) +{ + XMatShaper8Data* p; + int i, j; + + // Allocate a big chuck of memory to store precomputed tables + p = malloc_aligned(ContextID); + if (p == NULL) return FALSE; + + p -> ContextID = ContextID; + + // Precompute tables + FillFirstShaper(p ->Shaper1R, Curve1[0]); + FillFirstShaper(p ->Shaper1G, Curve1[1]); + FillFirstShaper(p ->Shaper1B, Curve1[2]); + + FillSecondShaper(p ->Shaper2R, Curve2[0]); + FillSecondShaper(p ->Shaper2G, Curve2[1]); + FillSecondShaper(p ->Shaper2B, Curve2[2]); + + + // Convert matrix to float + for (i=0; i < 3; i++) { + for (j=0; j < 3; j++) { + p ->Mat[j][i] = (cmsFloat32Number) Mat->v[i].n[j]; + } + } + + // Roundoff + for (i=0; i < 3; i++) { + + if (Off == NULL) { + + p->Mat[3][i] = 0.0f; + } + else { + p->Mat[3][i] = (cmsFloat32Number)Off->n[i]; + } + } + + + return p; +} + +// A fast matrix-shaper evaluator for 8 bits. +static +void MatShaperXform8SSE(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + XMatShaper8Data* p = (XMatShaper8Data*) _cmsGetTransformUserData(CMMcargo); + + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + __m128 mat0 = _mm_load_ps(p->Mat[0]); + __m128 mat1 = _mm_load_ps(p->Mat[1]); + __m128 mat2 = _mm_load_ps(p->Mat[2]); + __m128 mat3 = _mm_load_ps(p->Mat[3]); + + __m128 zero = _mm_setzero_ps(); + __m128 one = _mm_set1_ps(1.0f); + __m128 scale = _mm_set1_ps((cmsFloat32Number)0x4000); + + cmsUInt8Number buffer[32]; + cmsUInt32Number* output_index = (cmsUInt32Number*)(((uintptr_t)buffer + 16) & ~0xf); + + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; + + /** + * Prefectch + */ + __m128 rvector = _mm_set1_ps(p->Shaper1R[*rin]); + __m128 gvector = _mm_set1_ps(p->Shaper1G[*gin]); + __m128 bvector = _mm_set1_ps(p->Shaper1B[*bin]); + + for (ii = 0; ii < PixelsPerLine; ii++) { + + __m128 el1 = _mm_mul_ps(rvector, mat0); + __m128 el2 = _mm_mul_ps(gvector, mat1); + __m128 el3 = _mm_mul_ps(bvector, mat2); + + __m128 sum = _mm_add_ps(el1, _mm_add_ps(el2, _mm_add_ps(el3, mat3))); + + __m128 out = _mm_min_ps(_mm_max_ps(sum, zero), one); + + out = _mm_mul_ps(out, scale); + + /** + * Rounding and converting to index. + * Actually this is a costly instruction that may be blocking performance + */ + _mm_store_si128((__m128i*)output_index, _mm_cvtps_epi32(out)); + + + // Handle alpha + if (ain) { + *aout = *ain; + } + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + if (ain) ain += SourceIncrements[3]; + + /** + * Take next value whilst store is being performed + */ + if (i < PixelsPerLine - 1) + { + rvector = _mm_set1_ps(p->Shaper1R[*rin]); + gvector = _mm_set1_ps(p->Shaper1G[*gin]); + bvector = _mm_set1_ps(p->Shaper1B[*bin]); + } + + *rout = p->Shaper2R[output_index[0]]; + *gout = p->Shaper2G[output_index[1]]; + *bout = p->Shaper2B[output_index[2]]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + if (aout) aout += DestIncrements[3]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + + +// 8 bits on input allows matrix-shaper boost up a little bit +cmsBool Optimize8MatrixShaperSSE(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsStage* Curve1, *Curve2; + cmsStage* Matrix1, *Matrix2; + _cmsStageMatrixData* Data1; + _cmsStageMatrixData* Data2; + cmsMAT3 res; + cmsBool IdentityMat = FALSE; + cmsPipeline* Dest, *Src; + cmsContext ContextID; + cmsUInt32Number nChans; + cmsFloat64Number factor = 1.0; + int cpuinfo[4]; + + // Check for SSE2 support + __cpuid(cpuinfo, 1); + if (!(cpuinfo[3] & (1 << 26))) return FALSE; + + // Only works on RGB to RGB and gray to gray + if ( !( (T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3) || + (T_CHANNELS(*InputFormat) == 1 && T_CHANNELS(*OutputFormat) == 1) )) return FALSE; + + // Only works on 8 bit input + if (T_BYTES(*InputFormat) != 1 || T_BYTES(*OutputFormat) != 1) return FALSE; + + // Seems suitable, proceed + Src = *Lut; + + // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for + if (!cmsPipelineCheckAndRetreiveStages(Src, 4, + cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType, + &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE; + + ContextID = cmsGetPipelineContextID(Src); + nChans = T_CHANNELS(*InputFormat); + + // Get both matrices, which are 3x3 + Data1 = (_cmsStageMatrixData*) cmsStageData(Matrix1); + Data2 = (_cmsStageMatrixData*) cmsStageData(Matrix2); + + // Input offset should be zero + if (Data1 ->Offset != NULL) return FALSE; + + if (cmsStageInputChannels(Matrix1) == 1 && cmsStageOutputChannels(Matrix2) == 1) + { + // This is a gray to gray. Just multiply + factor = Data1->Double[0]*Data2->Double[0] + + Data1->Double[1]*Data2->Double[1] + + Data1->Double[2]*Data2->Double[2]; + + if (fabs(1 - factor) < (1.0 / 65535.0)) IdentityMat = TRUE; + } + else + { + // Multiply both matrices to get the result + _cmsMAT3per(&res, (cmsMAT3*) Data2 ->Double, (cmsMAT3*) Data1 ->Double); + + // Now the result is in res + Data2 -> Offset. Maybe is a plain identity? + IdentityMat = FALSE; + if (_cmsMAT3isIdentity(&res) && Data2 ->Offset == NULL) { + + // We can get rid of full matrix + IdentityMat = TRUE; + } + } + + // Allocate an empty LUT + Dest = cmsPipelineAlloc(ContextID, nChans, nChans); + if (!Dest) return FALSE; + + // Assamble the new LUT + cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1)); + + if (!IdentityMat) { + + if (nChans == 1) + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 1, 1, (const cmsFloat64Number*) &factor, Data2->Offset)); + else + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*) &res, Data2 ->Offset)); + } + + + cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2)); + + // If identity on matrix, we can further optimize the curves, so call the join curves routine + if (IdentityMat) { + + Optimize8ByJoiningCurves(TransformFn, UserData, FreeUserData, &Dest, InputFormat, OutputFormat, dwFlags); + } + else { + _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*) cmsStageData(Curve1); + _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*) cmsStageData(Curve2); + + // In this particular optimization, caché does not help as it takes more time to deal with + // the caché that with the pixel handling + *dwFlags |= cmsFLAGS_NOCACHE; + + + // Setup the optimizarion routines + *UserData = SetMatShaper(ContextID, mpeC1 ->TheCurves, &res, (cmsVEC3*) Data2 ->Offset, mpeC2->TheCurves); + *FreeUserData = FreeMatShaper; + + *TransformFn = (_cmsTransformFn) MatShaperXform8SSE; + } + + *dwFlags &= ~cmsFLAGS_CAN_CHANGE_FORMATTER; + cmsPipelineFree(Src); + *Lut = Dest; + return TRUE; +} + + diff --git a/plugins/fast_float/src/fast_float_internal.h b/plugins/fast_float/src/fast_float_internal.h index 8499483..92b377a 100644 --- a/plugins/fast_float/src/fast_float_internal.h +++ b/plugins/fast_float/src/fast_float_internal.h @@ -171,6 +171,15 @@ cmsBool Optimize8MatrixShaper(_cmsTransformFn* TransformFn, cmsUInt32Number* OutputFormat, cmsUInt32Number* dwFlags); +// 8 bits using SSE +cmsBool Optimize8MatrixShaperSSE(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + cmsBool OptimizeMatrixShaper15(_cmsTransformFn* TransformFn, void** UserData, _cmsFreeUserDataFn* FreeUserData, diff --git a/plugins/fast_float/src/fast_float_sup.c b/plugins/fast_float/src/fast_float_sup.c index 67c4e90..4b25705 100644 --- a/plugins/fast_float/src/fast_float_sup.c +++ b/plugins/fast_float/src/fast_float_sup.c @@ -40,6 +40,9 @@ cmsBool Floating_Point_Transforms_Dispatcher(_cmsTransformFn* TransformFn, // Try to optimize by joining curves if (Optimize8ByJoiningCurves(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + // Try to use SSE2 to optimize as a set of curves plus a matrix plus a set of curves + if (Optimize8MatrixShaperSSE(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + // Try to optimize as a set of curves plus a matrix plus a set of curves if (Optimize8MatrixShaper(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; diff --git a/plugins/fast_float/testbed/fast_float_testbed.c b/plugins/fast_float/testbed/fast_float_testbed.c index 1852bf2..ae7fb25 100644 --- a/plugins/fast_float/testbed/fast_float_testbed.c +++ b/plugins/fast_float/testbed/fast_float_testbed.c @@ -1809,7 +1809,7 @@ int main() printf("Installing plug-in ... "); cmsPlugin(cmsFastFloatExtensions()); printf("done.\n\n"); - + CheckComputeIncrements(); diff --git a/src/cmsalpha.c b/src/cmsalpha.c index f747bc6..28cf2c5 100644 --- a/src/cmsalpha.c +++ b/src/cmsalpha.c @@ -362,10 +362,12 @@ int FormatterPos(cmsUInt32Number frm) if (b == 4 && T_FLOAT(frm)) return 4; // FLT if (b == 2 && !T_FLOAT(frm)) - if (T_ENDIAN16(frm)) - return 2; // 16SE - else - return 1; // 16 + { + if (T_ENDIAN16(frm)) + return 2; // 16SE + else + return 1; // 16 + } if (b == 1 && !T_FLOAT(frm)) return 0; // 8 return -1; // not recognized -- cgit v1.2.1