summaryrefslogtreecommitdiff
path: root/plugins/fast_float/src/fast_float_15mats.c
diff options
context:
space:
mode:
authorMarti Maria <marti.maria@littlecms.com>2020-04-23 09:36:03 +0200
committerMarti Maria <marti.maria@littlecms.com>2020-04-23 09:36:03 +0200
commitaa64fa73c26bfb61b087ac1dab5cf44b53f57b24 (patch)
tree1a80398743fbff6d4e17836d90c31463786a5327 /plugins/fast_float/src/fast_float_15mats.c
parent0030b066b2eec47da5f987319bd4e8d5e92449fe (diff)
downloadlcms2-aa64fa73c26bfb61b087ac1dab5cf44b53f57b24.tar.gz
Release of fast floating point plug-in to open source
Release of fast floating point plug-in to open source. This plug-in greatly increments the throughput in some situations. It is released under GPL3, which is different from the core library, released under MIT.
Diffstat (limited to 'plugins/fast_float/src/fast_float_15mats.c')
-rw-r--r--plugins/fast_float/src/fast_float_15mats.c353
1 files changed, 353 insertions, 0 deletions
diff --git a/plugins/fast_float/src/fast_float_15mats.c b/plugins/fast_float/src/fast_float_15mats.c
new file mode 100644
index 0000000..3e5d29d
--- /dev/null
+++ b/plugins/fast_float/src/fast_float_15mats.c
@@ -0,0 +1,353 @@
+//---------------------------------------------------------------------------------
+//
+// Little Color Management System, fast floating point extensions
+// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved
+//
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+//---------------------------------------------------------------------------------
+
+// Optimization for matrix-shaper in 15 bits. Numbers are operated in 1.15 usigned,
+
+#include "fast_float_internal.h"
+
+// An storage capable to keep 1.15 signed and some extra precission.
+// Actually I use 32 bits integer (signed)
+typedef cmsInt32Number cmsS1Fixed15Number;
+
+// Conversion to fixed. Note we don't use floor to get proper sign roundoff
+#define DOUBLE_TO_1FIXED15(x) ((cmsS1Fixed15Number) ((double) (x) * 0x8000 + 0.5))
+
+// This is the private data container used by this optimization
+typedef struct {
+
+ cmsS1Fixed15Number Mat[3][3];
+ cmsS1Fixed15Number Off[3];
+
+ // Precalculated tables for first shaper (375 Kb in total of both shapers)
+ cmsUInt16Number Shaper1R[MAX_NODES_IN_CURVE];
+ cmsUInt16Number Shaper1G[MAX_NODES_IN_CURVE];
+ cmsUInt16Number Shaper1B[MAX_NODES_IN_CURVE];
+
+ // Second shaper
+ cmsUInt16Number Shaper2R[MAX_NODES_IN_CURVE];
+ cmsUInt16Number Shaper2G[MAX_NODES_IN_CURVE];
+ cmsUInt16Number Shaper2B[MAX_NODES_IN_CURVE];
+
+ // A flag for fast operation if identity
+ cmsBool IdentityMat;
+
+ // The context
+ cmsContext ContextID;
+
+ // Poits to the raw, unaligned memory
+ void * real_ptr;
+
+
+} XMatShaperData;
+
+// A special malloc that returns memory aligned to DWORD boundary. Aligned memory access is way faster than unaligned
+// reference to the real block is kept for later free
+static XMatShaperData* malloc_aligned(cmsContext ContextID)
+{
+ cmsUInt8Number* real_ptr = (cmsUInt8Number*)_cmsMallocZero(ContextID, sizeof(XMatShaperData) + 32);
+ cmsUInt8Number* aligned = (cmsUInt8Number*)(((uintptr_t)real_ptr + 16) & ~0xf);
+ XMatShaperData* p = (XMatShaperData*)aligned;
+
+ p->real_ptr = real_ptr;
+ p->ContextID = ContextID;
+ return p;
+}
+
+
+// Free the private data container
+static
+void FreeMatShaper(cmsContext ContextID, void* Data)
+{
+
+ XMatShaperData* p = (XMatShaperData*)Data;
+ if (p != NULL)
+ _cmsFree(ContextID, p->real_ptr);
+}
+
+
+// This table converts from 8 bits to 1.14 after applying the curve
+static
+void FillShaper(cmsUInt16Number* Table, cmsToneCurve* Curve)
+{
+ int i;
+ cmsFloat32Number R, y;
+
+ for (i = 0; i < MAX_NODES_IN_CURVE; i++) {
+
+ R = (cmsFloat32Number)i / (cmsFloat32Number) (MAX_NODES_IN_CURVE - 1);
+ y = cmsEvalToneCurveFloat(Curve, R);
+
+ Table[i] = (cmsUInt16Number) DOUBLE_TO_1FIXED15(y);
+ }
+}
+
+
+// Compute the matrix-shaper structure
+static
+XMatShaperData* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3], cmsBool IdentityMat)
+{
+ XMatShaperData* p;
+ int i, j;
+
+ // Allocate a big chuck of memory to store precomputed tables
+ p = malloc_aligned(ContextID);
+ if (p == NULL) return FALSE;
+
+ p->ContextID = ContextID;
+
+ p->IdentityMat = IdentityMat;
+
+ // Precompute tables
+ FillShaper(p->Shaper1R, Curve1[0]);
+ FillShaper(p->Shaper1G, Curve1[1]);
+ FillShaper(p->Shaper1B, Curve1[2]);
+
+ FillShaper(p->Shaper2R, Curve2[0]);
+ FillShaper(p->Shaper2G, Curve2[1]);
+ FillShaper(p->Shaper2B, Curve2[2]);
+
+ // Convert matrix to nFixed14. Note that those values may take more than 16 bits if negative
+ for (i = 0; i < 3; i++) {
+ for (j = 0; j < 3; j++) {
+
+ p->Mat[i][j] = DOUBLE_TO_1FIXED15(Mat->v[i].n[j]);
+ }
+ }
+
+
+ for (i = 0; i < 3; i++) {
+
+ if (Off == NULL) {
+
+ p->Off[i] = 0x4000;
+
+ }
+ else {
+ p->Off[i] = DOUBLE_TO_1FIXED15(Off->n[i]) + 0x4000;
+
+ }
+ }
+
+
+ return p;
+}
+
+// A fast matrix-shaper evaluator for 15 bits. This is a bit ticky since I'm using 1.15 signed fixed point.
+static
+void MatShaperXform(struct _cmstransform_struct *CMMcargo,
+ const void* Input,
+ void* Output,
+ cmsUInt32Number PixelsPerLine,
+ cmsUInt32Number LineCount,
+ const cmsStride* Stride)
+{
+ XMatShaperData* p = (XMatShaperData*)_cmsGetTransformUserData(CMMcargo);
+
+ cmsS1Fixed15Number l1, l2, l3;
+
+ cmsS1Fixed15Number r, g, b;
+ cmsUInt32Number ri, gi, bi;
+ cmsUInt32Number i, ii;
+ cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS];
+ cmsUInt32Number SourceIncrements[cmsMAXCHANNELS];
+ cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS];
+ cmsUInt32Number DestIncrements[cmsMAXCHANNELS];
+
+ const cmsUInt8Number* rin;
+ const cmsUInt8Number* gin;
+ const cmsUInt8Number* bin;
+ const cmsUInt8Number* ain = NULL;
+
+ cmsUInt8Number* rout;
+ cmsUInt8Number* gout;
+ cmsUInt8Number* bout;
+ cmsUInt8Number* aout = NULL;
+
+ cmsUInt32Number nalpha, strideIn, strideOut;
+
+ _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements);
+ _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements);
+
+ strideIn = strideOut = 0;
+ for (i = 0; i < LineCount; i++) {
+
+ rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn;
+ gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn;
+ bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn;
+ if (nalpha)
+ ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn;
+
+
+ rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut;
+ gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
+ bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
+ if (nalpha)
+ aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
+
+
+ for (ii = 0; ii < PixelsPerLine; ii++) {
+
+ // Across first shaper, which also converts to 1.15 fixed point.
+ r = p->Shaper1R[*(cmsUInt16Number*)rin];
+ g = p->Shaper1G[*(cmsUInt16Number*)gin];
+ b = p->Shaper1B[*(cmsUInt16Number*)bin];
+
+ if (p->IdentityMat)
+ {
+ l1 = r; l2 = g; l3 = b;
+ }
+ else
+ {
+ // Evaluate the matrix in 1.14 fixed point
+ l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 15;
+ l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 15;
+ l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 15;
+ }
+
+ // Now we have to clip to 0..1.0 range
+ ri = (l1 < 0) ? 0 : ((l1 > 0x8000) ? 0x8000 : l1);
+ gi = (l2 < 0) ? 0 : ((l2 > 0x8000) ? 0x8000 : l2);
+ bi = (l3 < 0) ? 0 : ((l3 > 0x8000) ? 0x8000 : l3);
+
+
+ // And across second shaper,
+ *(cmsUInt16Number*)rout = p->Shaper2R[ri];
+ *(cmsUInt16Number*)gout = p->Shaper2G[gi];
+ *(cmsUInt16Number*)bout = p->Shaper2B[bi];
+
+
+ // Handle alpha
+ if (ain) {
+ memmove(aout, ain, 2);
+ }
+
+ rin += SourceIncrements[0];
+ gin += SourceIncrements[1];
+ bin += SourceIncrements[2];
+ if (ain) ain += SourceIncrements[3];
+
+ rout += DestIncrements[0];
+ gout += DestIncrements[1];
+ bout += DestIncrements[2];
+ if (aout) aout += DestIncrements[3];
+ }
+
+ strideIn += Stride->BytesPerLineIn;
+ strideOut += Stride->BytesPerLineOut;
+ }
+}
+
+
+
+// 15 bits on input allows matrix-shaper boost up a little bit
+cmsBool OptimizeMatrixShaper15(_cmsTransformFn* TransformFn,
+ void** UserData,
+ _cmsFreeUserDataFn* FreeUserData,
+ cmsPipeline** Lut,
+ cmsUInt32Number* InputFormat,
+ cmsUInt32Number* OutputFormat,
+ cmsUInt32Number* dwFlags)
+{
+ cmsStage* Curve1, *Curve2;
+ cmsStage* Matrix1, *Matrix2;
+ _cmsStageMatrixData* Data1;
+ _cmsStageMatrixData* Data2;
+ cmsMAT3 res;
+ cmsBool IdentityMat = FALSE;
+ cmsPipeline* Dest, *Src;
+ cmsContext ContextID;
+ cmsUInt32Number nChans;
+
+ // Only works on RGB to RGB and gray
+
+ if (!(T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3)) return FALSE;
+
+ // Only works on 15 bit to 15 bit
+ if (T_BYTES(*InputFormat) != 2 || T_BYTES(*OutputFormat) != 2 ||
+ T_BIT15(*InputFormat) == 0 || T_BIT15(*OutputFormat) == 0) return FALSE;
+
+ // Seems suitable, proceed
+ Src = *Lut;
+
+ // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for
+ if (!cmsPipelineCheckAndRetreiveStages(Src, 4,
+ cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType,
+ &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE;
+
+ ContextID = cmsGetPipelineContextID(Src);
+ nChans = T_CHANNELS(*InputFormat);
+
+ // Get both matrices, which are 3x3
+ Data1 = (_cmsStageMatrixData*)cmsStageData(Matrix1);
+ Data2 = (_cmsStageMatrixData*)cmsStageData(Matrix2);
+
+ // Input offset should be zero
+ if (Data1->Offset != NULL) return FALSE;
+
+ // Multiply both matrices to get the result
+ _cmsMAT3per(&res, (cmsMAT3*)Data2->Double, (cmsMAT3*)Data1->Double);
+
+ // Now the result is in res + Data2 -> Offset. Maybe is a plain identity?
+ IdentityMat = FALSE;
+ if (_cmsMAT3isIdentity(&res) && Data2->Offset == NULL) {
+
+ // We can get rid of full matrix
+ IdentityMat = TRUE;
+ }
+
+
+ // Allocate an empty LUT
+ Dest = cmsPipelineAlloc(ContextID, nChans, nChans);
+ if (!Dest) return FALSE;
+
+ // Assamble the new LUT
+ cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1));
+
+ if (!IdentityMat) {
+
+ cmsPipelineInsertStage(Dest, cmsAT_END,
+ cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*)&res, Data2->Offset));
+ }
+
+ cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2));
+
+ {
+ _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*)cmsStageData(Curve1);
+ _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*)cmsStageData(Curve2);
+
+ // In this particular optimization, caché does not help as it takes more time to deal with
+ // the caché that with the pixel handling
+ *dwFlags |= cmsFLAGS_NOCACHE;
+
+ // Setup the optimizarion routines
+ *UserData = SetMatShaper(ContextID, mpeC1->TheCurves, &res, (cmsVEC3*)Data2->Offset, mpeC2->TheCurves, IdentityMat);
+ *FreeUserData = FreeMatShaper;
+
+ *TransformFn = (_cmsTransformFn)MatShaperXform;
+ }
+
+
+ cmsPipelineFree(Src);
+ *Lut = Dest;
+ return TRUE;
+}
+
+