Release of fast floating point plug-in to open source

Release of fast floating point plug-in to open source. This plug-in greatly increments the throughput in some situations. It is released under GPL3, which is different from the core library, released under MIT.
author: Marti Maria <marti.maria@littlecms.com> 2020-04-23 09:36:03 +0200
committer: Marti Maria <marti.maria@littlecms.com> 2020-04-23 09:36:03 +0200
commit: aa64fa73c26bfb61b087ac1dab5cf44b53f57b24 (patch)
tree: 1a80398743fbff6d4e17836d90c31463786a5327 /plugins/fast_float/src/fast_float_15mats.c
parent: 0030b066b2eec47da5f987319bd4e8d5e92449fe (diff)
download: lcms2-aa64fa73c26bfb61b087ac1dab5cf44b53f57b24.tar.gz
1 files changed, 353 insertions, 0 deletions
diff --git a/plugins/fast_float/src/fast_float_15mats.c b/plugins/fast_float/src/fast_float_15mats.c
new file mode 100644
index 0000000..3e5d29d
--- /dev/null
+++ b/plugins/fast_float/src/fast_float_15mats.c
@@ -0,0 +1,353 @@
+//---------------------------------------------------------------------------------
+//
+//  Little Color Management System, fast floating point extensions
+//  Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved
+//
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+//---------------------------------------------------------------------------------
+
+// Optimization for matrix-shaper in 15 bits. Numbers are operated in 1.15 usigned, 
+
+#include "fast_float_internal.h"
+
+// An storage capable to keep 1.15 signed and some extra precission. 
+// Actually I use 32 bits integer (signed)
+typedef cmsInt32Number cmsS1Fixed15Number;   
+
+// Conversion to fixed. Note we don't use floor to get proper sign roundoff
+#define DOUBLE_TO_1FIXED15(x) ((cmsS1Fixed15Number) ((double) (x) * 0x8000 + 0.5))
+
+// This is the private data container used by this optimization
+typedef struct {
+
+       cmsS1Fixed15Number Mat[3][3];
+       cmsS1Fixed15Number Off[3];
+       
+       // Precalculated tables for first shaper (375 Kb in total of both shapers)
+       cmsUInt16Number Shaper1R[MAX_NODES_IN_CURVE];
+       cmsUInt16Number Shaper1G[MAX_NODES_IN_CURVE];
+       cmsUInt16Number Shaper1B[MAX_NODES_IN_CURVE];
+
+       // Second shaper
+       cmsUInt16Number Shaper2R[MAX_NODES_IN_CURVE];
+       cmsUInt16Number Shaper2G[MAX_NODES_IN_CURVE];
+       cmsUInt16Number Shaper2B[MAX_NODES_IN_CURVE];
+
+       // A flag for fast operation if identity
+       cmsBool IdentityMat;
+
+       // The context
+       cmsContext ContextID;
+
+       // Poits to the raw, unaligned memory
+       void * real_ptr;
+
+
+} XMatShaperData;
+
+// A special malloc that returns memory aligned to DWORD boundary. Aligned memory access is way faster than unaligned
+// reference to the real block is kept for later free
+static XMatShaperData* malloc_aligned(cmsContext ContextID)
+{
+       cmsUInt8Number* real_ptr = (cmsUInt8Number*)_cmsMallocZero(ContextID, sizeof(XMatShaperData) + 32);
+       cmsUInt8Number* aligned = (cmsUInt8Number*)(((uintptr_t)real_ptr + 16) & ~0xf);
+       XMatShaperData* p = (XMatShaperData*)aligned;
+
+       p->real_ptr = real_ptr;
+       p->ContextID = ContextID;
+       return p;
+}
+
+
+// Free the private data container
+static
+void  FreeMatShaper(cmsContext ContextID, void* Data)
+{
+
+       XMatShaperData* p = (XMatShaperData*)Data;
+       if (p != NULL)
+              _cmsFree(ContextID, p->real_ptr);	
+}
+
+
+// This table converts from 8 bits to 1.14 after applying the curve
+static
+void FillShaper(cmsUInt16Number* Table, cmsToneCurve* Curve)
+{
+       int i;
+       cmsFloat32Number R, y;
+
+       for (i = 0; i < MAX_NODES_IN_CURVE; i++) {
+
+              R = (cmsFloat32Number)i / (cmsFloat32Number) (MAX_NODES_IN_CURVE - 1);
+              y = cmsEvalToneCurveFloat(Curve, R);
+
+              Table[i] = (cmsUInt16Number) DOUBLE_TO_1FIXED15(y);
+       }
+}
+
+
+// Compute the matrix-shaper structure
+static
+XMatShaperData* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3], cmsBool IdentityMat)
+{
+       XMatShaperData* p;
+       int i, j;
+
+       // Allocate a big chuck of memory to store precomputed tables
+       p = malloc_aligned(ContextID);
+       if (p == NULL) return FALSE;
+
+       p->ContextID = ContextID;
+
+       p->IdentityMat = IdentityMat;
+      
+       // Precompute tables
+       FillShaper(p->Shaper1R, Curve1[0]);
+       FillShaper(p->Shaper1G, Curve1[1]);
+       FillShaper(p->Shaper1B, Curve1[2]);
+
+       FillShaper(p->Shaper2R, Curve2[0]);
+       FillShaper(p->Shaper2G, Curve2[1]);
+       FillShaper(p->Shaper2B, Curve2[2]);
+
+       // Convert matrix to nFixed14. Note that those values may take more than 16 bits if negative
+       for (i = 0; i < 3; i++) {
+              for (j = 0; j < 3; j++) {
+
+                     p->Mat[i][j] = DOUBLE_TO_1FIXED15(Mat->v[i].n[j]);
+              }
+       }
+
+
+       for (i = 0; i < 3; i++) {
+
+              if (Off == NULL) {
+
+                     p->Off[i] = 0x4000;
+
+              }
+              else {
+                     p->Off[i] = DOUBLE_TO_1FIXED15(Off->n[i]) + 0x4000;
+
+              }
+       }
+
+
+       return p;
+}
+
+// A fast matrix-shaper evaluator for 15 bits. This is a bit ticky since I'm using 1.15 signed fixed point.
+static
+void MatShaperXform(struct _cmstransform_struct *CMMcargo,
+                    const void* Input,
+                    void* Output,
+                    cmsUInt32Number PixelsPerLine,
+                    cmsUInt32Number LineCount,
+                    const cmsStride* Stride)
+{
+       XMatShaperData* p = (XMatShaperData*)_cmsGetTransformUserData(CMMcargo);
+
+       cmsS1Fixed15Number l1, l2, l3;
+
+       cmsS1Fixed15Number r, g, b;
+       cmsUInt32Number ri, gi, bi;
+       cmsUInt32Number i, ii;
+       cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS];
+       cmsUInt32Number SourceIncrements[cmsMAXCHANNELS];
+       cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS];
+       cmsUInt32Number DestIncrements[cmsMAXCHANNELS];
+
+       const cmsUInt8Number* rin;
+       const cmsUInt8Number* gin;
+       const cmsUInt8Number* bin;
+       const cmsUInt8Number* ain = NULL;
+
+       cmsUInt8Number* rout;
+       cmsUInt8Number* gout;
+       cmsUInt8Number* bout;
+       cmsUInt8Number* aout = NULL;
+
+       cmsUInt32Number nalpha, strideIn, strideOut;
+
+       _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements);
+       _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements);
+
+       strideIn = strideOut = 0;
+       for (i = 0; i < LineCount; i++) {
+
+              rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn;
+              gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn;
+              bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn;
+              if (nalpha)
+                     ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn;
+
+
+              rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut;
+              gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
+              bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
+              if (nalpha)
+                     aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
+
+
+              for (ii = 0; ii < PixelsPerLine; ii++) {
+
+                     // Across first shaper, which also converts to 1.15 fixed point. 
+                     r = p->Shaper1R[*(cmsUInt16Number*)rin];
+                     g = p->Shaper1G[*(cmsUInt16Number*)gin];
+                     b = p->Shaper1B[*(cmsUInt16Number*)bin];
+
+                     if (p->IdentityMat)
+                     {
+                            l1 = r; l2 = g; l3 = b;
+                     }
+                     else
+                     {
+                            // Evaluate the matrix in 1.14 fixed point
+                            l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 15;
+                            l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 15;
+                            l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 15;
+                     }
+
+                     // Now we have to clip to 0..1.0 range 
+                     ri = (l1 < 0) ? 0 : ((l1 > 0x8000) ? 0x8000 : l1);
+                     gi = (l2 < 0) ? 0 : ((l2 > 0x8000) ? 0x8000 : l2);
+                     bi = (l3 < 0) ? 0 : ((l3 > 0x8000) ? 0x8000 : l3);
+
+
+                     // And across second shaper, 
+                     *(cmsUInt16Number*)rout = p->Shaper2R[ri];
+                     *(cmsUInt16Number*)gout = p->Shaper2G[gi];
+                     *(cmsUInt16Number*)bout = p->Shaper2B[bi];
+
+
+                     // Handle alpha
+                     if (ain) {
+                            memmove(aout, ain, 2);                            
+                     }
+
+                     rin += SourceIncrements[0];
+                     gin += SourceIncrements[1];
+                     bin += SourceIncrements[2];
+                     if (ain) ain += SourceIncrements[3];
+
+                     rout += DestIncrements[0];
+                     gout += DestIncrements[1];
+                     bout += DestIncrements[2];
+                     if (aout) aout += DestIncrements[3];
+              }
+
+              strideIn += Stride->BytesPerLineIn;
+              strideOut += Stride->BytesPerLineOut;
+       }
+}
+
+
+
+//  15 bits on input allows matrix-shaper boost up a little bit
+cmsBool OptimizeMatrixShaper15(_cmsTransformFn* TransformFn,
+                                   void** UserData,
+                                   _cmsFreeUserDataFn* FreeUserData,
+                                   cmsPipeline** Lut,
+                                   cmsUInt32Number* InputFormat,
+                                   cmsUInt32Number* OutputFormat,
+                                   cmsUInt32Number* dwFlags)
+{
+       cmsStage* Curve1, *Curve2;
+       cmsStage* Matrix1, *Matrix2;
+       _cmsStageMatrixData* Data1;
+       _cmsStageMatrixData* Data2;
+       cmsMAT3 res;
+       cmsBool IdentityMat = FALSE;
+       cmsPipeline* Dest, *Src;
+       cmsContext ContextID;
+       cmsUInt32Number nChans;
+       
+       // Only works on RGB to RGB and gray
+
+       if (!(T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3)) return FALSE;
+
+       // Only works on 15 bit to 15 bit
+       if (T_BYTES(*InputFormat) != 2 || T_BYTES(*OutputFormat) != 2 || 
+              T_BIT15(*InputFormat) == 0 || T_BIT15(*OutputFormat) == 0) return FALSE;
+
+       // Seems suitable, proceed
+       Src = *Lut;
+
+       // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for
+       if (!cmsPipelineCheckAndRetreiveStages(Src, 4,
+              cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType,
+              &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE;
+
+       ContextID = cmsGetPipelineContextID(Src);
+       nChans = T_CHANNELS(*InputFormat);
+
+       // Get both matrices, which are 3x3
+       Data1 = (_cmsStageMatrixData*)cmsStageData(Matrix1);
+       Data2 = (_cmsStageMatrixData*)cmsStageData(Matrix2);
+
+       // Input offset should be zero
+       if (Data1->Offset != NULL) return FALSE;
+
+       // Multiply both matrices to get the result
+       _cmsMAT3per(&res, (cmsMAT3*)Data2->Double, (cmsMAT3*)Data1->Double);
+
+       // Now the result is in res + Data2 -> Offset. Maybe is a plain identity?
+       IdentityMat = FALSE;
+       if (_cmsMAT3isIdentity(&res) && Data2->Offset == NULL) {
+
+              // We can get rid of full matrix
+              IdentityMat = TRUE;
+       }
+
+
+       // Allocate an empty LUT 
+       Dest = cmsPipelineAlloc(ContextID, nChans, nChans);
+       if (!Dest) return FALSE;
+
+       // Assamble the new LUT
+       cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1));
+
+       if (!IdentityMat) {
+
+              cmsPipelineInsertStage(Dest, cmsAT_END,
+                     cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*)&res, Data2->Offset));
+       }
+
+       cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2));
+       
+       {
+              _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*)cmsStageData(Curve1);
+              _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*)cmsStageData(Curve2);
+
+              // In this particular optimization, cach� does not help as it takes more time to deal with 
+              // the cach� that with the pixel handling
+              *dwFlags |= cmsFLAGS_NOCACHE;
+
+              // Setup the optimizarion routines
+              *UserData = SetMatShaper(ContextID, mpeC1->TheCurves, &res, (cmsVEC3*)Data2->Offset, mpeC2->TheCurves, IdentityMat);
+              *FreeUserData = FreeMatShaper;
+
+              *TransformFn = (_cmsTransformFn)MatShaperXform;
+       }
+       
+
+       cmsPipelineFree(Src);
+       *Lut = Dest;
+       return TRUE;
+}
+
+
author	Marti Maria <marti.maria@littlecms.com>	2020-04-23 09:36:03 +0200
committer	Marti Maria <marti.maria@littlecms.com>	2020-04-23 09:36:03 +0200
commit	aa64fa73c26bfb61b087ac1dab5cf44b53f57b24 (patch)
tree	1a80398743fbff6d4e17836d90c31463786a5327 /plugins/fast_float/src/fast_float_15mats.c
parent	0030b066b2eec47da5f987319bd4e8d5e92449fe (diff)
download	lcms2-aa64fa73c26bfb61b087ac1dab5cf44b53f57b24.tar.gz