From aa64fa73c26bfb61b087ac1dab5cf44b53f57b24 Mon Sep 17 00:00:00 2001 From: Marti Maria Date: Thu, 23 Apr 2020 09:36:03 +0200 Subject: Release of fast floating point plug-in to open source Release of fast floating point plug-in to open source. This plug-in greatly increments the throughput in some situations. It is released under GPL3, which is different from the core library, released under MIT. --- plugins/fast_float/src/Makefile.am | 28 ++ plugins/fast_float/src/fast_16_tethra.c | 374 ++++++++++++++++++ plugins/fast_float/src/fast_8_curves.c | 405 +++++++++++++++++++ plugins/fast_float/src/fast_8_matsh.c | 403 +++++++++++++++++++ plugins/fast_float/src/fast_8_tethra.c | 504 ++++++++++++++++++++++++ plugins/fast_float/src/fast_float_15bits.c | 568 +++++++++++++++++++++++++++ plugins/fast_float/src/fast_float_15mats.c | 353 +++++++++++++++++ plugins/fast_float/src/fast_float_cmyk.c | 382 ++++++++++++++++++ plugins/fast_float/src/fast_float_curves.c | 378 ++++++++++++++++++ plugins/fast_float/src/fast_float_internal.h | 237 +++++++++++ plugins/fast_float/src/fast_float_matsh.c | 325 +++++++++++++++ plugins/fast_float/src/fast_float_separate.c | 199 ++++++++++ plugins/fast_float/src/fast_float_sup.c | 89 +++++ plugins/fast_float/src/fast_float_tethra.c | 295 ++++++++++++++ 14 files changed, 4540 insertions(+) create mode 100644 plugins/fast_float/src/Makefile.am create mode 100644 plugins/fast_float/src/fast_16_tethra.c create mode 100644 plugins/fast_float/src/fast_8_curves.c create mode 100644 plugins/fast_float/src/fast_8_matsh.c create mode 100644 plugins/fast_float/src/fast_8_tethra.c create mode 100644 plugins/fast_float/src/fast_float_15bits.c create mode 100644 plugins/fast_float/src/fast_float_15mats.c create mode 100644 plugins/fast_float/src/fast_float_cmyk.c create mode 100644 plugins/fast_float/src/fast_float_curves.c create mode 100644 plugins/fast_float/src/fast_float_internal.h create mode 100644 plugins/fast_float/src/fast_float_matsh.c create mode 100644 plugins/fast_float/src/fast_float_separate.c create mode 100644 plugins/fast_float/src/fast_float_sup.c create mode 100644 plugins/fast_float/src/fast_float_tethra.c (limited to 'plugins/fast_float/src') diff --git a/plugins/fast_float/src/Makefile.am b/plugins/fast_float/src/Makefile.am new file mode 100644 index 0000000..4244cf7 --- /dev/null +++ b/plugins/fast_float/src/Makefile.am @@ -0,0 +1,28 @@ +# +# Makefile for building lcms2_fast_float library +# + +# Don't require all the GNU mandated files +AUTOMAKE_OPTIONS = 1.7 foreign + +includedir = ${prefix}/include + +# Shared libraries built in this directory +lib_LTLIBRARIES = liblcms2_fast_float.la + +LIBRARY_CURRENT = 1 +LIBRARY_REVISION = 1 +LIBRARY_AGE = 0 + +INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include -I$(srcdir)/../include -I$(builddir)/../include + +liblcms2_fast_float_la_LDFLAGS = -no-undefined \ + -version-info $(LIBRARY_CURRENT):$(LIBRARY_REVISION):$(LIBRARY_AGE) + +liblcms2_fast_float_la_LIBADD = $(LCMS_LIB_DEPLIBS) $(top_builddir)/src/liblcms2.la + +liblcms2_fast_float_la_SOURCES = \ + fast_float_15bits.c fast_float_15mats.c fast_float_curves.c fast_float_matsh.c fast_float_separate.c \ + fast_float_sup.c fast_float_tethra.c fast_float_cmyk.c fast_float_internal.h \ + fast_8_curves.c fast_8_matsh.c fast_8_tethra.c + diff --git a/plugins/fast_float/src/fast_16_tethra.c b/plugins/fast_float/src/fast_16_tethra.c new file mode 100644 index 0000000..5c01646 --- /dev/null +++ b/plugins/fast_float/src/fast_16_tethra.c @@ -0,0 +1,374 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +// lcms internal +cmsBool _cmsOptimizePipeline(cmsContext ContextID, + cmsPipeline** Lut, + cmsUInt32Number Intent, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + + +// Optimization for 16 bits, 3 inputs only +typedef struct { + + cmsContext ContextID; + + const cmsInterpParams* p; // Tetrahedrical interpolation parameters. This is a not-owned pointer. + +} Performance16Data; + + +// Precomputes tables for 16-bit on input devicelink. +static +Performance16Data* Performance16alloc(cmsContext ContextID, const cmsInterpParams* p) +{ + Performance16Data* p16; + + p16 = (Performance16Data*) _cmsMallocZero(ContextID, sizeof(Performance16Data)); + if (p16 == NULL) return NULL; + + p16 ->ContextID = ContextID; + p16 ->p = p; + + return p16; +} + +static +void Performance16free(cmsContext ContextID, void* ptr) +{ + _cmsFree(ContextID, ptr); +} + +/** +* Because cmsChangeBuffersFormat, we have to allow this code to output data in either 8 or 16 bits. +* The increments are already computed correctly, but the data may change. So, we use a macro to +* increase xput +*/ +#define TO_OUTPUT_16(d,v) do { *(cmsUInt16Number*) (d) = v; } while(0) +#define TO_OUTPUT_8(d,v) do { *(cmsUInt8Number*) (d) = FROM_16_TO_8(v); } while(0) + +#define TO_OUTPUT(d,v) do { if (out16) TO_OUTPUT_16(d,v); else TO_OUTPUT_8(d,v); } while(0) + +#define FROM_INPUT(v) in16 ? (*((const cmsUInt16Number*)v)) : *((const cmsUInt8Number*)v); + +static +void PerformanceEval16(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + + cmsUInt16Number r, g, b; + int x0, y0, z0; + cmsS15Fixed16Number rx, ry, rz; + cmsS15Fixed16Number fx, fy, fz; + cmsS15Fixed16Number c0, c1, c2, c3, Rest; + cmsUInt32Number OutChan, TotalPlusAlpha; + cmsS15Fixed16Number X0, X1, Y0, Y1, Z0, Z1; + Performance16Data* p16 = (Performance16Data*)_cmsGetTransformUserData(CMMcargo); + const cmsInterpParams* p = p16->p; + cmsUInt32Number TotalOut = p->nOutputs; + const cmsUInt16Number* BaseTable = (const cmsUInt16Number*)p->Table; + const cmsUInt16Number* LutTable; + + cmsUInt8Number* out[cmsMAXCHANNELS]; + cmsUInt16Number res16; + + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + int in16, out16; // Used by macros! + + cmsUInt32Number nalpha, strideIn, strideOut; + + cmsUInt32Number dwInFormat = cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo); + cmsUInt32Number dwOutFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo); + + _cmsComputeComponentIncrements(dwInFormat, Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(dwOutFormat, Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + in16 = (T_BYTES(dwInFormat) == 2); + out16 = (T_BYTES(dwOutFormat) == 2); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + TotalPlusAlpha = TotalOut; + if (ain) TotalPlusAlpha++; + + for (OutChan = 0; OutChan < TotalPlusAlpha; OutChan++) { + out[OutChan] = (cmsUInt8Number*)Output + DestStartingOrder[OutChan] + strideOut; + } + + + for (ii = 0; ii < PixelsPerLine; ii++) { + + r = FROM_INPUT(rin); + g = FROM_INPUT(gin); + b = FROM_INPUT(bin); + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + + fx = _cmsToFixedDomain((int)r * p->Domain[0]); + fy = _cmsToFixedDomain((int)g * p->Domain[1]); + fz = _cmsToFixedDomain((int)b * p->Domain[2]); + + x0 = FIXED_TO_INT(fx); + y0 = FIXED_TO_INT(fy); + z0 = FIXED_TO_INT(fz); + + rx = FIXED_REST_TO_INT(fx); + ry = FIXED_REST_TO_INT(fy); + rz = FIXED_REST_TO_INT(fz); + + X0 = p->opta[2] * x0; + X1 = (r == 0xFFFFU ? 0 : p->opta[2]); + + Y0 = p->opta[1] * y0; + Y1 = (g == 0xFFFFU ? 0 : p->opta[1]); + + Z0 = p->opta[0] * z0; + Z1 = (b == 0xFFFFU ? 0 : p->opta[0]); + + + LutTable = &BaseTable[X0 + Y0 + Z0]; + + // Output should be computed as x = ROUND_FIXED_TO_INT(_cmsToFixedDomain(Rest)) + // which expands as: x = (Rest + ((Rest+0x7fff)/0xFFFF) + 0x8000)>>16 + // This can be replaced by: t = Rest+0x8001, x = (t + (t>>16))>>16 + // at the cost of being off by one at 7fff and 17ffe. + + if (rx >= ry) { + if (ry >= rz) { + Y1 += X1; + Z1 += Y1; + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + c1 = LutTable[X1]; + c2 = LutTable[Y1]; + c3 = LutTable[Z1]; + c0 = *LutTable++; + c3 -= c2; + c2 -= c1; + c1 -= c0; + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + TO_OUTPUT(out[OutChan], res16); + out[OutChan] += DestIncrements[OutChan]; + } + } + else if (rz >= rx) { + X1 += Z1; + Y1 += X1; + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + c1 = LutTable[X1]; + c2 = LutTable[Y1]; + c3 = LutTable[Z1]; + c0 = *LutTable++; + c2 -= c1; + c1 -= c3; + c3 -= c0; + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + TO_OUTPUT(out[OutChan], res16); + out[OutChan] += DestIncrements[OutChan]; + } + } + else { + Z1 += X1; + Y1 += Z1; + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + c1 = LutTable[X1]; + c2 = LutTable[Y1]; + c3 = LutTable[Z1]; + c0 = *LutTable++; + c2 -= c3; + c3 -= c1; + c1 -= c0; + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + TO_OUTPUT(out[OutChan], res16); + out[OutChan] += DestIncrements[OutChan]; + } + } + } + else { + if (rx >= rz) { + X1 += Y1; + Z1 += X1; + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + c1 = LutTable[X1]; + c2 = LutTable[Y1]; + c3 = LutTable[Z1]; + c0 = *LutTable++; + c3 -= c1; + c1 -= c2; + c2 -= c0; + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + TO_OUTPUT(out[OutChan], res16); + out[OutChan] += DestIncrements[OutChan]; + } + } + else if (ry >= rz) { + Z1 += Y1; + X1 += Z1; + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + c1 = LutTable[X1]; + c2 = LutTable[Y1]; + c3 = LutTable[Z1]; + c0 = *LutTable++; + c1 -= c3; + c3 -= c2; + c2 -= c0; + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + TO_OUTPUT(out[OutChan], res16); + out[OutChan] += DestIncrements[OutChan]; + } + } + else { + Y1 += Z1; + X1 += Y1; + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + c1 = LutTable[X1]; + c2 = LutTable[Y1]; + c3 = LutTable[Z1]; + c0 = *LutTable++; + c1 -= c2; + c2 -= c3; + c3 -= c0; + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + TO_OUTPUT(out[OutChan], res16); + out[OutChan] += DestIncrements[OutChan]; + } + } + } + + if (ain) + { + res16 = *(const cmsUInt16Number*)ain; + TO_OUTPUT(out[OutChan], res16); + out[TotalOut] += DestIncrements[TotalOut]; + } + + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + +#undef DENS + + + +// -------------------------------------------------------------------------------------------------------------- + +cmsBool Optimize16BitRGBTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsStage* mpe; + Performance16Data* p16; + cmsContext ContextID; + _cmsStageCLutData* data; + cmsUInt32Number newFlags; + cmsStage* OptimizedCLUTmpe; + + + // For empty transforms, do nothing + if (*Lut == NULL) return FALSE; + + // This is a loosy optimization! does not apply in floating-point cases + if (T_FLOAT(*InputFormat) || T_FLOAT(*OutputFormat)) return FALSE; + + // Only on 16-bit + if (T_BYTES(*InputFormat) != 2 || T_BYTES(*OutputFormat) != 2) return FALSE; + + // Only real 16 bits + if (T_BIT15(*InputFormat) != 0 || T_BIT15(*OutputFormat) != 0) return FALSE; + + // Only on input RGB + if (T_COLORSPACE(*InputFormat) != PT_RGB) return FALSE; + + // Named color pipelines cannot be optimized either + for (mpe = cmsPipelineGetPtrToFirstStage(*Lut); + mpe != NULL; + mpe = cmsStageNext(mpe)) { + if (cmsStageType(mpe) == cmsSigNamedColorElemType) return FALSE; + } + + ContextID = cmsGetPipelineContextID(*Lut); + newFlags = *dwFlags | cmsFLAGS_FORCE_CLUT; + + if (!_cmsOptimizePipeline(ContextID, + Lut, + INTENT_PERCEPTUAL, // Dont care + InputFormat, + OutputFormat, + &newFlags)) return FALSE; + + OptimizedCLUTmpe = cmsPipelineGetPtrToFirstStage(*Lut); + + // Set the evaluator + data = (_cmsStageCLutData*)cmsStageData(OptimizedCLUTmpe); + + p16 = Performance16alloc(ContextID, data->Params); + if (p16 == NULL) return FALSE; + + *TransformFn = (_cmsTransformFn) PerformanceEval16; + *UserData = p16; + *FreeDataFn = Performance16free; + *InputFormat |= 0x02000000; + *OutputFormat |= 0x02000000; + + + return TRUE; +} + diff --git a/plugins/fast_float/src/fast_8_curves.c b/plugins/fast_float/src/fast_8_curves.c new file mode 100644 index 0000000..8f3c5bf --- /dev/null +++ b/plugins/fast_float/src/fast_8_curves.c @@ -0,0 +1,405 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +// Curves, optimization is valid for 8 bits only +typedef struct { + + cmsContext ContextID; + int nCurves; + cmsUInt8Number Curves[cmsMAXCHANNELS][256]; + +} Curves8Data; + + +// Evaluator for RGB 8-bit curves. This are just 1D tables +static void FastEvaluateRGBCurves8(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + Curves8Data* Data = (Curves8Data*)_cmsGetTransformUserData(CMMcargo); + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; + + for (ii = 0; ii < PixelsPerLine; ii++) { + + + *rout = Data->Curves[0][*rin]; + *gout = Data->Curves[1][*gin]; + *bout = Data->Curves[2][*bin]; + + // Handle alpha + if (ain) { + *aout = *ain; + } + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + if (ain) ain += SourceIncrements[3]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + if (aout) aout += DestIncrements[3]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + +// Do nothing but arrange the format. RGB +static void FastRGBIdentity8(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; + + for (ii = 0; ii < PixelsPerLine; ii++) { + + + *rout = *rin; + *gout = *gin; + *bout = *bin; + + // Handle alpha + if (ain) { + *aout = *ain; + } + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + if (ain) ain += SourceIncrements[3]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + if (aout) aout += DestIncrements[3]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + + +// Evaluate 1 channel only +static void FastEvaluateGrayCurves8(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* gin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* gout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + Curves8Data* Data = (Curves8Data*)_cmsGetTransformUserData(CMMcargo); + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + + gout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + + for (ii = 0; ii < PixelsPerLine; ii++) { + + *gout = Data->Curves[0][*gin]; + + // Handle alpha + if (ain) { + *aout = *ain; + } + + gin += SourceIncrements[0]; + + if (ain) ain += SourceIncrements[1]; + + gout += DestIncrements[0]; + + if (aout) aout += DestIncrements[1]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + +static void FastGrayIdentity8(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* gin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* gout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + + gout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + + for (ii = 0; ii < PixelsPerLine; ii++) { + + *gout = *gin; + + // Handle alpha + if (ain) { + *aout = *ain; + } + + gin += SourceIncrements[0]; + + if (ain) ain += SourceIncrements[1]; + + gout += DestIncrements[0]; + + if (aout) aout += DestIncrements[1]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + + + + +// Try to see if the curves are linear +static +cmsBool AllCurvesAreLinear(Curves8Data* data) +{ + int i, j; + + for (i=0; i < 3; i++) { + for (j = 0; j < 256; j++) { + if (data ->Curves[i][j] != j) return FALSE; + } + } + + return TRUE; +} + + +static +Curves8Data* ComputeCompositeCurves(cmsUInt32Number nChan, cmsPipeline* Src) +{ + cmsUInt32Number i, j; + cmsFloat32Number InFloat[3], OutFloat[3]; + + Curves8Data* Data = (Curves8Data*) _cmsMallocZero(cmsGetPipelineContextID(Src), sizeof(Curves8Data)); + if (Data == NULL) return NULL; + + // Create target curves + for (i=0; i < 256; i++) { + + for (j=0; j Curves[j][i] = FROM_16_TO_8(_cmsSaturateWord(OutFloat[j] * 65535.0)); + } + + return Data; +} + + +// If the target LUT holds only curves, the optimization procedure is to join all those +// curves together. That only works on curves and does not work on matrices. +// Any number of channels up to 16 +cmsBool Optimize8ByJoiningCurves(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + + cmsPipeline* Src = *Lut; + cmsStage* mpe; + Curves8Data* Data; + cmsUInt32Number nChans; + + // This is a loosy optimization! does not apply in floating-point cases + if (T_FLOAT(*InputFormat) || T_FLOAT(*OutputFormat)) return FALSE; + + // Only on 8-bit + if (T_BYTES(*InputFormat) != 1 || T_BYTES(*OutputFormat) != 1) return FALSE; + + // Curves need same channels on input and output (despite extra channels may differ) + nChans = T_CHANNELS(*InputFormat); + if (nChans != T_CHANNELS(*OutputFormat)) return FALSE; + + // gray and RGB + if (nChans != 1 && nChans != 3) return FALSE; + + // Only curves in this LUT? + for (mpe = cmsPipelineGetPtrToFirstStage(Src); + mpe != NULL; + mpe = cmsStageNext(mpe)) { + + if (cmsStageType(mpe) != cmsSigCurveSetElemType) return FALSE; + } + + Data = ComputeCompositeCurves(nChans, Src); + + *dwFlags |= cmsFLAGS_NOCACHE; + *UserData = Data; + *FreeUserData = _cmsFree; + + // Maybe the curves are linear at the end + if (nChans == 1) + *TransformFn = (_cmsTransformFn) (AllCurvesAreLinear(Data) ? FastGrayIdentity8 : FastEvaluateGrayCurves8); + else + *TransformFn = (_cmsTransformFn) (AllCurvesAreLinear(Data) ? FastRGBIdentity8 : FastEvaluateRGBCurves8); + + return TRUE; + +} + diff --git a/plugins/fast_float/src/fast_8_matsh.c b/plugins/fast_float/src/fast_8_matsh.c new file mode 100644 index 0000000..3f3af3c --- /dev/null +++ b/plugins/fast_float/src/fast_8_matsh.c @@ -0,0 +1,403 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +// Optimization for matrix-shaper in 8 bits. Numbers are operated in n.14 signed, tables are stored in 1.14 fixed + +#include "fast_float_internal.h" + +typedef cmsInt32Number cmsS1Fixed14Number; // Note that this may hold more than 16 bits! + +#define DOUBLE_TO_1FIXED14(x) ((cmsS1Fixed14Number) floor((x) * 16384.0 + 0.5)) + +// This is the private data container used by this optimization +typedef struct { + + // This is for SSE2, MUST be aligned at 16 bit boundary + + cmsFloat32Number fMatrix[4][4]; + cmsFloat32Number fShaper1[256 * 3]; + + void * real_ptr; + + cmsContext ContextID; + + cmsS1Fixed14Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0) + cmsS1Fixed14Number Shaper1G[256]; + cmsS1Fixed14Number Shaper1B[256]; + + cmsS1Fixed14Number Mat[3][3]; // n.14 to n.14 (needs a saturation after that) + cmsS1Fixed14Number Off[3]; + + cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255 + cmsUInt8Number Shaper2G[0x4001]; + cmsUInt8Number Shaper2B[0x4001]; + +} XMatShaper8Data; + + +static +XMatShaper8Data* malloc_aligned(cmsContext ContextID) +{ + cmsUInt8Number* real_ptr = (cmsUInt8Number*) _cmsMallocZero(ContextID, sizeof(XMatShaper8Data) + 32); + cmsUInt8Number* aligned = (cmsUInt8Number*) (((uintptr_t)real_ptr + 16) & ~0xf); + XMatShaper8Data* p = (XMatShaper8Data*) aligned; + + p ->real_ptr = real_ptr; + return p; +} + +static +void free_aligned(XMatShaper8Data* a) +{ + _cmsFree(a->ContextID, a->real_ptr); +} + + +// Free the private data container +static +void FreeMatShaper(cmsContext ContextID, void* Data) +{ + UNUSED_PARAMETER(ContextID); + + if (Data != NULL) free_aligned((XMatShaper8Data*) Data); +} + + +// This table converts from 8 bits to 1.14 after applying the curve +static +void FillFirstShaper(cmsS1Fixed14Number* Table, cmsToneCurve* Curve) +{ + int i; + cmsFloat32Number R, y; + + for (i=0; i < 256; i++) { + + R = (cmsFloat32Number) (i / 255.0); + y = cmsEvalToneCurveFloat(Curve, R); + + Table[i] = DOUBLE_TO_1FIXED14(y); + } +} + +static +void FillFirstShaperFloat(cmsFloat32Number* Table, cmsToneCurve* Curve) +{ + int i; + cmsFloat32Number R; + + for (i=0; i < 256; i++) { + + R = (cmsFloat32Number) (i / 255.0); + + Table[i] = cmsEvalToneCurveFloat(Curve, R); + } +} + + +// This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve +static +void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve) +{ + int i; + cmsFloat32Number R, Val; + cmsUInt16Number w; + + for (i=0; i < 0x4001; i++) { + + R = (cmsFloat32Number) (i / 16384.0); + Val = cmsEvalToneCurveFloat(Curve, R); + w = _cmsSaturateWord(Val * 65535.0 + 0.5); + + Table[i] = FROM_16_TO_8(w); + + } +} + +// Compute the matrix-shaper structure +static +XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3]) +{ + XMatShaper8Data* p; + int i, j; + + // Allocate a big chuck of memory to store precomputed tables + p = malloc_aligned(ContextID); + if (p == NULL) return FALSE; + + p -> ContextID = ContextID; + + // Precompute tables + FillFirstShaper(p ->Shaper1R, Curve1[0]); + FillFirstShaper(p ->Shaper1G, Curve1[1]); + FillFirstShaper(p ->Shaper1B, Curve1[2]); + + FillSecondShaper(p ->Shaper2R, Curve2[0]); + FillSecondShaper(p ->Shaper2G, Curve2[1]); + FillSecondShaper(p ->Shaper2B, Curve2[2]); + + + FillFirstShaperFloat(p ->fShaper1, Curve1[0]); + FillFirstShaperFloat(p ->fShaper1 + 256, Curve1[1]); + FillFirstShaperFloat(p ->fShaper1 + 256*2, Curve1[2]); + + // Convert matrix to nFixed14. Note that those values may take more than 16 bits as + for (i=0; i < 3; i++) { + for (j=0; j < 3; j++) { + p ->Mat[i][j] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]); + p ->fMatrix[j][i] = (cmsFloat32Number) Mat ->v[i].n[j]; + } + } + + + for (i=0; i < 3; i++) { + + if (Off == NULL) { + + p ->Off[i] = 0x2000; + p ->fMatrix[3][i] = 0.0f; + } + else { + p ->Off[i] = DOUBLE_TO_1FIXED14(Off->n[i]) + 0x2000; + p ->fMatrix[3][i] = (cmsFloat32Number) Off->n[i]; + } + } + + + return p; +} + +// A fast matrix-shaper evaluator for 8 bits. This is a bit ticky since I'm using 1.14 signed fixed point +// to accomplish some performance. Actually it takes 256x3 16 bits tables and 16385 x 3 tables of 8 bits, +// in total about 50K, and the performance boost is huge! + +static +void MatShaperXform8(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + XMatShaper8Data* p = (XMatShaper8Data*) _cmsGetTransformUserData(CMMcargo); + + register cmsS1Fixed14Number l1, l2, l3; + cmsS1Fixed14Number r, g, b; + cmsUInt32Number ri, gi, bi; + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; + + + for (ii = 0; ii < PixelsPerLine; ii++) { + + // Across first shaper, which also converts to 1.14 fixed point. 16 bits guaranteed. + r = p->Shaper1R[*rin]; + g = p->Shaper1G[*gin]; + b = p->Shaper1B[*bin]; + + // Evaluate the matrix in 1.14 fixed point + l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 14; + l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 14; + l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 14; + + + // Now we have to clip to 0..1.0 range + ri = (l1 < 0) ? 0 : ((l1 > 0x4000) ? 0x4000 : l1); + gi = (l2 < 0) ? 0 : ((l2 > 0x4000) ? 0x4000 : l2); + bi = (l3 < 0) ? 0 : ((l3 > 0x4000) ? 0x4000 : l3); + + + // And across second shaper, + *rout = p->Shaper2R[ri]; + *gout = p->Shaper2G[gi]; + *bout = p->Shaper2B[bi]; + + // Handle alpha + if (ain) { + *aout = *ain; + } + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + if (ain) ain += SourceIncrements[3]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + if (aout) aout += DestIncrements[3]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + +// 8 bits on input allows matrix-shaper boost up a little bit +cmsBool Optimize8MatrixShaper(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsStage* Curve1, *Curve2; + cmsStage* Matrix1, *Matrix2; + _cmsStageMatrixData* Data1; + _cmsStageMatrixData* Data2; + cmsMAT3 res; + cmsBool IdentityMat = FALSE; + cmsPipeline* Dest, *Src; + cmsContext ContextID; + cmsUInt32Number nChans; + cmsFloat64Number factor = 1.0; + + // Only works on RGB to RGB and gray to gray + + if ( !( (T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3) || + (T_CHANNELS(*InputFormat) == 1 && T_CHANNELS(*OutputFormat) == 1) )) return FALSE; + + // Only works on 8 bit input + if (T_BYTES(*InputFormat) != 1 || T_BYTES(*OutputFormat) != 1) return FALSE; + + // Seems suitable, proceed + Src = *Lut; + + // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for + if (!cmsPipelineCheckAndRetreiveStages(Src, 4, + cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType, + &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE; + + ContextID = cmsGetPipelineContextID(Src); + nChans = T_CHANNELS(*InputFormat); + + // Get both matrices, which are 3x3 + Data1 = (_cmsStageMatrixData*) cmsStageData(Matrix1); + Data2 = (_cmsStageMatrixData*) cmsStageData(Matrix2); + + // Input offset should be zero + if (Data1 ->Offset != NULL) return FALSE; + + if (cmsStageInputChannels(Matrix1) == 1 && cmsStageOutputChannels(Matrix2) == 1) + { + // This is a gray to gray. Just multiply + factor = Data1->Double[0]*Data2->Double[0] + + Data1->Double[1]*Data2->Double[1] + + Data1->Double[2]*Data2->Double[2]; + + if (fabs(1 - factor) < (1.0 / 65535.0)) IdentityMat = TRUE; + } + else + { + // Multiply both matrices to get the result + _cmsMAT3per(&res, (cmsMAT3*) Data2 ->Double, (cmsMAT3*) Data1 ->Double); + + // Now the result is in res + Data2 -> Offset. Maybe is a plain identity? + IdentityMat = FALSE; + if (_cmsMAT3isIdentity(&res) && Data2 ->Offset == NULL) { + + // We can get rid of full matrix + IdentityMat = TRUE; + } + } + + // Allocate an empty LUT + Dest = cmsPipelineAlloc(ContextID, nChans, nChans); + if (!Dest) return FALSE; + + // Assamble the new LUT + cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1)); + + if (!IdentityMat) { + + if (nChans == 1) + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 1, 1, (const cmsFloat64Number*) &factor, Data2->Offset)); + else + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*) &res, Data2 ->Offset)); + } + + + cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2)); + + // If identity on matrix, we can further optimize the curves, so call the join curves routine + if (IdentityMat) { + + Optimize8ByJoiningCurves(TransformFn, UserData, FreeUserData, &Dest, InputFormat, OutputFormat, dwFlags); + } + else { + _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*) cmsStageData(Curve1); + _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*) cmsStageData(Curve2); + + // In this particular optimization, caché does not help as it takes more time to deal with + // the caché that with the pixel handling + *dwFlags |= cmsFLAGS_NOCACHE; + + // Setup the optimizarion routines + *UserData = SetMatShaper(ContextID, mpeC1 ->TheCurves, &res, (cmsVEC3*) Data2 ->Offset, mpeC2->TheCurves); + *FreeUserData = FreeMatShaper; + + *TransformFn = (_cmsTransformFn) MatShaperXform8; + } + + cmsPipelineFree(Src); + *Lut = Dest; + return TRUE; +} + + diff --git a/plugins/fast_float/src/fast_8_tethra.c b/plugins/fast_float/src/fast_8_tethra.c new file mode 100644 index 0000000..8fdea0a --- /dev/null +++ b/plugins/fast_float/src/fast_8_tethra.c @@ -0,0 +1,504 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +#define PRELINEARIZATION_POINTS 4096 + +// Optimization for 8 bits, 3 inputs only +typedef struct { + + cmsContext ContextID; + + const cmsInterpParams* p; // Tetrahedrical interpolation parameters. This is a not-owned pointer. + + cmsUInt16Number rx[256], ry[256], rz[256]; + cmsUInt32Number X0[256], Y0[256], Z0[256]; // Precomputed nodes and offsets for 8-bit input data + + +} Performance8Data; + + +// Precomputes tables for 8-bit on input devicelink. +static +Performance8Data* Performance8alloc(cmsContext ContextID, const cmsInterpParams* p, cmsToneCurve* G[3]) +{ + int i; + cmsUInt16Number Input[3]; + cmsS15Fixed16Number v1, v2, v3; + Performance8Data* p8; + + p8 = (Performance8Data*) _cmsMallocZero(ContextID, sizeof(Performance8Data)); + if (p8 == NULL) return NULL; + + // Since this only works for 8 bit input, values comes always as x * 257, + // we can safely take msb byte (x << 8 + x) + for (i=0; i < 256; i++) { + + if (G != NULL) { + + // Get 16-bit representation + Input[0] = cmsEvalToneCurve16(G[0], FROM_8_TO_16(i)); + Input[1] = cmsEvalToneCurve16(G[1], FROM_8_TO_16(i)); + Input[2] = cmsEvalToneCurve16(G[2], FROM_8_TO_16(i)); + } + else { + Input[0] = FROM_8_TO_16(i); + Input[1] = FROM_8_TO_16(i); + Input[2] = FROM_8_TO_16(i); + } + + // Move to 0..1.0 in fixed domain + v1 = _cmsToFixedDomain(Input[0] * p -> Domain[0]); + v2 = _cmsToFixedDomain(Input[1] * p -> Domain[1]); + v3 = _cmsToFixedDomain(Input[2] * p -> Domain[2]); + + // Store the precalculated table of nodes + p8 ->X0[i] = (p->opta[2] * FIXED_TO_INT(v1)); + p8 ->Y0[i] = (p->opta[1] * FIXED_TO_INT(v2)); + p8 ->Z0[i] = (p->opta[0] * FIXED_TO_INT(v3)); + + // Store the precalculated table of offsets + p8 ->rx[i] = (cmsUInt16Number) FIXED_REST_TO_INT(v1); + p8 ->ry[i] = (cmsUInt16Number) FIXED_REST_TO_INT(v2); + p8 ->rz[i] = (cmsUInt16Number) FIXED_REST_TO_INT(v3); + } + + + p8 ->ContextID = ContextID; + p8 ->p = p; + + return p8; +} + +static +void Performance8free(cmsContext ContextID, void* ptr) +{ + _cmsFree(ContextID, ptr); +} + + +// Sampler implemented by another LUT. This is a clean way to precalculate the devicelink 3D CLUT for +// almost any transform. We use floating point precision and then convert from floating point to 16 bits. +static +int XFormSampler16(register const cmsUInt16Number In[], register cmsUInt16Number Out[], register void* Cargo) +{ + // Evaluate in 16 bits + cmsPipelineEval16(In, Out, (cmsPipeline*) Cargo); + + // Always succeed + return TRUE; +} + + +// A optimized interpolation for 8-bit input. +#define DENS(i,j,k) (LutTable[(i)+(j)+(k)+OutChan]) + +static +void PerformanceEval8(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + + cmsUInt8Number r, g, b; + cmsS15Fixed16Number rx, ry, rz; + cmsS15Fixed16Number c0, c1, c2, c3, Rest; + cmsUInt32Number OutChan, TotalPlusAlpha; + cmsS15Fixed16Number X0, X1, Y0, Y1, Z0, Z1; + Performance8Data* p8 = (Performance8Data*)_cmsGetTransformUserData(CMMcargo); + const cmsInterpParams* p = p8->p; + cmsUInt32Number TotalOut = p->nOutputs; + const cmsUInt16Number* LutTable = (const cmsUInt16Number*)p->Table; + + cmsUInt8Number* out[cmsMAXCHANNELS]; + cmsUInt16Number res16; + + cmsUInt32Number i, ii; + + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + TotalPlusAlpha = TotalOut; + if (ain) TotalPlusAlpha++; + + for (OutChan = 0; OutChan < TotalPlusAlpha; OutChan++) { + out[OutChan] = (cmsUInt8Number*)Output + DestStartingOrder[OutChan] + strideOut; + } + + + for (ii = 0; ii < PixelsPerLine; ii++) { + + r = *rin; g = *gin; b = *bin; + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + + X0 = X1 = p8->X0[r]; + Y0 = Y1 = p8->Y0[g]; + Z0 = Z1 = p8->Z0[b]; + + rx = p8->rx[r]; + ry = p8->ry[g]; + rz = p8->rz[b]; + + X1 = X0 + ((rx == 0) ? 0 : p->opta[2]); + Y1 = Y0 + ((ry == 0) ? 0 : p->opta[1]); + Z1 = Z0 + ((rz == 0) ? 0 : p->opta[0]); + + + // These are the 6 Tetrahedral + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + + c0 = DENS(X0, Y0, Z0); + + if (rx >= ry && ry >= rz) + { + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z0) - DENS(X1, Y0, Z0); + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + } + else + if (rx >= rz && rz >= ry) + { + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X1, Y0, Z1) - DENS(X1, Y0, Z0); + } + else + if (rz >= rx && rx >= ry) + { + c1 = DENS(X1, Y0, Z1) - DENS(X0, Y0, Z1); + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + } + else + if (ry >= rx && rx >= rz) + { + c1 = DENS(X1, Y1, Z0) - DENS(X0, Y1, Z0); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + } + else + if (ry >= rz && rz >= rx) + { + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X0, Y1, Z1) - DENS(X0, Y1, Z0); + } + else + if (rz >= ry && ry >= rx) + { + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z1) - DENS(X0, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + } + else { + c1 = c2 = c3 = 0; + } + + + Rest = c1 * rx + c2 * ry + c3 * rz + 0x8001; + res16 = (cmsUInt16Number)c0 + ((Rest + (Rest >> 16)) >> 16); + + *out[OutChan] = FROM_16_TO_8(res16); + out[OutChan] += DestIncrements[OutChan]; + + if (ain) + *out[TotalOut] = *ain; + + } + + + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + +#undef DENS + + +// Curves that contain wide empty areas are not optimizeable +static +cmsBool IsDegenerated(const cmsToneCurve* g) +{ + int i, Zeros = 0, Poles = 0; + int nEntries = cmsGetToneCurveEstimatedTableEntries(g); + const cmsUInt16Number* Table16 = cmsGetToneCurveEstimatedTable(g); + + for (i=0; i < nEntries; i++) { + + if (Table16[i] == 0x0000) Zeros++; + if (Table16[i] == 0xffff) Poles++; + } + + if (Zeros == 1 && Poles == 1) return FALSE; // For linear tables + if (Zeros > (nEntries / 4)) return TRUE; // Degenerated, mostly zeros + if (Poles > (nEntries / 4)) return TRUE; // Degenerated, mostly poles + + return FALSE; +} + + + +// Normalize endpoints by slope limiting max and min. This assures endpoints as well. +// Descending curves are handled as well. +static +void SlopeLimiting(cmsUInt16Number* Table16, int nEntries) +{ + int BeginVal, EndVal; + + int AtBegin = (int) floor((cmsFloat64Number)nEntries * 0.02 + 0.5); // Cutoff at 2% + int AtEnd = nEntries - AtBegin - 1; // And 98% + cmsFloat64Number Val, Slope, beta; + int i; + + + if (Table16[0] > Table16[nEntries-1]) { + BeginVal = 0xffff; EndVal = 0; + } + else { + BeginVal = 0; EndVal = 0xffff; + } + + // Compute slope and offset for begin of curve + Val = Table16[AtBegin]; + Slope = (Val - BeginVal) / AtBegin; + beta = Val - Slope * AtBegin; + + for (i=0; i < AtBegin; i++) + Table16[i] = _cmsSaturateWord(i * Slope + beta); + + // Compute slope and offset for the end + Val = Table16[AtEnd]; + Slope = (EndVal - Val) / AtBegin; // AtBegin holds the X interval, which is same in both cases + beta = Val - Slope * AtEnd; + + for (i = AtEnd; i < (int) nEntries; i++) + Table16[i] = _cmsSaturateWord(i * Slope + beta); +} + + +// -------------------------------------------------------------------------------------------------------------- + +cmsBool Optimize8BitRGBTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsPipeline* OriginalLut; + int nGridPoints; + cmsToneCurve *Trans[cmsMAXCHANNELS], *TransReverse[cmsMAXCHANNELS]; + cmsUInt32Number t, i, j; + cmsFloat32Number v, In[cmsMAXCHANNELS], Out[cmsMAXCHANNELS]; + cmsBool lIsSuitable, lIsLinear; + cmsPipeline* OptimizedLUT = NULL, *LutPlusCurves = NULL; + cmsStage* OptimizedCLUTmpe; + cmsColorSpaceSignature OutputColorSpace; + cmsStage* OptimizedPrelinMpe; + cmsStage* mpe; + Performance8Data* p8; + cmsUInt16Number* MyTable[3]; + cmsContext ContextID; + _cmsStageCLutData* data; + + // For empty transforms, do nothing + if (*Lut == NULL) return FALSE; + + // This is a loosy optimization! does not apply in floating-point cases + if (T_FLOAT(*InputFormat) || T_FLOAT(*OutputFormat)) return FALSE; + + // Only on 8-bit + if (T_BYTES(*InputFormat) != 1 || T_BYTES(*OutputFormat) != 1) return FALSE; + + // Only on RGB + if (T_COLORSPACE(*InputFormat) != PT_RGB) return FALSE; + + OriginalLut = *Lut; + + // Named color pipelines cannot be optimized either + for (mpe = cmsPipelineGetPtrToFirstStage(OriginalLut); + mpe != NULL; + mpe = cmsStageNext(mpe)) { + if (cmsStageType(mpe) == cmsSigNamedColorElemType) return FALSE; + } + + ContextID = cmsGetPipelineContextID(OriginalLut); + OutputColorSpace = _cmsICCcolorSpace(T_COLORSPACE(*OutputFormat)); + nGridPoints = _cmsReasonableGridpointsByColorspace(cmsSigRgbData, *dwFlags); + + // Empty gamma containers + memset(Trans, 0, sizeof(Trans)); + memset(TransReverse, 0, sizeof(TransReverse)); + + MyTable[0] = (cmsUInt16Number*) _cmsMallocZero(ContextID, sizeof(cmsUInt16Number) * PRELINEARIZATION_POINTS); + MyTable[1] = (cmsUInt16Number*) _cmsMallocZero(ContextID, sizeof(cmsUInt16Number) * PRELINEARIZATION_POINTS); + MyTable[2] = (cmsUInt16Number*) _cmsMallocZero(ContextID, sizeof(cmsUInt16Number) * PRELINEARIZATION_POINTS); + + if (MyTable[0] == NULL || MyTable[1] == NULL || MyTable[2] == NULL) goto Error; + + // Populate the curves + + for (i=0; i < PRELINEARIZATION_POINTS; i++) { + + v = (cmsFloat32Number) ((cmsFloat64Number) i / (PRELINEARIZATION_POINTS - 1)); + + // Feed input with a gray ramp + for (j=0; j < 3; j++) + In[j] = v; + + // Evaluate the gray value + cmsPipelineEvalFloat(In, Out, OriginalLut); + + // Store result in curve + for (j=0; j < 3; j++) + MyTable[j][i] = _cmsSaturateWord(Out[j] * 65535.0); + } + + for (t=0; t < 3; t++) { + + SlopeLimiting(MyTable[t], PRELINEARIZATION_POINTS); + + Trans[t] = cmsBuildTabulatedToneCurve16(ContextID, PRELINEARIZATION_POINTS, MyTable[t]); + if (Trans[t] == NULL) goto Error; + + _cmsFree(cmsGetPipelineContextID(OriginalLut), MyTable[t]); + } + + // Check for validity + lIsSuitable = TRUE; + lIsLinear = TRUE; + for (t=0; (lIsSuitable && (t < 3)); t++) { + + // Exclude if already linear + if (!cmsIsToneCurveLinear(Trans[t])) + lIsLinear = FALSE; + + // Exclude if non-monotonic + if (!cmsIsToneCurveMonotonic(Trans[t])) + lIsSuitable = FALSE; + + if (IsDegenerated(Trans[t])) + lIsSuitable = FALSE; + } + + // If it is not suitable, just quit + if (!lIsSuitable) goto Error; + + // Invert curves if possible + for (t = 0; t < cmsPipelineInputChannels(OriginalLut); t++) { + TransReverse[t] = cmsReverseToneCurveEx(PRELINEARIZATION_POINTS, Trans[t]); + if (TransReverse[t] == NULL) goto Error; + } + + // Now inset the reversed curves at the begin of transform + LutPlusCurves = cmsPipelineDup(OriginalLut); + if (LutPlusCurves == NULL) goto Error; + + cmsPipelineInsertStage(LutPlusCurves, cmsAT_BEGIN, cmsStageAllocToneCurves(ContextID, 3, TransReverse)); + + // Create the result LUT + OptimizedLUT = cmsPipelineAlloc(cmsGetPipelineContextID(OriginalLut), 3, cmsPipelineOutputChannels(OriginalLut)); + if (OptimizedLUT == NULL) goto Error; + + OptimizedPrelinMpe = cmsStageAllocToneCurves(ContextID, 3, Trans); + + // Create and insert the curves at the beginning + cmsPipelineInsertStage(OptimizedLUT, cmsAT_BEGIN, OptimizedPrelinMpe); + + // Allocate the CLUT for result + OptimizedCLUTmpe = cmsStageAllocCLut16bit(ContextID, nGridPoints, 3, cmsPipelineOutputChannels(OriginalLut), NULL); + + // Add the CLUT to the destination LUT + cmsPipelineInsertStage(OptimizedLUT, cmsAT_END, OptimizedCLUTmpe); + + // Resample the LUT + if (!cmsStageSampleCLut16bit(OptimizedCLUTmpe, XFormSampler16, (void*) LutPlusCurves, 0)) goto Error; + + // Set the evaluator + data = (_cmsStageCLutData*) cmsStageData(OptimizedCLUTmpe); + + p8 = Performance8alloc(ContextID, data ->Params, Trans); + if (p8 == NULL) return FALSE; + + // Free resources + for (t = 0; t <3; t++) { + + if (Trans[t]) cmsFreeToneCurve(Trans[t]); + if (TransReverse[t]) cmsFreeToneCurve(TransReverse[t]); + } + + cmsPipelineFree(LutPlusCurves); + + // And return the obtained LUT + cmsPipelineFree(OriginalLut); + + *Lut = OptimizedLUT; + *TransformFn = (_cmsTransformFn) PerformanceEval8; + *UserData = p8; + *FreeDataFn = Performance8free; + + return TRUE; + +Error: + + for (t = 0; t < 3; t++) { + + if (Trans[t]) cmsFreeToneCurve(Trans[t]); + if (TransReverse[t]) cmsFreeToneCurve(TransReverse[t]); + } + + if (LutPlusCurves != NULL) cmsPipelineFree(LutPlusCurves); + if (OptimizedLUT != NULL) cmsPipelineFree(OptimizedLUT); + + return FALSE; +} + diff --git a/plugins/fast_float/src/fast_float_15bits.c b/plugins/fast_float/src/fast_float_15bits.c new file mode 100644 index 0000000..5186466 --- /dev/null +++ b/plugins/fast_float/src/fast_float_15bits.c @@ -0,0 +1,568 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + + +//--------------------------------------------------------------------------------- + +// The internal photoshop 16 bit format range is 1.15 fixed point, which goes 0..32768 +// (NOT 32767) that means: +// +// 16 bits encoding 15 bit Photoshop encoding +// ================ ========================= +// +// 0x0000 0x0000 +// 0xFFFF 0x8000 +// +// A nice (and fast) way to implement conversions is by using 64 bit values, which are +// native CPU word size in most today architectures. +// In CMYK, internal Photoshop format comes inverted, and this inversion happens after +// the resizing, so values 32769 to 65535 are never used in PhotoShop. + +//--------------------------------------------------------------------------------- + +// This macro converts 16 bits to 15 bits by using a 64 bits value +cmsINLINE cmsUInt16Number From16To15(cmsUInt16Number x16) +{ + cmsUInt64Number r64 = (((cmsUInt64Number)x16 << 15)) / 0xFFFFL; + return (cmsUInt16Number)r64; +} + +// This macro converts 15 bits to 16 bits by using a 64 bit value. It is based in fixed 1.15 math +cmsINLINE cmsUInt16Number From15To16(cmsUInt16Number x15) +{ + cmsUInt64Number r64 = ((cmsUInt64Number) x15 * 0xFFFF + 0x4000L) >> 15; + return (cmsUInt16Number)r64; +} + +// Specialized 1-channel formatters +static +cmsUInt8Number* Unroll15bitsGray(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + Values[0] = From15To16(*(cmsUInt16Number*)Buffer); + + return Buffer + 2; +} + + +static +cmsUInt8Number* Pack15bitsGray(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + *(cmsUInt16Number*)Buffer = From16To15(Values[0]); + return Buffer + 2; +} + +// Specialized 3-channels formatters +static +cmsUInt8Number* Unroll15bitsRGB(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + Values[0] = From15To16(*(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[1] = From15To16(*(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[2] = From15To16(*(cmsUInt16Number*)Buffer); + + return Buffer + 2; +} + + +static +cmsUInt8Number* Pack15bitsRGB(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + *(cmsUInt16Number*)Buffer = From16To15(Values[0]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = From16To15(Values[1]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = From16To15(Values[2]); + + return Buffer + 2; +} + + +static +cmsUInt8Number* Unroll15bitsRGBA(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + Values[0] = From15To16(*(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[1] = From15To16(*(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[2] = From15To16(*(cmsUInt16Number*)Buffer); + + return Buffer + 4; +} + + +static +cmsUInt8Number* Pack15bitsRGBA(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + *(cmsUInt16Number*)Buffer = From16To15(Values[0]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = From16To15(Values[1]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = From16To15(Values[2]); + + return Buffer + 4; +} + + +// Specialized 3 channels reversed formatters +static +cmsUInt8Number* Unroll15bitsBGR(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + Values[2] = From15To16(*(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[1] = From15To16(*(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[0] = From15To16(*(cmsUInt16Number*)Buffer); + + return Buffer + 2; +} + + +static +cmsUInt8Number* Pack15bitsBGR(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + *(cmsUInt16Number*)Buffer = From16To15(Values[2]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = From16To15(Values[1]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = From16To15(Values[0]); + + return Buffer+2; +} + +// Specialized 4 channels CMYK formatters. Note Photoshop stores CMYK reversed +static +cmsUInt8Number* Unroll15bitsCMYK(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + Values[0] = From15To16(0x8000 - *(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[1] = From15To16(0x8000 - *(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[2] = From15To16(0x8000 - *(cmsUInt16Number*)Buffer); + Buffer += 2; + Values[3] = From15To16(0x8000 - *(cmsUInt16Number*)Buffer); + + return Buffer + 2; +} + +static +cmsUInt8Number* Pack15bitsCMYK(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + UNUSED_PARAMETER(CMMcargo); + UNUSED_PARAMETER(Stride); + + *(cmsUInt16Number*)Buffer = 0x8000U - From16To15(Values[0]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = 0x8000U - From16To15(Values[1]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = 0x8000U - From16To15(Values[2]); + Buffer += 2; + *(cmsUInt16Number*)Buffer = 0x8000U - From16To15(Values[3]); + + return Buffer + 2; +} + + +// This macros does all handling for fallthrough cases +cmsINLINE cmsUInt16Number UnrollOne(cmsUInt16Number x, cmsBool Reverse, cmsBool SwapEndian) +{ + if (SwapEndian) + x = (x << 8) | (x >> 8); + + if (Reverse) + x = 0xffff - x; + + return From15To16(x); +} + +cmsINLINE cmsUInt16Number PackOne(cmsUInt16Number x, cmsBool Reverse, cmsBool SwapEndian) +{ + x = From16To15(x); + + if (Reverse) + x = 0xffff - x; + + if (SwapEndian) + x = (x << 8) | (x >> 8); + + return x; +} + +// Generic planar support +static +cmsUInt8Number* Unroll15bitsPlanar(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number wIn[], + register cmsUInt8Number* accum, + register cmsUInt32Number Stride) +{ + _xform_head* head = (_xform_head*) CMMcargo; + int nChan = T_CHANNELS(head->InputFormat); + int DoSwap = T_DOSWAP(head->InputFormat); + int Reverse = T_FLAVOR(head->InputFormat); + int SwapEndian = T_ENDIAN16(head->InputFormat); + int i; + cmsUInt8Number* Init = accum; + + UNUSED_PARAMETER(Stride); + + if (DoSwap) { + accum += T_EXTRA(head->InputFormat) * Stride * 2; + } + + for (i = 0; i < nChan; i++) { + + int index = DoSwap ? (nChan - i - 1) : i; + + wIn[index] = UnrollOne(*(cmsUInt16Number*)accum, Reverse, SwapEndian); + + accum += Stride * 2; + } + + return (Init + 2); +} + + +static +cmsUInt8Number* Pack15bitsPlanar(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number wOut[], + register cmsUInt8Number* output, + register cmsUInt32Number Stride) +{ + _xform_head* head = (_xform_head*)CMMcargo; + int nChan = T_CHANNELS(head->OutputFormat); + int DoSwap = T_DOSWAP(head->OutputFormat); + int Reverse = T_FLAVOR(head->OutputFormat); + int SwapEndian = T_ENDIAN16(head->OutputFormat); + register int i; + cmsUInt8Number* Init = output; + + + if (DoSwap) { + output += T_EXTRA(head->OutputFormat) * Stride * 2; + } + + for (i = 0; i < nChan; i++) { + + int index = DoSwap ? (nChan - i - 1) : i; + + *(cmsUInt16Number*)output = PackOne(wOut[index], Reverse, SwapEndian); + output += (Stride * sizeof(cmsUInt16Number)); + } + + return (Init + sizeof(cmsUInt16Number)); +} + + + +// Generic falltrough +static +cmsUInt8Number* Unroll15bitsChunky(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + _xform_head* head = (_xform_head*) CMMcargo; + + int nChan = T_CHANNELS(head->InputFormat); + int DoSwap = T_DOSWAP(head->InputFormat); + int Reverse = T_FLAVOR(head->InputFormat); + int SwapEndian = T_ENDIAN16(head->InputFormat); + + register int i; + + UNUSED_PARAMETER(Stride); + + if (DoSwap) { + Buffer += T_EXTRA(head->OutputFormat) * 2; + } + + for (i = 0; i < nChan; i++) { + + int index = DoSwap ? (nChan - i - 1) : i; + + Values[index] = UnrollOne(*(cmsUInt16Number*)Buffer, Reverse, SwapEndian); + + Buffer += 2; + } + + + return Buffer; +} + + +static +cmsUInt8Number* Pack15bitsChunky(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + _xform_head* head = (_xform_head*)CMMcargo; + + int nChan = T_CHANNELS(head->OutputFormat); + int DoSwap = T_DOSWAP(head->OutputFormat); + int Reverse = T_FLAVOR(head->OutputFormat); + int SwapEndian = T_ENDIAN16(head->OutputFormat); + + register int i; + + UNUSED_PARAMETER(Stride); + + if (DoSwap) { + Buffer += T_EXTRA(head->OutputFormat) * 2; + } + + for (i = 0; i < nChan; i++) { + + int index = DoSwap ? (nChan - i - 1) : i; + + *(cmsUInt16Number*)Buffer = PackOne(Values[index], Reverse, SwapEndian); + + Buffer += 2; + } + + return Buffer; +} + + + +// Generic N-bytes plus dither 16-to-8 conversion. +static int err[cmsMAXCHANNELS]; + +static +cmsUInt8Number* PackNBytesDither(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + _xform_head* info = (_xform_head*)CMMcargo; + + int nChan = T_CHANNELS(info->OutputFormat); + register int i; + unsigned int n, pe, pf; + + UNUSED_PARAMETER(Stride); + + for (i = 0; i < nChan; i++) { + + n = Values[i] + err[i]; // Value + + pe = (n / 257); // Whole part + pf = (n % 257); // Fractional part + + err[i] = pf; // Store it for next pixel + + *Buffer++ = (cmsUInt8Number) pe; + } + + return Buffer + T_EXTRA(info->OutputFormat); +} + + +static +cmsUInt8Number* PackNBytesSwapDither(register struct _cmstransform_struct* CMMcargo, + register cmsUInt16Number Values[], + register cmsUInt8Number* Buffer, + register cmsUInt32Number Stride) +{ + _xform_head* info = (_xform_head*)CMMcargo; + + int nChan = T_CHANNELS(info->OutputFormat); + register int i; + unsigned int n, pe, pf; + + UNUSED_PARAMETER(Stride); + + for (i = nChan - 1; i >= 0; --i) { + + n = Values[i] + err[i]; // Value + + pe = (n / 257); // Whole part + pf = (n % 257); // Fractional part + + err[i] = pf; // Store it for next pixel + + *Buffer++ = (cmsUInt8Number)pe; + } + + + return Buffer + T_EXTRA(info->OutputFormat); +} + + +// The factory for 15 bits. This function returns a pointer to specialized function +// that would deal with the asked format. It return a pointer to NULL if the format +// is not supported. This is tha basis of formatter plug-in for 15 bit formats. +cmsFormatter Formatter_15Bit_Factory(cmsUInt32Number Type, + cmsFormatterDirection Dir, + cmsUInt32Number dwFlags) +{ + cmsFormatter Result = { NULL }; + + UNUSED_PARAMETER(dwFlags); + + switch (Type) { + + // Simple Gray + case TYPE_GRAY_15: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsGray : Pack15bitsGray; + break; + + // 3 channels + case TYPE_CMY_15: + case TYPE_RGB_15: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsRGB : Pack15bitsRGB; + break; + + // 3 channels reversed + case TYPE_YMC_15: + case TYPE_BGR_15: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsBGR : Pack15bitsBGR; + break; + + // 3 Channels plus one alpha + case TYPE_RGBA_15: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsRGBA : Pack15bitsRGBA; + break; + + // 4 channels + case TYPE_CMYK_15: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsCMYK : Pack15bitsCMYK; + break; + + // Planar versions + case TYPE_GRAYA_15_PLANAR: + case TYPE_RGB_15_PLANAR: + case TYPE_BGR_15_PLANAR: + case TYPE_RGBA_15_PLANAR: + case TYPE_ABGR_15_PLANAR: + case TYPE_CMY_15_PLANAR: + case TYPE_CMYK_15_PLANAR: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsPlanar : Pack15bitsPlanar; + break; + + // Falltrough for remaining (corner) cases + case TYPE_GRAY_15_REV: + case TYPE_GRAY_15_SE: + case TYPE_GRAYA_15: + case TYPE_GRAYA_15_SE: + case TYPE_RGB_15_SE: + case TYPE_BGR_15_SE: + case TYPE_RGBA_15_SE: + case TYPE_ARGB_15: + case TYPE_ABGR_15: + case TYPE_ABGR_15_SE: + case TYPE_BGRA_15: + case TYPE_BGRA_15_SE: + case TYPE_CMY_15_SE: + case TYPE_CMYK_15_REV: + case TYPE_CMYK_15_SE: + case TYPE_KYMC_15: + case TYPE_KYMC_15_SE: + case TYPE_KCMY_15: + case TYPE_KCMY_15_REV: + case TYPE_KCMY_15_SE: + Result.Fmt16 = (Dir == cmsFormatterInput) ? Unroll15bitsChunky : Pack15bitsChunky; + break; + + case TYPE_GRAY_8_DITHER: + case TYPE_RGB_8_DITHER: + case TYPE_RGBA_8_DITHER: + case TYPE_CMYK_8_DITHER: + if (Dir == cmsFormatterOutput) { + Result.Fmt16 = PackNBytesDither; + } + break; + + case TYPE_ABGR_8_DITHER: + case TYPE_BGR_8_DITHER: + case TYPE_KYMC_8_DITHER: + if (Dir == cmsFormatterOutput) { + Result.Fmt16 = PackNBytesSwapDither; + } + break; + + default:; + } + + return Result; +} + + + diff --git a/plugins/fast_float/src/fast_float_15mats.c b/plugins/fast_float/src/fast_float_15mats.c new file mode 100644 index 0000000..3e5d29d --- /dev/null +++ b/plugins/fast_float/src/fast_float_15mats.c @@ -0,0 +1,353 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +// Optimization for matrix-shaper in 15 bits. Numbers are operated in 1.15 usigned, + +#include "fast_float_internal.h" + +// An storage capable to keep 1.15 signed and some extra precission. +// Actually I use 32 bits integer (signed) +typedef cmsInt32Number cmsS1Fixed15Number; + +// Conversion to fixed. Note we don't use floor to get proper sign roundoff +#define DOUBLE_TO_1FIXED15(x) ((cmsS1Fixed15Number) ((double) (x) * 0x8000 + 0.5)) + +// This is the private data container used by this optimization +typedef struct { + + cmsS1Fixed15Number Mat[3][3]; + cmsS1Fixed15Number Off[3]; + + // Precalculated tables for first shaper (375 Kb in total of both shapers) + cmsUInt16Number Shaper1R[MAX_NODES_IN_CURVE]; + cmsUInt16Number Shaper1G[MAX_NODES_IN_CURVE]; + cmsUInt16Number Shaper1B[MAX_NODES_IN_CURVE]; + + // Second shaper + cmsUInt16Number Shaper2R[MAX_NODES_IN_CURVE]; + cmsUInt16Number Shaper2G[MAX_NODES_IN_CURVE]; + cmsUInt16Number Shaper2B[MAX_NODES_IN_CURVE]; + + // A flag for fast operation if identity + cmsBool IdentityMat; + + // The context + cmsContext ContextID; + + // Poits to the raw, unaligned memory + void * real_ptr; + + +} XMatShaperData; + +// A special malloc that returns memory aligned to DWORD boundary. Aligned memory access is way faster than unaligned +// reference to the real block is kept for later free +static XMatShaperData* malloc_aligned(cmsContext ContextID) +{ + cmsUInt8Number* real_ptr = (cmsUInt8Number*)_cmsMallocZero(ContextID, sizeof(XMatShaperData) + 32); + cmsUInt8Number* aligned = (cmsUInt8Number*)(((uintptr_t)real_ptr + 16) & ~0xf); + XMatShaperData* p = (XMatShaperData*)aligned; + + p->real_ptr = real_ptr; + p->ContextID = ContextID; + return p; +} + + +// Free the private data container +static +void FreeMatShaper(cmsContext ContextID, void* Data) +{ + + XMatShaperData* p = (XMatShaperData*)Data; + if (p != NULL) + _cmsFree(ContextID, p->real_ptr); +} + + +// This table converts from 8 bits to 1.14 after applying the curve +static +void FillShaper(cmsUInt16Number* Table, cmsToneCurve* Curve) +{ + int i; + cmsFloat32Number R, y; + + for (i = 0; i < MAX_NODES_IN_CURVE; i++) { + + R = (cmsFloat32Number)i / (cmsFloat32Number) (MAX_NODES_IN_CURVE - 1); + y = cmsEvalToneCurveFloat(Curve, R); + + Table[i] = (cmsUInt16Number) DOUBLE_TO_1FIXED15(y); + } +} + + +// Compute the matrix-shaper structure +static +XMatShaperData* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3], cmsBool IdentityMat) +{ + XMatShaperData* p; + int i, j; + + // Allocate a big chuck of memory to store precomputed tables + p = malloc_aligned(ContextID); + if (p == NULL) return FALSE; + + p->ContextID = ContextID; + + p->IdentityMat = IdentityMat; + + // Precompute tables + FillShaper(p->Shaper1R, Curve1[0]); + FillShaper(p->Shaper1G, Curve1[1]); + FillShaper(p->Shaper1B, Curve1[2]); + + FillShaper(p->Shaper2R, Curve2[0]); + FillShaper(p->Shaper2G, Curve2[1]); + FillShaper(p->Shaper2B, Curve2[2]); + + // Convert matrix to nFixed14. Note that those values may take more than 16 bits if negative + for (i = 0; i < 3; i++) { + for (j = 0; j < 3; j++) { + + p->Mat[i][j] = DOUBLE_TO_1FIXED15(Mat->v[i].n[j]); + } + } + + + for (i = 0; i < 3; i++) { + + if (Off == NULL) { + + p->Off[i] = 0x4000; + + } + else { + p->Off[i] = DOUBLE_TO_1FIXED15(Off->n[i]) + 0x4000; + + } + } + + + return p; +} + +// A fast matrix-shaper evaluator for 15 bits. This is a bit ticky since I'm using 1.15 signed fixed point. +static +void MatShaperXform(struct _cmstransform_struct *CMMcargo, + const void* Input, + void* Output, + cmsUInt32Number PixelsPerLine, + cmsUInt32Number LineCount, + const cmsStride* Stride) +{ + XMatShaperData* p = (XMatShaperData*)_cmsGetTransformUserData(CMMcargo); + + cmsS1Fixed15Number l1, l2, l3; + + cmsS1Fixed15Number r, g, b; + cmsUInt32Number ri, gi, bi; + cmsUInt32Number i, ii; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + const cmsUInt8Number* ain = NULL; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + cmsUInt8Number* aout = NULL; + + cmsUInt32Number nalpha, strideIn, strideOut; + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements); + + strideIn = strideOut = 0; + for (i = 0; i < LineCount; i++) { + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn; + if (nalpha) + ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn; + + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut; + if (nalpha) + aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut; + + + for (ii = 0; ii < PixelsPerLine; ii++) { + + // Across first shaper, which also converts to 1.15 fixed point. + r = p->Shaper1R[*(cmsUInt16Number*)rin]; + g = p->Shaper1G[*(cmsUInt16Number*)gin]; + b = p->Shaper1B[*(cmsUInt16Number*)bin]; + + if (p->IdentityMat) + { + l1 = r; l2 = g; l3 = b; + } + else + { + // Evaluate the matrix in 1.14 fixed point + l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 15; + l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 15; + l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 15; + } + + // Now we have to clip to 0..1.0 range + ri = (l1 < 0) ? 0 : ((l1 > 0x8000) ? 0x8000 : l1); + gi = (l2 < 0) ? 0 : ((l2 > 0x8000) ? 0x8000 : l2); + bi = (l3 < 0) ? 0 : ((l3 > 0x8000) ? 0x8000 : l3); + + + // And across second shaper, + *(cmsUInt16Number*)rout = p->Shaper2R[ri]; + *(cmsUInt16Number*)gout = p->Shaper2G[gi]; + *(cmsUInt16Number*)bout = p->Shaper2B[bi]; + + + // Handle alpha + if (ain) { + memmove(aout, ain, 2); + } + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + if (ain) ain += SourceIncrements[3]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + if (aout) aout += DestIncrements[3]; + } + + strideIn += Stride->BytesPerLineIn; + strideOut += Stride->BytesPerLineOut; + } +} + + + +// 15 bits on input allows matrix-shaper boost up a little bit +cmsBool OptimizeMatrixShaper15(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsStage* Curve1, *Curve2; + cmsStage* Matrix1, *Matrix2; + _cmsStageMatrixData* Data1; + _cmsStageMatrixData* Data2; + cmsMAT3 res; + cmsBool IdentityMat = FALSE; + cmsPipeline* Dest, *Src; + cmsContext ContextID; + cmsUInt32Number nChans; + + // Only works on RGB to RGB and gray + + if (!(T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3)) return FALSE; + + // Only works on 15 bit to 15 bit + if (T_BYTES(*InputFormat) != 2 || T_BYTES(*OutputFormat) != 2 || + T_BIT15(*InputFormat) == 0 || T_BIT15(*OutputFormat) == 0) return FALSE; + + // Seems suitable, proceed + Src = *Lut; + + // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for + if (!cmsPipelineCheckAndRetreiveStages(Src, 4, + cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType, + &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE; + + ContextID = cmsGetPipelineContextID(Src); + nChans = T_CHANNELS(*InputFormat); + + // Get both matrices, which are 3x3 + Data1 = (_cmsStageMatrixData*)cmsStageData(Matrix1); + Data2 = (_cmsStageMatrixData*)cmsStageData(Matrix2); + + // Input offset should be zero + if (Data1->Offset != NULL) return FALSE; + + // Multiply both matrices to get the result + _cmsMAT3per(&res, (cmsMAT3*)Data2->Double, (cmsMAT3*)Data1->Double); + + // Now the result is in res + Data2 -> Offset. Maybe is a plain identity? + IdentityMat = FALSE; + if (_cmsMAT3isIdentity(&res) && Data2->Offset == NULL) { + + // We can get rid of full matrix + IdentityMat = TRUE; + } + + + // Allocate an empty LUT + Dest = cmsPipelineAlloc(ContextID, nChans, nChans); + if (!Dest) return FALSE; + + // Assamble the new LUT + cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1)); + + if (!IdentityMat) { + + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*)&res, Data2->Offset)); + } + + cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2)); + + { + _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*)cmsStageData(Curve1); + _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*)cmsStageData(Curve2); + + // In this particular optimization, caché does not help as it takes more time to deal with + // the caché that with the pixel handling + *dwFlags |= cmsFLAGS_NOCACHE; + + // Setup the optimizarion routines + *UserData = SetMatShaper(ContextID, mpeC1->TheCurves, &res, (cmsVEC3*)Data2->Offset, mpeC2->TheCurves, IdentityMat); + *FreeUserData = FreeMatShaper; + + *TransformFn = (_cmsTransformFn)MatShaperXform; + } + + + cmsPipelineFree(Src); + *Lut = Dest; + return TRUE; +} + + diff --git a/plugins/fast_float/src/fast_float_cmyk.c b/plugins/fast_float/src/fast_float_cmyk.c new file mode 100644 index 0000000..bd35b96 --- /dev/null +++ b/plugins/fast_float/src/fast_float_cmyk.c @@ -0,0 +1,382 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +// Optimization for floating point tetrahedral interpolation +typedef struct { + + cmsContext ContextID; + const cmsInterpParams* p; // Tetrahedrical interpolation parameters. This is a not-owned pointer. + +} FloatCMYKData; + + + +// Precomputes tables on input devicelink. +static +FloatCMYKData* FloatCMYKAlloc(cmsContext ContextID, const cmsInterpParams* p) +{ + FloatCMYKData* fd; + + fd = (FloatCMYKData*) _cmsMallocZero(ContextID, sizeof(FloatCMYKData)); + if (fd == NULL) return NULL; + + fd ->ContextID = ContextID; + fd ->p = p; + return fd; +} + + +static +int XFormSampler(register const cmsFloat32Number In[], register cmsFloat32Number Out[], register void* Cargo) +{ + // Evaluate in 16 bits + cmsPipelineEvalFloat(In, Out, (cmsPipeline*) Cargo); + + // Always succeed + return TRUE; +} + +cmsINLINE cmsFloat32Number LinearInterpInt(cmsFloat32Number a, cmsFloat32Number l, cmsFloat32Number h) +{ + return (h - l) * a + l; +} + +// To prevent out of bounds indexing +cmsINLINE cmsFloat32Number fclamp100(cmsFloat32Number v) +{ + return v < 0.0f ? 0.0f : (v > 100.0f ? 100.0f : v); +} + + +// A optimized interpolation for 8-bit input. +#define DENS(i,j,k) (LutTable[(i)+(j)+(k)+OutChan]) + +static +void FloatCMYKCLUTEval(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + + cmsFloat32Number c, m, y, k; + cmsFloat32Number px, py, pz, pk; + int x0, y0, z0, k0; + int X0, Y0, Z0, K0, X1, Y1, Z1, K1; + cmsFloat32Number rx, ry, rz, rk; + cmsFloat32Number c0, c1 = 0, c2 = 0, c3 = 0; + + cmsUInt32Number OutChan; + FloatCMYKData* p8 = (FloatCMYKData*) _cmsGetTransformUserData(CMMcargo); + + const cmsInterpParams* p = p8 ->p; + cmsUInt32Number TotalOut = p -> nOutputs; + const cmsFloat32Number* LutTable = (const cmsFloat32Number*)p->Table; + cmsUInt32Number ii; + const cmsUInt8Number* cin; + const cmsUInt8Number* min; + const cmsUInt8Number* yin; + const cmsUInt8Number* kin; + + cmsFloat32Number Tmp1[cmsMAXCHANNELS], Tmp2[cmsMAXCHANNELS]; + + cmsUInt8Number* out[cmsMAXCHANNELS]; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + cmsUInt32Number InputFormat = cmsGetTransformInputFormat((cmsHTRANSFORM) CMMcargo); + cmsUInt32Number OutputFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM) CMMcargo); + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(InputFormat, Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(OutputFormat, Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + // SeparateCMYK(InputFormat, Stride, SourceStartingOrder, SourceIncrements); + // SeparateCMYK(OutputFormat, Stride, DestStartingOrder, DestIncrements); + + cin = (const cmsUInt8Number*)Input + SourceStartingOrder[0]; + min = (const cmsUInt8Number*)Input + SourceStartingOrder[1]; + yin = (const cmsUInt8Number*)Input + SourceStartingOrder[2]; + kin = (const cmsUInt8Number*)Input + SourceStartingOrder[3]; + + for (ii=0; ii < TotalOut; ii++) + out[ii] = (cmsUInt8Number*)Output + DestStartingOrder[ii]; + + for (ii=0; ii < len; ii++) { + + c = fclamp100(*(cmsFloat32Number*)cin) / 100.0f; + m = fclamp100(*(cmsFloat32Number*)min) / 100.0f; + y = fclamp100(*(cmsFloat32Number*)yin) / 100.0f; + k = fclamp100(*(cmsFloat32Number*)kin) / 100.0f; + + cin += SourceIncrements[0]; + min += SourceIncrements[1]; + yin += SourceIncrements[2]; + kin += SourceIncrements[3]; + + pk = c * p->Domain[0]; // C + px = m * p->Domain[1]; // M + py = y * p->Domain[2]; // Y + pz = k * p->Domain[3]; // K + + + k0 = (int)_cmsQuickFloor(pk); rk = (pk - (cmsFloat32Number)k0); + x0 = (int)_cmsQuickFloor(px); rx = (px - (cmsFloat32Number)x0); + y0 = (int)_cmsQuickFloor(py); ry = (py - (cmsFloat32Number)y0); + z0 = (int)_cmsQuickFloor(pz); rz = (pz - (cmsFloat32Number)z0); + + + K0 = p->opta[3] * k0; + K1 = K0 + (c >= 1.0 ? 0 : p->opta[3]); + + X0 = p->opta[2] * x0; + X1 = X0 + (m >= 1.0 ? 0 : p->opta[2]); + + Y0 = p->opta[1] * y0; + Y1 = Y0 + (y >= 1.0 ? 0 : p->opta[1]); + + Z0 = p->opta[0] * z0; + Z1 = Z0 + (k >= 1.0 ? 0 : p->opta[0]); + + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + + c0 = DENS(X0, Y0, Z0); + + if (rx >= ry && ry >= rz) { + + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z0) - DENS(X1, Y0, Z0); + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + + } + else + if (rx >= rz && rz >= ry) { + + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X1, Y0, Z1) - DENS(X1, Y0, Z0); + + } + else + if (rz >= rx && rx >= ry) { + + c1 = DENS(X1, Y0, Z1) - DENS(X0, Y0, Z1); + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + + } + else + if (ry >= rx && rx >= rz) { + + c1 = DENS(X1, Y1, Z0) - DENS(X0, Y1, Z0); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + + } + else + if (ry >= rz && rz >= rx) { + + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X0, Y1, Z1) - DENS(X0, Y1, Z0); + + } + else + if (rz >= ry && ry >= rx) { + + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z1) - DENS(X0, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + + } + else { + c1 = c2 = c3 = 0; + } + + + Tmp1[OutChan] = c0 + c1 * rx + c2 * ry + c3 * rz; + + } + + + LutTable = (cmsFloat32Number*)p->Table; + LutTable += K1; + + for (OutChan = 0; OutChan < p->nOutputs; OutChan++) { + + c0 = DENS(X0, Y0, Z0); + + if (rx >= ry && ry >= rz) { + + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z0) - DENS(X1, Y0, Z0); + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + + } + else + if (rx >= rz && rz >= ry) { + + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X1, Y0, Z1) - DENS(X1, Y0, Z0); + + } + else + if (rz >= rx && rx >= ry) { + + c1 = DENS(X1, Y0, Z1) - DENS(X0, Y0, Z1); + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + + } + else + if (ry >= rx && rx >= rz) { + + c1 = DENS(X1, Y1, Z0) - DENS(X0, Y1, Z0); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + + } + else + if (ry >= rz && rz >= rx) { + + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X0, Y1, Z1) - DENS(X0, Y1, Z0); + + } + else + if (rz >= ry && ry >= rx) { + + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z1) - DENS(X0, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + + } + else { + c1 = c2 = c3 = 0; + } + + Tmp2[OutChan] = c0 + c1 * rx + c2 * ry + c3 * rz; + } + + + for (OutChan = 0; OutChan < p->nOutputs; OutChan++) { + + *(cmsFloat32Number*)(out[OutChan]) = LinearInterpInt(rk, Tmp1[OutChan], Tmp2[OutChan]); + out[OutChan] += DestIncrements[OutChan]; + } + + + } +} + +#undef DENS + + + +// -------------------------------------------------------------------------------------------------------------- + +cmsBool OptimizeCLUTCMYKTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsPipeline* OriginalLut; + int nGridPoints; + cmsPipeline* OptimizedLUT = NULL; + cmsStage* OptimizedCLUTmpe; + cmsColorSpaceSignature OutputColorSpace; + cmsStage* mpe; + FloatCMYKData* p8; + cmsContext ContextID; + _cmsStageCLutData* data; + + // For empty transforms, do nothing + if (*Lut == NULL) return FALSE; + + // This is a loosy optimization! does not apply in floating-point cases + if (!T_FLOAT(*InputFormat) || !T_FLOAT(*OutputFormat)) return FALSE; + + // Only on 8-bit + if (T_BYTES(*InputFormat) != 4 || T_BYTES(*OutputFormat) != 4) return FALSE; + + // Only on CMYK + if (T_COLORSPACE(*InputFormat) != PT_CMYK) return FALSE; + + OriginalLut = *Lut; + + // Named color pipelines cannot be optimized either + for (mpe = cmsPipelineGetPtrToFirstStage(OriginalLut); + mpe != NULL; + mpe = cmsStageNext(mpe)) { + if (cmsStageType(mpe) == cmsSigNamedColorElemType) return FALSE; + } + + ContextID = cmsGetPipelineContextID(OriginalLut); + OutputColorSpace = _cmsICCcolorSpace(T_COLORSPACE(*OutputFormat)); + nGridPoints = _cmsReasonableGridpointsByColorspace(cmsSigRgbData, *dwFlags); + + // Create the result LUT + OptimizedLUT = cmsPipelineAlloc(cmsGetPipelineContextID(OriginalLut), 4, cmsPipelineOutputChannels(OriginalLut)); + if (OptimizedLUT == NULL) goto Error; + + + // Allocate the CLUT for result + OptimizedCLUTmpe = cmsStageAllocCLutFloat(ContextID, nGridPoints, 4, cmsPipelineOutputChannels(OriginalLut), NULL); + + // Add the CLUT to the destination LUT + cmsPipelineInsertStage(OptimizedLUT, cmsAT_BEGIN, OptimizedCLUTmpe); + + // Resample the LUT + if (!cmsStageSampleCLutFloat(OptimizedCLUTmpe, XFormSampler, (void*)OriginalLut, 0)) goto Error; + + // Set the evaluator, copy parameters + data = (_cmsStageCLutData*) cmsStageData(OptimizedCLUTmpe); + + p8 = FloatCMYKAlloc(ContextID, data ->Params); + if (p8 == NULL) return FALSE; + + // And return the obtained LUT + cmsPipelineFree(OriginalLut); + + *Lut = OptimizedLUT; + *TransformFn = (_cmsTransformFn) FloatCMYKCLUTEval; + *UserData = p8; + *FreeDataFn = _cmsFree; + + return TRUE; + +Error: + + if (OptimizedLUT != NULL) cmsPipelineFree(OptimizedLUT); + + return FALSE; +} + diff --git a/plugins/fast_float/src/fast_float_curves.c b/plugins/fast_float/src/fast_float_curves.c new file mode 100644 index 0000000..9842d5f --- /dev/null +++ b/plugins/fast_float/src/fast_float_curves.c @@ -0,0 +1,378 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +// Curves, optimization is valid for floating point curves +typedef struct { + + cmsFloat32Number CurveR[MAX_NODES_IN_CURVE]; + cmsFloat32Number CurveG[MAX_NODES_IN_CURVE]; + cmsFloat32Number CurveB[MAX_NODES_IN_CURVE]; + + void* real_ptr; + +} CurvesFloatData; + + + +// A special malloc that returns memory aligned to DWORD boundary. Aligned memory access is way faster than unaligned +// reference to the real block is kept for later free +static CurvesFloatData* malloc_aligned(cmsContext ContextID) +{ + cmsUInt8Number* real_ptr = (cmsUInt8Number*)_cmsMallocZero(ContextID, sizeof(CurvesFloatData) + 32); + cmsUInt8Number* aligned = (cmsUInt8Number*)(((uintptr_t)real_ptr + 16) & ~0xf); + CurvesFloatData* p = (CurvesFloatData*)aligned; + + p->real_ptr = real_ptr; + + return p; +} + +// Free the private data container +static void free_aligned(cmsContext ContextID, void* Data) +{ + CurvesFloatData* p = (CurvesFloatData*)Data; + if (p != NULL) + _cmsFree(ContextID, p->real_ptr); +} + +// Evaluator for float curves. This are just 1D tables + +static void FastEvaluateFloatRGBCurves(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + cmsUInt32Number ii; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + + cmsUInt32Number InputFormat = cmsGetTransformInputFormat((cmsHTRANSFORM) CMMcargo); + cmsUInt32Number OutputFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM) CMMcargo); + + CurvesFloatData* Data = (CurvesFloatData*) _cmsGetTransformUserData(CMMcargo); + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(InputFormat, Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(OutputFormat, Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + // SeparateRGB(InputFormat, Stride, SourceStartingOrder, SourceIncrements); + // SeparateRGB(OutputFormat, Stride, DestStartingOrder, DestIncrements); + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0]; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1]; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2]; + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0]; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1]; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2]; + + for (ii = 0; ii < len; ii++) { + + *(cmsFloat32Number*)rout = flerp(Data->CurveR, *(cmsFloat32Number*)rin); + *(cmsFloat32Number*)gout = flerp(Data->CurveG, *(cmsFloat32Number*)gin); + *(cmsFloat32Number*)bout = flerp(Data->CurveB, *(cmsFloat32Number*)bin); + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + } +} + +// Do nothing but arrange the RGB format. +static void FastFloatRGBIdentity(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + cmsUInt32Number ii; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + + + cmsUInt32Number InputFormat = cmsGetTransformInputFormat((cmsHTRANSFORM) CMMcargo); + cmsUInt32Number OutputFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM) CMMcargo); + + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(InputFormat, Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(OutputFormat, Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + // SeparateRGB(InputFormat, Stride, SourceStartingOrder, SourceIncrements); + // SeparateRGB(OutputFormat, Stride, DestStartingOrder, DestIncrements); + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0]; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1]; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2]; + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0]; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1]; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2]; + + for (ii=0; ii < len; ii++) { + + memmove(rout, rin, 4); + memmove(gout, gin, 4); + memmove(bout, bin, 4); + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + } +} + +// Evaluate 1 channel only +static void FastEvaluateFloatGrayCurves(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + cmsUInt32Number ii; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + const cmsUInt8Number* kin; + cmsUInt8Number* kout; + + cmsUInt32Number InputFormat = cmsGetTransformInputFormat((cmsHTRANSFORM) CMMcargo); + cmsUInt32Number OutputFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM) CMMcargo); + + CurvesFloatData* Data = (CurvesFloatData*) _cmsGetTransformUserData(CMMcargo); + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(InputFormat, Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(OutputFormat, Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + // SeparateGray(InputFormat, Stride, &SourceStartingOrder, &SourceIncrement); + // SeparateGray(OutputFormat, Stride, &DestStartingOrder, &DestIncrement); + + kin = (const cmsUInt8Number*)Input + SourceStartingOrder[0]; + kout = (cmsUInt8Number*)Output + DestStartingOrder[0]; + + for (ii = 0; ii < len; ii++) { + + *(cmsFloat32Number*)kout = flerp(Data->CurveR, *(cmsFloat32Number*)kin); + + kin += SourceIncrements[0]; + kout += DestIncrements[0]; + } +} + + +static void FastFloatGrayIdentity(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + cmsUInt32Number ii; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + const cmsUInt8Number* kin; + cmsUInt8Number* kout; + + cmsUInt32Number InputFormat = cmsGetTransformInputFormat((cmsHTRANSFORM) CMMcargo); + cmsUInt32Number OutputFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM) CMMcargo); + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(InputFormat, Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(OutputFormat, Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + // SeparateGray(InputFormat, Stride, &SourceStartingOrder, &SourceIncrement); + // SeparateGray(OutputFormat, Stride, &DestStartingOrder, &DestIncrement); + + kin = (const cmsUInt8Number*) Input + SourceStartingOrder[0]; + kout = (cmsUInt8Number*)Output + DestStartingOrder[0]; + + for (ii=0; ii < len; ii++) { + + memmove(kout, kin, 4); + + kin += SourceIncrements[0]; + kout += DestIncrements[0]; + } +} + + +#define LINEAR_CURVES_EPSILON 0.00001 + +// Try to see if the curves are linear +static +cmsBool AllRGBCurvesAreLinear(CurvesFloatData* data) +{ + int j; + cmsFloat32Number expected; + + for (j = 0; j < MAX_NODES_IN_CURVE; j++) { + + expected = (cmsFloat32Number)j / (cmsFloat32Number)(MAX_NODES_IN_CURVE - 1); + + if (fabsf(data->CurveR[j] - expected) > LINEAR_CURVES_EPSILON || + fabsf(data->CurveG[j] - expected) > LINEAR_CURVES_EPSILON || + fabsf(data->CurveB[j] - expected) > LINEAR_CURVES_EPSILON) { + return FALSE; + } + } + + return TRUE; +} + +static +cmsBool KCurveIsLinear(CurvesFloatData* data) +{ + int j; + cmsFloat32Number expected; + + for (j = 0; j < MAX_NODES_IN_CURVE; j++) { + expected = (cmsFloat32Number)j / (cmsFloat32Number)(MAX_NODES_IN_CURVE - 1); + + if (fabs(data->CurveR[j] - expected) > LINEAR_CURVES_EPSILON) return FALSE; + } + + + return TRUE; +} + + +// Create linearization tables with a reasonable number of entries. Precission is about 32 bits. +static +CurvesFloatData* ComputeCompositeCurves(cmsUInt32Number nChan, cmsPipeline* Src) +{ + cmsUInt32Number i, j; + cmsFloat32Number InFloat[3], OutFloat[3]; + + CurvesFloatData* Data = malloc_aligned(cmsGetPipelineContextID(Src)); + if (Data == NULL) return NULL; + + // Create target curves + for (i = 0; i < MAX_NODES_IN_CURVE; i++) { + + for (j=0; j CurveR[i] = OutFloat[0]; + } + else { + Data->CurveR[i] = OutFloat[0]; + Data->CurveG[i] = OutFloat[1]; + Data->CurveB[i] = OutFloat[2]; + } + + } + + return Data; +} + + +// If the target LUT holds only curves, the optimization procedure is to join all those +// curves together. That only works on curves and does not work on matrices. +cmsBool OptimizeFloatByJoiningCurves(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + + cmsPipeline* Src = *Lut; + cmsStage* mpe; + CurvesFloatData* Data; + cmsUInt32Number nChans; + + // Apply only to floating-point cases + if (!T_FLOAT(*InputFormat) || !T_FLOAT(*OutputFormat)) return FALSE; + + // Only on 8-bit + if (T_BYTES(*InputFormat) != 4 || T_BYTES(*OutputFormat) != 4) return FALSE; + + // Curves need same channels on input and output (despite extra channels may differ) + nChans = T_CHANNELS(*InputFormat); + if (nChans != T_CHANNELS(*OutputFormat)) return FALSE; + + // gray and RGB + if (nChans != 1 && nChans != 3) return FALSE; + + // Only curves in this LUT? + for (mpe = cmsPipelineGetPtrToFirstStage(Src); + mpe != NULL; + mpe = cmsStageNext(mpe)) { + + if (cmsStageType(mpe) != cmsSigCurveSetElemType) return FALSE; + } + + Data = ComputeCompositeCurves(nChans, Src); + + *dwFlags |= cmsFLAGS_NOCACHE; + *UserData = Data; + *FreeUserData = free_aligned; + + // Maybe the curves are linear at the end + if (nChans == 1) + *TransformFn = (_cmsTransformFn) (KCurveIsLinear(Data) ? FastFloatGrayIdentity : FastEvaluateFloatGrayCurves); + else + *TransformFn = (_cmsTransformFn) (AllRGBCurvesAreLinear(Data) ? FastFloatRGBIdentity : FastEvaluateFloatRGBCurves); + + return TRUE; + +} + diff --git a/plugins/fast_float/src/fast_float_internal.h b/plugins/fast_float/src/fast_float_internal.h new file mode 100644 index 0000000..fd38d40 --- /dev/null +++ b/plugins/fast_float/src/fast_float_internal.h @@ -0,0 +1,237 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#ifndef _FAST_INTERNAL_H +#define _FAST_INTERNAL_H + +#include "lcms2_fast_float.h" +#include + +#define REQUIRED_LCMS_VERSION 2100 + +// Unused parameter warning supression +#define UNUSED_PARAMETER(x) ((void)x) + +// The specification for "inline" is section 6.7.4 of the C99 standard (ISO/IEC 9899:1999). +// unfortunately VisualC++ does not conform that +#if defined(_MSC_VER) || defined(__BORLANDC__) +# define cmsINLINE __inline +#else +# define cmsINLINE static inline +#endif + +// A fast way to convert from/to 16 <-> 8 bits +#define FROM_8_TO_16(rgb) (cmsUInt16Number) ((((cmsUInt16Number) (rgb)) << 8)|(rgb)) +#define FROM_16_TO_8(rgb) (cmsUInt8Number) ((((rgb) * 65281 + 8388608) >> 24) & 0xFF) + + +// This macro return words stored as big endian +#define CHANGE_ENDIAN(w) (cmsUInt16Number) ((cmsUInt16Number) ((w)<<8)|((w)>>8)) + +// This macro changes the polarity of a word +#define REVERSE_FLAVOR_16(x) ((cmsUInt16Number)(0xffff-(x))) + +// Fixed point +#define FIXED_TO_INT(x) ((x)>>16) +#define FIXED_REST_TO_INT(x) ((x)&0xFFFFU) + +// Utility macros to convert from to 0...1.0 in 15.16 fixed domain to 0..0xffff as integer +cmsINLINE cmsS15Fixed16Number _cmsToFixedDomain(int a) { return a + ((a + 0x7fff) / 0xffff); } +cmsINLINE int _cmsFromFixedDomain(cmsS15Fixed16Number a) { return a - ((a + 0x7fff) >> 16); } + +// This is the upper part of internal transform structure. Only format specifiers are used +typedef struct { + + cmsUInt32Number InputFormat, OutputFormat; // Keep formats for further reference + +} _xform_head; + + +#define MAX_NODES_IN_CURVE 0x8001 + + +// To prevent out of bounds indexing +cmsINLINE cmsFloat32Number fclamp(cmsFloat32Number v) +{ + return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); +} + +// Fast floor conversion logic. +cmsINLINE int _cmsQuickFloor(cmsFloat64Number val) +{ +#ifdef CMS_DONT_USE_FAST_FLOOR + return (int)floor(val); +#else +#define _lcms_double2fixmagic (68719476736.0 * 1.5) + + union { + cmsFloat64Number val; + int halves[2]; + } temp; + + temp.val = val + _lcms_double2fixmagic; + +#ifdef CMS_USE_BIG_ENDIAN + return temp.halves[1] >> 16; +#else + return temp.halves[0] >> 16; +#endif +#endif +} + +// Floor to word, taking care of saturation. This is not critical in terms of performance +cmsINLINE cmsUInt16Number _cmsSaturateWord(cmsFloat64Number d) +{ + d += 0.5; + + if (d <= 0) return 0; + if (d >= 65535.0) return 0xffff; + + return (cmsUInt16Number)floor(d); +} + + +cmsINLINE cmsFloat32Number flerp(const cmsFloat32Number LutTable[], cmsFloat32Number v) +{ + cmsFloat32Number y1, y0; + cmsFloat32Number rest; + int cell0, cell1; + + if (v <= 0.0) { + return LutTable[0]; + } + else + if (v >= 1.0) { + return LutTable[MAX_NODES_IN_CURVE - 1]; + } + + v *= (MAX_NODES_IN_CURVE - 1); + + cell0 = _cmsQuickFloor(v); + cell1 = (int)ceilf(v); + + // Rest is 16 LSB bits + rest = v - cell0; + + y0 = LutTable[cell0]; + y1 = LutTable[cell1]; + + return y0 + (y1 - y0) * rest; +} + + + + +// Some secret sauce from lcms +int _cmsReasonableGridpointsByColorspace(cmsColorSpaceSignature Colorspace, cmsUInt32Number dwFlags); + + + +// Compute the increments to be used by the transform functions +void _cmsComputeComponentIncrements(cmsUInt32Number Format, + cmsUInt32Number BytesPerPlane, + cmsUInt32Number* nChannels, + cmsUInt32Number* nAlpha, + cmsUInt32Number ComponentStartingOrder[], + cmsUInt32Number ComponentPointerIncrements[]); + +// 15 bits formatters +cmsFormatter Formatter_15Bit_Factory(cmsUInt32Number Type, + cmsFormatterDirection Dir, + cmsUInt32Number dwFlags); + +// Optimizers + +// 8 bits on input allows matrix-shaper boost up a little bit +cmsBool Optimize8MatrixShaper(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool OptimizeMatrixShaper15(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + + +cmsBool Optimize8ByJoiningCurves(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool OptimizeFloatByJoiningCurves(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool OptimizeFloatMatrixShaper(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool Optimize8BitRGBTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool Optimize16BitRGBTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool OptimizeCLUTRGBTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +cmsBool OptimizeCLUTCMYKTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags); + +#endif diff --git a/plugins/fast_float/src/fast_float_matsh.c b/plugins/fast_float/src/fast_float_matsh.c new file mode 100644 index 0000000..848f665 --- /dev/null +++ b/plugins/fast_float/src/fast_float_matsh.c @@ -0,0 +1,325 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +// Optimization for matrix-shaper in float + +#include "fast_float_internal.h" + + +// This is the private data container used by this optimization +typedef struct { + + + cmsFloat32Number Mat[3][3]; + cmsFloat32Number Off[3]; + + cmsFloat32Number Shaper1R[MAX_NODES_IN_CURVE]; + cmsFloat32Number Shaper1G[MAX_NODES_IN_CURVE]; + cmsFloat32Number Shaper1B[MAX_NODES_IN_CURVE]; + + cmsFloat32Number Shaper2R[MAX_NODES_IN_CURVE]; + cmsFloat32Number Shaper2G[MAX_NODES_IN_CURVE]; + cmsFloat32Number Shaper2B[MAX_NODES_IN_CURVE]; + + cmsBool UseOff; + + void * real_ptr; + +} VXMatShaperFloatData; + + +static +VXMatShaperFloatData* malloc_aligned(cmsContext ContextID) +{ + cmsUInt8Number* real_ptr = (cmsUInt8Number*) _cmsMallocZero(ContextID, sizeof(VXMatShaperFloatData) + 32); + cmsUInt8Number* aligned = (cmsUInt8Number*) (((uintptr_t)real_ptr + 16) & ~0xf); + VXMatShaperFloatData* p = (VXMatShaperFloatData*) aligned; + + p ->real_ptr = real_ptr; + return p; +} + + + +// Free the private data container +static +void FreeMatShaper(cmsContext ContextID, void* Data) +{ + VXMatShaperFloatData* d = (VXMatShaperFloatData*)Data; + + if (d != NULL) + _cmsFree(ContextID, d->real_ptr); +} + + +static +void FillShaper(cmsFloat32Number* Table, cmsToneCurve* Curve) +{ + int i; + cmsFloat32Number R; + + for (i = 0; i < MAX_NODES_IN_CURVE; i++) { + + R = (cmsFloat32Number) i / (cmsFloat32Number) (MAX_NODES_IN_CURVE - 1); + + Table[i] = cmsEvalToneCurveFloat(Curve, R); + } +} + + +// Compute the matrix-shaper structure +static +VXMatShaperFloatData* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3]) +{ + VXMatShaperFloatData* p; + int i, j; + + // Allocate a big chuck of memory to store precomputed tables + p = malloc_aligned(ContextID); + if (p == NULL) return FALSE; + + + // Precompute tables + FillShaper(p->Shaper1R, Curve1[0]); + FillShaper(p->Shaper1G, Curve1[1]); + FillShaper(p->Shaper1B, Curve1[2]); + + FillShaper(p->Shaper2R, Curve2[0]); + FillShaper(p->Shaper2G, Curve2[1]); + FillShaper(p->Shaper2B, Curve2[2]); + + + for (i=0; i < 3; i++) { + for (j=0; j < 3; j++) { + p->Mat[i][j] = (cmsFloat32Number) Mat->v[i].n[j]; + } + } + + + for (i = 0; i < 3; i++) { + + if (Off == NULL) { + + p->UseOff = FALSE; + p->Off[i] = 0.0; + } + else { + p->UseOff = TRUE; + p->Off[i] = (cmsFloat32Number)Off->n[i]; + + } + } + + + return p; +} + + + +// A fast matrix-shaper evaluator for floating point +static +void MatShaperFloat(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + VXMatShaperFloatData* p = (VXMatShaperFloatData*) _cmsGetTransformUserData(CMMcargo); + cmsFloat32Number l1, l2, l3; + cmsFloat32Number r, g, b; + cmsUInt32Number ii; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + + cmsUInt8Number* rout; + cmsUInt8Number* gout; + cmsUInt8Number* bout; + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(cmsGetTransformInputFormat((cmsHTRANSFORM)CMMcargo), Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(cmsGetTransformOutputFormat((cmsHTRANSFORM)CMMcargo), Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0]; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1]; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2]; + + rout = (cmsUInt8Number*)Output + DestStartingOrder[0]; + gout = (cmsUInt8Number*)Output + DestStartingOrder[1]; + bout = (cmsUInt8Number*)Output + DestStartingOrder[2]; + + for (ii=0; ii < len; ii++) { + + r = flerp(p->Shaper1R, *(cmsFloat32Number*)rin); + g = flerp(p->Shaper1G, *(cmsFloat32Number*)gin); + b = flerp(p->Shaper1B, *(cmsFloat32Number*)bin); + + l1 = p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b ; + l2 = p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b ; + l3 = p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b ; + + if (p->UseOff) { + + l1 += p->Off[0]; + l2 += p->Off[1]; + l3 += p->Off[2]; + } + + *(cmsFloat32Number*)rout = flerp(p->Shaper2R, l1); + *(cmsFloat32Number*)gout = flerp(p->Shaper2G, l2); + *(cmsFloat32Number*)bout = flerp(p->Shaper2B, l3); + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + + rout += DestIncrements[0]; + gout += DestIncrements[1]; + bout += DestIncrements[2]; + } + +} + + + +cmsBool OptimizeFloatMatrixShaper(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsStage* Curve1, *Curve2; + cmsStage* Matrix1, *Matrix2; + _cmsStageMatrixData* Data1; + _cmsStageMatrixData* Data2; + cmsMAT3 res; + cmsBool IdentityMat = FALSE; + cmsPipeline* Dest, *Src; + cmsContext ContextID; + cmsUInt32Number nChans; + cmsFloat64Number factor = 1.0; + + + // Apply only to floating-point cases + if (!T_FLOAT(*InputFormat) || !T_FLOAT(*OutputFormat)) return FALSE; + + // Only works on RGB to RGB and gray to gray + if ( !( (T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3)) && + !( (T_CHANNELS(*InputFormat) == 1 && T_CHANNELS(*OutputFormat) == 1))) return FALSE; + + // Only works on float + if (T_BYTES(*InputFormat) != 4 || T_BYTES(*OutputFormat) != 4) return FALSE; + + // Seems suitable, proceed + Src = *Lut; + + // Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for + if (!cmsPipelineCheckAndRetreiveStages(Src, 4, + cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType, + &Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE; + + ContextID = cmsGetPipelineContextID(Src); + nChans = T_CHANNELS(*InputFormat); + + // Get both matrices, which are 3x3 + Data1 = (_cmsStageMatrixData*) cmsStageData(Matrix1); + Data2 = (_cmsStageMatrixData*) cmsStageData(Matrix2); + + // Input offset should be zero + if (Data1 ->Offset != NULL) return FALSE; + + if (cmsStageInputChannels(Matrix1) == 1 && cmsStageOutputChannels(Matrix2) == 1) + { + // This is a gray to gray. Just multiply + factor = Data1->Double[0]*Data2->Double[0] + + Data1->Double[1]*Data2->Double[1] + + Data1->Double[2]*Data2->Double[2]; + + if (fabs(1 - factor) < (1.0 / 65535.0)) IdentityMat = TRUE; + } + else + { + // Multiply both matrices to get the result + _cmsMAT3per(&res, (cmsMAT3*) Data2 ->Double, (cmsMAT3*) Data1 ->Double); + + // Now the result is in res + Data2 -> Offset. Maybe is a plain identity? + IdentityMat = FALSE; + if (_cmsMAT3isIdentity(&res) && Data2 ->Offset == NULL) { + + // We can get rid of full matrix + IdentityMat = TRUE; + } + } + + // Allocate an empty LUT + Dest = cmsPipelineAlloc(ContextID, nChans, nChans); + if (!Dest) return FALSE; + + // Assamble the new LUT + cmsPipelineInsertStage(Dest, cmsAT_BEGIN, cmsStageDup(Curve1)); + + if (!IdentityMat) { + + if (nChans == 1) + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 1, 1, (const cmsFloat64Number*) &factor, Data2->Offset)); + else + cmsPipelineInsertStage(Dest, cmsAT_END, + cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*) &res, Data2 ->Offset)); + } + + + cmsPipelineInsertStage(Dest, cmsAT_END, cmsStageDup(Curve2)); + + // If identity on matrix, we can further optimize the curves, so call the join curves routine + if (IdentityMat) { + + OptimizeFloatByJoiningCurves(TransformFn, UserData, FreeUserData, &Dest, InputFormat, OutputFormat, dwFlags); + } + else { + _cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*) cmsStageData(Curve1); + _cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*) cmsStageData(Curve2); + + // In this particular optimization, caché does not help as it takes more time to deal with + // the cachthat with the pixel handling + *dwFlags |= cmsFLAGS_NOCACHE; + + // Setup the optimizarion routines + *UserData = SetMatShaper(ContextID, mpeC1 ->TheCurves, &res, (cmsVEC3*) Data2 ->Offset, mpeC2->TheCurves); + *FreeUserData = FreeMatShaper; + + *TransformFn = (_cmsTransformFn) MatShaperFloat; + } + + cmsPipelineFree(Src); + *Lut = Dest; + return TRUE; +} + + diff --git a/plugins/fast_float/src/fast_float_separate.c b/plugins/fast_float/src/fast_float_separate.c new file mode 100644 index 0000000..c4186a9 --- /dev/null +++ b/plugins/fast_float/src/fast_float_separate.c @@ -0,0 +1,199 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +// Separable input. It just computes the distance from +// each component to the next one in bytes. It gives components RGB in this order +// +// Encoding Starting Increment DoSwap Swapfirst Extra +// RGB, 012 333 0 0 0 +// RGBA, 012 444 0 0 1 +// ARGB, 123 444 0 1 1 +// BGR, 210 333 1 0 0 +// BGRA, 210 444 1 1 1 +// ABGR 321 444 1 0 1 +// +// +// On planar configurations, the distance is the stride added to any non-negative +// +// RGB 0, S, 2*S 111 +// RGBA 0, S, 2*S 111 (fourth plane is safely ignored) +// ARGB S, 2*S, 3*S 111 +// BGR 2*S, S, 0 111 +// BGRA 2*S, S, 0, 111 (fourth plane is safely ignored) +// ABGR 3*S, 2*S, S 111 +// +//---------------------------------------------------------------------------------------- + + +// Return the size in bytes of a given formatter +static +int trueBytesSize(cmsUInt32Number Format) +{ + int fmt_bytes = T_BYTES(Format); + + // For double, the T_BYTES field returns zero + if (fmt_bytes == 0) + return sizeof(double); + + // Otherwise, it is already correct for all formats + return fmt_bytes; +} + +// RGBA -> normal +// ARGB -> swap first +// ABGR -> doSwap +// BGRA -> doSwap swapFirst + +// This function computes the distance from each component to the next one in bytes. +static +void ComputeIncrementsForChunky(cmsUInt32Number Format, + cmsUInt32Number BytesPerPlane, + cmsUInt32Number* nChannels, + cmsUInt32Number* nAlpha, + cmsUInt32Number ComponentStartingOrder[], + cmsUInt32Number ComponentPointerIncrements[]) +{ + int extra = T_EXTRA(Format); + int channels = T_CHANNELS(Format); + int total_chans = channels + extra; + int i; + int channelSize = trueBytesSize(Format); + int pixelSize = channelSize * total_chans; + + UNUSED_PARAMETER(BytesPerPlane); + + // Setup the counts + if (nChannels != NULL) + *nChannels = channels; + + if (nAlpha != NULL) + *nAlpha = extra; + + // Separation is independent of starting point and only depends on channel size + for (i = 0; i < total_chans; i++) + ComponentPointerIncrements[i] = pixelSize; + + // Handle do swap + for (i = 0; i < total_chans; i++) + { + if (T_DOSWAP(Format)) { + ComponentStartingOrder[i] = total_chans - i - 1; + } + else { + ComponentStartingOrder[i] = i; + } + } + + // Handle swap first (ROL of positions), example CMYK -> KCMY | 0123 -> 3012 + if (T_SWAPFIRST(Format)) { + + cmsUInt32Number tmp = ComponentStartingOrder[0]; + for (i = 0; i < total_chans-1; i++) + ComponentStartingOrder[i] = ComponentStartingOrder[i + 1]; + + ComponentStartingOrder[total_chans - 1] = tmp; + } + + // Handle size + if (channelSize > 1) + for (i = 0; i < total_chans; i++) { + ComponentStartingOrder[i] *= channelSize; + } +} + + + +// On planar configurations, the distance is the stride added to any non-negative +static +void ComputeIncrementsForPlanar(cmsUInt32Number Format, + cmsUInt32Number BytesPerPlane, + cmsUInt32Number* nChannels, + cmsUInt32Number* nAlpha, + cmsUInt32Number ComponentStartingOrder[], + cmsUInt32Number ComponentPointerIncrements[]) +{ + int extra = T_EXTRA(Format); + int channels = T_CHANNELS(Format); + int total_chans = channels + extra; + int i; + int channelSize = trueBytesSize(Format); + + // Setup the counts + if (nChannels != NULL) + *nChannels = channels; + + if (nAlpha != NULL) + *nAlpha = extra; + + // Separation is independent of starting point and only depends on channel size + for (i = 0; i < total_chans; i++) + ComponentPointerIncrements[i] = channelSize; + + // Handle do swap + for (i = 0; i < total_chans; i++) + { + if (T_DOSWAP(Format)) { + ComponentStartingOrder[i] = total_chans - i - 1; + } + else { + ComponentStartingOrder[i] = i; + } + } + + // Handle swap first (ROL of positions), example CMYK -> KCMY | 0123 -> 3012 + if (T_SWAPFIRST(Format)) { + + cmsUInt32Number tmp = ComponentStartingOrder[0]; + for (i = 0; i < total_chans - 1; i++) + ComponentStartingOrder[i] = ComponentStartingOrder[i + 1]; + + ComponentStartingOrder[total_chans - 1] = tmp; + } + + // Handle size + for (i = 0; i < total_chans; i++) { + ComponentStartingOrder[i] *= BytesPerPlane; + } +} + + + +// Dispatcher por chunky and planar RGB +void _cmsComputeComponentIncrements(cmsUInt32Number Format, + cmsUInt32Number BytesPerPlane, + cmsUInt32Number* nChannels, + cmsUInt32Number* nAlpha, + cmsUInt32Number ComponentStartingOrder[], + cmsUInt32Number ComponentPointerIncrements[]) +{ + if (T_PLANAR(Format)) { + + ComputeIncrementsForPlanar(Format, BytesPerPlane, nChannels, nAlpha, ComponentStartingOrder, ComponentPointerIncrements); + } + else { + ComputeIncrementsForChunky(Format, BytesPerPlane, nChannels, nAlpha, ComponentStartingOrder, ComponentPointerIncrements); + } + +} + + diff --git a/plugins/fast_float/src/fast_float_sup.c b/plugins/fast_float/src/fast_float_sup.c new file mode 100644 index 0000000..67c4e90 --- /dev/null +++ b/plugins/fast_float/src/fast_float_sup.c @@ -0,0 +1,89 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + + +#include "fast_float_internal.h" + + +// This is the main dispatcher +static +cmsBool Floating_Point_Transforms_Dispatcher(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeUserData, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + + // Try to optimize as a set of curves plus a matrix plus a set of curves + if (OptimizeMatrixShaper15(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize by joining curves + if (Optimize8ByJoiningCurves(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize as a set of curves plus a matrix plus a set of curves + if (Optimize8MatrixShaper(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize by joining curves + if (OptimizeFloatByJoiningCurves(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize as a set of curves plus a matrix plus a set of curves + if (OptimizeFloatMatrixShaper(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize using prelinearization plus tetrahedral + if (Optimize8BitRGBTransform(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize using prelinearization plus tetrahedral + if (Optimize16BitRGBTransform(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize using prelinearization plus tetrahedral + if (OptimizeCLUTRGBTransform(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + // Try to optimize using prelinearization plus tetrahedral + if (OptimizeCLUTCMYKTransform(TransformFn, UserData, FreeUserData, Lut, InputFormat, OutputFormat, dwFlags)) return TRUE; + + + // Cannot optimize, use lcms normal process + return FALSE; +} + +// The Plug-in entry points +static cmsPluginFormatters PluginFastFloat = { + { cmsPluginMagicNumber, REQUIRED_LCMS_VERSION, cmsPluginFormattersSig, NULL }, + + Formatter_15Bit_Factory +}; + +static cmsPluginTransform PluginList = { + + { cmsPluginMagicNumber, REQUIRED_LCMS_VERSION, cmsPluginTransformSig, (cmsPluginBase *) &PluginFastFloat }, + + Floating_Point_Transforms_Dispatcher +}; + +// This is the main plug-in installer. +// Using a function to retrieve the plug-in entry point allows us to execute initialization data. +void* cmsFastFloatExtensions(void) +{ + return (void*)&PluginList; +} + diff --git a/plugins/fast_float/src/fast_float_tethra.c b/plugins/fast_float/src/fast_float_tethra.c new file mode 100644 index 0000000..0897568 --- /dev/null +++ b/plugins/fast_float/src/fast_float_tethra.c @@ -0,0 +1,295 @@ +//--------------------------------------------------------------------------------- +// +// Little Color Management System, fast floating point extensions +// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved +// +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +//--------------------------------------------------------------------------------- + +#include "fast_float_internal.h" + +// Optimization for floating point tetrahedral interpolation +typedef struct { + + cmsContext ContextID; + const cmsInterpParams* p; // Tetrahedrical interpolation parameters. This is a not-owned pointer. + +} FloatCLUTData; + +// Precomputes tables for 8-bit on input devicelink. +static +FloatCLUTData* FloatCLUTAlloc(cmsContext ContextID, const cmsInterpParams* p) +{ + FloatCLUTData* fd; + + fd = (FloatCLUTData*) _cmsMallocZero(ContextID, sizeof(FloatCLUTData)); + if (fd == NULL) return NULL; + + fd ->ContextID = ContextID; + fd ->p = p; + return fd; +} + + +// Sampler implemented by another LUT. This is a clean way to precalculate the devicelink 3D CLUT for +// almost any transform. We use floating point precision and then convert from floating point to 16 bits. +static +int XFormSampler(register const cmsFloat32Number In[], register cmsFloat32Number Out[], register void* Cargo) +{ + // Evaluate in 16 bits + cmsPipelineEvalFloat(In, Out, (cmsPipeline*) Cargo); + + // Always succeed + return TRUE; +} + + + +// A optimized interpolation for 8-bit input. +#define DENS(i,j,k) (LutTable[(i)+(j)+(k)+OutChan]) + +static +void FloatCLUTEval(struct _cmstransform_struct *CMMcargo, + const cmsFloat32Number* Input, + cmsFloat32Number* Output, + cmsUInt32Number len, + cmsUInt32Number Stride) +{ + + cmsFloat32Number r, g, b; + cmsFloat32Number px, py, pz; + int x0, y0, z0; + int X0, Y0, Z0, X1, Y1, Z1; + cmsFloat32Number rx, ry, rz; + cmsFloat32Number c0, c1 = 0, c2 = 0, c3 = 0; + + cmsUInt32Number OutChan; + FloatCLUTData* p8 = (FloatCLUTData*) _cmsGetTransformUserData(CMMcargo); + + const cmsInterpParams* p = p8 ->p; + cmsUInt32Number TotalOut = p -> nOutputs; + const cmsFloat32Number* LutTable = (const cmsFloat32Number*)p->Table; + cmsUInt32Number ii; + const cmsUInt8Number* rin; + const cmsUInt8Number* gin; + const cmsUInt8Number* bin; + + cmsUInt8Number* out[cmsMAXCHANNELS]; + cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number SourceIncrements[cmsMAXCHANNELS]; + cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS]; + cmsUInt32Number DestIncrements[cmsMAXCHANNELS]; + + cmsUInt32Number InputFormat = cmsGetTransformInputFormat((cmsHTRANSFORM) CMMcargo); + cmsUInt32Number OutputFormat = cmsGetTransformOutputFormat((cmsHTRANSFORM) CMMcargo); + + cmsUInt32Number nchans, nalpha; + + _cmsComputeComponentIncrements(InputFormat, Stride, &nchans, &nalpha, SourceStartingOrder, SourceIncrements); + _cmsComputeComponentIncrements(OutputFormat, Stride, &nchans, &nalpha, DestStartingOrder, DestIncrements); + + // SeparateRGB(InputFormat, Stride, SourceStartingOrder, SourceIncrements); + // SeparateRGB(OutputFormat, Stride, DestStartingOrder, DestIncrements); + + rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0]; + gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1]; + bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2]; + + for (ii=0; ii < TotalOut; ii++) + out[ii] = (cmsUInt8Number*) Output + DestStartingOrder[ii]; + + for (ii=0; ii < len; ii++) { + + r = fclamp(*(cmsFloat32Number*)rin); + g = fclamp(*(cmsFloat32Number*)gin); + b = fclamp(*(cmsFloat32Number*)bin); + + rin += SourceIncrements[0]; + gin += SourceIncrements[1]; + bin += SourceIncrements[2]; + + px = r * p->Domain[0]; + py = g * p->Domain[1]; + pz = b * p->Domain[2]; + + + x0 = (int)_cmsQuickFloor(px); rx = (px - (cmsFloat32Number)x0); + y0 = (int)_cmsQuickFloor(py); ry = (py - (cmsFloat32Number)y0); + z0 = (int)_cmsQuickFloor(pz); rz = (pz - (cmsFloat32Number)z0); + + X0 = p->opta[2] * x0; + X1 = X0 + (r >= 1.0 ? 0 : p->opta[2]); + + Y0 = p->opta[1] * y0; + Y1 = Y0 + (g >= 1.0 ? 0 : p->opta[1]); + + Z0 = p->opta[0] * z0; + Z1 = Z0 + (b >= 1.0 ? 0 : p->opta[0]); + + for (OutChan = 0; OutChan < TotalOut; OutChan++) { + + // These are the 6 Tetrahedral + + c0 = DENS(X0, Y0, Z0); + + if (rx >= ry && ry >= rz) { + + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z0) - DENS(X1, Y0, Z0); + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + + } + else + if (rx >= rz && rz >= ry) { + + c1 = DENS(X1, Y0, Z0) - c0; + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X1, Y0, Z1) - DENS(X1, Y0, Z0); + + } + else + if (rz >= rx && rx >= ry) { + + c1 = DENS(X1, Y0, Z1) - DENS(X0, Y0, Z1); + c2 = DENS(X1, Y1, Z1) - DENS(X1, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + + } + else + if (ry >= rx && rx >= rz) { + + c1 = DENS(X1, Y1, Z0) - DENS(X0, Y1, Z0); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X1, Y1, Z1) - DENS(X1, Y1, Z0); + + } + else + if (ry >= rz && rz >= rx) { + + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z0) - c0; + c3 = DENS(X0, Y1, Z1) - DENS(X0, Y1, Z0); + + } + else + if (rz >= ry && ry >= rx) { + + c1 = DENS(X1, Y1, Z1) - DENS(X0, Y1, Z1); + c2 = DENS(X0, Y1, Z1) - DENS(X0, Y0, Z1); + c3 = DENS(X0, Y0, Z1) - c0; + + } + else { + c1 = c2 = c3 = 0; + } + + *(cmsFloat32Number*) (out[OutChan]) = c0 + c1 * rx + c2 * ry + c3 * rz; + + out[OutChan] += DestIncrements[OutChan]; + + } + + + } +} + +#undef DENS + + + +// -------------------------------------------------------------------------------------------------------------- + +cmsBool OptimizeCLUTRGBTransform(_cmsTransformFn* TransformFn, + void** UserData, + _cmsFreeUserDataFn* FreeDataFn, + cmsPipeline** Lut, + cmsUInt32Number* InputFormat, + cmsUInt32Number* OutputFormat, + cmsUInt32Number* dwFlags) +{ + cmsPipeline* OriginalLut; + int nGridPoints; + cmsPipeline* OptimizedLUT = NULL; + cmsStage* OptimizedCLUTmpe; + cmsColorSpaceSignature OutputColorSpace; + cmsStage* mpe; + FloatCLUTData* p8; + cmsContext ContextID; + _cmsStageCLutData* data; + + // For empty transforms, do nothing + if (*Lut == NULL) return FALSE; + + // This is a loosy optimization! does not apply in floating-point cases + if (!T_FLOAT(*InputFormat) || !T_FLOAT(*OutputFormat)) return FALSE; + + // Only on 8-bit + if (T_BYTES(*InputFormat) != 4 || T_BYTES(*OutputFormat) != 4) return FALSE; + + // Only on RGB + if (T_COLORSPACE(*InputFormat) != PT_RGB) return FALSE; + if (T_COLORSPACE(*OutputFormat) != PT_RGB) return FALSE; + + OriginalLut = *Lut; + + // Named color pipelines cannot be optimized either + for (mpe = cmsPipelineGetPtrToFirstStage(OriginalLut); + mpe != NULL; + mpe = cmsStageNext(mpe)) { + if (cmsStageType(mpe) == cmsSigNamedColorElemType) return FALSE; + } + + ContextID = cmsGetPipelineContextID(OriginalLut); + OutputColorSpace = _cmsICCcolorSpace(T_COLORSPACE(*OutputFormat)); + nGridPoints = _cmsReasonableGridpointsByColorspace(cmsSigRgbData, *dwFlags); + + // Create the result LUT + OptimizedLUT = cmsPipelineAlloc(cmsGetPipelineContextID(OriginalLut), 3, cmsPipelineOutputChannels(OriginalLut)); + if (OptimizedLUT == NULL) goto Error; + + + // Allocate the CLUT for result + OptimizedCLUTmpe = cmsStageAllocCLutFloat(ContextID, nGridPoints, 3, cmsPipelineOutputChannels(OriginalLut), NULL); + + // Add the CLUT to the destination LUT + cmsPipelineInsertStage(OptimizedLUT, cmsAT_BEGIN, OptimizedCLUTmpe); + + // Resample the LUT + if (!cmsStageSampleCLutFloat(OptimizedCLUTmpe, XFormSampler, (void*)OriginalLut, 0)) goto Error; + + // Set the evaluator, copy parameters + data = (_cmsStageCLutData*) cmsStageData(OptimizedCLUTmpe); + + p8 = FloatCLUTAlloc(ContextID, data ->Params); + if (p8 == NULL) return FALSE; + + // And return the obtained LUT + cmsPipelineFree(OriginalLut); + + *Lut = OptimizedLUT; + *TransformFn = (_cmsTransformFn) FloatCLUTEval; + *UserData = p8; + *FreeDataFn = _cmsFree; + + return TRUE; + +Error: + + if (OptimizedLUT != NULL) cmsPipelineFree(OptimizedLUT); + + return FALSE; +} + -- cgit v1.2.1