1/* vim: set ts=8 sw=8 noexpandtab: */
2//  qcms
3//  Copyright (C) 2009 Mozilla Corporation
4//  Copyright (C) 1998-2007 Marti Maria
5//
6// Permission is hereby granted, free of charge, to any person obtaining
7// a copy of this software and associated documentation files (the "Software"),
8// to deal in the Software without restriction, including without limitation
9// the rights to use, copy, modify, merge, publish, distribute, sublicense,
10// and/or sell copies of the Software, and to permit persons to whom the Software
11// is furnished to do so, subject to the following conditions:
12//
13// The above copyright notice and this permission notice shall be included in
14// all copies or substantial portions of the Software.
15//
16// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
18// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24#include <stdlib.h>
25#include <math.h>
26#include <assert.h>
27#include <string.h> //memcpy
28#include "qcmsint.h"
29#include "chain.h"
30#include "matrix.h"
31#include "transform_util.h"
32
33/* for MSVC, GCC, Intel, and Sun compilers */
34#if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
35#define X86
36#endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
37
38// Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
39// This is just an approximation, I am not handling all the non-linear
40// aspects of the RGB to XYZ process, and assumming that the gamma correction
41// has transitive property in the tranformation chain.
42//
43// the alghoritm:
44//
45//            - First I build the absolute conversion matrix using
46//              primaries in XYZ. This matrix is next inverted
47//            - Then I eval the source white point across this matrix
48//              obtaining the coeficients of the transformation
49//            - Then, I apply these coeficients to the original matrix
50static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
51{
52	struct matrix primaries;
53	struct matrix primaries_invert;
54	struct matrix result;
55	struct vector white_point;
56	struct vector coefs;
57
58	double xn, yn;
59	double xr, yr;
60	double xg, yg;
61	double xb, yb;
62
63	xn = white.x;
64	yn = white.y;
65
66	if (yn == 0.0)
67		return matrix_invalid();
68
69	xr = primrs.red.x;
70	yr = primrs.red.y;
71	xg = primrs.green.x;
72	yg = primrs.green.y;
73	xb = primrs.blue.x;
74	yb = primrs.blue.y;
75
76	primaries.m[0][0] = xr;
77	primaries.m[0][1] = xg;
78	primaries.m[0][2] = xb;
79
80	primaries.m[1][0] = yr;
81	primaries.m[1][1] = yg;
82	primaries.m[1][2] = yb;
83
84	primaries.m[2][0] = 1 - xr - yr;
85	primaries.m[2][1] = 1 - xg - yg;
86	primaries.m[2][2] = 1 - xb - yb;
87	primaries.invalid = false;
88
89	white_point.v[0] = xn/yn;
90	white_point.v[1] = 1.;
91	white_point.v[2] = (1.0-xn-yn)/yn;
92
93	primaries_invert = matrix_invert(primaries);
94
95	coefs = matrix_eval(primaries_invert, white_point);
96
97	result.m[0][0] = coefs.v[0]*xr;
98	result.m[0][1] = coefs.v[1]*xg;
99	result.m[0][2] = coefs.v[2]*xb;
100
101	result.m[1][0] = coefs.v[0]*yr;
102	result.m[1][1] = coefs.v[1]*yg;
103	result.m[1][2] = coefs.v[2]*yb;
104
105	result.m[2][0] = coefs.v[0]*(1.-xr-yr);
106	result.m[2][1] = coefs.v[1]*(1.-xg-yg);
107	result.m[2][2] = coefs.v[2]*(1.-xb-yb);
108	result.invalid = primaries_invert.invalid;
109
110	return result;
111}
112
113struct CIE_XYZ {
114	double X;
115	double Y;
116	double Z;
117};
118
119/* CIE Illuminant D50 */
120static const struct CIE_XYZ D50_XYZ = {
121	0.9642,
122	1.0000,
123	0.8249
124};
125
126/* from lcms: xyY2XYZ()
127 * corresponds to argyll: icmYxy2XYZ() */
128static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
129{
130	struct CIE_XYZ dest;
131	dest.X = (source.x / source.y) * source.Y;
132	dest.Y = source.Y;
133	dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
134	return dest;
135}
136
137/* from lcms: ComputeChromaticAdaption */
138// Compute chromatic adaption matrix using chad as cone matrix
139static struct matrix
140compute_chromatic_adaption(struct CIE_XYZ source_white_point,
141                           struct CIE_XYZ dest_white_point,
142                           struct matrix chad)
143{
144	struct matrix chad_inv;
145	struct vector cone_source_XYZ, cone_source_rgb;
146	struct vector cone_dest_XYZ, cone_dest_rgb;
147	struct matrix cone, tmp;
148
149	tmp = chad;
150	chad_inv = matrix_invert(tmp);
151
152	cone_source_XYZ.v[0] = source_white_point.X;
153	cone_source_XYZ.v[1] = source_white_point.Y;
154	cone_source_XYZ.v[2] = source_white_point.Z;
155
156	cone_dest_XYZ.v[0] = dest_white_point.X;
157	cone_dest_XYZ.v[1] = dest_white_point.Y;
158	cone_dest_XYZ.v[2] = dest_white_point.Z;
159
160	cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
161	cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
162
163	cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
164	cone.m[0][1] = 0;
165	cone.m[0][2] = 0;
166	cone.m[1][0] = 0;
167	cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
168	cone.m[1][2] = 0;
169	cone.m[2][0] = 0;
170	cone.m[2][1] = 0;
171	cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
172	cone.invalid = false;
173
174	// Normalize
175	return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
176}
177
178/* from lcms: cmsAdaptionMatrix */
179// Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
180// Bradford is assumed
181static struct matrix
182adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
183{
184#if defined (_MSC_VER)
185#pragma warning(push)
186/* Disable double to float truncation warning 4305 */
187#pragma warning(disable:4305)
188#endif
189	struct matrix lam_rigg = {{ // Bradford matrix
190	                         {  0.8951,  0.2664, -0.1614 },
191	                         { -0.7502,  1.7135,  0.0367 },
192	                         {  0.0389, -0.0685,  1.0296 }
193	                         }};
194#if defined (_MSC_VER)
195/* Restore warnings */
196#pragma warning(pop)
197#endif
198	return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
199}
200
201/* from lcms: cmsAdaptMatrixToD50 */
202static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
203{
204	struct CIE_XYZ Dn;
205	struct matrix Bradford;
206
207	if (source_white_pt.y == 0.0)
208		return matrix_invalid();
209
210	Dn = xyY2XYZ(source_white_pt);
211
212	Bradford = adaption_matrix(Dn, D50_XYZ);
213	return matrix_multiply(Bradford, r);
214}
215
216qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
217{
218	struct matrix colorants;
219	colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
220	colorants = adapt_matrix_to_D50(colorants, white_point);
221
222	if (colorants.invalid)
223		return false;
224
225	/* note: there's a transpose type of operation going on here */
226	profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
227	profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
228	profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
229
230	profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
231	profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
232	profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
233
234	profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
235	profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
236	profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
237
238	return true;
239}
240
241#if 0
242static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
243{
244	const int r_out = output_format.r;
245	const int b_out = output_format.b;
246
247	int i;
248	float (*mat)[4] = transform->matrix;
249	for (i=0; i<length; i++) {
250		unsigned char device_r = *src++;
251		unsigned char device_g = *src++;
252		unsigned char device_b = *src++;
253
254		float linear_r = transform->input_gamma_table_r[device_r];
255		float linear_g = transform->input_gamma_table_g[device_g];
256		float linear_b = transform->input_gamma_table_b[device_b];
257
258		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
259		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
260		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
261
262		float out_device_r = pow(out_linear_r, transform->out_gamma_r);
263		float out_device_g = pow(out_linear_g, transform->out_gamma_g);
264		float out_device_b = pow(out_linear_b, transform->out_gamma_b);
265
266		dest[r_out] = clamp_u8(out_device_r*255);
267		dest[1]     = clamp_u8(out_device_g*255);
268		dest[b_out] = clamp_u8(out_device_b*255);
269		dest += 3;
270	}
271}
272#endif
273
274static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
275{
276	const int r_out = output_format.r;
277	const int b_out = output_format.b;
278
279	unsigned int i;
280	for (i = 0; i < length; i++) {
281		float out_device_r, out_device_g, out_device_b;
282		unsigned char device = *src++;
283
284		float linear = transform->input_gamma_table_gray[device];
285
286		out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
287		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
288		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
289
290		dest[r_out] = clamp_u8(out_device_r*255);
291		dest[1]     = clamp_u8(out_device_g*255);
292		dest[b_out] = clamp_u8(out_device_b*255);
293		dest += 3;
294	}
295}
296
297/* Alpha is not corrected.
298   A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
299   RGB Is?" Tech Memo 17 (December 14, 1998).
300	See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
301*/
302
303static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
304{
305	const int r_out = output_format.r;
306	const int b_out = output_format.b;
307
308	unsigned int i;
309	for (i = 0; i < length; i++) {
310		float out_device_r, out_device_g, out_device_b;
311		unsigned char device = *src++;
312		unsigned char alpha = *src++;
313
314		float linear = transform->input_gamma_table_gray[device];
315
316		out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
317		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
318		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
319
320		dest[r_out] = clamp_u8(out_device_r*255);
321		dest[1]     = clamp_u8(out_device_g*255);
322		dest[b_out] = clamp_u8(out_device_b*255);
323		dest[3]     = alpha;
324		dest += 4;
325	}
326}
327
328
329static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
330{
331	const int r_out = output_format.r;
332	const int b_out = output_format.b;
333
334	unsigned int i;
335	for (i = 0; i < length; i++) {
336		unsigned char device = *src++;
337		uint16_t gray;
338
339		float linear = transform->input_gamma_table_gray[device];
340
341		/* we could round here... */
342		gray = linear * PRECACHE_OUTPUT_MAX;
343
344		dest[r_out] = transform->output_table_r->data[gray];
345		dest[1]     = transform->output_table_g->data[gray];
346		dest[b_out] = transform->output_table_b->data[gray];
347		dest += 3;
348	}
349}
350
351
352static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
353{
354	const int r_out = output_format.r;
355	const int b_out = output_format.b;
356
357	unsigned int i;
358	for (i = 0; i < length; i++) {
359		unsigned char device = *src++;
360		unsigned char alpha = *src++;
361		uint16_t gray;
362
363		float linear = transform->input_gamma_table_gray[device];
364
365		/* we could round here... */
366		gray = linear * PRECACHE_OUTPUT_MAX;
367
368		dest[r_out] = transform->output_table_r->data[gray];
369		dest[1]     = transform->output_table_g->data[gray];
370		dest[b_out] = transform->output_table_b->data[gray];
371		dest[3]     = alpha;
372		dest += 4;
373	}
374}
375
376static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
377{
378	const int r_out = output_format.r;
379	const int b_out = output_format.b;
380
381	unsigned int i;
382	float (*mat)[4] = transform->matrix;
383	for (i = 0; i < length; i++) {
384		unsigned char device_r = *src++;
385		unsigned char device_g = *src++;
386		unsigned char device_b = *src++;
387		uint16_t r, g, b;
388
389		float linear_r = transform->input_gamma_table_r[device_r];
390		float linear_g = transform->input_gamma_table_g[device_g];
391		float linear_b = transform->input_gamma_table_b[device_b];
392
393		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
394		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
395		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
396
397		out_linear_r = clamp_float(out_linear_r);
398		out_linear_g = clamp_float(out_linear_g);
399		out_linear_b = clamp_float(out_linear_b);
400
401		/* we could round here... */
402		r = out_linear_r * PRECACHE_OUTPUT_MAX;
403		g = out_linear_g * PRECACHE_OUTPUT_MAX;
404		b = out_linear_b * PRECACHE_OUTPUT_MAX;
405
406		dest[r_out] = transform->output_table_r->data[r];
407		dest[1]     = transform->output_table_g->data[g];
408		dest[b_out] = transform->output_table_b->data[b];
409		dest += 3;
410	}
411}
412
413static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
414{
415	const int r_out = output_format.r;
416	const int b_out = output_format.b;
417
418	unsigned int i;
419	float (*mat)[4] = transform->matrix;
420	for (i = 0; i < length; i++) {
421		unsigned char device_r = *src++;
422		unsigned char device_g = *src++;
423		unsigned char device_b = *src++;
424		unsigned char alpha = *src++;
425		uint16_t r, g, b;
426
427		float linear_r = transform->input_gamma_table_r[device_r];
428		float linear_g = transform->input_gamma_table_g[device_g];
429		float linear_b = transform->input_gamma_table_b[device_b];
430
431		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
432		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
433		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
434
435		out_linear_r = clamp_float(out_linear_r);
436		out_linear_g = clamp_float(out_linear_g);
437		out_linear_b = clamp_float(out_linear_b);
438
439		/* we could round here... */
440		r = out_linear_r * PRECACHE_OUTPUT_MAX;
441		g = out_linear_g * PRECACHE_OUTPUT_MAX;
442		b = out_linear_b * PRECACHE_OUTPUT_MAX;
443
444		dest[r_out] = transform->output_table_r->data[r];
445		dest[1]     = transform->output_table_g->data[g];
446		dest[b_out] = transform->output_table_b->data[b];
447		dest[3]     = alpha;
448		dest += 4;
449	}
450}
451
452// Not used
453/*
454static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
455{
456	const int r_out = output_format.r;
457	const int b_out = output_format.b;
458
459	unsigned int i;
460	int xy_len = 1;
461	int x_len = transform->grid_size;
462	int len = x_len * x_len;
463	float* r_table = transform->r_clut;
464	float* g_table = transform->g_clut;
465	float* b_table = transform->b_clut;
466
467	for (i = 0; i < length; i++) {
468		unsigned char in_r = *src++;
469		unsigned char in_g = *src++;
470		unsigned char in_b = *src++;
471		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
472
473		int x = floor(linear_r * (transform->grid_size-1));
474		int y = floor(linear_g * (transform->grid_size-1));
475		int z = floor(linear_b * (transform->grid_size-1));
476		int x_n = ceil(linear_r * (transform->grid_size-1));
477		int y_n = ceil(linear_g * (transform->grid_size-1));
478		int z_n = ceil(linear_b * (transform->grid_size-1));
479		float x_d = linear_r * (transform->grid_size-1) - x;
480		float y_d = linear_g * (transform->grid_size-1) - y;
481		float z_d = linear_b * (transform->grid_size-1) - z;
482
483		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
484		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
485		float r_y1 = lerp(r_x1, r_x2, y_d);
486		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
487		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
488		float r_y2 = lerp(r_x3, r_x4, y_d);
489		float clut_r = lerp(r_y1, r_y2, z_d);
490
491		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
492		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
493		float g_y1 = lerp(g_x1, g_x2, y_d);
494		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
495		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
496		float g_y2 = lerp(g_x3, g_x4, y_d);
497		float clut_g = lerp(g_y1, g_y2, z_d);
498
499		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
500		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
501		float b_y1 = lerp(b_x1, b_x2, y_d);
502		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
503		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
504		float b_y2 = lerp(b_x3, b_x4, y_d);
505		float clut_b = lerp(b_y1, b_y2, z_d);
506
507		dest[r_out] = clamp_u8(clut_r*255.0f);
508		dest[1]     = clamp_u8(clut_g*255.0f);
509		dest[b_out] = clamp_u8(clut_b*255.0f);
510		dest += 3;
511	}
512}
513*/
514
515// Using lcms' tetra interpolation algorithm.
516static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
517{
518	const int r_out = output_format.r;
519	const int b_out = output_format.b;
520
521	unsigned int i;
522	int xy_len = 1;
523	int x_len = transform->grid_size;
524	int len = x_len * x_len;
525	float* r_table = transform->r_clut;
526	float* g_table = transform->g_clut;
527	float* b_table = transform->b_clut;
528	float c0_r, c1_r, c2_r, c3_r;
529	float c0_g, c1_g, c2_g, c3_g;
530	float c0_b, c1_b, c2_b, c3_b;
531	float clut_r, clut_g, clut_b;
532	for (i = 0; i < length; i++) {
533		unsigned char in_r = *src++;
534		unsigned char in_g = *src++;
535		unsigned char in_b = *src++;
536		unsigned char in_a = *src++;
537		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
538
539		int x = floor(linear_r * (transform->grid_size-1));
540		int y = floor(linear_g * (transform->grid_size-1));
541		int z = floor(linear_b * (transform->grid_size-1));
542		int x_n = ceil(linear_r * (transform->grid_size-1));
543		int y_n = ceil(linear_g * (transform->grid_size-1));
544		int z_n = ceil(linear_b * (transform->grid_size-1));
545		float rx = linear_r * (transform->grid_size-1) - x;
546		float ry = linear_g * (transform->grid_size-1) - y;
547		float rz = linear_b * (transform->grid_size-1) - z;
548
549		c0_r = CLU(r_table, x, y, z);
550		c0_g = CLU(g_table, x, y, z);
551		c0_b = CLU(b_table, x, y, z);
552
553		if( rx >= ry ) {
554			if (ry >= rz) { //rx >= ry && ry >= rz
555				c1_r = CLU(r_table, x_n, y, z) - c0_r;
556				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
557				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
558				c1_g = CLU(g_table, x_n, y, z) - c0_g;
559				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
560				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
561				c1_b = CLU(b_table, x_n, y, z) - c0_b;
562				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
563				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
564			} else {
565				if (rx >= rz) { //rx >= rz && rz >= ry
566					c1_r = CLU(r_table, x_n, y, z) - c0_r;
567					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
568					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
569					c1_g = CLU(g_table, x_n, y, z) - c0_g;
570					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
571					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
572					c1_b = CLU(b_table, x_n, y, z) - c0_b;
573					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
574					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
575				} else { //rz > rx && rx >= ry
576					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
577					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
578					c3_r = CLU(r_table, x, y, z_n) - c0_r;
579					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
580					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
581					c3_g = CLU(g_table, x, y, z_n) - c0_g;
582					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
583					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
584					c3_b = CLU(b_table, x, y, z_n) - c0_b;
585				}
586			}
587		} else {
588			if (rx >= rz) { //ry > rx && rx >= rz
589				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
590				c2_r = CLU(r_table, x, y_n, z) - c0_r;
591				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
592				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
593				c2_g = CLU(g_table, x, y_n, z) - c0_g;
594				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
595				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
596				c2_b = CLU(b_table, x, y_n, z) - c0_b;
597				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
598			} else {
599				if (ry >= rz) { //ry >= rz && rz > rx
600					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
601					c2_r = CLU(r_table, x, y_n, z) - c0_r;
602					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
603					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
604					c2_g = CLU(g_table, x, y_n, z) - c0_g;
605					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
606					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
607					c2_b = CLU(b_table, x, y_n, z) - c0_b;
608					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
609				} else { //rz > ry && ry > rx
610					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
611					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
612					c3_r = CLU(r_table, x, y, z_n) - c0_r;
613					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
614					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
615					c3_g = CLU(g_table, x, y, z_n) - c0_g;
616					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
617					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
618					c3_b = CLU(b_table, x, y, z_n) - c0_b;
619				}
620			}
621		}
622
623		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
624		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
625		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
626
627		dest[r_out] = clamp_u8(clut_r*255.0f);
628		dest[1]     = clamp_u8(clut_g*255.0f);
629		dest[b_out] = clamp_u8(clut_b*255.0f);
630		dest[3]     = in_a;
631		dest += 4;
632	}
633}
634
635// Using lcms' tetra interpolation code.
636static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
637{
638	const int r_out = output_format.r;
639	const int b_out = output_format.b;
640
641	unsigned int i;
642	int xy_len = 1;
643	int x_len = transform->grid_size;
644	int len = x_len * x_len;
645	float* r_table = transform->r_clut;
646	float* g_table = transform->g_clut;
647	float* b_table = transform->b_clut;
648	float c0_r, c1_r, c2_r, c3_r;
649	float c0_g, c1_g, c2_g, c3_g;
650	float c0_b, c1_b, c2_b, c3_b;
651	float clut_r, clut_g, clut_b;
652	for (i = 0; i < length; i++) {
653		unsigned char in_r = *src++;
654		unsigned char in_g = *src++;
655		unsigned char in_b = *src++;
656		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
657
658		int x = floor(linear_r * (transform->grid_size-1));
659		int y = floor(linear_g * (transform->grid_size-1));
660		int z = floor(linear_b * (transform->grid_size-1));
661		int x_n = ceil(linear_r * (transform->grid_size-1));
662		int y_n = ceil(linear_g * (transform->grid_size-1));
663		int z_n = ceil(linear_b * (transform->grid_size-1));
664		float rx = linear_r * (transform->grid_size-1) - x;
665		float ry = linear_g * (transform->grid_size-1) - y;
666		float rz = linear_b * (transform->grid_size-1) - z;
667
668		c0_r = CLU(r_table, x, y, z);
669		c0_g = CLU(g_table, x, y, z);
670		c0_b = CLU(b_table, x, y, z);
671
672		if( rx >= ry ) {
673			if (ry >= rz) { //rx >= ry && ry >= rz
674				c1_r = CLU(r_table, x_n, y, z) - c0_r;
675				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
676				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
677				c1_g = CLU(g_table, x_n, y, z) - c0_g;
678				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
679				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
680				c1_b = CLU(b_table, x_n, y, z) - c0_b;
681				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
682				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
683			} else {
684				if (rx >= rz) { //rx >= rz && rz >= ry
685					c1_r = CLU(r_table, x_n, y, z) - c0_r;
686					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
687					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
688					c1_g = CLU(g_table, x_n, y, z) - c0_g;
689					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
690					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
691					c1_b = CLU(b_table, x_n, y, z) - c0_b;
692					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
693					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
694				} else { //rz > rx && rx >= ry
695					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
696					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
697					c3_r = CLU(r_table, x, y, z_n) - c0_r;
698					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
699					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
700					c3_g = CLU(g_table, x, y, z_n) - c0_g;
701					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
702					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
703					c3_b = CLU(b_table, x, y, z_n) - c0_b;
704				}
705			}
706		} else {
707			if (rx >= rz) { //ry > rx && rx >= rz
708				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
709				c2_r = CLU(r_table, x, y_n, z) - c0_r;
710				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
711				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
712				c2_g = CLU(g_table, x, y_n, z) - c0_g;
713				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
714				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
715				c2_b = CLU(b_table, x, y_n, z) - c0_b;
716				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
717			} else {
718				if (ry >= rz) { //ry >= rz && rz > rx
719					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
720					c2_r = CLU(r_table, x, y_n, z) - c0_r;
721					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
722					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
723					c2_g = CLU(g_table, x, y_n, z) - c0_g;
724					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
725					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
726					c2_b = CLU(b_table, x, y_n, z) - c0_b;
727					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
728				} else { //rz > ry && ry > rx
729					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
730					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
731					c3_r = CLU(r_table, x, y, z_n) - c0_r;
732					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
733					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
734					c3_g = CLU(g_table, x, y, z_n) - c0_g;
735					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
736					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
737					c3_b = CLU(b_table, x, y, z_n) - c0_b;
738				}
739			}
740		}
741
742		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
743		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
744		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
745
746		dest[r_out] = clamp_u8(clut_r*255.0f);
747		dest[1]     = clamp_u8(clut_g*255.0f);
748		dest[b_out] = clamp_u8(clut_b*255.0f);
749		dest += 3;
750	}
751}
752
753static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
754{
755	const int r_out = output_format.r;
756	const int b_out = output_format.b;
757
758	unsigned int i;
759	float (*mat)[4] = transform->matrix;
760	for (i = 0; i < length; i++) {
761		unsigned char device_r = *src++;
762		unsigned char device_g = *src++;
763		unsigned char device_b = *src++;
764		float out_device_r, out_device_g, out_device_b;
765
766		float linear_r = transform->input_gamma_table_r[device_r];
767		float linear_g = transform->input_gamma_table_g[device_g];
768		float linear_b = transform->input_gamma_table_b[device_b];
769
770		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
771		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
772		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
773
774		out_linear_r = clamp_float(out_linear_r);
775		out_linear_g = clamp_float(out_linear_g);
776		out_linear_b = clamp_float(out_linear_b);
777
778		out_device_r = lut_interp_linear(out_linear_r,
779				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
780		out_device_g = lut_interp_linear(out_linear_g,
781				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
782		out_device_b = lut_interp_linear(out_linear_b,
783				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
784
785		dest[r_out] = clamp_u8(out_device_r*255);
786		dest[1]     = clamp_u8(out_device_g*255);
787		dest[b_out] = clamp_u8(out_device_b*255);
788		dest += 3;
789	}
790}
791
792static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
793{
794	const int r_out = output_format.r;
795	const int b_out = output_format.b;
796
797	unsigned int i;
798	float (*mat)[4] = transform->matrix;
799	for (i = 0; i < length; i++) {
800		unsigned char device_r = *src++;
801		unsigned char device_g = *src++;
802		unsigned char device_b = *src++;
803		unsigned char alpha = *src++;
804		float out_device_r, out_device_g, out_device_b;
805
806		float linear_r = transform->input_gamma_table_r[device_r];
807		float linear_g = transform->input_gamma_table_g[device_g];
808		float linear_b = transform->input_gamma_table_b[device_b];
809
810		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
811		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
812		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
813
814		out_linear_r = clamp_float(out_linear_r);
815		out_linear_g = clamp_float(out_linear_g);
816		out_linear_b = clamp_float(out_linear_b);
817
818		out_device_r = lut_interp_linear(out_linear_r,
819				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
820		out_device_g = lut_interp_linear(out_linear_g,
821				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
822		out_device_b = lut_interp_linear(out_linear_b,
823				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
824
825		dest[r_out] = clamp_u8(out_device_r*255);
826		dest[1]     = clamp_u8(out_device_g*255);
827		dest[b_out] = clamp_u8(out_device_b*255);
828		dest[3]     = alpha;
829		dest += 4;
830	}
831}
832
833#if 0
834static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
835{
836	const int r_out = output_format.r;
837	const int b_out = output_format.b;
838
839	int i;
840	float (*mat)[4] = transform->matrix;
841	for (i = 0; i < length; i++) {
842		unsigned char device_r = *src++;
843		unsigned char device_g = *src++;
844		unsigned char device_b = *src++;
845
846		float linear_r = transform->input_gamma_table_r[device_r];
847		float linear_g = transform->input_gamma_table_g[device_g];
848		float linear_b = transform->input_gamma_table_b[device_b];
849
850		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
851		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
852		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
853
854		dest[r_out] = clamp_u8(out_linear_r*255);
855		dest[1]     = clamp_u8(out_linear_g*255);
856		dest[b_out] = clamp_u8(out_linear_b*255);
857		dest += 3;
858	}
859}
860#endif
861
862/*
863 * If users create and destroy objects on different threads, even if the same
864 * objects aren't used on different threads at the same time, we can still run
865 * in to trouble with refcounts if they aren't atomic.
866 *
867 * This can lead to us prematurely deleting the precache if threads get unlucky
868 * and write the wrong value to the ref count.
869 */
870static struct precache_output *precache_reference(struct precache_output *p)
871{
872	qcms_atomic_increment(p->ref_count);
873	return p;
874}
875
876static struct precache_output *precache_create()
877{
878	struct precache_output *p = malloc(sizeof(struct precache_output));
879	if (p)
880		p->ref_count = 1;
881	return p;
882}
883
884void precache_release(struct precache_output *p)
885{
886	if (qcms_atomic_decrement(p->ref_count) == 0) {
887		free(p);
888	}
889}
890
891#ifdef HAVE_POSIX_MEMALIGN
892static qcms_transform *transform_alloc(void)
893{
894	qcms_transform *t;
895	if (!posix_memalign(&t, 16, sizeof(*t))) {
896		return t;
897	} else {
898		return NULL;
899	}
900}
901static void transform_free(qcms_transform *t)
902{
903	free(t);
904}
905#else
906static qcms_transform *transform_alloc(void)
907{
908	/* transform needs to be aligned on a 16byte boundrary */
909	char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
910	/* make room for a pointer to the block returned by calloc */
911	void *transform_start = original_block + sizeof(void*);
912	/* align transform_start */
913	qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
914
915	/* store a pointer to the block returned by calloc so that we can free it later */
916	void **(original_block_ptr) = (void**)transform_aligned;
917	if (!original_block)
918		return NULL;
919	original_block_ptr--;
920	*original_block_ptr = original_block;
921
922	return transform_aligned;
923}
924static void transform_free(qcms_transform *t)
925{
926	/* get at the pointer to the unaligned block returned by calloc */
927	void **p = (void**)t;
928	p--;
929	free(*p);
930}
931#endif
932
933void qcms_transform_release(qcms_transform *t)
934{
935	/* ensure we only free the gamma tables once even if there are
936	 * multiple references to the same data */
937
938	if (t->output_table_r)
939		precache_release(t->output_table_r);
940	if (t->output_table_g)
941		precache_release(t->output_table_g);
942	if (t->output_table_b)
943		precache_release(t->output_table_b);
944
945	free(t->input_gamma_table_r);
946	if (t->input_gamma_table_g != t->input_gamma_table_r)
947		free(t->input_gamma_table_g);
948	if (t->input_gamma_table_g != t->input_gamma_table_r &&
949	    t->input_gamma_table_g != t->input_gamma_table_b)
950		free(t->input_gamma_table_b);
951
952	free(t->input_gamma_table_gray);
953
954	free(t->output_gamma_lut_r);
955	free(t->output_gamma_lut_g);
956	free(t->output_gamma_lut_b);
957
958	transform_free(t);
959}
960
961#ifdef X86
962// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
963// mozilla/jpeg)
964 // -------------------------------------------------------------------------
965#if defined(_M_IX86) && defined(_MSC_VER)
966#define HAS_CPUID
967/* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
968   register - I'm not sure if that ever happens on windows, but cpuid isn't
969   on the critical path so we just preserve the register to be safe and to be
970   consistent with the non-windows version. */
971static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
972       uint32_t a_, b_, c_, d_;
973       __asm {
974              xchg   ebx, esi
975              mov    eax, fxn
976              cpuid
977              mov    a_, eax
978              mov    b_, ebx
979              mov    c_, ecx
980              mov    d_, edx
981              xchg   ebx, esi
982       }
983       *a = a_;
984       *b = b_;
985       *c = c_;
986       *d = d_;
987}
988#elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
989#define HAS_CPUID
990/* Get us a CPUID function. We can't use ebx because it's the PIC register on
991   some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
992static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
993
994	uint32_t a_, b_, c_, d_;
995       __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
996                             : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
997	   *a = a_;
998	   *b = b_;
999	   *c = c_;
1000	   *d = d_;
1001}
1002#endif
1003
1004// -------------------------Runtime SSEx Detection-----------------------------
1005
1006/* MMX is always supported per
1007 *  Gecko v1.9.1 minimum CPU requirements */
1008#define SSE1_EDX_MASK (1UL << 25)
1009#define SSE2_EDX_MASK (1UL << 26)
1010#define SSE3_ECX_MASK (1UL <<  0)
1011
1012static int sse_version_available(void)
1013{
1014#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
1015	/* we know at build time that 64-bit CPUs always have SSE2
1016	 * this tells the compiler that non-SSE2 branches will never be
1017	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
1018	return 2;
1019#elif defined(HAS_CPUID)
1020	static int sse_version = -1;
1021	uint32_t a, b, c, d;
1022	uint32_t function = 0x00000001;
1023
1024	if (sse_version == -1) {
1025		sse_version = 0;
1026		cpuid(function, &a, &b, &c, &d);
1027		if (c & SSE3_ECX_MASK)
1028			sse_version = 3;
1029		else if (d & SSE2_EDX_MASK)
1030			sse_version = 2;
1031		else if (d & SSE1_EDX_MASK)
1032			sse_version = 1;
1033	}
1034
1035	return sse_version;
1036#else
1037	return 0;
1038#endif
1039}
1040#endif
1041
1042static const struct matrix bradford_matrix = {{	{ 0.8951f, 0.2664f,-0.1614f},
1043						{-0.7502f, 1.7135f, 0.0367f},
1044						{ 0.0389f,-0.0685f, 1.0296f}},
1045						false};
1046
1047static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
1048						    { 0.4323053f, 0.5183603f, 0.0492912f},
1049						    {-0.0085287f, 0.0400428f, 0.9684867f}},
1050						    false};
1051
1052// See ICCv4 E.3
1053struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
1054	float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
1055		  (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
1056	float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
1057		  (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
1058	float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
1059		  (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
1060	struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
1061	return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
1062}
1063
1064void qcms_profile_precache_output_transform(qcms_profile *profile)
1065{
1066	/* we only support precaching on rgb profiles */
1067	if (profile->color_space != RGB_SIGNATURE)
1068		return;
1069
1070	if (qcms_supports_iccv4) {
1071		/* don't precache since we will use the B2A LUT */
1072		if (profile->B2A0)
1073			return;
1074
1075		/* don't precache since we will use the mBA LUT */
1076		if (profile->mBA)
1077			return;
1078	}
1079
1080	/* don't precache if we do not have the TRC curves */
1081	if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
1082		return;
1083
1084	if (!profile->output_table_r) {
1085		profile->output_table_r = precache_create();
1086		if (profile->output_table_r &&
1087				!compute_precache(profile->redTRC, profile->output_table_r->data)) {
1088			precache_release(profile->output_table_r);
1089			profile->output_table_r = NULL;
1090		}
1091	}
1092	if (!profile->output_table_g) {
1093		profile->output_table_g = precache_create();
1094		if (profile->output_table_g &&
1095				!compute_precache(profile->greenTRC, profile->output_table_g->data)) {
1096			precache_release(profile->output_table_g);
1097			profile->output_table_g = NULL;
1098		}
1099	}
1100	if (!profile->output_table_b) {
1101		profile->output_table_b = precache_create();
1102		if (profile->output_table_b &&
1103				!compute_precache(profile->blueTRC, profile->output_table_b->data)) {
1104			precache_release(profile->output_table_b);
1105			profile->output_table_b = NULL;
1106		}
1107	}
1108}
1109
1110/* Replace the current transformation with a LUT transformation using a given number of sample points */
1111qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
1112                                                 int samples, qcms_data_type in_type)
1113{
1114	/* The range between which 2 consecutive sample points can be used to interpolate */
1115	uint16_t x,y,z;
1116	uint32_t l;
1117	uint32_t lutSize = 3 * samples * samples * samples;
1118	float* src = NULL;
1119	float* dest = NULL;
1120	float* lut = NULL;
1121
1122	src = malloc(lutSize*sizeof(float));
1123	dest = malloc(lutSize*sizeof(float));
1124
1125	if (src && dest) {
1126		/* Prepare a list of points we want to sample */
1127		l = 0;
1128		for (x = 0; x < samples; x++) {
1129			for (y = 0; y < samples; y++) {
1130				for (z = 0; z < samples; z++) {
1131					src[l++] = x / (float)(samples-1);
1132					src[l++] = y / (float)(samples-1);
1133					src[l++] = z / (float)(samples-1);
1134				}
1135			}
1136		}
1137
1138		lut = qcms_chain_transform(in, out, src, dest, lutSize);
1139		if (lut) {
1140			transform->r_clut = &lut[0];
1141			transform->g_clut = &lut[1];
1142			transform->b_clut = &lut[2];
1143			transform->grid_size = samples;
1144			if (in_type == QCMS_DATA_RGBA_8) {
1145				transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
1146			} else {
1147				transform->transform_fn = qcms_transform_data_tetra_clut;
1148			}
1149		}
1150	}
1151
1152
1153	//XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
1154	if (src && lut != src) {
1155		free(src);
1156	}
1157	if (dest && lut != dest) {
1158		free(dest);
1159	}
1160
1161	if (lut == NULL) {
1162		return NULL;
1163	}
1164	return transform;
1165}
1166
1167#define NO_MEM_TRANSFORM NULL
1168
1169qcms_transform* qcms_transform_create(
1170		qcms_profile *in, qcms_data_type in_type,
1171		qcms_profile *out, qcms_data_type out_type,
1172		qcms_intent intent)
1173{
1174	bool precache = false;
1175
1176        qcms_transform *transform = transform_alloc();
1177        if (!transform) {
1178		return NULL;
1179	}
1180	if (out_type != QCMS_DATA_RGB_8 &&
1181                out_type != QCMS_DATA_RGBA_8) {
1182            assert(0 && "output type");
1183	    transform_free(transform);
1184            return NULL;
1185        }
1186
1187	if (out->output_table_r &&
1188			out->output_table_g &&
1189			out->output_table_b) {
1190		precache = true;
1191	}
1192
1193	if (qcms_supports_iccv4 && (in->A2B0 || out->B2A0 || in->mAB || out->mAB)) {
1194		// Precache the transformation to a CLUT 33x33x33 in size.
1195		// 33 is used by many profiles and works well in pratice.
1196		// This evenly divides 256 into blocks of 8x8x8.
1197		// TODO For transforming small data sets of about 200x200 or less
1198		// precaching should be avoided.
1199		qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
1200		if (!result) {
1201            		assert(0 && "precacheLUT failed");
1202			transform_free(transform);
1203			return NULL;
1204		}
1205		return result;
1206	}
1207
1208	if (precache) {
1209		transform->output_table_r = precache_reference(out->output_table_r);
1210		transform->output_table_g = precache_reference(out->output_table_g);
1211		transform->output_table_b = precache_reference(out->output_table_b);
1212	} else {
1213		if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
1214			qcms_transform_release(transform);
1215			return NO_MEM_TRANSFORM;
1216		}
1217		build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
1218		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
1219		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
1220		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
1221			qcms_transform_release(transform);
1222			return NO_MEM_TRANSFORM;
1223		}
1224	}
1225
1226        if (in->color_space == RGB_SIGNATURE) {
1227		struct matrix in_matrix, out_matrix, result;
1228
1229		if (in_type != QCMS_DATA_RGB_8 &&
1230                    in_type != QCMS_DATA_RGBA_8){
1231                	assert(0 && "input type");
1232			transform_free(transform);
1233                	return NULL;
1234            	}
1235		if (precache) {
1236#if defined(SSE2_ENABLE) && defined(X86)
1237		    if (sse_version_available() >= 2) {
1238			    if (in_type == QCMS_DATA_RGB_8)
1239				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
1240			    else
1241				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
1242
1243#if defined(SSE2_ENABLE) && !(defined(_MSC_VER) && defined(_M_AMD64))
1244                    /* Microsoft Compiler for x64 doesn't support MMX.
1245                     * SSE code uses MMX so that we disable on x64 */
1246		    } else
1247		    if (sse_version_available() >= 1) {
1248			    if (in_type == QCMS_DATA_RGB_8)
1249				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
1250			    else
1251				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
1252#endif
1253		    } else
1254#endif
1255			{
1256				if (in_type == QCMS_DATA_RGB_8)
1257					transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
1258				else
1259					transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
1260			}
1261		} else {
1262			if (in_type == QCMS_DATA_RGB_8)
1263				transform->transform_fn = qcms_transform_data_rgb_out_lut;
1264			else
1265				transform->transform_fn = qcms_transform_data_rgba_out_lut;
1266		}
1267
1268		//XXX: avoid duplicating tables if we can
1269		transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
1270		transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
1271		transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
1272		if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
1273			qcms_transform_release(transform);
1274			return NO_MEM_TRANSFORM;
1275		}
1276
1277
1278		/* build combined colorant matrix */
1279		in_matrix = build_colorant_matrix(in);
1280		out_matrix = build_colorant_matrix(out);
1281		out_matrix = matrix_invert(out_matrix);
1282		if (out_matrix.invalid) {
1283			qcms_transform_release(transform);
1284			return NULL;
1285		}
1286		result = matrix_multiply(out_matrix, in_matrix);
1287
1288		/* store the results in column major mode
1289		 * this makes doing the multiplication with sse easier */
1290		transform->matrix[0][0] = result.m[0][0];
1291		transform->matrix[1][0] = result.m[0][1];
1292		transform->matrix[2][0] = result.m[0][2];
1293		transform->matrix[0][1] = result.m[1][0];
1294		transform->matrix[1][1] = result.m[1][1];
1295		transform->matrix[2][1] = result.m[1][2];
1296		transform->matrix[0][2] = result.m[2][0];
1297		transform->matrix[1][2] = result.m[2][1];
1298		transform->matrix[2][2] = result.m[2][2];
1299
1300	} else if (in->color_space == GRAY_SIGNATURE) {
1301		if (in_type != QCMS_DATA_GRAY_8 &&
1302				in_type != QCMS_DATA_GRAYA_8){
1303			assert(0 && "input type");
1304			transform_free(transform);
1305			return NULL;
1306		}
1307
1308		transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
1309		if (!transform->input_gamma_table_gray) {
1310			qcms_transform_release(transform);
1311			return NO_MEM_TRANSFORM;
1312		}
1313
1314		if (precache) {
1315			if (in_type == QCMS_DATA_GRAY_8) {
1316				transform->transform_fn = qcms_transform_data_gray_out_precache;
1317			} else {
1318				transform->transform_fn = qcms_transform_data_graya_out_precache;
1319			}
1320		} else {
1321			if (in_type == QCMS_DATA_GRAY_8) {
1322				transform->transform_fn = qcms_transform_data_gray_out_lut;
1323			} else {
1324				transform->transform_fn = qcms_transform_data_graya_out_lut;
1325			}
1326		}
1327	} else {
1328		assert(0 && "unexpected colorspace");
1329		transform_free(transform);
1330		return NULL;
1331	}
1332	return transform;
1333}
1334
1335/* __force_align_arg_pointer__ is an x86-only attribute, and gcc/clang warns on unused
1336 * attributes. Don't use this on ARM or AMD64. __has_attribute can detect the presence
1337 * of the attribute but is currently only supported by clang */
1338#if defined(__has_attribute)
1339#define HAS_FORCE_ALIGN_ARG_POINTER __has_attribute(__force_align_arg_pointer__)
1340#elif defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) && !defined(__arm__) && !defined(__mips__)
1341#define HAS_FORCE_ALIGN_ARG_POINTER 1
1342#else
1343#define HAS_FORCE_ALIGN_ARG_POINTER 0
1344#endif
1345
1346#if HAS_FORCE_ALIGN_ARG_POINTER
1347/* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
1348__attribute__((__force_align_arg_pointer__))
1349#endif
1350void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
1351{
1352	static const struct _qcms_format_type output_rgbx = { 0, 2 };
1353
1354	transform->transform_fn(transform, src, dest, length, output_rgbx);
1355}
1356
1357void qcms_transform_data_type(qcms_transform *transform, void *src, void *dest, size_t length, qcms_output_type type)
1358{
1359	static const struct _qcms_format_type output_rgbx = { 0, 2 };
1360	static const struct _qcms_format_type output_bgrx = { 2, 0 };
1361
1362	transform->transform_fn(transform, src, dest, length, type == QCMS_OUTPUT_BGRX ? output_bgrx : output_rgbx);
1363}
1364
1365qcms_bool qcms_supports_iccv4;
1366void qcms_enable_iccv4()
1367{
1368	qcms_supports_iccv4 = true;
1369}
1370