1/*-------------------------------------------------------------------------
2 * drawElements Quality Program OpenGL ES 2.0 Module
3 * -------------------------------------------------
4 *
5 * Copyright 2014 The Android Open Source Project
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 *      http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 *//*!
20 * \file
21 * \brief Shader operator performance tests.
22 *//*--------------------------------------------------------------------*/
23
24#include "es2pShaderOperatorTests.hpp"
25#include "glsCalibration.hpp"
26#include "gluShaderUtil.hpp"
27#include "gluShaderProgram.hpp"
28#include "gluPixelTransfer.hpp"
29#include "tcuTestLog.hpp"
30#include "tcuRenderTarget.hpp"
31#include "tcuCommandLine.hpp"
32#include "tcuSurface.hpp"
33#include "deStringUtil.hpp"
34#include "deSharedPtr.hpp"
35#include "deClock.h"
36#include "deMath.h"
37
38#include "glwEnums.hpp"
39#include "glwFunctions.hpp"
40
41#include <map>
42#include <algorithm>
43#include <limits>
44#include <set>
45
46namespace deqp
47{
48namespace gles2
49{
50namespace Performance
51{
52
53using namespace gls;
54using namespace glu;
55using tcu::Vec2;
56using tcu::Vec4;
57using tcu::TestLog;
58using de::SharedPtr;
59
60using std::string;
61using std::vector;
62
63#define MEASUREMENT_FAIL() throw tcu::InternalError("Unable to get sensible measurements for estimation", DE_NULL, __FILE__, __LINE__)
64
65// Number of measurements in OperatorPerformanceCase for each workload size, unless specified otherwise by a command line argument.
66static const int	DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD	= 3;
67// How many different workload sizes are used by OperatorPerformanceCase.
68static const int	NUM_WORKLOADS							= 8;
69// Maximum workload size that can be attempted. In a sensible case, this most likely won't be reached.
70static const int	MAX_WORKLOAD_SIZE						= 1<<29;
71
72// BinaryOpCase-specific constants for shader generation.
73static const int	BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS	= 4;
74static const int	BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT	= 2;
75static const int	BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT		= 4;
76
77// FunctionCase-specific constants for shader generation.
78static const int	FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS			= 4;
79
80static const char* const s_swizzles[][4] =
81{
82	{ "x", "yx", "yzx", "wzyx" },
83	{ "y", "zy", "wyz", "xwzy" },
84	{ "z", "wy", "zxy", "yzwx" },
85	{ "w", "xw", "yxw", "zyxw" }
86};
87
88template <int N>
89static tcu::Vector<float, N> mean (const vector<tcu::Vector<float, N> >& data)
90{
91	tcu::Vector<float, N> sum(0.0f);
92	for (int i = 0; i < (int)data.size(); i++)
93		sum += data[i];
94	return sum / tcu::Vector<float, N>((float)data.size());
95}
96
97static void uniformNfv (const glw::Functions& gl, int n, int location, int count, const float* data)
98{
99	switch (n)
100	{
101		case 1: gl.uniform1fv(location, count, data); break;
102		case 2: gl.uniform2fv(location, count, data); break;
103		case 3: gl.uniform3fv(location, count, data); break;
104		case 4: gl.uniform4fv(location, count, data); break;
105		default: DE_ASSERT(false);
106	}
107}
108
109static void uniformNiv (const glw::Functions& gl, int n, int location, int count, const int* data)
110{
111	switch (n)
112	{
113		case 1: gl.uniform1iv(location, count, data); break;
114		case 2: gl.uniform2iv(location, count, data); break;
115		case 3: gl.uniform3iv(location, count, data); break;
116		case 4: gl.uniform4iv(location, count, data); break;
117		default: DE_ASSERT(false);
118	}
119}
120
121static void uniformMatrixNfv (const glw::Functions& gl, int n, int location, int count, const float* data)
122{
123	switch (n)
124	{
125		case 2: gl.uniformMatrix2fv(location, count, GL_FALSE, &data[0]); break;
126		case 3: gl.uniformMatrix3fv(location, count, GL_FALSE, &data[0]); break;
127		case 4: gl.uniformMatrix4fv(location, count, GL_FALSE, &data[0]); break;
128		default: DE_ASSERT(false);
129	}
130}
131
132static glu::DataType getDataTypeFloatOrVec (int size)
133{
134	return size == 1 ? glu::TYPE_FLOAT : glu::getDataTypeFloatVec(size);
135}
136
137static int getIterationCountOrDefault (const tcu::CommandLine& cmdLine, int def)
138{
139	const int cmdLineVal = cmdLine.getTestIterationCount();
140	return cmdLineVal > 0 ? cmdLineVal : def;
141}
142
143static string lineParamsString (const LineParameters& params)
144{
145	return "y = " + de::toString(params.offset) + " + " + de::toString(params.coefficient) + "*x";
146}
147
148namespace
149{
150
151/*--------------------------------------------------------------------*//*!
152 * \brief Abstract class for measuring shader operator performance.
153 *
154 * This class draws multiple times with different workload sizes (set
155 * via a uniform, by subclass). Time for each frame is measured, and the
156 * slope of the workload size vs frame time data is estimated. This slope
157 * tells us the estimated increase in frame time caused by a workload
158 * increase of 1 unit (what 1 workload unit means is up to subclass).
159 *
160 * Generally, the shaders contain not just the operation we're interested
161 * in (e.g. addition) but also some other stuff (e.g. loop overhead). To
162 * eliminate this cost, we actually do the stuff described in the above
163 * paragraph with multiple programs (usually two), which contain different
164 * kinds of workload (e.g. different loop contents). Then we can (in
165 * theory) compute the cost of just one operation in a subclass-dependent
166 * manner.
167 *
168 * At this point, the result tells us the increase in frame time caused
169 * by the addition of one operation. Dividing this by the amount of
170 * draw calls in a frame, and further by the amount of vertices or
171 * fragments in a draw call, we get the time cost of one operation.
172 *
173 * In reality, there sometimes isn't just a trivial linear dependence
174 * between workload size and frame time. Instead, there tends to be some
175 * amount of initial "free" operations. That is, it may be that all
176 * workload sizes below some positive integer C yield the same frame time,
177 * and only workload sizes beyond C increase the frame time in a supposedly
178 * linear manner. Graphically, this means that there graph consists of two
179 * parts: a horizontal left part, and a linearly increasing right part; the
180 * right part starts where the left parts ends. The principal task of these
181 * tests is to look at the slope of the increasing right part. Additionally
182 * an estimate for the amount of initial free operations is calculated.
183 * Note that it is also normal to get graphs where the horizontal left part
184 * is of zero width, i.e. there are no free operations.
185 *//*--------------------------------------------------------------------*/
186class OperatorPerformanceCase : public tcu::TestCase
187{
188public:
189	enum CaseType
190	{
191		CASETYPE_VERTEX = 0,
192		CASETYPE_FRAGMENT,
193
194		CASETYPE_LAST
195	};
196
197	struct InitialCalibration
198	{
199		int initialNumCalls;
200		InitialCalibration (void) : initialNumCalls(1) {}
201	};
202
203	typedef SharedPtr<InitialCalibration> InitialCalibrationStorage;
204
205								OperatorPerformanceCase		(tcu::TestContext& testCtx, glu::RenderContext& renderCtx, const char* name, const char* description,
206															 CaseType caseType, int numWorkloads, const InitialCalibrationStorage& initialCalibrationStorage);
207								~OperatorPerformanceCase	(void);
208
209	void						init						(void);
210	void						deinit						(void);
211
212	IterateResult				iterate						(void);
213
214	struct AttribSpec
215	{
216		AttribSpec (const char* name_, const tcu::Vec4& p00_, const tcu::Vec4& p01_, const tcu::Vec4& p10_, const tcu::Vec4& p11_)
217			: name		(name_)
218			, p00		(p00_)
219			, p01		(p01_)
220			, p10		(p10_)
221			, p11		(p11_)
222		{
223		}
224
225		AttribSpec (void) {}
226
227		std::string		name;
228		tcu::Vec4		p00;	//!< Bottom left.
229		tcu::Vec4		p01;	//!< Bottom right.
230		tcu::Vec4		p10;	//!< Top left.
231		tcu::Vec4		p11;	//!< Top right.
232	};
233
234protected:
235	struct ProgramContext
236	{
237		string				vertShaderSource;
238		string				fragShaderSource;
239		vector<AttribSpec>	attributes;
240
241		string				description;
242
243		ProgramContext (void) {}
244		ProgramContext (const string& vs, const string& fs, const vector<AttribSpec>& attrs, const string& desc)
245			: vertShaderSource(vs), fragShaderSource(fs), attributes(attrs), description(desc) {}
246	};
247
248	virtual vector<ProgramContext>	generateProgramData					(void) const = 0;
249	//! Sets program-specific uniforms that don't depend on the workload size.
250	virtual void					setGeneralUniforms					(deUint32 program) const = 0;
251	//! Sets the uniform(s) that specifies the workload size in the shader.
252	virtual void					setWorkloadSizeUniform				(deUint32 program, int workload) const = 0;
253	//! Computes the cost of a single operation, given the workload costs per program.
254	virtual float					computeSingleOperationTime			(const vector<float>& perProgramWorkloadCosts) const = 0;
255	//! Logs a human-readable description of what computeSingleOperationTime does.
256	virtual void					logSingleOperationCalculationInfo	(void) const = 0;
257
258	glu::RenderContext&				m_renderCtx;
259
260	CaseType						m_caseType;
261
262private:
263	enum State
264	{
265		STATE_CALIBRATING = 0,		//!< Calibrate draw call count, using first program in m_programs, with workload size 1.
266		STATE_FIND_HIGH_WORKLOAD,	//!< Find an appropriate lower bound for the highest workload size we intend to use (one with high-enough frame time compared to workload size 1) for each program.
267		STATE_MEASURING,			//!< Do actual measurements, for each program in m_programs.
268		STATE_REPORTING,			//!< Measurements are done; calculate results and log.
269		STATE_FINISHED,				//!< All done.
270
271		STATE_LAST
272	};
273
274	struct WorkloadRecord
275	{
276		int				workloadSize;
277		vector<float>	frameTimes; //!< In microseconds.
278
279				WorkloadRecord	(int workloadSize_)						: workloadSize(workloadSize_) {}
280		bool	operator<		(const WorkloadRecord& other) const		{ return this->workloadSize < other.workloadSize; }
281		void	addFrameTime	(float time)							{ frameTimes.push_back(time); }
282		float	getMedianTime	(void) const
283		{
284			vector<float> times = frameTimes;
285			std::sort(times.begin(), times.end());
286			return times.size() % 2 == 0 ?
287					(times[times.size()/2-1] + times[times.size()/2])*0.5f :
288					times[times.size()/2];
289		}
290	};
291
292	void								prepareProgram				(int progNdx);					//!< Sets attributes and uniforms for m_programs[progNdx].
293	void								prepareWorkload				(int progNdx, int workload);	//!< Calls setWorkloadSizeUniform and draws, in case the implementation does some draw-time compilation.
294	void								prepareNextRound			(void);							//!< Increases workload and/or updates m_state.
295	void								render						(int numDrawCalls);
296	deUint64							renderAndMeasure			(int numDrawCalls);
297	void								adjustAndLogGridAndViewport	(void);							//!< Log grid and viewport sizes, after possibly reducing them to reduce draw time.
298
299	vector<Vec2>						getWorkloadMedianDataPoints	(int progNdx) const; //!< [ Vec2(r.workloadSize, r.getMedianTime()) for r in m_workloadRecords[progNdx] ]
300
301	const int							m_numMeasurementsPerWorkload;
302	const int							m_numWorkloads;				//!< How many different workload sizes are used for measurement for each program.
303
304	int									m_workloadNdx;				//!< Runs from 0 to m_numWorkloads-1.
305
306	int									m_workloadMeasurementNdx;
307	vector<vector<WorkloadRecord> >		m_workloadRecordsFindHigh;	//!< The measurements done during STATE_FIND_HIGH_WORKLOAD.
308	vector<vector<WorkloadRecord> >		m_workloadRecords;			//!< The measurements of each program in m_programs. Generated during STATE_MEASURING, into index specified by m_measureProgramNdx.
309
310	State								m_state;
311	int									m_measureProgramNdx;		//!< When m_state is STATE_FIND_HIGH_WORKLOAD or STATE_MEASURING, this tells which program in m_programs is being measured.
312
313	vector<int>							m_highWorkloadSizes;		//!< The first workload size encountered during STATE_FIND_HIGH_WORKLOAD that was determined suitable, for each program.
314
315	TheilSenCalibrator					m_calibrator;
316	InitialCalibrationStorage			m_initialCalibrationStorage;
317
318	int									m_viewportWidth;
319	int									m_viewportHeight;
320	int									m_gridSizeX;
321	int									m_gridSizeY;
322
323	vector<ProgramContext>				m_programData;
324	vector<SharedPtr<ShaderProgram> >	m_programs;
325
326	std::vector<deUint32>				m_attribBuffers;
327};
328
329static inline float triangleInterpolate (float v0, float v1, float v2, float x, float y)
330{
331	return v0 + (v2-v0)*x + (v1-v0)*y;
332}
333
334static inline float triQuadInterpolate (float x, float y, const tcu::Vec4& quad)
335{
336	// \note Top left fill rule.
337	if (x + y < 1.0f)
338		return triangleInterpolate(quad.x(), quad.y(), quad.z(), x, y);
339	else
340		return triangleInterpolate(quad.w(), quad.z(), quad.y(), 1.0f-x, 1.0f-y);
341}
342
343static inline int getNumVertices (int gridSizeX, int gridSizeY)
344{
345	return gridSizeX * gridSizeY * 2 * 3;
346}
347
348static void generateVertices (std::vector<float>& dst, int gridSizeX, int gridSizeY, const OperatorPerformanceCase::AttribSpec& spec)
349{
350	const int numComponents = 4;
351
352	DE_ASSERT(gridSizeX >= 1 && gridSizeY >= 1);
353	dst.resize(getNumVertices(gridSizeX, gridSizeY) * numComponents);
354
355	{
356		int dstNdx = 0;
357
358		for (int baseY = 0; baseY < gridSizeY; baseY++)
359		for (int baseX = 0; baseX < gridSizeX; baseX++)
360		{
361			const float xf0 = (float)(baseX + 0) / (float)gridSizeX;
362			const float yf0 = (float)(baseY + 0) / (float)gridSizeY;
363			const float xf1 = (float)(baseX + 1) / (float)gridSizeX;
364			const float yf1 = (float)(baseY + 1) / (float)gridSizeY;
365
366#define ADD_VERTEX(XF, YF)										\
367	for (int compNdx = 0; compNdx < numComponents; compNdx++)	\
368		dst[dstNdx++] = triQuadInterpolate((XF), (YF), tcu::Vec4(spec.p00[compNdx], spec.p01[compNdx], spec.p10[compNdx], spec.p11[compNdx]))
369
370			ADD_VERTEX(xf0, yf0);
371			ADD_VERTEX(xf1, yf0);
372			ADD_VERTEX(xf0, yf1);
373
374			ADD_VERTEX(xf1, yf0);
375			ADD_VERTEX(xf1, yf1);
376			ADD_VERTEX(xf0, yf1);
377
378#undef ADD_VERTEX
379		}
380	}
381}
382
383static float intersectionX (const gls::LineParameters& a, const gls::LineParameters& b)
384{
385	return (a.offset - b.offset) / (b.coefficient - a.coefficient);
386}
387
388static int numDistinctX (const vector<Vec2>& data)
389{
390	std::set<float> xs;
391	for (int i = 0; i < (int)data.size(); i++)
392		xs.insert(data[i].x());
393	return (int)xs.size();
394}
395
396static gls::LineParameters simpleLinearRegression (const vector<Vec2>& data)
397{
398	const Vec2	mid					= mean(data);
399
400	float		slopeNumerator		= 0.0f;
401	float		slopeDenominator	= 0.0f;
402
403	for (int i = 0; i < (int)data.size(); i++)
404	{
405		const Vec2 diff = data[i] - mid;
406
407		slopeNumerator		+= diff.x()*diff.y();
408		slopeDenominator	+= diff.x()*diff.x();
409	}
410
411	const float slope	= slopeNumerator / slopeDenominator;
412	const float offset	= mid.y() - slope*mid.x();
413
414	return gls::LineParameters(offset, slope);
415}
416
417static float simpleLinearRegressionError (const vector<Vec2>& data)
418{
419	if (numDistinctX(data) <= 2)
420		return 0.0f;
421	else
422	{
423		const gls::LineParameters	estimator	= simpleLinearRegression(data);
424		float						error		= 0.0f;
425
426		for (int i = 0; i < (int)data.size(); i++)
427		{
428			const float estY = estimator.offset + estimator.coefficient*data[i].x();
429			const float diff = estY - data[i].y();
430			error += diff*diff;
431		}
432
433		return error / (float)data.size();
434	}
435}
436
437static float verticalVariance (const vector<Vec2>& data)
438{
439	if (numDistinctX(data) <= 2)
440		return 0.0f;
441	else
442	{
443		const float		meanY = mean(data).y();
444		float			error = 0.0f;
445
446		for (int i = 0; i < (int)data.size(); i++)
447		{
448			const float diff = meanY - data[i].y();
449			error += diff*diff;
450		}
451
452		return error / (float)data.size();
453	}
454}
455
456/*--------------------------------------------------------------------*//*!
457 * \brief Find the x coord that divides the input data into two slopes.
458 *
459 * The operator performance measurements tend to produce results where
460 * we get small operation counts "for free" (e.g. because the operations
461 * are performed during some memory transfer overhead or something),
462 * resulting in a curve with two parts: an initial horizontal line segment,
463 * and a rising line.
464 *
465 * This function finds the x coordinate that divides the input data into
466 * two parts such that the sum of the mean square errors for the
467 * least-squares estimated lines for the two parts is minimized, under the
468 * additional condition that the left line is horizontal.
469 *
470 * This function returns a number X s.t. { pt | pt is in data, pt.x >= X }
471 * is the right line, and the rest of data is the left line.
472 *//*--------------------------------------------------------------------*/
473static float findSlopePivotX (const vector<Vec2>& data)
474{
475	std::set<float> xCoords;
476	for (int i = 0; i < (int)data.size(); i++)
477		xCoords.insert(data[i].x());
478
479	float			lowestError		= std::numeric_limits<float>::infinity();
480	float			bestPivotX		= -std::numeric_limits<float>::infinity();
481
482	for (std::set<float>::const_iterator pivotX = xCoords.begin(); pivotX != xCoords.end(); ++pivotX)
483	{
484		vector<Vec2> leftData;
485		vector<Vec2> rightData;
486		for (int i = 0; i < (int)data.size(); i++)
487		{
488			if (data[i].x() < *pivotX)
489				leftData.push_back(data[i]);
490			else
491				rightData.push_back(data[i]);
492		}
493
494		if (numDistinctX(rightData) < 3) // We don't trust the right data if there's too little of it.
495			break;
496
497		{
498			const float totalError = verticalVariance(leftData) + simpleLinearRegressionError(rightData);
499
500			if (totalError < lowestError)
501			{
502				lowestError = totalError;
503				bestPivotX = *pivotX;
504			}
505		}
506	}
507
508	DE_ASSERT(lowestError < std::numeric_limits<float>::infinity());
509
510	return bestPivotX;
511}
512
513struct SegmentedEstimator
514{
515	float					pivotX; //!< Value returned by findSlopePivotX, or -infinity if only single line.
516	gls::LineParameters		left;
517	gls::LineParameters		right;
518	SegmentedEstimator (const gls::LineParameters& l, const gls::LineParameters& r, float pivotX_) : pivotX(pivotX_), left(l), right(r) {}
519};
520
521/*--------------------------------------------------------------------*//*!
522 * \brief Compute line estimators for (potentially) two-segment data.
523 *
524 * Splits the given data into left and right parts (using findSlopePivotX)
525 * and returns the line estimates for them.
526 *
527 * Sometimes, however (especially in fragment shader cases) the data is
528 * in fact not segmented, but a straight line. This function attempts to
529 * detect if this the case, and if so, sets left.offset = right.offset and
530 * left.slope = 0, meaning essentially that the initial "flat" part of the
531 * data has zero width.
532 *//*--------------------------------------------------------------------*/
533static SegmentedEstimator computeSegmentedEstimator (const vector<Vec2>& data)
534{
535	const float		pivotX = findSlopePivotX(data);
536	vector<Vec2>	leftData;
537	vector<Vec2>	rightData;
538
539	for (int i = 0; i < (int)data.size(); i++)
540	{
541		if (data[i].x() < pivotX)
542			leftData.push_back(data[i]);
543		else
544			rightData.push_back(data[i]);
545	}
546
547	{
548		const gls::LineParameters leftLine		= gls::theilSenLinearRegression(leftData);
549		const gls::LineParameters rightLine		= gls::theilSenLinearRegression(rightData);
550
551		if (numDistinctX(leftData) < 2 || leftLine.coefficient > rightLine.coefficient*0.5f)
552		{
553			// Left data doesn't seem credible; assume the data is just a single line.
554			const gls::LineParameters entireLine = gls::theilSenLinearRegression(data);
555			return SegmentedEstimator(gls::LineParameters(entireLine.offset, 0.0f), entireLine, -std::numeric_limits<float>::infinity());
556		}
557		else
558			return SegmentedEstimator(leftLine, rightLine, pivotX);
559	}
560}
561
562OperatorPerformanceCase::OperatorPerformanceCase (tcu::TestContext& testCtx, glu::RenderContext& renderCtx, const char* name, const char* description,
563												  CaseType caseType, int numWorkloads, const InitialCalibrationStorage& initialCalibrationStorage)
564	: tcu::TestCase					(testCtx, tcu::NODETYPE_PERFORMANCE, name, description)
565	, m_renderCtx					(renderCtx)
566	, m_caseType					(caseType)
567	, m_numMeasurementsPerWorkload	(getIterationCountOrDefault(m_testCtx.getCommandLine(), DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD))
568	, m_numWorkloads				(numWorkloads)
569	, m_workloadNdx					(-1)
570	, m_workloadMeasurementNdx		(-1)
571	, m_state						(STATE_LAST)
572	, m_measureProgramNdx			(-1)
573	, m_initialCalibrationStorage	(initialCalibrationStorage)
574	, m_viewportWidth				(caseType == CASETYPE_VERTEX	? 32	: renderCtx.getRenderTarget().getWidth())
575	, m_viewportHeight				(caseType == CASETYPE_VERTEX	? 32	: renderCtx.getRenderTarget().getHeight())
576	, m_gridSizeX					(caseType == CASETYPE_FRAGMENT	? 1		: 100)
577	, m_gridSizeY					(caseType == CASETYPE_FRAGMENT	? 1		: 100)
578{
579	DE_ASSERT(m_numWorkloads > 0);
580}
581
582OperatorPerformanceCase::~OperatorPerformanceCase (void)
583{
584	if (!m_attribBuffers.empty())
585	{
586		m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
587		m_attribBuffers.clear();
588	}
589}
590
591static void logRenderTargetInfo (TestLog& log, const tcu::RenderTarget& renderTarget)
592{
593	log << TestLog::Section("RenderTarget", "Render target")
594		<< TestLog::Message << "size: " << renderTarget.getWidth() << "x" << renderTarget.getHeight() << TestLog::EndMessage
595		<< TestLog::Message << "bits:"
596							<< " R" << renderTarget.getPixelFormat().redBits
597							<< " G" << renderTarget.getPixelFormat().greenBits
598							<< " B" << renderTarget.getPixelFormat().blueBits
599							<< " A" << renderTarget.getPixelFormat().alphaBits
600							<< " D" << renderTarget.getDepthBits()
601							<< " S" << renderTarget.getStencilBits()
602							<< TestLog::EndMessage;
603
604	if (renderTarget.getNumSamples() != 0)
605		log << TestLog::Message << renderTarget.getNumSamples() << "x MSAA" << TestLog::EndMessage;
606	else
607		log << TestLog::Message << "No MSAA" << TestLog::EndMessage;
608
609	log << TestLog::EndSection;
610}
611
612vector<Vec2> OperatorPerformanceCase::getWorkloadMedianDataPoints (int progNdx) const
613{
614	const vector<WorkloadRecord>&	records = m_workloadRecords[progNdx];
615	vector<Vec2>					result;
616
617	for (int i = 0; i < (int)records.size(); i++)
618		result.push_back(Vec2((float)records[i].workloadSize, records[i].getMedianTime()));
619
620	return result;
621}
622
623void OperatorPerformanceCase::prepareProgram (int progNdx)
624{
625	DE_ASSERT(progNdx < (int)m_programs.size());
626	DE_ASSERT(m_programData.size() == m_programs.size());
627
628	const glw::Functions&	gl			= m_renderCtx.getFunctions();
629	const ShaderProgram&	program		= *m_programs[progNdx];
630
631	vector<AttribSpec>		attributes	= m_programData[progNdx].attributes;
632
633	attributes.push_back(AttribSpec("a_position",
634									Vec4(-1.0f, -1.0f, 0.0f, 1.0f),
635									Vec4( 1.0f, -1.0f, 0.0f, 1.0f),
636									Vec4(-1.0f,  1.0f, 0.0f, 1.0f),
637									Vec4( 1.0f,  1.0f, 0.0f, 1.0f)));
638
639	DE_ASSERT(program.isOk());
640
641	// Generate vertices.
642	if (!m_attribBuffers.empty())
643		gl.deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
644	m_attribBuffers.resize(attributes.size(), 0);
645	gl.genBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
646	GLU_EXPECT_NO_ERROR(gl.getError(), "glGenBuffers()");
647
648	for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
649	{
650		std::vector<float> vertices;
651		generateVertices(vertices, m_gridSizeX, m_gridSizeY, attributes[attribNdx]);
652
653		gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
654		gl.bufferData(GL_ARRAY_BUFFER, (glw::GLsizeiptr)(vertices.size()*sizeof(float)), &vertices[0], GL_STATIC_DRAW);
655		GLU_EXPECT_NO_ERROR(gl.getError(), "Upload buffer data");
656	}
657
658	// Setup attribute bindings.
659	for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
660	{
661		int location = gl.getAttribLocation(program.getProgram(), attributes[attribNdx].name.c_str());
662
663		if (location >= 0)
664		{
665			gl.enableVertexAttribArray(location);
666			gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
667			gl.vertexAttribPointer(location, 4, GL_FLOAT, GL_FALSE, 0, DE_NULL);
668		}
669	}
670	GLU_EXPECT_NO_ERROR(gl.getError(), "Setup vertex input state");
671
672	gl.useProgram(program.getProgram());
673	setGeneralUniforms(program.getProgram());
674	gl.viewport(0, 0, m_viewportWidth, m_viewportHeight);
675}
676
677void OperatorPerformanceCase::prepareWorkload (int progNdx, int workload)
678{
679	setWorkloadSizeUniform(m_programs[progNdx]->getProgram(), workload);
680	render(m_calibrator.getCallCount());
681}
682
683void OperatorPerformanceCase::prepareNextRound (void)
684{
685	DE_ASSERT(m_state == STATE_CALIBRATING			||
686			  m_state == STATE_FIND_HIGH_WORKLOAD	||
687			  m_state == STATE_MEASURING);
688
689	TestLog& log = m_testCtx.getLog();
690
691	if (m_state == STATE_CALIBRATING && m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
692	{
693		m_measureProgramNdx = 0;
694		m_state = STATE_FIND_HIGH_WORKLOAD;
695	}
696
697	if (m_state == STATE_CALIBRATING)
698		prepareWorkload(0, 1);
699	else if (m_state == STATE_FIND_HIGH_WORKLOAD)
700	{
701		vector<WorkloadRecord>& records = m_workloadRecordsFindHigh[m_measureProgramNdx];
702
703		if (records.empty() || records.back().getMedianTime() < 2.0f*records[0].getMedianTime())
704		{
705			int workloadSize;
706
707			if (records.empty())
708				workloadSize = 1;
709			else
710			{
711				workloadSize = records.back().workloadSize*2;
712
713				if (workloadSize > MAX_WORKLOAD_SIZE)
714				{
715					log << TestLog::Message << "Even workload size " << records.back().workloadSize
716											<< " doesn't give high enough frame time for program " << m_measureProgramNdx
717											<< ". Can't get sensible result." << TestLog::EndMessage;
718					MEASUREMENT_FAIL();
719				}
720			}
721
722			records.push_back(WorkloadRecord(workloadSize));
723			prepareWorkload(0, workloadSize);
724			m_workloadMeasurementNdx = 0;
725		}
726		else
727		{
728			m_highWorkloadSizes[m_measureProgramNdx] = records.back().workloadSize;
729			m_measureProgramNdx++;
730
731			if (m_measureProgramNdx >= (int)m_programs.size())
732			{
733				m_state = STATE_MEASURING;
734				m_workloadNdx = -1;
735				m_measureProgramNdx = 0;
736			}
737
738			prepareProgram(m_measureProgramNdx);
739			prepareNextRound();
740		}
741	}
742	else
743	{
744		m_workloadNdx++;
745
746		if (m_workloadNdx < m_numWorkloads)
747		{
748			DE_ASSERT(m_numWorkloads > 1);
749			const int highWorkload	= m_highWorkloadSizes[m_measureProgramNdx];
750			const int workload		= highWorkload > m_numWorkloads ?
751										1 + m_workloadNdx*(highWorkload-1)/(m_numWorkloads-1) :
752										1 + m_workloadNdx;
753
754			prepareWorkload(m_measureProgramNdx, workload);
755
756			m_workloadMeasurementNdx = 0;
757
758			m_workloadRecords[m_measureProgramNdx].push_back(WorkloadRecord(workload));
759		}
760		else
761		{
762			m_measureProgramNdx++;
763
764			if (m_measureProgramNdx < (int)m_programs.size())
765			{
766				m_workloadNdx = -1;
767				m_workloadMeasurementNdx = 0;
768				prepareProgram(m_measureProgramNdx);
769				prepareNextRound();
770			}
771			else
772				m_state = STATE_REPORTING;
773		}
774	}
775}
776
777void OperatorPerformanceCase::init (void)
778{
779	TestLog&				log		= m_testCtx.getLog();
780	const glw::Functions&	gl		= m_renderCtx.getFunctions();
781
782	// Validate that we have sane grid and viewport setup.
783	DE_ASSERT(de::inBounds(m_gridSizeX, 1, 256) && de::inBounds(m_gridSizeY, 1, 256));
784	TCU_CHECK(de::inRange(m_viewportWidth,	1, m_renderCtx.getRenderTarget().getWidth()) &&
785			  de::inRange(m_viewportHeight,	1, m_renderCtx.getRenderTarget().getHeight()));
786
787	logRenderTargetInfo(log, m_renderCtx.getRenderTarget());
788
789	log << TestLog::Message << "Using additive blending." << TestLog::EndMessage;
790	gl.enable(GL_BLEND);
791	gl.blendEquation(GL_FUNC_ADD);
792	gl.blendFunc(GL_ONE, GL_ONE);
793
794	// Generate programs.
795	DE_ASSERT(m_programs.empty());
796	m_programData = generateProgramData();
797	DE_ASSERT(!m_programData.empty());
798
799	for (int progNdx = 0; progNdx < (int)m_programData.size(); progNdx++)
800	{
801		const string& vert = m_programData[progNdx].vertShaderSource;
802		const string& frag = m_programData[progNdx].fragShaderSource;
803
804		m_programs.push_back(SharedPtr<ShaderProgram>(new ShaderProgram(m_renderCtx, glu::makeVtxFragSources(vert, frag))));
805
806		if (!m_programs.back()->isOk())
807		{
808			log << *m_programs.back();
809			TCU_FAIL("Compile failed");
810		}
811	}
812
813	// Log all programs.
814	for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
815		log << TestLog::Section("Program" + de::toString(progNdx), "Program " + de::toString(progNdx))
816				<< TestLog::Message << m_programData[progNdx].description << TestLog::EndMessage
817				<< *m_programs[progNdx]
818			<< TestLog::EndSection;
819
820	m_highWorkloadSizes.resize(m_programData.size());
821	m_workloadRecordsFindHigh.resize(m_programData.size());
822	m_workloadRecords.resize(m_programData.size());
823
824	m_calibrator.clear(CalibratorParameters(m_initialCalibrationStorage->initialNumCalls, 10 /* calibrate iteration frames */, 2000.0f /* calibrate iteration shortcut threshold (ms) */, 16 /* max calibrate iterations */,
825											1000.0f/30.0f /* frame time (ms) */, 1000.0f/60.0f /* frame time cap (ms) */, 1000.0f /* target measure duration (ms) */));
826	m_state = STATE_CALIBRATING;
827
828	prepareProgram(0);
829	prepareNextRound();
830}
831
832void OperatorPerformanceCase::deinit (void)
833{
834	if (!m_attribBuffers.empty())
835	{
836		m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
837		m_attribBuffers.clear();
838	}
839
840	m_programs.clear();
841}
842
843void OperatorPerformanceCase::render (int numDrawCalls)
844{
845	const glw::Functions&	gl				= m_renderCtx.getFunctions();
846	const int				numVertices		= getNumVertices(m_gridSizeX, m_gridSizeY);
847
848	for (int callNdx = 0; callNdx < numDrawCalls; callNdx++)
849		gl.drawArrays(GL_TRIANGLES, 0, numVertices);
850
851	glu::readPixels(m_renderCtx, 0, 0, tcu::Surface(1, 1).getAccess()); // \note Serves as a more reliable replacement for glFinish().
852}
853
854deUint64 OperatorPerformanceCase::renderAndMeasure (int numDrawCalls)
855{
856	const deUint64 startTime = deGetMicroseconds();
857	render(numDrawCalls);
858	return deGetMicroseconds() - startTime;
859}
860
861void OperatorPerformanceCase::adjustAndLogGridAndViewport (void)
862{
863	TestLog& log = m_testCtx.getLog();
864
865	// If call count is just 1, and the target frame time still wasn't reached, reduce grid or viewport size.
866	if (m_calibrator.getCallCount() == 1)
867	{
868		const gls::MeasureState&	calibratorMeasure	= m_calibrator.getMeasureState();
869		const float					drawCallTime		= (float)calibratorMeasure.getTotalTime() / (float)calibratorMeasure.frameTimes.size();
870		const float					targetDrawCallTime	= m_calibrator.getParameters().targetFrameTimeUs;
871		const float					targetRatio			= targetDrawCallTime / drawCallTime;
872
873		if (targetRatio < 0.95f)
874		{
875			// Reduce grid or viewport size assuming draw call time scales proportionally.
876			if (m_caseType == CASETYPE_VERTEX)
877			{
878				const float targetRatioSqrt = deFloatSqrt(targetRatio);
879				m_gridSizeX = (int)(targetRatioSqrt * (float)m_gridSizeX);
880				m_gridSizeY = (int)(targetRatioSqrt * (float)m_gridSizeY);
881				TCU_CHECK_MSG(m_gridSizeX >= 1 && m_gridSizeY >= 1, "Can't decrease grid size enough to achieve low-enough draw times");
882				log << TestLog::Message << "Note: triangle grid size reduced from original; it's now smaller than during calibration." << TestLog::EndMessage;
883			}
884			else
885			{
886				const float targetRatioSqrt = deFloatSqrt(targetRatio);
887				m_viewportWidth  = (int)(targetRatioSqrt * (float)m_viewportWidth);
888				m_viewportHeight = (int)(targetRatioSqrt * (float)m_viewportHeight);
889				TCU_CHECK_MSG(m_viewportWidth >= 1 && m_viewportHeight >= 1, "Can't decrease viewport size enough to achieve low-enough draw times");
890				log << TestLog::Message << "Note: viewport size reduced from original; it's now smaller than during calibration." << TestLog::EndMessage;
891			}
892		}
893	}
894
895	prepareProgram(0);
896
897	// Log grid and viewport sizes.
898	log << TestLog::Message << "Grid size: " << m_gridSizeX << "x" << m_gridSizeY << TestLog::EndMessage;
899	log << TestLog::Message << "Viewport: " << m_viewportWidth << "x" << m_viewportHeight << TestLog::EndMessage;
900}
901
902OperatorPerformanceCase::IterateResult OperatorPerformanceCase::iterate (void)
903{
904	const TheilSenCalibrator::State calibratorState = m_calibrator.getState();
905
906	if (calibratorState != TheilSenCalibrator::STATE_FINISHED)
907	{
908		if (calibratorState == TheilSenCalibrator::STATE_RECOMPUTE_PARAMS)
909			m_calibrator.recomputeParameters();
910		else if (calibratorState == TheilSenCalibrator::STATE_MEASURE)
911			m_calibrator.recordIteration(renderAndMeasure(m_calibrator.getCallCount()));
912		else
913			DE_ASSERT(false);
914
915		if (m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
916		{
917			logCalibrationInfo(m_testCtx.getLog(), m_calibrator);
918			adjustAndLogGridAndViewport();
919			prepareNextRound();
920			m_initialCalibrationStorage->initialNumCalls = m_calibrator.getCallCount();
921		}
922	}
923	else if (m_state == STATE_FIND_HIGH_WORKLOAD || m_state == STATE_MEASURING)
924	{
925		if (m_workloadMeasurementNdx < m_numMeasurementsPerWorkload)
926		{
927			vector<WorkloadRecord>& records = m_state == STATE_FIND_HIGH_WORKLOAD ? m_workloadRecordsFindHigh[m_measureProgramNdx] : m_workloadRecords[m_measureProgramNdx];
928			records.back().addFrameTime((float)renderAndMeasure(m_calibrator.getCallCount()));
929			m_workloadMeasurementNdx++;
930		}
931		else
932			prepareNextRound();
933	}
934	else
935	{
936		DE_ASSERT(m_state == STATE_REPORTING);
937
938		TestLog&	log				= m_testCtx.getLog();
939		const int	drawCallCount	= m_calibrator.getCallCount();
940
941		{
942			// Compute per-program estimators for measurements.
943			vector<SegmentedEstimator> estimators;
944			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
945				estimators.push_back(computeSegmentedEstimator(getWorkloadMedianDataPoints(progNdx)));
946
947			// Log measurements and their estimators for all programs.
948			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
949			{
950				const SegmentedEstimator&	estimator	= estimators[progNdx];
951				const string				progNdxStr	= de::toString(progNdx);
952				vector<WorkloadRecord>		records		= m_workloadRecords[progNdx];
953				std::sort(records.begin(), records.end());
954
955				{
956					const tcu::ScopedLogSection section(log,
957														"Program" + progNdxStr + "Measurements",
958														"Measurements for program " + progNdxStr);
959
960					// Sample list of individual frame times.
961
962					log << TestLog::SampleList("Program" + progNdxStr + "IndividualFrameTimes", "Individual frame times")
963						<< TestLog::SampleInfo << TestLog::ValueInfo("Workload",	"Workload",		"",		QP_SAMPLE_VALUE_TAG_PREDICTOR)
964											   << TestLog::ValueInfo("FrameTime",	"Frame time",	"us",	QP_SAMPLE_VALUE_TAG_RESPONSE)
965						<< TestLog::EndSampleInfo;
966
967					for (int i = 0; i < (int)records.size(); i++)
968						for (int j = 0; j < (int)records[i].frameTimes.size(); j++)
969							log << TestLog::Sample << records[i].workloadSize << records[i].frameTimes[j] << TestLog::EndSample;
970
971					log << TestLog::EndSampleList;
972
973					// Sample list of median frame times.
974
975					log << TestLog::SampleList("Program" + progNdxStr + "MedianFrameTimes", "Median frame times")
976						<< TestLog::SampleInfo << TestLog::ValueInfo("Workload",		"Workload",				"",		QP_SAMPLE_VALUE_TAG_PREDICTOR)
977											   << TestLog::ValueInfo("MedianFrameTime",	"Median frame time",	"us",	QP_SAMPLE_VALUE_TAG_RESPONSE)
978						<< TestLog::EndSampleInfo;
979
980					for (int i = 0; i < (int)records.size(); i++)
981						log << TestLog::Sample << records[i].workloadSize << records[i].getMedianTime() << TestLog::EndSample;
982
983					log << TestLog::EndSampleList;
984
985					log << TestLog::Float("Program" + progNdxStr + "WorkloadCostEstimate", "Workload cost estimate", "us / workload", QP_KEY_TAG_TIME, estimator.right.coefficient);
986
987					if (estimator.pivotX > -std::numeric_limits<float>::infinity())
988						log << TestLog::Message << "Note: the data points with x coordinate greater than or equal to " << estimator.pivotX
989												<< " seem to form a rising line, and the rest of data points seem to form a near-horizontal line" << TestLog::EndMessage
990							<< TestLog::Message << "Note: the left line is estimated to be " << lineParamsString(estimator.left)
991												<< " and the right line " << lineParamsString(estimator.right) << TestLog::EndMessage;
992					else
993						log << TestLog::Message << "Note: the data seem to form a single line: " << lineParamsString(estimator.right) << TestLog::EndMessage;
994				}
995			}
996
997			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
998			{
999				if (estimators[progNdx].right.coefficient <= 0.0f)
1000				{
1001					log << TestLog::Message << "Slope of measurements for program " << progNdx << " isn't positive. Can't get sensible result." << TestLog::EndMessage;
1002					MEASUREMENT_FAIL();
1003				}
1004			}
1005
1006			// \note For each estimator, .right.coefficient is the increase in draw time (in microseconds) when
1007			// incrementing shader workload size by 1, when D draw calls are done, with a vertex/fragment count
1008			// of R.
1009			//
1010			// The measurements of any single program can't tell us the final result (time of single operation),
1011			// so we use computeSingleOperationTime to compute it from multiple programs' measurements in a
1012			// subclass-defined manner.
1013			//
1014			// After that, microseconds per operation can be calculated as singleOperationTime / (D * R).
1015
1016			{
1017				vector<float>	perProgramSlopes;
1018				for (int i = 0; i < (int)m_programs.size(); i++)
1019					perProgramSlopes.push_back(estimators[i].right.coefficient);
1020
1021				logSingleOperationCalculationInfo();
1022
1023				const float		maxSlope				= *std::max_element(perProgramSlopes.begin(), perProgramSlopes.end());
1024				const float		usecsPerFramePerOp		= computeSingleOperationTime(perProgramSlopes);
1025				const int		vertexOrFragmentCount	= m_caseType == CASETYPE_VERTEX ?
1026															getNumVertices(m_gridSizeX, m_gridSizeY) :
1027															m_viewportWidth*m_viewportHeight;
1028				const double	usecsPerDrawCallPerOp	= usecsPerFramePerOp / (double)drawCallCount;
1029				const double	usecsPerSingleOp		= usecsPerDrawCallPerOp / (double)vertexOrFragmentCount;
1030				const double	megaOpsPerSecond		= (double)(drawCallCount*vertexOrFragmentCount) / usecsPerFramePerOp;
1031				const int		numFreeOps				= de::max(0, (int)deFloatFloor(intersectionX(estimators[0].left,
1032																									 LineParameters(estimators[0].right.offset,
1033																													usecsPerFramePerOp))));
1034
1035				log << TestLog::Integer("VertexOrFragmentCount",
1036										"R = " + string(m_caseType == CASETYPE_VERTEX ? "Vertex" : "Fragment") + " count",
1037										"", QP_KEY_TAG_NONE, vertexOrFragmentCount)
1038
1039					<< TestLog::Integer("DrawCallsPerFrame", "D = Draw calls per frame", "", QP_KEY_TAG_NONE, drawCallCount)
1040
1041					<< TestLog::Integer("VerticesOrFragmentsPerFrame",
1042										"R*D = " + string(m_caseType == CASETYPE_VERTEX ? "Vertices" : "Fragments") + " per frame",
1043										"", QP_KEY_TAG_NONE, vertexOrFragmentCount*drawCallCount)
1044
1045					<< TestLog::Float("TimePerFramePerOp",
1046									  "Estimated cost of R*D " + string(m_caseType == CASETYPE_VERTEX ? "vertices" : "fragments")
1047									  + " (i.e. one frame) with one shader operation",
1048									  "us", QP_KEY_TAG_TIME, (float)usecsPerFramePerOp)
1049
1050					<< TestLog::Float("TimePerDrawcallPerOp",
1051									  "Estimated cost of one draw call with one shader operation",
1052									  "us", QP_KEY_TAG_TIME, (float)usecsPerDrawCallPerOp)
1053
1054					<< TestLog::Float("TimePerSingleOp",
1055									  "Estimated cost of a single shader operation",
1056									  "us", QP_KEY_TAG_TIME, (float)usecsPerSingleOp);
1057
1058				// \note Sometimes, when the operation is free or very cheap, it can happen that the shader with the operation runs,
1059				//		 for some reason, a bit faster than the shader without the operation, and thus we get a negative result. The
1060				//		 following threshold values for accepting a negative or almost-zero result are rather quick and dirty.
1061				if (usecsPerFramePerOp <= -0.1f*maxSlope)
1062				{
1063					log << TestLog::Message << "Got strongly negative result." << TestLog::EndMessage;
1064					MEASUREMENT_FAIL();
1065				}
1066				else if (usecsPerFramePerOp <= 0.001*maxSlope)
1067				{
1068					log << TestLog::Message << "Cost of operation seems to be approximately zero." << TestLog::EndMessage;
1069					m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
1070				}
1071				else
1072				{
1073					log << TestLog::Float("OpsPerSecond",
1074										  "Operations per second",
1075										  "Million/s", QP_KEY_TAG_PERFORMANCE, (float)megaOpsPerSecond)
1076
1077						<< TestLog::Integer("NumFreeOps",
1078											"Estimated number of \"free\" operations",
1079											"", QP_KEY_TAG_PERFORMANCE, numFreeOps);
1080
1081					m_testCtx.setTestResult(QP_TEST_RESULT_PASS, de::floatToString((float)megaOpsPerSecond, 2).c_str());
1082				}
1083
1084				m_state = STATE_FINISHED;
1085			}
1086		}
1087
1088		return STOP;
1089	}
1090
1091	return CONTINUE;
1092}
1093
1094// Binary operator case.
1095class BinaryOpCase : public OperatorPerformanceCase
1096{
1097public:
1098						BinaryOpCase				(Context& context, const char* name, const char* description, const char* op,
1099													 glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex, const InitialCalibrationStorage& initialCalibration);
1100
1101protected:
1102	vector<ProgramContext>	generateProgramData					(void) const;
1103	void					setGeneralUniforms					(deUint32 program) const;
1104	void					setWorkloadSizeUniform				(deUint32 program, int numOperations) const;
1105	float					computeSingleOperationTime			(const vector<float>& perProgramOperationCosts) const;
1106	void					logSingleOperationCalculationInfo	(void) const;
1107
1108private:
1109	enum ProgramID
1110	{
1111		// \note 0-based sequential numbering is relevant, because these are also used as vector indices.
1112		// \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
1113		PROGRAM_WITH_BIGGER_LOOP = 0,
1114		PROGRAM_WITH_SMALLER_LOOP,
1115
1116		PROGRAM_LAST
1117	};
1118
1119	ProgramContext			generateSingleProgramData		(ProgramID) const;
1120
1121	const string			m_op;
1122	const glu::DataType		m_type;
1123	const glu::Precision	m_precision;
1124	const bool				m_useSwizzle;
1125};
1126
1127BinaryOpCase::BinaryOpCase (Context& context, const char* name, const char* description, const char* op,
1128							glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex, const InitialCalibrationStorage& initialCalibration)
1129	: OperatorPerformanceCase	(context.getTestContext(), context.getRenderContext(), name, description,
1130								 isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
1131	, m_op						(op)
1132	, m_type					(type)
1133	, m_precision				(precision)
1134	, m_useSwizzle				(useSwizzle)
1135{
1136}
1137
1138BinaryOpCase::ProgramContext BinaryOpCase::generateSingleProgramData (ProgramID programID) const
1139{
1140	DE_ASSERT(glu::isDataTypeFloatOrVec(m_type) || glu::isDataTypeIntOrIVec(m_type));
1141
1142	const bool			isVertexCase	= m_caseType == CASETYPE_VERTEX;
1143	const char* const	precision		= glu::getPrecisionName(m_precision);
1144	const char* const	inputPrecision	= glu::isDataTypeIntOrIVec(m_type) && m_precision == glu::PRECISION_LOWP ? "mediump" : precision;
1145	const char* const	typeName		= getDataTypeName(m_type);
1146
1147	std::ostringstream	vtx;
1148	std::ostringstream	frag;
1149	std::ostringstream&	op				= isVertexCase ? vtx : frag;
1150
1151	// Attributes.
1152	vtx << "attribute highp vec4 a_position;\n";
1153	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1154		vtx << "attribute " << inputPrecision << " vec4 a_in" << i << ";\n";
1155
1156	if (isVertexCase)
1157	{
1158		vtx << "varying mediump vec4 v_color;\n";
1159		frag << "varying mediump vec4 v_color;\n";
1160	}
1161	else
1162	{
1163		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1164		{
1165			vtx << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1166			frag << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1167		}
1168	}
1169
1170	op << "uniform mediump int u_numLoopIterations;\n";
1171	if (isVertexCase)
1172		op << "uniform mediump float u_zero;\n";
1173
1174	vtx << "\n";
1175	vtx << "void main()\n";
1176	vtx << "{\n";
1177
1178	if (!isVertexCase)
1179		vtx << "\tgl_Position = a_position;\n";
1180
1181	frag << "\n";
1182	frag << "void main()\n";
1183	frag << "{\n";
1184
1185	// Expression inputs.
1186	const char* const prefix = isVertexCase ? "a_" : "v_";
1187	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1188	{
1189		const int	inSize		= getDataTypeScalarSize(m_type);
1190		const bool	isInt		= de::inRange<int>(m_type, TYPE_INT, TYPE_INT_VEC4);
1191		const bool	cast		= isInt || (!m_useSwizzle && m_type != TYPE_FLOAT_VEC4);
1192
1193		op << "\t" << precision << " " << typeName << " in" << i << " = ";
1194
1195		if (cast)
1196			op << typeName << "(";
1197
1198		op << prefix << "in" << i;
1199
1200		if (m_useSwizzle)
1201			op << "." << s_swizzles[i % DE_LENGTH_OF_ARRAY(s_swizzles)][inSize-1];
1202
1203		if (cast)
1204			op << ")";
1205
1206		op << ";\n";
1207	}
1208
1209	// Operation accumulation variables.
1210	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1211	{
1212		op << "\t" << precision << " " << typeName << " acc" << i << "a" << " = in" << i+0 << ";\n";
1213		op << "\t" << precision << " " << typeName << " acc" << i << "b" << " = in" << i+1 << ";\n";
1214	}
1215
1216	// Loop, with expressions in it.
1217	op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
1218	op << "\t{\n";
1219	{
1220		const int unrollAmount = programID == PROGRAM_WITH_SMALLER_LOOP ? BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT : BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1221		for (int unrollNdx = 0; unrollNdx < unrollAmount; unrollNdx++)
1222		{
1223			for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1224			{
1225				if (i > 0 || unrollNdx > 0)
1226					op << "\n";
1227				op << "\t\tacc" << i << "a = acc" << i << "b " << m_op << " acc" << i << "a" << ";\n";
1228				op << "\t\tacc" << i << "b = acc" << i << "a " << m_op << " acc" << i << "b" << ";\n";
1229			}
1230		}
1231	}
1232	op << "\t}\n";
1233	op << "\n";
1234
1235	// Result variable (sum of accumulation variables).
1236	op << "\t" << precision << " " << typeName << " res =";
1237	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1238		op << (i > 0 ? " "+m_op : "") << " acc" << i << "b";
1239	op << ";\n";
1240
1241	// Convert to color.
1242	op << "\tmediump vec4 color = ";
1243	if (m_type == TYPE_FLOAT_VEC4)
1244		op << "res";
1245	else
1246	{
1247		int size = getDataTypeScalarSize(m_type);
1248		op << "vec4(res";
1249
1250		for (int i = size; i < 4; i++)
1251			op << ", " << (i == 3 ? "1.0" : "0.0");
1252
1253		op << ")";
1254	}
1255	op << ";\n";
1256	op << "\t" << (isVertexCase ? "v_color" : "gl_FragColor") << " = color;\n";
1257
1258	if (isVertexCase)
1259	{
1260		vtx << "	gl_Position = a_position + u_zero*color;\n";
1261		frag << "	gl_FragColor = v_color;\n";
1262	}
1263	else
1264	{
1265		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1266			vtx << "	v_in" << i << " = a_in" << i << ";\n";
1267	}
1268
1269	vtx << "}\n";
1270	frag << "}\n";
1271
1272	{
1273		vector<AttribSpec> attributes;
1274		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1275			attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
1276											Vec4(2.0f, 2.0f, 2.0f, 1.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1277											Vec4(1.0f, 2.0f, 1.0f, 2.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1278											Vec4(2.0f, 1.0f, 2.0f, 2.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1279											Vec4(1.0f, 1.0f, 2.0f, 1.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4)));
1280
1281		{
1282			string description = "This is the program with the ";
1283
1284			description += programID == PROGRAM_WITH_SMALLER_LOOP	? "smaller"
1285						 : programID == PROGRAM_WITH_BIGGER_LOOP	? "bigger"
1286						 : DE_NULL;
1287
1288			description += " loop.\n"
1289						   "Note: workload size for this program means the number of loop iterations.";
1290
1291			return ProgramContext(vtx.str(), frag.str(), attributes, description);
1292		}
1293	}
1294}
1295
1296vector<BinaryOpCase::ProgramContext> BinaryOpCase::generateProgramData (void) const
1297{
1298	vector<ProgramContext> progData;
1299	for (int i = 0; i < PROGRAM_LAST; i++)
1300		progData.push_back(generateSingleProgramData((ProgramID)i));
1301	return progData;
1302}
1303
1304void BinaryOpCase::setGeneralUniforms (deUint32 program) const
1305{
1306	const glw::Functions& gl = m_renderCtx.getFunctions();
1307	gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
1308}
1309
1310void BinaryOpCase::setWorkloadSizeUniform (deUint32 program, int numLoopIterations) const
1311{
1312	const glw::Functions& gl = m_renderCtx.getFunctions();
1313	gl.uniform1i(gl.getUniformLocation(program, "u_numLoopIterations"), numLoopIterations);
1314}
1315
1316float BinaryOpCase::computeSingleOperationTime (const vector<float>& perProgramOperationCosts) const
1317{
1318	DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
1319
1320	const int		baseNumOpsInsideLoop				= 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
1321	const int		numOpsInsideLoopInSmallProgram		= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
1322	const int		numOpsInsideLoopInBigProgram		= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1323	DE_STATIC_ASSERT(numOpsInsideLoopInBigProgram > numOpsInsideLoopInSmallProgram);
1324	const int		opDiff								= numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
1325	const float		programOperationCostDiff			= perProgramOperationCosts[PROGRAM_WITH_BIGGER_LOOP] - perProgramOperationCosts[PROGRAM_WITH_SMALLER_LOOP];
1326
1327	return programOperationCostDiff / (float)opDiff;
1328}
1329
1330void BinaryOpCase::logSingleOperationCalculationInfo (void) const
1331{
1332	const int			baseNumOpsInsideLoop			= 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
1333	const int			numOpsInsideLoopInSmallProgram	= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
1334	const int			numOpsInsideLoopInBigProgram	= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1335	const int			opDiff							= numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
1336	const char* const	opName							= m_op == "+" ? "addition"
1337														: m_op == "-" ? "subtraction"
1338														: m_op == "*" ? "multiplication"
1339														: m_op == "/" ? "division"
1340														: DE_NULL;
1341	DE_ASSERT(opName != DE_NULL);
1342
1343	m_testCtx.getLog() << TestLog::Message << "Note: the bigger program contains " << opDiff << " more "
1344										   << opName << " operations in one loop iteration than the small program; "
1345										   << "cost of one operation is calculated as (cost_of_bigger_workload - cost_of_smaller_workload) / " << opDiff
1346										   << TestLog::EndMessage;
1347}
1348
1349// Built-in function case.
1350class FunctionCase : public OperatorPerformanceCase
1351{
1352public:
1353	enum
1354	{
1355		MAX_PARAMS = 3
1356	};
1357
1358						FunctionCase			(Context&							context,
1359												 const char*						name,
1360												 const char*						description,
1361												 const char*						func,
1362												 glu::DataType						returnType,
1363												 const glu::DataType				paramTypes[MAX_PARAMS],
1364												 const Vec4&						attribute,
1365												 int								modifyParamNdx, //!< Add a compile-time constant (2.0) to the parameter at this index. This is ignored if negative.
1366												 bool								useNearlyConstantINputs, //!< Function inputs shouldn't be much bigger than 'attribute'.
1367												 glu::Precision						precision,
1368												 bool								isVertex,
1369												 const InitialCalibrationStorage&	initialCalibration);
1370
1371protected:
1372	vector<ProgramContext>	generateProgramData					(void) const;
1373	void					setGeneralUniforms					(deUint32 program) const;
1374	void					setWorkloadSizeUniform				(deUint32 program, int numOperations) const;
1375	float					computeSingleOperationTime			(const vector<float>& perProgramOperationCosts) const;
1376	void					logSingleOperationCalculationInfo	(void) const;
1377
1378private:
1379	enum ProgramID
1380	{
1381		// \note 0-based sequential numbering is relevant, because these are also used as vector indices.
1382		// \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
1383		PROGRAM_WITH_FUNCTION_CALLS = 0,
1384		PROGRAM_WITHOUT_FUNCTION_CALLS,
1385
1386		PROGRAM_LAST
1387	};
1388
1389	//! Forms a "sum" expression from aExpr and bExpr; for booleans, this is "equal(a,b)", otherwise actual sum.
1390	static string		sumExpr						(const string& aExpr, const string& bExpr, glu::DataType type);
1391	//! Forms an expression used to increment an input value in the shader. If type is boolean, this is just
1392	//! baseExpr; otherwise, baseExpr is modified by multiplication or division by a loop index,
1393	//! to prevent simple compiler optimizations. See m_useNearlyConstantInputs for more explanation.
1394	static string		incrementExpr				(const string& baseExpr, glu::DataType type, bool divide);
1395
1396	ProgramContext		generateSingleProgramData	(ProgramID) const;
1397
1398	const string			m_func;
1399	const glu::DataType		m_returnType;
1400	glu::DataType			m_paramTypes[MAX_PARAMS];
1401	// \note m_modifyParamNdx, if not negative, specifies the index of the parameter to which a
1402	//		 compile-time constant (2.0) is added. This is a quick and dirty way to deal with
1403	//		 functions like clamp or smoothstep that require that a certain parameter is
1404	//		 greater than a certain other parameter.
1405	const int				m_modifyParamNdx;
1406	// \note m_useNearlyConstantInputs determines whether the inputs given to the function
1407	//		 should increase (w.r.t m_attribute) only by very small amounts. This is relevant
1408	//		 for functions like asin, which requires its inputs to be in a specific range.
1409	//		 In practice, this affects whether expressions used to increment the input
1410	//		 variables use division instead of multiplication; normally, multiplication is used,
1411	//		 but it's hard to keep the increments very small that way, and division shouldn't
1412	//		 be the default, since for many functions (probably not asin, luckily), division
1413	//		 is too heavy and dominates time-wise.
1414	const bool				m_useNearlyConstantInputs;
1415	const Vec4				m_attribute;
1416	const glu::Precision	m_precision;
1417};
1418
1419FunctionCase::FunctionCase (Context&							context,
1420							const char*							name,
1421							const char*							description,
1422							const char*							func,
1423							glu::DataType						returnType,
1424							const glu::DataType					paramTypes[MAX_PARAMS],
1425							const Vec4&							attribute,
1426							int									modifyParamNdx,
1427							bool								useNearlyConstantInputs,
1428							glu::Precision						precision,
1429							bool								isVertex,
1430							const InitialCalibrationStorage&	initialCalibration)
1431	: OperatorPerformanceCase	(context.getTestContext(), context.getRenderContext(), name, description,
1432								 isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
1433	, m_func					(func)
1434	, m_returnType				(returnType)
1435	, m_modifyParamNdx			(modifyParamNdx)
1436	, m_useNearlyConstantInputs	(useNearlyConstantInputs)
1437	, m_attribute				(attribute)
1438	, m_precision				(precision)
1439{
1440	for (int i = 0; i < MAX_PARAMS; i++)
1441		m_paramTypes[i] = paramTypes[i];
1442}
1443
1444string FunctionCase::sumExpr (const string& aExpr, const string& bExpr, glu::DataType type)
1445{
1446	if (glu::isDataTypeBoolOrBVec(type))
1447	{
1448		if (type == glu::TYPE_BOOL)
1449			return "(" + aExpr + " == " + bExpr + ")";
1450		else
1451			return "equal(" + aExpr + ", " + bExpr + ")";
1452	}
1453	else
1454		return "(" + aExpr + " + " + bExpr + ")";
1455}
1456
1457string FunctionCase::incrementExpr (const string& baseExpr, glu::DataType type, bool divide)
1458{
1459	const string mulOrDiv = divide ? "/" : "*";
1460
1461	return glu::isDataTypeBoolOrBVec(type)	? baseExpr
1462		 : glu::isDataTypeIntOrIVec(type)	? "(" + baseExpr + mulOrDiv + "(i+1))"
1463		 :									  "(" + baseExpr + mulOrDiv + "float(i+1))";
1464}
1465
1466FunctionCase::ProgramContext FunctionCase::generateSingleProgramData (ProgramID programID) const
1467{
1468	const bool			isVertexCase			= m_caseType == CASETYPE_VERTEX;
1469	const char* const	precision				= glu::getPrecisionName(m_precision);
1470	const char* const	returnTypeName			= getDataTypeName(m_returnType);
1471	const string		returnPrecisionMaybe	= glu::isDataTypeBoolOrBVec(m_returnType) ? "" : string() + precision + " ";
1472	const char*			inputPrecision			= DE_NULL;
1473	const bool			isMatrixReturn			= isDataTypeMatrix(m_returnType);
1474	int					numParams				= 0;
1475	const char*			paramTypeNames[MAX_PARAMS];
1476	string				paramPrecisionsMaybe[MAX_PARAMS];
1477
1478	for (int i = 0; i < MAX_PARAMS; i++)
1479	{
1480		paramTypeNames[i]			= getDataTypeName(m_paramTypes[i]);
1481		paramPrecisionsMaybe[i]		= glu::isDataTypeBoolOrBVec(m_paramTypes[i]) ? "" : string() + precision + " ";
1482
1483		if (inputPrecision == DE_NULL && isDataTypeIntOrIVec(m_paramTypes[i]) && m_precision == glu::PRECISION_LOWP)
1484			inputPrecision = "mediump";
1485
1486		if (m_paramTypes[i] != TYPE_INVALID)
1487			numParams = i+1;
1488	}
1489
1490	DE_ASSERT(numParams > 0);
1491
1492	if (inputPrecision == DE_NULL)
1493		inputPrecision = precision;
1494
1495	int						numAttributes	= FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS + numParams - 1;
1496	std::ostringstream		vtx;
1497	std::ostringstream		frag;
1498	std::ostringstream&		op				= isVertexCase ? vtx : frag;
1499
1500	// Attributes.
1501	vtx << "attribute highp vec4 a_position;\n";
1502	for (int i = 0; i < numAttributes; i++)
1503		vtx << "attribute " << inputPrecision << " vec4 a_in" << i << ";\n";
1504
1505	if (isVertexCase)
1506	{
1507		vtx << "varying mediump vec4 v_color;\n";
1508		frag << "varying mediump vec4 v_color;\n";
1509	}
1510	else
1511	{
1512		for (int i = 0; i < numAttributes; i++)
1513		{
1514			vtx << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1515			frag << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1516		}
1517	}
1518
1519	op << "uniform mediump int u_numLoopIterations;\n";
1520	if (isVertexCase)
1521		op << "uniform mediump float u_zero;\n";
1522
1523	for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1524		op << "uniform " << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " u_inc" << (char)('A'+paramNdx) << ";\n";
1525
1526	vtx << "\n";
1527	vtx << "void main()\n";
1528	vtx << "{\n";
1529
1530	if (!isVertexCase)
1531		vtx << "\tgl_Position = a_position;\n";
1532
1533	frag << "\n";
1534	frag << "void main()\n";
1535	frag << "{\n";
1536
1537	// Function call input and return value accumulation variables.
1538	{
1539		const char* const inPrefix = isVertexCase ? "a_" : "v_";
1540
1541		for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
1542		{
1543			for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1544			{
1545				const glu::DataType		paramType	= m_paramTypes[paramNdx];
1546				const bool				mustCast	= paramType != glu::TYPE_FLOAT_VEC4;
1547
1548				op << "\t" << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " in" << calcNdx << (char)('a'+paramNdx) << " = ";
1549
1550				if (mustCast)
1551					op << paramTypeNames[paramNdx] << "(";
1552
1553				if (glu::isDataTypeMatrix(paramType))
1554				{
1555					static const char* const	swizzles[3]		= { "x", "xy", "xyz" };
1556					const int					numRows			= glu::getDataTypeMatrixNumRows(paramType);
1557					const int					numCols			= glu::getDataTypeMatrixNumColumns(paramType);
1558					const string				swizzle			= numRows < 4 ? string() + "." + swizzles[numRows-1] : "";
1559
1560					for (int i = 0; i < numCols; i++)
1561						op << (i > 0 ? ", " : "") << inPrefix << "in" << calcNdx+paramNdx << swizzle;
1562				}
1563				else
1564				{
1565					op << inPrefix << "in" << calcNdx+paramNdx;
1566
1567					if (paramNdx == m_modifyParamNdx)
1568					{
1569						DE_ASSERT(glu::isDataTypeFloatOrVec(paramType));
1570						op << " + 2.0";
1571					}
1572				}
1573
1574				if (mustCast)
1575					op << ")";
1576
1577				op << ";\n";
1578			}
1579
1580			op << "\t" << returnPrecisionMaybe << returnTypeName << " res" << calcNdx << " = " << returnTypeName << "(0);\n";
1581		}
1582	}
1583
1584	// Loop with expressions in it.
1585	op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
1586	op << "\t{\n";
1587	for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
1588	{
1589		if (calcNdx > 0)
1590			op << "\n";
1591
1592		op << "\t\t{\n";
1593
1594		for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1595		{
1596			const string inputName	= "in" + de::toString(calcNdx) + (char)('a'+inputNdx);
1597			const string incName	= string() + "u_inc" + (char)('A'+inputNdx);
1598			const string incExpr	= incrementExpr(incName, m_paramTypes[inputNdx], m_useNearlyConstantInputs);
1599
1600			op << "\t\t\t" << inputName << " = " << sumExpr(inputName, incExpr, m_paramTypes[inputNdx]) << ";\n";
1601		}
1602
1603		op << "\t\t\t" << returnPrecisionMaybe << returnTypeName << " eval" << calcNdx << " = ";
1604
1605		if (programID == PROGRAM_WITH_FUNCTION_CALLS)
1606		{
1607			op << m_func << "(";
1608
1609			for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1610			{
1611				if (paramNdx > 0)
1612					op << ", ";
1613
1614				op << "in" << calcNdx << (char)('a'+paramNdx);
1615			}
1616
1617			op << ")";
1618		}
1619		else
1620		{
1621			DE_ASSERT(programID == PROGRAM_WITHOUT_FUNCTION_CALLS);
1622			op << returnTypeName << "(1)";
1623		}
1624
1625		op << ";\n";
1626
1627		{
1628			const string resName	= "res" + de::toString(calcNdx);
1629			const string evalName	= "eval" + de::toString(calcNdx);
1630			const string incExpr	= incrementExpr(evalName, m_returnType, m_useNearlyConstantInputs);
1631
1632			op << "\t\t\tres" << calcNdx << " = " << sumExpr(resName, incExpr, m_returnType) << ";\n";
1633		}
1634
1635		op << "\t\t}\n";
1636	}
1637	op << "\t}\n";
1638	op << "\n";
1639
1640	// Result variables.
1641	for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1642	{
1643		op << "\t" << paramPrecisionsMaybe[inputNdx] << paramTypeNames[inputNdx] << " sumIn" << (char)('A'+inputNdx) << " = ";
1644		{
1645			string expr = string() + "in0" + (char)('a'+inputNdx);
1646			for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1647				expr = sumExpr(expr, string() + "in" + de::toString(i) + (char)('a'+inputNdx), m_paramTypes[inputNdx]);
1648			op << expr;
1649		}
1650		op << ";\n";
1651	}
1652
1653	op << "\t" << returnPrecisionMaybe << returnTypeName << " sumRes = ";
1654	{
1655		string expr = "res0";
1656		for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1657			expr = sumExpr(expr, "res" + de::toString(i), m_returnType);
1658		op << expr;
1659	}
1660	op << ";\n";
1661
1662	{
1663		glu::DataType finalResultDataType = glu::TYPE_LAST;
1664
1665		if (glu::isDataTypeMatrix(m_returnType))
1666		{
1667			finalResultDataType = m_returnType;
1668
1669			op << "\t" << precision << " " << returnTypeName << " finalRes = ";
1670
1671			for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1672			{
1673				DE_ASSERT(m_paramTypes[inputNdx] == m_returnType);
1674				op << "sumIn" << (char)('A'+inputNdx) << " + ";
1675			}
1676			op << "sumRes;\n";
1677		}
1678		else
1679		{
1680			int numFinalResComponents = glu::getDataTypeScalarSize(m_returnType);
1681			for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1682				numFinalResComponents = de::max(numFinalResComponents, glu::getDataTypeScalarSize(m_paramTypes[inputNdx]));
1683
1684			finalResultDataType = getDataTypeFloatOrVec(numFinalResComponents);
1685
1686			{
1687				const string finalResType = glu::getDataTypeName(finalResultDataType);
1688				op << "\t" << precision << " " << finalResType << " finalRes = ";
1689				for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1690					op << finalResType << "(sumIn" << (char)('A'+inputNdx) << ") + ";
1691				op << finalResType << "(sumRes);\n";
1692			}
1693		}
1694
1695		// Convert to color.
1696		op << "\tmediump vec4 color = ";
1697		if (finalResultDataType == TYPE_FLOAT_VEC4)
1698			op << "finalRes";
1699		else
1700		{
1701			int size = isMatrixReturn ? getDataTypeMatrixNumRows(finalResultDataType) : getDataTypeScalarSize(finalResultDataType);
1702
1703			op << "vec4(";
1704
1705			if (isMatrixReturn)
1706			{
1707				for (int i = 0; i < getDataTypeMatrixNumColumns(finalResultDataType); i++)
1708				{
1709					if (i > 0)
1710						op << " + ";
1711					op << "finalRes[" << i << "]";
1712				}
1713			}
1714			else
1715				op << "finalRes";
1716
1717			for (int i = size; i < 4; i++)
1718				op << ", " << (i == 3 ? "1.0" : "0.0");
1719
1720			op << ")";
1721		}
1722		op << ";\n";
1723		op << "\t" << (isVertexCase ? "v_color" : "gl_FragColor") << " = color;\n";
1724
1725		if (isVertexCase)
1726		{
1727			vtx << "	gl_Position = a_position + u_zero*color;\n";
1728			frag << "	gl_FragColor = v_color;\n";
1729		}
1730		else
1731		{
1732			for (int i = 0; i < numAttributes; i++)
1733				vtx << "	v_in" << i << " = a_in" << i << ";\n";
1734		}
1735
1736		vtx << "}\n";
1737		frag << "}\n";
1738	}
1739
1740	{
1741		vector<AttribSpec> attributes;
1742		for (int i = 0; i < numAttributes; i++)
1743			attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
1744											m_attribute.swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1745											m_attribute.swizzle((i+1)%4, (i+2)%4, (i+3)%4, (i+0)%4),
1746											m_attribute.swizzle((i+2)%4, (i+3)%4, (i+0)%4, (i+1)%4),
1747											m_attribute.swizzle((i+3)%4, (i+0)%4, (i+1)%4, (i+2)%4)));
1748
1749		{
1750			string description = "This is the program ";
1751
1752			description += programID == PROGRAM_WITHOUT_FUNCTION_CALLS	? "without"
1753						 : programID == PROGRAM_WITH_FUNCTION_CALLS		? "with"
1754						 : DE_NULL;
1755
1756			description += " '" + m_func + "' function calls.\n"
1757						   "Note: workload size for this program means the number of loop iterations.";
1758
1759			return ProgramContext(vtx.str(), frag.str(), attributes, description);
1760		}
1761	}
1762}
1763
1764vector<FunctionCase::ProgramContext> FunctionCase::generateProgramData (void) const
1765{
1766	vector<ProgramContext> progData;
1767	for (int i = 0; i < PROGRAM_LAST; i++)
1768		progData.push_back(generateSingleProgramData((ProgramID)i));
1769	return progData;
1770}
1771
1772void FunctionCase::setGeneralUniforms (deUint32 program) const
1773{
1774	const glw::Functions& gl = m_renderCtx.getFunctions();
1775
1776	gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
1777
1778	for (int paramNdx = 0; paramNdx < MAX_PARAMS; paramNdx++)
1779	{
1780		if (m_paramTypes[paramNdx] != glu::TYPE_INVALID)
1781		{
1782			const glu::DataType		paramType	= m_paramTypes[paramNdx];
1783			const int				scalarSize	= glu::getDataTypeScalarSize(paramType);
1784			const int				location	= gl.getUniformLocation(program, (string() + "u_inc" + (char)('A'+paramNdx)).c_str());
1785
1786			if (glu::isDataTypeFloatOrVec(paramType))
1787			{
1788				float values[4];
1789				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1790					values[i] = (float)paramNdx*0.01f + (float)i*0.001f; // Arbitrary small values.
1791				uniformNfv(gl, scalarSize, location, 1, &values[0]);
1792			}
1793			else if (glu::isDataTypeIntOrIVec(paramType))
1794			{
1795				int values[4];
1796				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1797					values[i] = paramNdx*100 + i; // Arbitrary values.
1798				uniformNiv(gl, scalarSize, location, 1, &values[0]);
1799			}
1800			else if (glu::isDataTypeBoolOrBVec(paramType))
1801			{
1802				int values[4];
1803				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1804					values[i] = (paramNdx >> i) & 1; // Arbitrary values.
1805				uniformNiv(gl, scalarSize, location, 1, &values[0]);
1806			}
1807			else if (glu::isDataTypeMatrix(paramType))
1808			{
1809				const int size = glu::getDataTypeMatrixNumRows(paramType);
1810				DE_ASSERT(size == glu::getDataTypeMatrixNumColumns(paramType));
1811				float values[4*4];
1812				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1813					values[i] = (float)paramNdx*0.01f + (float)i*0.001f; // Arbitrary values.
1814				uniformMatrixNfv(gl, size, location, 1, &values[0]);
1815			}
1816			else
1817				DE_ASSERT(false);
1818		}
1819	}
1820}
1821
1822void FunctionCase::setWorkloadSizeUniform (deUint32 program, int numLoopIterations) const
1823{
1824	const glw::Functions&	gl		= m_renderCtx.getFunctions();
1825	const int				loc		= gl.getUniformLocation(program, "u_numLoopIterations");
1826
1827	gl.uniform1i(loc, numLoopIterations);
1828}
1829
1830float FunctionCase::computeSingleOperationTime (const vector<float>& perProgramOperationCosts) const
1831{
1832	DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
1833	const int		numFunctionCalls			= FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
1834	const float		programOperationCostDiff	= perProgramOperationCosts[PROGRAM_WITH_FUNCTION_CALLS] - perProgramOperationCosts[PROGRAM_WITHOUT_FUNCTION_CALLS];
1835
1836	return programOperationCostDiff / (float)numFunctionCalls;
1837}
1838
1839void FunctionCase::logSingleOperationCalculationInfo (void) const
1840{
1841	const int numFunctionCalls = FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
1842
1843	m_testCtx.getLog() << TestLog::Message << "Note: program " << (int)PROGRAM_WITH_FUNCTION_CALLS << " contains "
1844										   << numFunctionCalls << " calls to '" << m_func << "' in one loop iteration; "
1845										   << "cost of one operation is calculated as "
1846										   << "(cost_of_workload_with_calls - cost_of_workload_without_calls) / " << numFunctionCalls << TestLog::EndMessage;
1847}
1848
1849} // anonymous
1850
1851ShaderOperatorTests::ShaderOperatorTests (Context& context)
1852	: TestCaseGroup(context, "operator", "Operator Performance Tests")
1853{
1854}
1855
1856ShaderOperatorTests::~ShaderOperatorTests (void)
1857{
1858}
1859
1860void ShaderOperatorTests::init (void)
1861{
1862	// Binary operator cases
1863
1864	static const DataType binaryOpTypes[] =
1865	{
1866		TYPE_FLOAT,
1867		TYPE_FLOAT_VEC2,
1868		TYPE_FLOAT_VEC3,
1869		TYPE_FLOAT_VEC4,
1870		TYPE_INT,
1871		TYPE_INT_VEC2,
1872		TYPE_INT_VEC3,
1873		TYPE_INT_VEC4,
1874	};
1875	static const Precision precisions[] =
1876	{
1877		PRECISION_LOWP,
1878		PRECISION_MEDIUMP,
1879		PRECISION_HIGHP
1880	};
1881	static const struct
1882	{
1883		const char*		name;
1884		const char*		op;
1885		bool			swizzle;
1886	} binaryOps[] =
1887	{
1888		{ "add",		"+",		false	},
1889		{ "sub",		"-",		true	},
1890		{ "mul",		"*",		false	},
1891		{ "div",		"/",		true	}
1892	};
1893
1894	tcu::TestCaseGroup* const binaryOpsGroup = new tcu::TestCaseGroup(m_testCtx, "binary_operator", "Binary Operator Performance Tests");
1895	addChild(binaryOpsGroup);
1896
1897	for (int opNdx = 0; opNdx < DE_LENGTH_OF_ARRAY(binaryOps); opNdx++)
1898	{
1899		tcu::TestCaseGroup* const opGroup = new tcu::TestCaseGroup(m_testCtx, binaryOps[opNdx].name, "");
1900		binaryOpsGroup->addChild(opGroup);
1901
1902		for (int isFrag = 0; isFrag <= 1; isFrag++)
1903		{
1904			const BinaryOpCase::InitialCalibrationStorage	shaderGroupCalibrationStorage	(new BinaryOpCase::InitialCalibration);
1905			const bool										isVertex						= isFrag == 0;
1906			tcu::TestCaseGroup* const						shaderGroup						= new tcu::TestCaseGroup(m_testCtx, isVertex ? "vertex" : "fragment", "");
1907			opGroup->addChild(shaderGroup);
1908
1909			for (int typeNdx = 0; typeNdx < DE_LENGTH_OF_ARRAY(binaryOpTypes); typeNdx++)
1910			{
1911				for (int precNdx = 0; precNdx < DE_LENGTH_OF_ARRAY(precisions); precNdx++)
1912				{
1913					const DataType		type			= binaryOpTypes[typeNdx];
1914					const Precision		precision		= precisions[precNdx];
1915					const char* const	op				= binaryOps[opNdx].op;
1916					const bool			useSwizzle		= binaryOps[opNdx].swizzle;
1917					std::ostringstream	name;
1918
1919					name << getPrecisionName(precision) << "_" << getDataTypeName(type);
1920
1921					shaderGroup->addChild(new BinaryOpCase(m_context, name.str().c_str(), "", op, type, precision, useSwizzle, isVertex, shaderGroupCalibrationStorage));
1922				}
1923			}
1924		}
1925	}
1926
1927	// Built-in function cases.
1928
1929	// Non-specific (i.e. includes gentypes) parameter types for the functions.
1930	enum ValueType
1931	{
1932		VALUE_NONE			= 0,
1933		VALUE_FLOAT			= (1<<0),	// float scalar
1934		VALUE_FLOAT_VEC		= (1<<1),	// float vector
1935		VALUE_FLOAT_VEC34	= (1<<2),	// float vector of size 3 or 4
1936		VALUE_FLOAT_GENTYPE	= (1<<3),	// float scalar/vector
1937		VALUE_VEC3			= (1<<4),	// vec3 only
1938		VALUE_VEC4			= (1<<5),	// vec4 only
1939		VALUE_MATRIX		= (1<<6),	// matrix
1940		VALUE_BOOL			= (1<<7),	// boolean scalar
1941		VALUE_BOOL_VEC		= (1<<8),	// boolean vector
1942		VALUE_BOOL_GENTYPE	= (1<<9),	// boolean scalar/vector
1943		VALUE_INT			= (1<<10),	// int scalar
1944		VALUE_INT_VEC		= (1<<11),	// int vector
1945		VALUE_INT_GENTYPE	= (1<<12),	// int scalar/vector
1946
1947		// Shorthands.
1948		N				= VALUE_NONE,
1949		F				= VALUE_FLOAT,
1950		FV				= VALUE_FLOAT_VEC,
1951		VL				= VALUE_FLOAT_VEC34, // L for "large"
1952		GT				= VALUE_FLOAT_GENTYPE,
1953		V3				= VALUE_VEC3,
1954		V4				= VALUE_VEC4,
1955		M				= VALUE_MATRIX,
1956		B				= VALUE_BOOL,
1957		BV				= VALUE_BOOL_VEC,
1958		BGT				= VALUE_BOOL_GENTYPE,
1959		I				= VALUE_INT,
1960		IV				= VALUE_INT_VEC,
1961		IGT				= VALUE_INT_GENTYPE,
1962
1963		VALUE_ANY_FLOAT			= VALUE_FLOAT		|	VALUE_FLOAT_VEC		|	VALUE_FLOAT_GENTYPE	| VALUE_VEC3 | VALUE_VEC4 | VALUE_FLOAT_VEC34,
1964		VALUE_ANY_INT			= VALUE_INT			|	VALUE_INT_VEC		|	VALUE_INT_GENTYPE,
1965		VALUE_ANY_BOOL			= VALUE_BOOL		|	VALUE_BOOL_VEC		|	VALUE_BOOL_GENTYPE,
1966
1967		VALUE_ANY_GENTYPE		= VALUE_FLOAT_VEC	|	VALUE_FLOAT_GENTYPE	|	VALUE_FLOAT_VEC34	|
1968								  VALUE_BOOL_VEC	|	VALUE_BOOL_GENTYPE	|
1969								  VALUE_INT_VEC		|	VALUE_INT_GENTYPE	|
1970								  VALUE_MATRIX
1971	};
1972	enum PrecisionMask
1973	{
1974		PRECMASK_NA				= 0,						//!< Precision not applicable (booleans)
1975		PRECMASK_LOWP			= (1<<PRECISION_LOWP),
1976		PRECMASK_MEDIUMP		= (1<<PRECISION_MEDIUMP),
1977		PRECMASK_HIGHP			= (1<<PRECISION_HIGHP),
1978
1979		PRECMASK_MEDIUMP_HIGHP	= (1<<PRECISION_MEDIUMP) | (1<<PRECISION_HIGHP),
1980		PRECMASK_ALL			= (1<<PRECISION_LOWP) | (1<<PRECISION_MEDIUMP) | (1<<PRECISION_HIGHP)
1981	};
1982
1983	static const DataType floatTypes[] =
1984	{
1985		TYPE_FLOAT,
1986		TYPE_FLOAT_VEC2,
1987		TYPE_FLOAT_VEC3,
1988		TYPE_FLOAT_VEC4
1989	};
1990	static const DataType intTypes[] =
1991	{
1992		TYPE_INT,
1993		TYPE_INT_VEC2,
1994		TYPE_INT_VEC3,
1995		TYPE_INT_VEC4
1996	};
1997	static const DataType boolTypes[] =
1998	{
1999		TYPE_BOOL,
2000		TYPE_BOOL_VEC2,
2001		TYPE_BOOL_VEC3,
2002		TYPE_BOOL_VEC4
2003	};
2004	static const DataType matrixTypes[] =
2005	{
2006		TYPE_FLOAT_MAT2,
2007		TYPE_FLOAT_MAT3,
2008		TYPE_FLOAT_MAT4
2009	};
2010
2011	tcu::TestCaseGroup* const angleAndTrigonometryGroup		= new tcu::TestCaseGroup(m_testCtx, "angle_and_trigonometry",	"Built-In Angle and Trigonometry Function Performance Tests");
2012	tcu::TestCaseGroup* const exponentialGroup				= new tcu::TestCaseGroup(m_testCtx, "exponential",				"Built-In Exponential Function Performance Tests");
2013	tcu::TestCaseGroup* const commonFunctionsGroup			= new tcu::TestCaseGroup(m_testCtx, "common_functions",			"Built-In Common Function Performance Tests");
2014	tcu::TestCaseGroup* const geometricFunctionsGroup		= new tcu::TestCaseGroup(m_testCtx, "geometric",				"Built-In Geometric Function Performance Tests");
2015	tcu::TestCaseGroup* const matrixFunctionsGroup			= new tcu::TestCaseGroup(m_testCtx, "matrix",					"Built-In Matrix Function Performance Tests");
2016	tcu::TestCaseGroup* const floatCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "float_compare",			"Built-In Floating Point Comparison Function Performance Tests");
2017	tcu::TestCaseGroup* const intCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "int_compare",				"Built-In Integer Comparison Function Performance Tests");
2018	tcu::TestCaseGroup* const boolCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "bool_compare",				"Built-In Boolean Comparison Function Performance Tests");
2019
2020	addChild(angleAndTrigonometryGroup);
2021	addChild(exponentialGroup);
2022	addChild(commonFunctionsGroup);
2023	addChild(geometricFunctionsGroup);
2024	addChild(matrixFunctionsGroup);
2025	addChild(floatCompareGroup);
2026	addChild(intCompareGroup);
2027	addChild(boolCompareGroup);
2028
2029	// Some attributes to be used as parameters for the functions.
2030	const Vec4 attrPos		= Vec4( 2.3f,  1.9f,  0.8f,  0.7f);
2031	const Vec4 attrNegPos	= Vec4(-1.3f,  2.5f, -3.5f,	 4.3f);
2032	const Vec4 attrSmall	= Vec4(-0.9f,  0.8f, -0.4f,	 0.2f);
2033
2034	// Function name, return type and parameter type information; also, what attribute should be used in the test.
2035	// \note Different versions of the same function (i.e. with the same group name) can be defined by putting them successively in this array.
2036	// \note In order to reduce case count and thus total execution time, we don't test all input type combinations for every function.
2037	static const struct
2038	{
2039		tcu::TestCaseGroup*					parentGroup;
2040		const char*							groupName;
2041		const char*							func;
2042		const ValueType						types[FunctionCase::MAX_PARAMS + 1]; // Return type and parameter types, in that order.
2043		const Vec4&							attribute;
2044		int									modifyParamNdx;
2045		bool								useNearlyConstantInputs;
2046		bool								booleanCase;
2047		PrecisionMask						precMask;
2048	} functionCaseGroups[] =
2049	{
2050		{ angleAndTrigonometryGroup,	"radians",			"radians",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2051		{ angleAndTrigonometryGroup,	"degrees",			"degrees",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2052		{ angleAndTrigonometryGroup,	"sin",				"sin",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2053		{ angleAndTrigonometryGroup,	"cos",				"cos",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2054		{ angleAndTrigonometryGroup,	"tan",				"tan",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2055		{ angleAndTrigonometryGroup,	"asin",				"asin",				{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
2056		{ angleAndTrigonometryGroup,	"acos",				"acos",				{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
2057		{ angleAndTrigonometryGroup,	"atan2",			"atan",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2058		{ angleAndTrigonometryGroup,	"atan",				"atan",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2059
2060		{ exponentialGroup,				"pow",				"pow",				{ F,  F,  F,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2061		{ exponentialGroup,				"exp",				"exp",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2062		{ exponentialGroup,				"log",				"log",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2063		{ exponentialGroup,				"exp2",				"exp2",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2064		{ exponentialGroup,				"log2",				"log2",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2065		{ exponentialGroup,				"sqrt",				"sqrt",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2066		{ exponentialGroup,				"inversesqrt",		"inversesqrt",		{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2067
2068		{ commonFunctionsGroup,			"abs",				"abs",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2069		{ commonFunctionsGroup,			"abs",				"abs",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2070		{ commonFunctionsGroup,			"sign",				"sign",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2071		{ commonFunctionsGroup,			"sign",				"sign",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2072		{ commonFunctionsGroup,			"floor",			"floor",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2073		{ commonFunctionsGroup,			"floor",			"floor",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2074		{ commonFunctionsGroup,			"ceil",				"ceil",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2075		{ commonFunctionsGroup,			"ceil",				"ceil",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2076		{ commonFunctionsGroup,			"fract",			"fract",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2077		{ commonFunctionsGroup,			"fract",			"fract",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2078		{ commonFunctionsGroup,			"mod",				"mod",				{ GT, GT, GT, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2079		{ commonFunctionsGroup,			"min",				"min",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2080		{ commonFunctionsGroup,			"min",				"min",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2081		{ commonFunctionsGroup,			"max",				"max",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2082		{ commonFunctionsGroup,			"max",				"max",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2083		{ commonFunctionsGroup,			"clamp",			"clamp",			{ F,  F,  F,  F  }, attrSmall,		 2, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2084		{ commonFunctionsGroup,			"clamp",			"clamp",			{ V4, V4, V4, V4 }, attrSmall,		 2, false,	false,	PRECMASK_ALL			},
2085		{ commonFunctionsGroup,			"mix",				"mix",				{ F,  F,  F,  F  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2086		{ commonFunctionsGroup,			"mix",				"mix",				{ V4, V4, V4, V4 }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2087		{ commonFunctionsGroup,			"step",				"step",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2088		{ commonFunctionsGroup,			"step",				"step",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2089		{ commonFunctionsGroup,			"smoothstep",		"smoothstep",		{ F,  F,  F,  F  }, attrSmall,		 1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2090		{ commonFunctionsGroup,			"smoothstep",		"smoothstep",		{ V4, V4, V4, V4 }, attrSmall,		 1, false,	false,	PRECMASK_ALL			},
2091
2092		{ geometricFunctionsGroup,		"length",			"length",			{ F,  VL, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2093		{ geometricFunctionsGroup,		"distance",			"distance",			{ F,  VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2094		{ geometricFunctionsGroup,		"dot",				"dot",				{ F,  VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2095		{ geometricFunctionsGroup,		"cross",			"cross",			{ V3, V3, V3, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2096		{ geometricFunctionsGroup,		"normalize",		"normalize",		{ VL, VL, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2097		{ geometricFunctionsGroup,		"faceforward",		"faceforward",		{ VL, VL, VL, VL }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2098		{ geometricFunctionsGroup,		"reflect",			"reflect",			{ VL, VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2099		{ geometricFunctionsGroup,		"refract",			"refract",			{ VL, VL, VL, F  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2100
2101		{ matrixFunctionsGroup,			"matrixCompMult",	"matrixCompMult",	{ M,  M,  M,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2102
2103		{ floatCompareGroup,			"lessThan",			"lessThan",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2104		{ floatCompareGroup,			"lessThanEqual",	"lessThanEqual",	{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2105		{ floatCompareGroup,			"greaterThan",		"greaterThan",		{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2106		{ floatCompareGroup,			"greaterThanEqual",	"greaterThanEqual",	{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2107		{ floatCompareGroup,			"equal",			"equal",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2108		{ floatCompareGroup,			"notEqual",			"notEqual",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2109
2110		{ intCompareGroup,				"lessThan",			"lessThan",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2111		{ intCompareGroup,				"lessThanEqual",	"lessThanEqual",	{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2112		{ intCompareGroup,				"greaterThan",		"greaterThan",		{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2113		{ intCompareGroup,				"greaterThanEqual",	"greaterThanEqual",	{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2114		{ intCompareGroup,				"equal",			"equal",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2115		{ intCompareGroup,				"notEqual",			"notEqual",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2116
2117		{ boolCompareGroup,				"equal",			"equal",			{ BV, BV, BV, N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2118		{ boolCompareGroup,				"notEqual",			"notEqual",			{ BV, BV, BV, N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2119		{ boolCompareGroup,				"any",				"any",				{ B,  BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2120		{ boolCompareGroup,				"all",				"all",				{ B,  BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2121		{ boolCompareGroup,				"not",				"not",				{ BV, BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		}
2122	};
2123
2124	// vertexSubGroup and fragmentSubGroup are the groups where the various vertex/fragment cases of a single function are added.
2125	// \note These are defined here so that different versions (different entries in the functionCaseGroups array) of the same function can be put in the same group.
2126	tcu::TestCaseGroup*							vertexSubGroup		= DE_NULL;
2127	tcu::TestCaseGroup*							fragmentSubGroup	= DE_NULL;
2128	FunctionCase::InitialCalibrationStorage		vertexSubGroupCalibrationStorage;
2129	FunctionCase::InitialCalibrationStorage		fragmentSubGroupCalibrationStorage;
2130	for (int funcNdx = 0; funcNdx < DE_LENGTH_OF_ARRAY(functionCaseGroups); funcNdx++)
2131	{
2132		tcu::TestCaseGroup* const	parentGroup					= functionCaseGroups[funcNdx].parentGroup;
2133		const char* const			groupName					= functionCaseGroups[funcNdx].groupName;
2134		const char* const			groupFunc					= functionCaseGroups[funcNdx].func;
2135		const ValueType* const		funcTypes					= functionCaseGroups[funcNdx].types;
2136		const Vec4&					groupAttribute				= functionCaseGroups[funcNdx].attribute;
2137		const int					modifyParamNdx				= functionCaseGroups[funcNdx].modifyParamNdx;
2138		const bool					useNearlyConstantInputs		= functionCaseGroups[funcNdx].useNearlyConstantInputs;
2139		const bool					booleanCase					= functionCaseGroups[funcNdx].booleanCase;
2140		const PrecisionMask			precMask					= functionCaseGroups[funcNdx].precMask;
2141
2142		// If this is a new function and not just a different version of the previously defined function, create a new group.
2143		if (funcNdx == 0 || parentGroup != functionCaseGroups[funcNdx-1].parentGroup || string(groupName) != functionCaseGroups[funcNdx-1].groupName)
2144		{
2145			tcu::TestCaseGroup* const funcGroup = new tcu::TestCaseGroup(m_testCtx, groupName, "");
2146			functionCaseGroups[funcNdx].parentGroup->addChild(funcGroup);
2147
2148			vertexSubGroup		= new tcu::TestCaseGroup(m_testCtx, "vertex", "");
2149			fragmentSubGroup	= new tcu::TestCaseGroup(m_testCtx, "fragment", "");
2150
2151			funcGroup->addChild(vertexSubGroup);
2152			funcGroup->addChild(fragmentSubGroup);
2153
2154			vertexSubGroupCalibrationStorage	= FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
2155			fragmentSubGroupCalibrationStorage	= FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
2156		}
2157
2158		DE_ASSERT(vertexSubGroup != DE_NULL);
2159		DE_ASSERT(fragmentSubGroup != DE_NULL);
2160
2161		// Find the type size range of parameters (e.g. from 2 to 4 in case of vectors).
2162		int genTypeFirstSize	= 1;
2163		int genTypeLastSize		= 1;
2164
2165		// Find the first return value or parameter with a gentype (if any) and set sizes accordingly.
2166		// \note Assumes only matching sizes gentypes are to be found, e.g. no "genType func (vec param)"
2167		for (int i = 0; i < FunctionCase::MAX_PARAMS + 1 && genTypeLastSize == 1; i++)
2168		{
2169			switch (funcTypes[i])
2170			{
2171				case VALUE_FLOAT_VEC:
2172				case VALUE_BOOL_VEC:
2173				case VALUE_INT_VEC:			// \note Fall-through.
2174					genTypeFirstSize = 2;
2175					genTypeLastSize = 4;
2176					break;
2177				case VALUE_FLOAT_VEC34:
2178					genTypeFirstSize = 3;
2179					genTypeLastSize = 4;
2180					break;
2181				case VALUE_FLOAT_GENTYPE:
2182				case VALUE_BOOL_GENTYPE:
2183				case VALUE_INT_GENTYPE:		// \note Fall-through.
2184					genTypeFirstSize = 1;
2185					genTypeLastSize = 4;
2186					break;
2187				case VALUE_MATRIX:
2188					genTypeFirstSize = 2;
2189					genTypeLastSize = 4;
2190					break;
2191				// If none of the above, keep looping.
2192				default:
2193					break;
2194			}
2195		}
2196
2197		// Create a case for each possible size of the gentype.
2198		for (int curSize = genTypeFirstSize; curSize <= genTypeLastSize; curSize++)
2199		{
2200			// Determine specific types for return value and the parameters, according to curSize. Non-gentypes not affected by curSize.
2201			DataType types[FunctionCase::MAX_PARAMS + 1];
2202			for (int i = 0; i < FunctionCase::MAX_PARAMS + 1; i++)
2203			{
2204				if (funcTypes[i] == VALUE_NONE)
2205					types[i] = TYPE_INVALID;
2206				else
2207				{
2208					int isFloat	= funcTypes[i] & VALUE_ANY_FLOAT;
2209					int isBool	= funcTypes[i] & VALUE_ANY_BOOL;
2210					int isInt	= funcTypes[i] & VALUE_ANY_INT;
2211					int isMat	= funcTypes[i] == VALUE_MATRIX;
2212					int inSize	= (funcTypes[i] & VALUE_ANY_GENTYPE)	? curSize
2213								: funcTypes[i] == VALUE_VEC3			? 3
2214								: funcTypes[i] == VALUE_VEC4			? 4
2215								: 1;
2216					int			typeArrayNdx = isMat ? inSize - 2 : inSize - 1; // \note No matrices of size 1.
2217
2218					types[i]	= isFloat	? floatTypes[typeArrayNdx]
2219								: isBool	? boolTypes[typeArrayNdx]
2220								: isInt		? intTypes[typeArrayNdx]
2221								: isMat		? matrixTypes[typeArrayNdx]
2222								: TYPE_LAST;
2223				}
2224
2225				DE_ASSERT(types[i] != TYPE_LAST);
2226			}
2227
2228			// Array for just the parameter types.
2229			DataType paramTypes[FunctionCase::MAX_PARAMS];
2230			for (int i = 0; i < FunctionCase::MAX_PARAMS; i++)
2231				paramTypes[i] = types[i+1];
2232
2233			for (int prec = (int)PRECISION_LOWP; prec < (int)PRECISION_LAST; prec++)
2234			{
2235				if ((precMask & (1 << prec)) == 0)
2236					continue;
2237
2238				const string		precisionPrefix = booleanCase ? "" : (string(getPrecisionName((Precision)prec)) + "_");
2239				std::ostringstream	caseName;
2240
2241				caseName << precisionPrefix;
2242
2243				// Write the name of each distinct parameter data type into the test case name.
2244				for (int i = 1; i < FunctionCase::MAX_PARAMS + 1 && types[i] != TYPE_INVALID; i++)
2245				{
2246					if (i == 1 || types[i] != types[i-1])
2247					{
2248						if (i > 1)
2249							caseName << "_";
2250
2251						caseName << getDataTypeName(types[i]);
2252					}
2253				}
2254
2255				for (int fragI = 0; fragI <= 1; fragI++)
2256				{
2257					const bool					vert	= fragI == 0;
2258					tcu::TestCaseGroup* const	group	= vert ? vertexSubGroup : fragmentSubGroup;
2259					group->addChild	(new FunctionCase(m_context,
2260													  caseName.str().c_str(), "",
2261													  groupFunc,
2262													  types[0], paramTypes,
2263													  groupAttribute, modifyParamNdx, useNearlyConstantInputs,
2264													  (Precision)prec, vert,
2265													  vert ? vertexSubGroupCalibrationStorage : fragmentSubGroupCalibrationStorage));
2266				}
2267			}
2268		}
2269	}
2270}
2271
2272} // Performance
2273} // gles2
2274} // deqp
2275