1 /*-------------------------------------------------------------------------
2  * drawElements Quality Program OpenGL ES 3.0 Module
3  * -------------------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief Shader operator performance tests.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "es3pShaderOperatorTests.hpp"
25 #include "glsCalibration.hpp"
26 #include "gluShaderUtil.hpp"
27 #include "gluShaderProgram.hpp"
28 #include "gluPixelTransfer.hpp"
29 #include "tcuTestLog.hpp"
30 #include "tcuRenderTarget.hpp"
31 #include "tcuCommandLine.hpp"
32 #include "tcuSurface.hpp"
33 #include "deStringUtil.hpp"
34 #include "deSharedPtr.hpp"
35 #include "deClock.h"
36 #include "deMath.h"
37 
38 #include "glwEnums.hpp"
39 #include "glwFunctions.hpp"
40 
41 #include <map>
42 #include <algorithm>
43 #include <limits>
44 #include <set>
45 
46 namespace deqp
47 {
48 namespace gles3
49 {
50 namespace Performance
51 {
52 
53 using namespace gls;
54 using namespace glu;
55 using tcu::Vec2;
56 using tcu::Vec4;
57 using tcu::TestLog;
58 using de::SharedPtr;
59 
60 using std::string;
61 using std::vector;
62 
63 #define MEASUREMENT_FAIL() throw tcu::InternalError("Unable to get sensible measurements for estimation", DE_NULL, __FILE__, __LINE__)
64 
65 // Number of measurements in OperatorPerformanceCase for each workload size, unless specified otherwise by a command line argument.
66 static const int	DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD	= 3;
67 // How many different workload sizes are used by OperatorPerformanceCase.
68 static const int	NUM_WORKLOADS							= 8;
69 // Maximum workload size that can be attempted. In a sensible case, this most likely won't be reached.
70 static const int	MAX_WORKLOAD_SIZE						= 1<<29;
71 
72 // BinaryOpCase-specific constants for shader generation.
73 static const int	BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS	= 4;
74 static const int	BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT	= 2;
75 static const int	BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT		= 4;
76 
77 // FunctionCase-specific constants for shader generation.
78 static const int	FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS			= 4;
79 
80 static const char* const s_swizzles[][4] =
81 {
82 	{ "x", "yx", "yzx", "wzyx" },
83 	{ "y", "zy", "wyz", "xwzy" },
84 	{ "z", "wy", "zxy", "yzwx" },
85 	{ "w", "xw", "yxw", "zyxw" }
86 };
87 
88 template <int N>
mean(const vector<tcu::Vector<float,N>> & data)89 static tcu::Vector<float, N> mean (const vector<tcu::Vector<float, N> >& data)
90 {
91 	tcu::Vector<float, N> sum(0.0f);
92 	for (int i = 0; i < (int)data.size(); i++)
93 		sum += data[i];
94 	return sum / tcu::Vector<float, N>((float)data.size());
95 }
96 
uniformNfv(const glw::Functions & gl,int n,int location,int count,const float * data)97 static void uniformNfv (const glw::Functions& gl, int n, int location, int count, const float* data)
98 {
99 	switch (n)
100 	{
101 		case 1: gl.uniform1fv(location, count, data); break;
102 		case 2: gl.uniform2fv(location, count, data); break;
103 		case 3: gl.uniform3fv(location, count, data); break;
104 		case 4: gl.uniform4fv(location, count, data); break;
105 		default: DE_ASSERT(false);
106 	}
107 }
108 
uniformNiv(const glw::Functions & gl,int n,int location,int count,const int * data)109 static void uniformNiv (const glw::Functions& gl, int n, int location, int count, const int* data)
110 {
111 	switch (n)
112 	{
113 		case 1: gl.uniform1iv(location, count, data); break;
114 		case 2: gl.uniform2iv(location, count, data); break;
115 		case 3: gl.uniform3iv(location, count, data); break;
116 		case 4: gl.uniform4iv(location, count, data); break;
117 		default: DE_ASSERT(false);
118 	}
119 }
120 
uniformMatrixNfv(const glw::Functions & gl,int n,int location,int count,const float * data)121 static void uniformMatrixNfv (const glw::Functions& gl, int n, int location, int count, const float* data)
122 {
123 	switch (n)
124 	{
125 		case 2: gl.uniformMatrix2fv(location, count, GL_FALSE, &data[0]); break;
126 		case 3: gl.uniformMatrix3fv(location, count, GL_FALSE, &data[0]); break;
127 		case 4: gl.uniformMatrix4fv(location, count, GL_FALSE, &data[0]); break;
128 		default: DE_ASSERT(false);
129 	}
130 }
131 
getDataTypeFloatOrVec(int size)132 static glu::DataType getDataTypeFloatOrVec (int size)
133 {
134 	return size == 1 ? glu::TYPE_FLOAT : glu::getDataTypeFloatVec(size);
135 }
136 
getIterationCountOrDefault(const tcu::CommandLine & cmdLine,int def)137 static int getIterationCountOrDefault (const tcu::CommandLine& cmdLine, int def)
138 {
139 	const int cmdLineVal = cmdLine.getTestIterationCount();
140 	return cmdLineVal > 0 ? cmdLineVal : def;
141 }
142 
lineParamsString(const LineParameters & params)143 static string lineParamsString (const LineParameters& params)
144 {
145 	return "y = " + de::toString(params.offset) + " + " + de::toString(params.coefficient) + "*x";
146 }
147 
148 namespace
149 {
150 
151 /*--------------------------------------------------------------------*//*!
152  * \brief Abstract class for measuring shader operator performance.
153  *
154  * This class draws multiple times with different workload sizes (set
155  * via a uniform, by subclass). Time for each frame is measured, and the
156  * slope of the workload size vs frame time data is estimated. This slope
157  * tells us the estimated increase in frame time caused by a workload
158  * increase of 1 unit (what 1 workload unit means is up to subclass).
159  *
160  * Generally, the shaders contain not just the operation we're interested
161  * in (e.g. addition) but also some other stuff (e.g. loop overhead). To
162  * eliminate this cost, we actually do the stuff described in the above
163  * paragraph with multiple programs (usually two), which contain different
164  * kinds of workload (e.g. different loop contents). Then we can (in
165  * theory) compute the cost of just one operation in a subclass-dependent
166  * manner.
167  *
168  * At this point, the result tells us the increase in frame time caused
169  * by the addition of one operation. Dividing this by the amount of
170  * draw calls in a frame, and further by the amount of vertices or
171  * fragments in a draw call, we get the time cost of one operation.
172  *
173  * In reality, there sometimes isn't just a trivial linear dependence
174  * between workload size and frame time. Instead, there tends to be some
175  * amount of initial "free" operations. That is, it may be that all
176  * workload sizes below some positive integer C yield the same frame time,
177  * and only workload sizes beyond C increase the frame time in a supposedly
178  * linear manner. Graphically, this means that there graph consists of two
179  * parts: a horizontal left part, and a linearly increasing right part; the
180  * right part starts where the left parts ends. The principal task of these
181  * tests is to look at the slope of the increasing right part. Additionally
182  * an estimate for the amount of initial free operations is calculated.
183  * Note that it is also normal to get graphs where the horizontal left part
184  * is of zero width, i.e. there are no free operations.
185  *//*--------------------------------------------------------------------*/
186 class OperatorPerformanceCase : public tcu::TestCase
187 {
188 public:
189 	enum CaseType
190 	{
191 		CASETYPE_VERTEX = 0,
192 		CASETYPE_FRAGMENT,
193 
194 		CASETYPE_LAST
195 	};
196 
197 	struct InitialCalibration
198 	{
199 		int initialNumCalls;
InitialCalibrationdeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::InitialCalibration200 		InitialCalibration (void) : initialNumCalls(1) {}
201 	};
202 
203 	typedef SharedPtr<InitialCalibration> InitialCalibrationStorage;
204 
205 								OperatorPerformanceCase		(tcu::TestContext& testCtx, glu::RenderContext& renderCtx, const char* name, const char* description,
206 															 CaseType caseType, int numWorkloads, const InitialCalibrationStorage& initialCalibrationStorage);
207 								~OperatorPerformanceCase	(void);
208 
209 	void						init						(void);
210 	void						deinit						(void);
211 
212 	IterateResult				iterate						(void);
213 
214 	struct AttribSpec
215 	{
AttribSpecdeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::AttribSpec216 		AttribSpec (const char* name_, const tcu::Vec4& p00_, const tcu::Vec4& p01_, const tcu::Vec4& p10_, const tcu::Vec4& p11_)
217 			: name		(name_)
218 			, p00		(p00_)
219 			, p01		(p01_)
220 			, p10		(p10_)
221 			, p11		(p11_)
222 		{
223 		}
224 
AttribSpecdeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::AttribSpec225 		AttribSpec (void) {}
226 
227 		std::string		name;
228 		tcu::Vec4		p00;	//!< Bottom left.
229 		tcu::Vec4		p01;	//!< Bottom right.
230 		tcu::Vec4		p10;	//!< Top left.
231 		tcu::Vec4		p11;	//!< Top right.
232 	};
233 
234 protected:
235 	struct ProgramContext
236 	{
237 		string				vertShaderSource;
238 		string				fragShaderSource;
239 		vector<AttribSpec>	attributes;
240 
241 		string				description;
242 
ProgramContextdeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::ProgramContext243 		ProgramContext (void) {}
ProgramContextdeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::ProgramContext244 		ProgramContext (const string& vs, const string& fs, const vector<AttribSpec>& attrs, const string& desc)
245 			: vertShaderSource(vs), fragShaderSource(fs), attributes(attrs), description(desc) {}
246 	};
247 
248 	virtual vector<ProgramContext>	generateProgramData					(void) const = 0;
249 	//! Sets program-specific uniforms that don't depend on the workload size.
250 	virtual void					setGeneralUniforms					(deUint32 program) const = 0;
251 	//! Sets the uniform(s) that specifies the workload size in the shader.
252 	virtual void					setWorkloadSizeUniform				(deUint32 program, int workload) const = 0;
253 	//! Computes the cost of a single operation, given the workload costs per program.
254 	virtual float					computeSingleOperationTime			(const vector<float>& perProgramWorkloadCosts) const = 0;
255 	//! Logs a human-readable description of what computeSingleOperationTime does.
256 	virtual void					logSingleOperationCalculationInfo	(void) const = 0;
257 
258 	glu::RenderContext&				m_renderCtx;
259 
260 	CaseType						m_caseType;
261 
262 private:
263 	enum State
264 	{
265 		STATE_CALIBRATING = 0,		//!< Calibrate draw call count, using first program in m_programs, with workload size 1.
266 		STATE_FIND_HIGH_WORKLOAD,	//!< Find an appropriate lower bound for the highest workload size we intend to use (one with high-enough frame time compared to workload size 1) for each program.
267 		STATE_MEASURING,			//!< Do actual measurements, for each program in m_programs.
268 		STATE_REPORTING,			//!< Measurements are done; calculate results and log.
269 		STATE_FINISHED,				//!< All done.
270 
271 		STATE_LAST
272 	};
273 
274 	struct WorkloadRecord
275 	{
276 		int				workloadSize;
277 		vector<float>	frameTimes; //!< In microseconds.
278 
WorkloadRecorddeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::WorkloadRecord279 				WorkloadRecord	(int workloadSize_)						: workloadSize(workloadSize_) {}
operator <deqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::WorkloadRecord280 		bool	operator<		(const WorkloadRecord& other) const		{ return this->workloadSize < other.workloadSize; }
addFrameTimedeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::WorkloadRecord281 		void	addFrameTime	(float time)							{ frameTimes.push_back(time); }
getMedianTimedeqp::gles3::Performance::__anonf1dfc9820111::OperatorPerformanceCase::WorkloadRecord282 		float	getMedianTime	(void) const
283 		{
284 			vector<float> times = frameTimes;
285 			std::sort(times.begin(), times.end());
286 			return times.size() % 2 == 0 ?
287 					(times[times.size()/2-1] + times[times.size()/2])*0.5f :
288 					times[times.size()/2];
289 		}
290 	};
291 
292 	void								prepareProgram				(int progNdx);					//!< Sets attributes and uniforms for m_programs[progNdx].
293 	void								prepareWorkload				(int progNdx, int workload);	//!< Calls setWorkloadSizeUniform and draws, in case the implementation does some draw-time compilation.
294 	void								prepareNextRound			(void);							//!< Increases workload and/or updates m_state.
295 	void								render						(int numDrawCalls);
296 	deUint64							renderAndMeasure			(int numDrawCalls);
297 	void								adjustAndLogGridAndViewport	(void);							//!< Log grid and viewport sizes, after possibly reducing them to reduce draw time.
298 
299 	vector<Vec2>						getWorkloadMedianDataPoints	(int progNdx) const; //!< [ Vec2(r.workloadSize, r.getMedianTime()) for r in m_workloadRecords[progNdx] ]
300 
301 	const int							m_numMeasurementsPerWorkload;
302 	const int							m_numWorkloads;				//!< How many different workload sizes are used for measurement for each program.
303 
304 	int									m_workloadNdx;				//!< Runs from 0 to m_numWorkloads-1.
305 
306 	int									m_workloadMeasurementNdx;
307 	vector<vector<WorkloadRecord> >		m_workloadRecordsFindHigh;	//!< The measurements done during STATE_FIND_HIGH_WORKLOAD.
308 	vector<vector<WorkloadRecord> >		m_workloadRecords;			//!< The measurements of each program in m_programs. Generated during STATE_MEASURING, into index specified by m_measureProgramNdx.
309 
310 	State								m_state;
311 	int									m_measureProgramNdx;		//!< When m_state is STATE_FIND_HIGH_WORKLOAD or STATE_MEASURING, this tells which program in m_programs is being measured.
312 
313 	vector<int>							m_highWorkloadSizes;		//!< The first workload size encountered during STATE_FIND_HIGH_WORKLOAD that was determined suitable, for each program.
314 
315 	TheilSenCalibrator					m_calibrator;
316 	InitialCalibrationStorage			m_initialCalibrationStorage;
317 
318 	int									m_viewportWidth;
319 	int									m_viewportHeight;
320 	int									m_gridSizeX;
321 	int									m_gridSizeY;
322 
323 	vector<ProgramContext>				m_programData;
324 	vector<SharedPtr<ShaderProgram> >	m_programs;
325 
326 	std::vector<deUint32>				m_attribBuffers;
327 };
328 
triangleInterpolate(float v0,float v1,float v2,float x,float y)329 static inline float triangleInterpolate (float v0, float v1, float v2, float x, float y)
330 {
331 	return v0 + (v2-v0)*x + (v1-v0)*y;
332 }
333 
triQuadInterpolate(float x,float y,const tcu::Vec4 & quad)334 static inline float triQuadInterpolate (float x, float y, const tcu::Vec4& quad)
335 {
336 	// \note Top left fill rule.
337 	if (x + y < 1.0f)
338 		return triangleInterpolate(quad.x(), quad.y(), quad.z(), x, y);
339 	else
340 		return triangleInterpolate(quad.w(), quad.z(), quad.y(), 1.0f-x, 1.0f-y);
341 }
342 
getNumVertices(int gridSizeX,int gridSizeY)343 static inline int getNumVertices (int gridSizeX, int gridSizeY)
344 {
345 	return gridSizeX * gridSizeY * 2 * 3;
346 }
347 
generateVertices(std::vector<float> & dst,int gridSizeX,int gridSizeY,const OperatorPerformanceCase::AttribSpec & spec)348 static void generateVertices (std::vector<float>& dst, int gridSizeX, int gridSizeY, const OperatorPerformanceCase::AttribSpec& spec)
349 {
350 	const int numComponents = 4;
351 
352 	DE_ASSERT(gridSizeX >= 1 && gridSizeY >= 1);
353 	dst.resize(getNumVertices(gridSizeX, gridSizeY) * numComponents);
354 
355 	{
356 		int dstNdx = 0;
357 
358 		for (int baseY = 0; baseY < gridSizeY; baseY++)
359 		for (int baseX = 0; baseX < gridSizeX; baseX++)
360 		{
361 			const float xf0 = (float)(baseX + 0) / (float)gridSizeX;
362 			const float yf0 = (float)(baseY + 0) / (float)gridSizeY;
363 			const float xf1 = (float)(baseX + 1) / (float)gridSizeX;
364 			const float yf1 = (float)(baseY + 1) / (float)gridSizeY;
365 
366 #define ADD_VERTEX(XF, YF)										\
367 	for (int compNdx = 0; compNdx < numComponents; compNdx++)	\
368 		dst[dstNdx++] = triQuadInterpolate((XF), (YF), tcu::Vec4(spec.p00[compNdx], spec.p01[compNdx], spec.p10[compNdx], spec.p11[compNdx]))
369 
370 			ADD_VERTEX(xf0, yf0);
371 			ADD_VERTEX(xf1, yf0);
372 			ADD_VERTEX(xf0, yf1);
373 
374 			ADD_VERTEX(xf1, yf0);
375 			ADD_VERTEX(xf1, yf1);
376 			ADD_VERTEX(xf0, yf1);
377 
378 #undef ADD_VERTEX
379 		}
380 	}
381 }
382 
intersectionX(const gls::LineParameters & a,const gls::LineParameters & b)383 static float intersectionX (const gls::LineParameters& a, const gls::LineParameters& b)
384 {
385 	return (a.offset - b.offset) / (b.coefficient - a.coefficient);
386 }
387 
numDistinctX(const vector<Vec2> & data)388 static int numDistinctX (const vector<Vec2>& data)
389 {
390 	std::set<float> xs;
391 	for (int i = 0; i < (int)data.size(); i++)
392 		xs.insert(data[i].x());
393 	return (int)xs.size();
394 }
395 
simpleLinearRegression(const vector<Vec2> & data)396 static gls::LineParameters simpleLinearRegression (const vector<Vec2>& data)
397 {
398 	const Vec2	mid					= mean(data);
399 
400 	float		slopeNumerator		= 0.0f;
401 	float		slopeDenominator	= 0.0f;
402 
403 	for (int i = 0; i < (int)data.size(); i++)
404 	{
405 		const Vec2 diff = data[i] - mid;
406 
407 		slopeNumerator		+= diff.x()*diff.y();
408 		slopeDenominator	+= diff.x()*diff.x();
409 	}
410 
411 	const float slope	= slopeNumerator / slopeDenominator;
412 	const float offset	= mid.y() - slope*mid.x();
413 
414 	return gls::LineParameters(offset, slope);
415 }
416 
simpleLinearRegressionError(const vector<Vec2> & data)417 static float simpleLinearRegressionError (const vector<Vec2>& data)
418 {
419 	if (numDistinctX(data) <= 2)
420 		return 0.0f;
421 	else
422 	{
423 		const gls::LineParameters	estimator	= simpleLinearRegression(data);
424 		float						error		= 0.0f;
425 
426 		for (int i = 0; i < (int)data.size(); i++)
427 		{
428 			const float estY = estimator.offset + estimator.coefficient*data[i].x();
429 			const float diff = estY - data[i].y();
430 			error += diff*diff;
431 		}
432 
433 		return error / (float)data.size();
434 	}
435 }
436 
verticalVariance(const vector<Vec2> & data)437 static float verticalVariance (const vector<Vec2>& data)
438 {
439 	if (numDistinctX(data) <= 2)
440 		return 0.0f;
441 	else
442 	{
443 		const float		meanY = mean(data).y();
444 		float			error = 0.0f;
445 
446 		for (int i = 0; i < (int)data.size(); i++)
447 		{
448 			const float diff = meanY - data[i].y();
449 			error += diff*diff;
450 		}
451 
452 		return error / (float)data.size();
453 	}
454 }
455 
456 /*--------------------------------------------------------------------*//*!
457  * \brief Find the x coord that divides the input data into two slopes.
458  *
459  * The operator performance measurements tend to produce results where
460  * we get small operation counts "for free" (e.g. because the operations
461  * are performed during some memory transfer overhead or something),
462  * resulting in a curve with two parts: an initial horizontal line segment,
463  * and a rising line.
464  *
465  * This function finds the x coordinate that divides the input data into
466  * two parts such that the sum of the mean square errors for the
467  * least-squares estimated lines for the two parts is minimized, under the
468  * additional condition that the left line is horizontal.
469  *
470  * This function returns a number X s.t. { pt | pt is in data, pt.x >= X }
471  * is the right line, and the rest of data is the left line.
472  *//*--------------------------------------------------------------------*/
findSlopePivotX(const vector<Vec2> & data)473 static float findSlopePivotX (const vector<Vec2>& data)
474 {
475 	std::set<float> xCoords;
476 	for (int i = 0; i < (int)data.size(); i++)
477 		xCoords.insert(data[i].x());
478 
479 	float			lowestError		= std::numeric_limits<float>::infinity();
480 	float			bestPivotX		= -std::numeric_limits<float>::infinity();
481 
482 	for (std::set<float>::const_iterator pivotX = xCoords.begin(); pivotX != xCoords.end(); ++pivotX)
483 	{
484 		vector<Vec2> leftData;
485 		vector<Vec2> rightData;
486 		for (int i = 0; i < (int)data.size(); i++)
487 		{
488 			if (data[i].x() < *pivotX)
489 				leftData.push_back(data[i]);
490 			else
491 				rightData.push_back(data[i]);
492 		}
493 
494 		if (numDistinctX(rightData) < 3) // We don't trust the right data if there's too little of it.
495 			break;
496 
497 		{
498 			const float totalError = verticalVariance(leftData) + simpleLinearRegressionError(rightData);
499 
500 			if (totalError < lowestError)
501 			{
502 				lowestError = totalError;
503 				bestPivotX = *pivotX;
504 			}
505 		}
506 	}
507 
508 	DE_ASSERT(lowestError < std::numeric_limits<float>::infinity());
509 
510 	return bestPivotX;
511 }
512 
513 struct SegmentedEstimator
514 {
515 	float					pivotX; //!< Value returned by findSlopePivotX, or -infinity if only single line.
516 	gls::LineParameters		left;
517 	gls::LineParameters		right;
SegmentedEstimatordeqp::gles3::Performance::__anonf1dfc9820111::SegmentedEstimator518 	SegmentedEstimator (const gls::LineParameters& l, const gls::LineParameters& r, float pivotX_) : pivotX(pivotX_), left(l), right(r) {}
519 };
520 
521 /*--------------------------------------------------------------------*//*!
522  * \brief Compute line estimators for (potentially) two-segment data.
523  *
524  * Splits the given data into left and right parts (using findSlopePivotX)
525  * and returns the line estimates for them.
526  *
527  * Sometimes, however (especially in fragment shader cases) the data is
528  * in fact not segmented, but a straight line. This function attempts to
529  * detect if this the case, and if so, sets left.offset = right.offset and
530  * left.slope = 0, meaning essentially that the initial "flat" part of the
531  * data has zero width.
532  *//*--------------------------------------------------------------------*/
computeSegmentedEstimator(const vector<Vec2> & data)533 static SegmentedEstimator computeSegmentedEstimator (const vector<Vec2>& data)
534 {
535 	const float		pivotX = findSlopePivotX(data);
536 	vector<Vec2>	leftData;
537 	vector<Vec2>	rightData;
538 
539 	for (int i = 0; i < (int)data.size(); i++)
540 	{
541 		if (data[i].x() < pivotX)
542 			leftData.push_back(data[i]);
543 		else
544 			rightData.push_back(data[i]);
545 	}
546 
547 	{
548 		const gls::LineParameters leftLine		= gls::theilSenLinearRegression(leftData);
549 		const gls::LineParameters rightLine		= gls::theilSenLinearRegression(rightData);
550 
551 		if (numDistinctX(leftData) < 2 || leftLine.coefficient > rightLine.coefficient*0.5f)
552 		{
553 			// Left data doesn't seem credible; assume the data is just a single line.
554 			const gls::LineParameters entireLine = gls::theilSenLinearRegression(data);
555 			return SegmentedEstimator(gls::LineParameters(entireLine.offset, 0.0f), entireLine, -std::numeric_limits<float>::infinity());
556 		}
557 		else
558 			return SegmentedEstimator(leftLine, rightLine, pivotX);
559 	}
560 }
561 
OperatorPerformanceCase(tcu::TestContext & testCtx,glu::RenderContext & renderCtx,const char * name,const char * description,CaseType caseType,int numWorkloads,const InitialCalibrationStorage & initialCalibrationStorage)562 OperatorPerformanceCase::OperatorPerformanceCase (tcu::TestContext& testCtx, glu::RenderContext& renderCtx, const char* name, const char* description,
563 												  CaseType caseType, int numWorkloads, const InitialCalibrationStorage& initialCalibrationStorage)
564 	: tcu::TestCase					(testCtx, tcu::NODETYPE_PERFORMANCE, name, description)
565 	, m_renderCtx					(renderCtx)
566 	, m_caseType					(caseType)
567 	, m_numMeasurementsPerWorkload	(getIterationCountOrDefault(m_testCtx.getCommandLine(), DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD))
568 	, m_numWorkloads				(numWorkloads)
569 	, m_workloadNdx					(-1)
570 	, m_workloadMeasurementNdx		(-1)
571 	, m_state						(STATE_LAST)
572 	, m_measureProgramNdx			(-1)
573 	, m_initialCalibrationStorage	(initialCalibrationStorage)
574 	, m_viewportWidth				(caseType == CASETYPE_VERTEX	? 32	: renderCtx.getRenderTarget().getWidth())
575 	, m_viewportHeight				(caseType == CASETYPE_VERTEX	? 32	: renderCtx.getRenderTarget().getHeight())
576 	, m_gridSizeX					(caseType == CASETYPE_FRAGMENT	? 1		: 100)
577 	, m_gridSizeY					(caseType == CASETYPE_FRAGMENT	? 1		: 100)
578 {
579 	DE_ASSERT(m_numWorkloads > 0);
580 }
581 
~OperatorPerformanceCase(void)582 OperatorPerformanceCase::~OperatorPerformanceCase (void)
583 {
584 	if (!m_attribBuffers.empty())
585 	{
586 		m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
587 		m_attribBuffers.clear();
588 	}
589 }
590 
logRenderTargetInfo(TestLog & log,const tcu::RenderTarget & renderTarget)591 static void logRenderTargetInfo (TestLog& log, const tcu::RenderTarget& renderTarget)
592 {
593 	log << TestLog::Section("RenderTarget", "Render target")
594 		<< TestLog::Message << "size: " << renderTarget.getWidth() << "x" << renderTarget.getHeight() << TestLog::EndMessage
595 		<< TestLog::Message << "bits:"
596 							<< " R" << renderTarget.getPixelFormat().redBits
597 							<< " G" << renderTarget.getPixelFormat().greenBits
598 							<< " B" << renderTarget.getPixelFormat().blueBits
599 							<< " A" << renderTarget.getPixelFormat().alphaBits
600 							<< " D" << renderTarget.getDepthBits()
601 							<< " S" << renderTarget.getStencilBits()
602 							<< TestLog::EndMessage;
603 
604 	if (renderTarget.getNumSamples() != 0)
605 		log << TestLog::Message << renderTarget.getNumSamples() << "x MSAA" << TestLog::EndMessage;
606 	else
607 		log << TestLog::Message << "No MSAA" << TestLog::EndMessage;
608 
609 	log << TestLog::EndSection;
610 }
611 
getWorkloadMedianDataPoints(int progNdx) const612 vector<Vec2> OperatorPerformanceCase::getWorkloadMedianDataPoints (int progNdx) const
613 {
614 	const vector<WorkloadRecord>&	records = m_workloadRecords[progNdx];
615 	vector<Vec2>					result;
616 
617 	for (int i = 0; i < (int)records.size(); i++)
618 		result.push_back(Vec2((float)records[i].workloadSize, records[i].getMedianTime()));
619 
620 	return result;
621 }
622 
prepareProgram(int progNdx)623 void OperatorPerformanceCase::prepareProgram (int progNdx)
624 {
625 	DE_ASSERT(progNdx < (int)m_programs.size());
626 	DE_ASSERT(m_programData.size() == m_programs.size());
627 
628 	const glw::Functions&	gl			= m_renderCtx.getFunctions();
629 	const ShaderProgram&	program		= *m_programs[progNdx];
630 
631 	vector<AttribSpec>		attributes	= m_programData[progNdx].attributes;
632 
633 	attributes.push_back(AttribSpec("a_position",
634 									Vec4(-1.0f, -1.0f, 0.0f, 1.0f),
635 									Vec4( 1.0f, -1.0f, 0.0f, 1.0f),
636 									Vec4(-1.0f,  1.0f, 0.0f, 1.0f),
637 									Vec4( 1.0f,  1.0f, 0.0f, 1.0f)));
638 
639 	DE_ASSERT(program.isOk());
640 
641 	// Generate vertices.
642 	if (!m_attribBuffers.empty())
643 		gl.deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
644 	m_attribBuffers.resize(attributes.size(), 0);
645 	gl.genBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
646 	GLU_EXPECT_NO_ERROR(gl.getError(), "glGenBuffers()");
647 
648 	for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
649 	{
650 		std::vector<float> vertices;
651 		generateVertices(vertices, m_gridSizeX, m_gridSizeY, attributes[attribNdx]);
652 
653 		gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
654 		gl.bufferData(GL_ARRAY_BUFFER, (glw::GLsizeiptr)(vertices.size()*sizeof(float)), &vertices[0], GL_STATIC_DRAW);
655 		GLU_EXPECT_NO_ERROR(gl.getError(), "Upload buffer data");
656 	}
657 
658 	// Setup attribute bindings.
659 	for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
660 	{
661 		int location = gl.getAttribLocation(program.getProgram(), attributes[attribNdx].name.c_str());
662 
663 		if (location >= 0)
664 		{
665 			gl.enableVertexAttribArray(location);
666 			gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
667 			gl.vertexAttribPointer(location, 4, GL_FLOAT, GL_FALSE, 0, DE_NULL);
668 		}
669 	}
670 	GLU_EXPECT_NO_ERROR(gl.getError(), "Setup vertex input state");
671 
672 	gl.useProgram(program.getProgram());
673 	setGeneralUniforms(program.getProgram());
674 	gl.viewport(0, 0, m_viewportWidth, m_viewportHeight);
675 }
676 
prepareWorkload(int progNdx,int workload)677 void OperatorPerformanceCase::prepareWorkload (int progNdx, int workload)
678 {
679 	setWorkloadSizeUniform(m_programs[progNdx]->getProgram(), workload);
680 	render(m_calibrator.getCallCount());
681 }
682 
prepareNextRound(void)683 void OperatorPerformanceCase::prepareNextRound (void)
684 {
685 	DE_ASSERT(m_state == STATE_CALIBRATING			||
686 			  m_state == STATE_FIND_HIGH_WORKLOAD	||
687 			  m_state == STATE_MEASURING);
688 
689 	TestLog& log = m_testCtx.getLog();
690 
691 	if (m_state == STATE_CALIBRATING && m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
692 	{
693 		m_measureProgramNdx = 0;
694 		m_state = STATE_FIND_HIGH_WORKLOAD;
695 	}
696 
697 	if (m_state == STATE_CALIBRATING)
698 		prepareWorkload(0, 1);
699 	else if (m_state == STATE_FIND_HIGH_WORKLOAD)
700 	{
701 		vector<WorkloadRecord>& records = m_workloadRecordsFindHigh[m_measureProgramNdx];
702 
703 		if (records.empty() || records.back().getMedianTime() < 2.0f*records[0].getMedianTime())
704 		{
705 			int workloadSize;
706 
707 			if (records.empty())
708 				workloadSize = 1;
709 			else
710 			{
711 				workloadSize = records.back().workloadSize*2;
712 
713 				if (workloadSize > MAX_WORKLOAD_SIZE)
714 				{
715 					log << TestLog::Message << "Even workload size " << records.back().workloadSize
716 											<< " doesn't give high enough frame time for program " << m_measureProgramNdx
717 											<< ". Can't get sensible result." << TestLog::EndMessage;
718 					MEASUREMENT_FAIL();
719 				}
720 			}
721 
722 			records.push_back(WorkloadRecord(workloadSize));
723 			prepareWorkload(0, workloadSize);
724 			m_workloadMeasurementNdx = 0;
725 		}
726 		else
727 		{
728 			m_highWorkloadSizes[m_measureProgramNdx] = records.back().workloadSize;
729 			m_measureProgramNdx++;
730 
731 			if (m_measureProgramNdx >= (int)m_programs.size())
732 			{
733 				m_state = STATE_MEASURING;
734 				m_workloadNdx = -1;
735 				m_measureProgramNdx = 0;
736 			}
737 
738 			prepareProgram(m_measureProgramNdx);
739 			prepareNextRound();
740 		}
741 	}
742 	else
743 	{
744 		m_workloadNdx++;
745 
746 		if (m_workloadNdx < m_numWorkloads)
747 		{
748 			DE_ASSERT(m_numWorkloads > 1);
749 			const int highWorkload	= m_highWorkloadSizes[m_measureProgramNdx];
750 			const int workload		= highWorkload > m_numWorkloads ?
751 										1 + m_workloadNdx*(highWorkload-1)/(m_numWorkloads-1) :
752 										1 + m_workloadNdx;
753 
754 			prepareWorkload(m_measureProgramNdx, workload);
755 
756 			m_workloadMeasurementNdx = 0;
757 
758 			m_workloadRecords[m_measureProgramNdx].push_back(WorkloadRecord(workload));
759 		}
760 		else
761 		{
762 			m_measureProgramNdx++;
763 
764 			if (m_measureProgramNdx < (int)m_programs.size())
765 			{
766 				m_workloadNdx = -1;
767 				m_workloadMeasurementNdx = 0;
768 				prepareProgram(m_measureProgramNdx);
769 				prepareNextRound();
770 			}
771 			else
772 				m_state = STATE_REPORTING;
773 		}
774 	}
775 }
776 
init(void)777 void OperatorPerformanceCase::init (void)
778 {
779 	TestLog&				log		= m_testCtx.getLog();
780 	const glw::Functions&	gl		= m_renderCtx.getFunctions();
781 
782 	// Validate that we have sane grid and viewport setup.
783 	DE_ASSERT(de::inBounds(m_gridSizeX, 1, 256) && de::inBounds(m_gridSizeY, 1, 256));
784 	TCU_CHECK(de::inRange(m_viewportWidth,	1, m_renderCtx.getRenderTarget().getWidth()) &&
785 			  de::inRange(m_viewportHeight,	1, m_renderCtx.getRenderTarget().getHeight()));
786 
787 	logRenderTargetInfo(log, m_renderCtx.getRenderTarget());
788 
789 	log << TestLog::Message << "Using additive blending." << TestLog::EndMessage;
790 	gl.enable(GL_BLEND);
791 	gl.blendEquation(GL_FUNC_ADD);
792 	gl.blendFunc(GL_ONE, GL_ONE);
793 
794 	// Generate programs.
795 	DE_ASSERT(m_programs.empty());
796 	m_programData = generateProgramData();
797 	DE_ASSERT(!m_programData.empty());
798 
799 	for (int progNdx = 0; progNdx < (int)m_programData.size(); progNdx++)
800 	{
801 		const string& vert = m_programData[progNdx].vertShaderSource;
802 		const string& frag = m_programData[progNdx].fragShaderSource;
803 
804 		m_programs.push_back(SharedPtr<ShaderProgram>(new ShaderProgram(m_renderCtx, glu::makeVtxFragSources(vert, frag))));
805 
806 		if (!m_programs.back()->isOk())
807 		{
808 			log << *m_programs.back();
809 			TCU_FAIL("Compile failed");
810 		}
811 	}
812 
813 	// Log all programs.
814 	for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
815 		log << TestLog::Section("Program" + de::toString(progNdx), "Program " + de::toString(progNdx))
816 				<< TestLog::Message << m_programData[progNdx].description << TestLog::EndMessage
817 				<< *m_programs[progNdx]
818 			<< TestLog::EndSection;
819 
820 	m_highWorkloadSizes.resize(m_programData.size());
821 	m_workloadRecordsFindHigh.resize(m_programData.size());
822 	m_workloadRecords.resize(m_programData.size());
823 
824 	m_calibrator.clear(CalibratorParameters(m_initialCalibrationStorage->initialNumCalls, 10 /* calibrate iteration frames */, 2000.0f /* calibrate iteration shortcut threshold (ms) */, 16 /* max calibrate iterations */,
825 											1000.0f/30.0f /* frame time (ms) */, 1000.0f/60.0f /* frame time cap (ms) */, 1000.0f /* target measure duration (ms) */));
826 	m_state = STATE_CALIBRATING;
827 
828 	prepareProgram(0);
829 	prepareNextRound();
830 }
831 
deinit(void)832 void OperatorPerformanceCase::deinit (void)
833 {
834 	if (!m_attribBuffers.empty())
835 	{
836 		m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
837 		m_attribBuffers.clear();
838 	}
839 
840 	m_programs.clear();
841 }
842 
render(int numDrawCalls)843 void OperatorPerformanceCase::render (int numDrawCalls)
844 {
845 	const glw::Functions&	gl				= m_renderCtx.getFunctions();
846 	const int				numVertices		= getNumVertices(m_gridSizeX, m_gridSizeY);
847 
848 	for (int callNdx = 0; callNdx < numDrawCalls; callNdx++)
849 		gl.drawArrays(GL_TRIANGLES, 0, numVertices);
850 
851 	glu::readPixels(m_renderCtx, 0, 0, tcu::Surface(1, 1).getAccess()); // \note Serves as a more reliable replacement for glFinish().
852 }
853 
renderAndMeasure(int numDrawCalls)854 deUint64 OperatorPerformanceCase::renderAndMeasure (int numDrawCalls)
855 {
856 	const deUint64 startTime = deGetMicroseconds();
857 	render(numDrawCalls);
858 	return deGetMicroseconds() - startTime;
859 }
860 
adjustAndLogGridAndViewport(void)861 void OperatorPerformanceCase::adjustAndLogGridAndViewport (void)
862 {
863 	TestLog& log = m_testCtx.getLog();
864 
865 	// If call count is just 1, and the target frame time still wasn't reached, reduce grid or viewport size.
866 	if (m_calibrator.getCallCount() == 1)
867 	{
868 		const gls::MeasureState&	calibratorMeasure	= m_calibrator.getMeasureState();
869 		const float					drawCallTime		= (float)calibratorMeasure.getTotalTime() / (float)calibratorMeasure.frameTimes.size();
870 		const float					targetDrawCallTime	= m_calibrator.getParameters().targetFrameTimeUs;
871 		const float					targetRatio			= targetDrawCallTime / drawCallTime;
872 
873 		if (targetRatio < 0.95f)
874 		{
875 			// Reduce grid or viewport size assuming draw call time scales proportionally.
876 			if (m_caseType == CASETYPE_VERTEX)
877 			{
878 				const float targetRatioSqrt = deFloatSqrt(targetRatio);
879 				m_gridSizeX = (int)(targetRatioSqrt * (float)m_gridSizeX);
880 				m_gridSizeY = (int)(targetRatioSqrt * (float)m_gridSizeY);
881 				TCU_CHECK_MSG(m_gridSizeX >= 1 && m_gridSizeY >= 1, "Can't decrease grid size enough to achieve low-enough draw times");
882 				log << TestLog::Message << "Note: triangle grid size reduced from original; it's now smaller than during calibration." << TestLog::EndMessage;
883 			}
884 			else
885 			{
886 				const float targetRatioSqrt = deFloatSqrt(targetRatio);
887 				m_viewportWidth  = (int)(targetRatioSqrt * (float)m_viewportWidth);
888 				m_viewportHeight = (int)(targetRatioSqrt * (float)m_viewportHeight);
889 				TCU_CHECK_MSG(m_viewportWidth >= 1 && m_viewportHeight >= 1, "Can't decrease viewport size enough to achieve low-enough draw times");
890 				log << TestLog::Message << "Note: viewport size reduced from original; it's now smaller than during calibration." << TestLog::EndMessage;
891 			}
892 		}
893 	}
894 
895 	prepareProgram(0);
896 
897 	// Log grid and viewport sizes.
898 	log << TestLog::Message << "Grid size: " << m_gridSizeX << "x" << m_gridSizeY << TestLog::EndMessage;
899 	log << TestLog::Message << "Viewport: " << m_viewportWidth << "x" << m_viewportHeight << TestLog::EndMessage;
900 }
901 
iterate(void)902 OperatorPerformanceCase::IterateResult OperatorPerformanceCase::iterate (void)
903 {
904 	const TheilSenCalibrator::State calibratorState = m_calibrator.getState();
905 
906 	if (calibratorState != TheilSenCalibrator::STATE_FINISHED)
907 	{
908 		if (calibratorState == TheilSenCalibrator::STATE_RECOMPUTE_PARAMS)
909 			m_calibrator.recomputeParameters();
910 		else if (calibratorState == TheilSenCalibrator::STATE_MEASURE)
911 			m_calibrator.recordIteration(renderAndMeasure(m_calibrator.getCallCount()));
912 		else
913 			DE_ASSERT(false);
914 
915 		if (m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
916 		{
917 			logCalibrationInfo(m_testCtx.getLog(), m_calibrator);
918 			adjustAndLogGridAndViewport();
919 			prepareNextRound();
920 			m_initialCalibrationStorage->initialNumCalls = m_calibrator.getCallCount();
921 		}
922 	}
923 	else if (m_state == STATE_FIND_HIGH_WORKLOAD || m_state == STATE_MEASURING)
924 	{
925 		if (m_workloadMeasurementNdx < m_numMeasurementsPerWorkload)
926 		{
927 			vector<WorkloadRecord>& records = m_state == STATE_FIND_HIGH_WORKLOAD ? m_workloadRecordsFindHigh[m_measureProgramNdx] : m_workloadRecords[m_measureProgramNdx];
928 			records.back().addFrameTime((float)renderAndMeasure(m_calibrator.getCallCount()));
929 			m_workloadMeasurementNdx++;
930 		}
931 		else
932 			prepareNextRound();
933 	}
934 	else
935 	{
936 		DE_ASSERT(m_state == STATE_REPORTING);
937 
938 		TestLog&	log				= m_testCtx.getLog();
939 		const int	drawCallCount	= m_calibrator.getCallCount();
940 
941 		{
942 			// Compute per-program estimators for measurements.
943 			vector<SegmentedEstimator> estimators;
944 			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
945 				estimators.push_back(computeSegmentedEstimator(getWorkloadMedianDataPoints(progNdx)));
946 
947 			// Log measurements and their estimators for all programs.
948 			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
949 			{
950 				const SegmentedEstimator&	estimator	= estimators[progNdx];
951 				const string				progNdxStr	= de::toString(progNdx);
952 				vector<WorkloadRecord>		records		= m_workloadRecords[progNdx];
953 				std::sort(records.begin(), records.end());
954 
955 				{
956 					const tcu::ScopedLogSection section(log,
957 														"Program" + progNdxStr + "Measurements",
958 														"Measurements for program " + progNdxStr);
959 
960 					// Sample list of individual frame times.
961 
962 					log << TestLog::SampleList("Program" + progNdxStr + "IndividualFrameTimes", "Individual frame times")
963 						<< TestLog::SampleInfo << TestLog::ValueInfo("Workload",	"Workload",		"",		QP_SAMPLE_VALUE_TAG_PREDICTOR)
964 											   << TestLog::ValueInfo("FrameTime",	"Frame time",	"us",	QP_SAMPLE_VALUE_TAG_RESPONSE)
965 						<< TestLog::EndSampleInfo;
966 
967 					for (int i = 0; i < (int)records.size(); i++)
968 						for (int j = 0; j < (int)records[i].frameTimes.size(); j++)
969 							log << TestLog::Sample << records[i].workloadSize << records[i].frameTimes[j] << TestLog::EndSample;
970 
971 					log << TestLog::EndSampleList;
972 
973 					// Sample list of median frame times.
974 
975 					log << TestLog::SampleList("Program" + progNdxStr + "MedianFrameTimes", "Median frame times")
976 						<< TestLog::SampleInfo << TestLog::ValueInfo("Workload",		"Workload",				"",		QP_SAMPLE_VALUE_TAG_PREDICTOR)
977 											   << TestLog::ValueInfo("MedianFrameTime",	"Median frame time",	"us",	QP_SAMPLE_VALUE_TAG_RESPONSE)
978 						<< TestLog::EndSampleInfo;
979 
980 					for (int i = 0; i < (int)records.size(); i++)
981 						log << TestLog::Sample << records[i].workloadSize << records[i].getMedianTime() << TestLog::EndSample;
982 
983 					log << TestLog::EndSampleList;
984 
985 					log << TestLog::Float("Program" + progNdxStr + "WorkloadCostEstimate", "Workload cost estimate", "us / workload", QP_KEY_TAG_TIME, estimator.right.coefficient);
986 
987 					if (estimator.pivotX > -std::numeric_limits<float>::infinity())
988 						log << TestLog::Message << "Note: the data points with x coordinate greater than or equal to " << estimator.pivotX
989 												<< " seem to form a rising line, and the rest of data points seem to form a near-horizontal line" << TestLog::EndMessage
990 							<< TestLog::Message << "Note: the left line is estimated to be " << lineParamsString(estimator.left)
991 												<< " and the right line " << lineParamsString(estimator.right) << TestLog::EndMessage;
992 					else
993 						log << TestLog::Message << "Note: the data seem to form a single line: " << lineParamsString(estimator.right) << TestLog::EndMessage;
994 				}
995 			}
996 
997 			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
998 			{
999 				if (estimators[progNdx].right.coefficient <= 0.0f)
1000 				{
1001 					log << TestLog::Message << "Slope of measurements for program " << progNdx << " isn't positive. Can't get sensible result." << TestLog::EndMessage;
1002 					MEASUREMENT_FAIL();
1003 				}
1004 			}
1005 
1006 			// \note For each estimator, .right.coefficient is the increase in draw time (in microseconds) when
1007 			// incrementing shader workload size by 1, when D draw calls are done, with a vertex/fragment count
1008 			// of R.
1009 			//
1010 			// The measurements of any single program can't tell us the final result (time of single operation),
1011 			// so we use computeSingleOperationTime to compute it from multiple programs' measurements in a
1012 			// subclass-defined manner.
1013 			//
1014 			// After that, microseconds per operation can be calculated as singleOperationTime / (D * R).
1015 
1016 			{
1017 				vector<float>	perProgramSlopes;
1018 				for (int i = 0; i < (int)m_programs.size(); i++)
1019 					perProgramSlopes.push_back(estimators[i].right.coefficient);
1020 
1021 				logSingleOperationCalculationInfo();
1022 
1023 				const float		maxSlope				= *std::max_element(perProgramSlopes.begin(), perProgramSlopes.end());
1024 				const float		usecsPerFramePerOp		= computeSingleOperationTime(perProgramSlopes);
1025 				const int		vertexOrFragmentCount	= m_caseType == CASETYPE_VERTEX ?
1026 															getNumVertices(m_gridSizeX, m_gridSizeY) :
1027 															m_viewportWidth*m_viewportHeight;
1028 				const double	usecsPerDrawCallPerOp	= usecsPerFramePerOp / (double)drawCallCount;
1029 				const double	usecsPerSingleOp		= usecsPerDrawCallPerOp / (double)vertexOrFragmentCount;
1030 				const double	megaOpsPerSecond		= (double)(drawCallCount*vertexOrFragmentCount) / usecsPerFramePerOp;
1031 				const int		numFreeOps				= de::max(0, (int)deFloatFloor(intersectionX(estimators[0].left,
1032 																									 LineParameters(estimators[0].right.offset,
1033 																													usecsPerFramePerOp))));
1034 
1035 				log << TestLog::Integer("VertexOrFragmentCount",
1036 										"R = " + string(m_caseType == CASETYPE_VERTEX ? "Vertex" : "Fragment") + " count",
1037 										"", QP_KEY_TAG_NONE, vertexOrFragmentCount)
1038 
1039 					<< TestLog::Integer("DrawCallsPerFrame", "D = Draw calls per frame", "", QP_KEY_TAG_NONE, drawCallCount)
1040 
1041 					<< TestLog::Integer("VerticesOrFragmentsPerFrame",
1042 										"R*D = " + string(m_caseType == CASETYPE_VERTEX ? "Vertices" : "Fragments") + " per frame",
1043 										"", QP_KEY_TAG_NONE, vertexOrFragmentCount*drawCallCount)
1044 
1045 					<< TestLog::Float("TimePerFramePerOp",
1046 									  "Estimated cost of R*D " + string(m_caseType == CASETYPE_VERTEX ? "vertices" : "fragments")
1047 									  + " (i.e. one frame) with one shader operation",
1048 									  "us", QP_KEY_TAG_TIME, (float)usecsPerFramePerOp)
1049 
1050 					<< TestLog::Float("TimePerDrawcallPerOp",
1051 									  "Estimated cost of one draw call with one shader operation",
1052 									  "us", QP_KEY_TAG_TIME, (float)usecsPerDrawCallPerOp)
1053 
1054 					<< TestLog::Float("TimePerSingleOp",
1055 									  "Estimated cost of a single shader operation",
1056 									  "us", QP_KEY_TAG_TIME, (float)usecsPerSingleOp);
1057 
1058 				// \note Sometimes, when the operation is free or very cheap, it can happen that the shader with the operation runs,
1059 				//		 for some reason, a bit faster than the shader without the operation, and thus we get a negative result. The
1060 				//		 following threshold values for accepting a negative or almost-zero result are rather quick and dirty.
1061 				if (usecsPerFramePerOp <= -0.1f*maxSlope)
1062 				{
1063 					log << TestLog::Message << "Got strongly negative result." << TestLog::EndMessage;
1064 					MEASUREMENT_FAIL();
1065 				}
1066 				else if (usecsPerFramePerOp <= 0.001*maxSlope)
1067 				{
1068 					log << TestLog::Message << "Cost of operation seems to be approximately zero." << TestLog::EndMessage;
1069 					m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
1070 				}
1071 				else
1072 				{
1073 					log << TestLog::Float("OpsPerSecond",
1074 										  "Operations per second",
1075 										  "Million/s", QP_KEY_TAG_PERFORMANCE, (float)megaOpsPerSecond)
1076 
1077 						<< TestLog::Integer("NumFreeOps",
1078 											"Estimated number of \"free\" operations",
1079 											"", QP_KEY_TAG_PERFORMANCE, numFreeOps);
1080 
1081 					m_testCtx.setTestResult(QP_TEST_RESULT_PASS, de::floatToString((float)megaOpsPerSecond, 2).c_str());
1082 				}
1083 
1084 				m_state = STATE_FINISHED;
1085 			}
1086 		}
1087 
1088 		return STOP;
1089 	}
1090 
1091 	return CONTINUE;
1092 }
1093 
1094 // Binary operator case.
1095 class BinaryOpCase : public OperatorPerformanceCase
1096 {
1097 public:
1098 						BinaryOpCase				(Context& context, const char* name, const char* description, const char* op,
1099 													 glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex, const InitialCalibrationStorage& initialCalibration);
1100 
1101 protected:
1102 	vector<ProgramContext>	generateProgramData					(void) const;
1103 	void					setGeneralUniforms					(deUint32 program) const;
1104 	void					setWorkloadSizeUniform				(deUint32 program, int numOperations) const;
1105 	float					computeSingleOperationTime			(const vector<float>& perProgramOperationCosts) const;
1106 	void					logSingleOperationCalculationInfo	(void) const;
1107 
1108 private:
1109 	enum ProgramID
1110 	{
1111 		// \note 0-based sequential numbering is relevant, because these are also used as vector indices.
1112 		// \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
1113 		PROGRAM_WITH_BIGGER_LOOP = 0,
1114 		PROGRAM_WITH_SMALLER_LOOP,
1115 
1116 		PROGRAM_LAST
1117 	};
1118 
1119 	ProgramContext			generateSingleProgramData		(ProgramID) const;
1120 
1121 	const string			m_op;
1122 	const glu::DataType		m_type;
1123 	const glu::Precision	m_precision;
1124 	const bool				m_useSwizzle;
1125 };
1126 
BinaryOpCase(Context & context,const char * name,const char * description,const char * op,glu::DataType type,glu::Precision precision,bool useSwizzle,bool isVertex,const InitialCalibrationStorage & initialCalibration)1127 BinaryOpCase::BinaryOpCase (Context& context, const char* name, const char* description, const char* op,
1128 							glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex, const InitialCalibrationStorage& initialCalibration)
1129 	: OperatorPerformanceCase	(context.getTestContext(), context.getRenderContext(), name, description,
1130 								 isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
1131 	, m_op						(op)
1132 	, m_type					(type)
1133 	, m_precision				(precision)
1134 	, m_useSwizzle				(useSwizzle)
1135 {
1136 }
1137 
generateSingleProgramData(ProgramID programID) const1138 BinaryOpCase::ProgramContext BinaryOpCase::generateSingleProgramData (ProgramID programID) const
1139 {
1140 	DE_ASSERT(glu::isDataTypeFloatOrVec(m_type) || glu::isDataTypeIntOrIVec(m_type));
1141 
1142 	const bool			isVertexCase	= m_caseType == CASETYPE_VERTEX;
1143 	const char* const	precision		= glu::getPrecisionName(m_precision);
1144 	const char* const	inputPrecision	= glu::isDataTypeIntOrIVec(m_type) && m_precision == glu::PRECISION_LOWP ? "mediump" : precision;
1145 	const char* const	typeName		= getDataTypeName(m_type);
1146 
1147 	std::ostringstream	vtx;
1148 	std::ostringstream	frag;
1149 	std::ostringstream&	op				= isVertexCase ? vtx : frag;
1150 
1151 	vtx << "#version 300 es\n";
1152 	frag << "#version 300 es\n"
1153 		 << "layout (location = 0) out mediump vec4 o_color;\n";
1154 
1155 	// Attributes.
1156 	vtx << "in highp vec4 a_position;\n";
1157 	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1158 		vtx << "in " << inputPrecision << " vec4 a_in" << i << ";\n";
1159 
1160 	if (isVertexCase)
1161 	{
1162 		vtx << "out mediump vec4 v_color;\n";
1163 		frag << "in mediump vec4 v_color;\n";
1164 	}
1165 	else
1166 	{
1167 		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1168 		{
1169 			vtx << "out " << inputPrecision << " vec4 v_in" << i << ";\n";
1170 			frag << "in " << inputPrecision << " vec4 v_in" << i << ";\n";
1171 		}
1172 	}
1173 
1174 	op << "uniform mediump int u_numLoopIterations;\n";
1175 	if (isVertexCase)
1176 		op << "uniform mediump float u_zero;\n";
1177 
1178 	vtx << "\n";
1179 	vtx << "void main()\n";
1180 	vtx << "{\n";
1181 
1182 	if (!isVertexCase)
1183 		vtx << "\tgl_Position = a_position;\n";
1184 
1185 	frag << "\n";
1186 	frag << "void main()\n";
1187 	frag << "{\n";
1188 
1189 	// Expression inputs.
1190 	const char* const prefix = isVertexCase ? "a_" : "v_";
1191 	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1192 	{
1193 		const int	inSize		= getDataTypeScalarSize(m_type);
1194 		const bool	isInt		= de::inRange<int>(m_type, TYPE_INT, TYPE_INT_VEC4);
1195 		const bool	cast		= isInt || (!m_useSwizzle && m_type != TYPE_FLOAT_VEC4);
1196 
1197 		op << "\t" << precision << " " << typeName << " in" << i << " = ";
1198 
1199 		if (cast)
1200 			op << typeName << "(";
1201 
1202 		op << prefix << "in" << i;
1203 
1204 		if (m_useSwizzle)
1205 			op << "." << s_swizzles[i % DE_LENGTH_OF_ARRAY(s_swizzles)][inSize-1];
1206 
1207 		if (cast)
1208 			op << ")";
1209 
1210 		op << ";\n";
1211 	}
1212 
1213 	// Operation accumulation variables.
1214 	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1215 	{
1216 		op << "\t" << precision << " " << typeName << " acc" << i << "a" << " = in" << i+0 << ";\n";
1217 		op << "\t" << precision << " " << typeName << " acc" << i << "b" << " = in" << i+1 << ";\n";
1218 	}
1219 
1220 	// Loop, with expressions in it.
1221 	op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
1222 	op << "\t{\n";
1223 	{
1224 		const int unrollAmount = programID == PROGRAM_WITH_SMALLER_LOOP ? BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT : BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1225 		for (int unrollNdx = 0; unrollNdx < unrollAmount; unrollNdx++)
1226 		{
1227 			for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1228 			{
1229 				if (i > 0 || unrollNdx > 0)
1230 					op << "\n";
1231 				op << "\t\tacc" << i << "a = acc" << i << "b " << m_op << " acc" << i << "a" << ";\n";
1232 				op << "\t\tacc" << i << "b = acc" << i << "a " << m_op << " acc" << i << "b" << ";\n";
1233 			}
1234 		}
1235 	}
1236 	op << "\t}\n";
1237 	op << "\n";
1238 
1239 	// Result variable (sum of accumulation variables).
1240 	op << "\t" << precision << " " << typeName << " res =";
1241 	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1242 		op << (i > 0 ? " "+m_op : "") << " acc" << i << "b";
1243 	op << ";\n";
1244 
1245 	// Convert to color.
1246 	op << "\tmediump vec4 color = ";
1247 	if (m_type == TYPE_FLOAT_VEC4)
1248 		op << "res";
1249 	else
1250 	{
1251 		int size = getDataTypeScalarSize(m_type);
1252 		op << "vec4(res";
1253 
1254 		for (int i = size; i < 4; i++)
1255 			op << ", " << (i == 3 ? "1.0" : "0.0");
1256 
1257 		op << ")";
1258 	}
1259 	op << ";\n";
1260 	op << "\t" << (isVertexCase ? "v_color" : "o_color") << " = color;\n";
1261 
1262 	if (isVertexCase)
1263 	{
1264 		vtx << "	gl_Position = a_position + u_zero*color;\n";
1265 		frag << "	o_color = v_color;\n";
1266 	}
1267 	else
1268 	{
1269 		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1270 			vtx << "	v_in" << i << " = a_in" << i << ";\n";
1271 	}
1272 
1273 	vtx << "}\n";
1274 	frag << "}\n";
1275 
1276 	{
1277 		vector<AttribSpec> attributes;
1278 		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
1279 			attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
1280 											Vec4(2.0f, 2.0f, 2.0f, 1.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1281 											Vec4(1.0f, 2.0f, 1.0f, 2.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1282 											Vec4(2.0f, 1.0f, 2.0f, 2.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1283 											Vec4(1.0f, 1.0f, 2.0f, 1.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4)));
1284 
1285 		{
1286 			string description = "This is the program with the ";
1287 
1288 			description += programID == PROGRAM_WITH_SMALLER_LOOP	? "smaller"
1289 						 : programID == PROGRAM_WITH_BIGGER_LOOP	? "bigger"
1290 						 : DE_NULL;
1291 
1292 			description += " loop.\n"
1293 						   "Note: workload size for this program means the number of loop iterations.";
1294 
1295 			return ProgramContext(vtx.str(), frag.str(), attributes, description);
1296 		}
1297 	}
1298 }
1299 
generateProgramData(void) const1300 vector<BinaryOpCase::ProgramContext> BinaryOpCase::generateProgramData (void) const
1301 {
1302 	vector<ProgramContext> progData;
1303 	for (int i = 0; i < PROGRAM_LAST; i++)
1304 		progData.push_back(generateSingleProgramData((ProgramID)i));
1305 	return progData;
1306 }
1307 
setGeneralUniforms(deUint32 program) const1308 void BinaryOpCase::setGeneralUniforms (deUint32 program) const
1309 {
1310 	const glw::Functions& gl = m_renderCtx.getFunctions();
1311 	gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
1312 }
1313 
setWorkloadSizeUniform(deUint32 program,int numLoopIterations) const1314 void BinaryOpCase::setWorkloadSizeUniform (deUint32 program, int numLoopIterations) const
1315 {
1316 	const glw::Functions& gl = m_renderCtx.getFunctions();
1317 	gl.uniform1i(gl.getUniformLocation(program, "u_numLoopIterations"), numLoopIterations);
1318 }
1319 
computeSingleOperationTime(const vector<float> & perProgramOperationCosts) const1320 float BinaryOpCase::computeSingleOperationTime (const vector<float>& perProgramOperationCosts) const
1321 {
1322 	DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
1323 
1324 	const int		baseNumOpsInsideLoop				= 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
1325 	const int		numOpsInsideLoopInSmallProgram		= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
1326 	const int		numOpsInsideLoopInBigProgram		= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1327 	DE_STATIC_ASSERT(numOpsInsideLoopInBigProgram > numOpsInsideLoopInSmallProgram);
1328 	const int		opDiff								= numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
1329 	const float		programOperationCostDiff			= perProgramOperationCosts[PROGRAM_WITH_BIGGER_LOOP] - perProgramOperationCosts[PROGRAM_WITH_SMALLER_LOOP];
1330 
1331 	return programOperationCostDiff / (float)opDiff;
1332 }
1333 
logSingleOperationCalculationInfo(void) const1334 void BinaryOpCase::logSingleOperationCalculationInfo (void) const
1335 {
1336 	const int			baseNumOpsInsideLoop			= 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
1337 	const int			numOpsInsideLoopInSmallProgram	= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
1338 	const int			numOpsInsideLoopInBigProgram	= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1339 	const int			opDiff							= numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
1340 	const char* const	opName							= m_op == "+" ? "addition"
1341 														: m_op == "-" ? "subtraction"
1342 														: m_op == "*" ? "multiplication"
1343 														: m_op == "/" ? "division"
1344 														: DE_NULL;
1345 	DE_ASSERT(opName != DE_NULL);
1346 
1347 	m_testCtx.getLog() << TestLog::Message << "Note: the bigger program contains " << opDiff << " more "
1348 										   << opName << " operations in one loop iteration than the small program; "
1349 										   << "cost of one operation is calculated as (cost_of_bigger_workload - cost_of_smaller_workload) / " << opDiff
1350 										   << TestLog::EndMessage;
1351 }
1352 
1353 // Built-in function case.
1354 class FunctionCase : public OperatorPerformanceCase
1355 {
1356 public:
1357 	enum
1358 	{
1359 		MAX_PARAMS = 3
1360 	};
1361 
1362 						FunctionCase			(Context&							context,
1363 												 const char*						name,
1364 												 const char*						description,
1365 												 const char*						func,
1366 												 glu::DataType						returnType,
1367 												 const glu::DataType				paramTypes[MAX_PARAMS],
1368 												 const Vec4&						attribute,
1369 												 int								modifyParamNdx, //!< Add a compile-time constant (2.0) to the parameter at this index. This is ignored if negative.
1370 												 bool								useNearlyConstantINputs, //!< Function inputs shouldn't be much bigger than 'attribute'.
1371 												 glu::Precision						precision,
1372 												 bool								isVertex,
1373 												 const InitialCalibrationStorage&	initialCalibration);
1374 
1375 protected:
1376 	vector<ProgramContext>	generateProgramData					(void) const;
1377 	void					setGeneralUniforms					(deUint32 program) const;
1378 	void					setWorkloadSizeUniform				(deUint32 program, int numOperations) const;
1379 	float					computeSingleOperationTime			(const vector<float>& perProgramOperationCosts) const;
1380 	void					logSingleOperationCalculationInfo	(void) const;
1381 
1382 private:
1383 	enum ProgramID
1384 	{
1385 		// \note 0-based sequential numbering is relevant, because these are also used as vector indices.
1386 		// \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
1387 		PROGRAM_WITH_FUNCTION_CALLS = 0,
1388 		PROGRAM_WITHOUT_FUNCTION_CALLS,
1389 
1390 		PROGRAM_LAST
1391 	};
1392 
1393 	//! Forms a "sum" expression from aExpr and bExpr; for booleans, this is "equal(a,b)", otherwise actual sum.
1394 	static string		sumExpr						(const string& aExpr, const string& bExpr, glu::DataType type);
1395 	//! Forms an expression used to increment an input value in the shader. If type is boolean, this is just
1396 	//! baseExpr; otherwise, baseExpr is modified by multiplication or division by a loop index,
1397 	//! to prevent simple compiler optimizations. See m_useNearlyConstantInputs for more explanation.
1398 	static string		incrementExpr				(const string& baseExpr, glu::DataType type, bool divide);
1399 
1400 	ProgramContext		generateSingleProgramData	(ProgramID) const;
1401 
1402 	const string			m_func;
1403 	const glu::DataType		m_returnType;
1404 	glu::DataType			m_paramTypes[MAX_PARAMS];
1405 	// \note m_modifyParamNdx, if not negative, specifies the index of the parameter to which a
1406 	//		 compile-time constant (2.0) is added. This is a quick and dirty way to deal with
1407 	//		 functions like clamp or smoothstep that require that a certain parameter is
1408 	//		 greater than a certain other parameter.
1409 	const int				m_modifyParamNdx;
1410 	// \note m_useNearlyConstantInputs determines whether the inputs given to the function
1411 	//		 should increase (w.r.t m_attribute) only by very small amounts. This is relevant
1412 	//		 for functions like asin, which requires its inputs to be in a specific range.
1413 	//		 In practice, this affects whether expressions used to increment the input
1414 	//		 variables use division instead of multiplication; normally, multiplication is used,
1415 	//		 but it's hard to keep the increments very small that way, and division shouldn't
1416 	//		 be the default, since for many functions (probably not asin, luckily), division
1417 	//		 is too heavy and dominates time-wise.
1418 	const bool				m_useNearlyConstantInputs;
1419 	const Vec4				m_attribute;
1420 	const glu::Precision	m_precision;
1421 };
1422 
FunctionCase(Context & context,const char * name,const char * description,const char * func,glu::DataType returnType,const glu::DataType paramTypes[MAX_PARAMS],const Vec4 & attribute,int modifyParamNdx,bool useNearlyConstantInputs,glu::Precision precision,bool isVertex,const InitialCalibrationStorage & initialCalibration)1423 FunctionCase::FunctionCase (Context&							context,
1424 							const char*							name,
1425 							const char*							description,
1426 							const char*							func,
1427 							glu::DataType						returnType,
1428 							const glu::DataType					paramTypes[MAX_PARAMS],
1429 							const Vec4&							attribute,
1430 							int									modifyParamNdx,
1431 							bool								useNearlyConstantInputs,
1432 							glu::Precision						precision,
1433 							bool								isVertex,
1434 							const InitialCalibrationStorage&	initialCalibration)
1435 	: OperatorPerformanceCase	(context.getTestContext(), context.getRenderContext(), name, description,
1436 								 isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
1437 	, m_func					(func)
1438 	, m_returnType				(returnType)
1439 	, m_modifyParamNdx			(modifyParamNdx)
1440 	, m_useNearlyConstantInputs	(useNearlyConstantInputs)
1441 	, m_attribute				(attribute)
1442 	, m_precision				(precision)
1443 {
1444 	for (int i = 0; i < MAX_PARAMS; i++)
1445 		m_paramTypes[i] = paramTypes[i];
1446 }
1447 
sumExpr(const string & aExpr,const string & bExpr,glu::DataType type)1448 string FunctionCase::sumExpr (const string& aExpr, const string& bExpr, glu::DataType type)
1449 {
1450 	if (glu::isDataTypeBoolOrBVec(type))
1451 	{
1452 		if (type == glu::TYPE_BOOL)
1453 			return "(" + aExpr + " == " + bExpr + ")";
1454 		else
1455 			return "equal(" + aExpr + ", " + bExpr + ")";
1456 	}
1457 	else
1458 		return "(" + aExpr + " + " + bExpr + ")";
1459 }
1460 
incrementExpr(const string & baseExpr,glu::DataType type,bool divide)1461 string FunctionCase::incrementExpr (const string& baseExpr, glu::DataType type, bool divide)
1462 {
1463 	const string mulOrDiv = divide ? "/" : "*";
1464 
1465 	return glu::isDataTypeBoolOrBVec(type)	? baseExpr
1466 		 : glu::isDataTypeIntOrIVec(type)	? "(" + baseExpr + mulOrDiv + "(i+1))"
1467 		 :									  "(" + baseExpr + mulOrDiv + "float(i+1))";
1468 }
1469 
generateSingleProgramData(ProgramID programID) const1470 FunctionCase::ProgramContext FunctionCase::generateSingleProgramData (ProgramID programID) const
1471 {
1472 	const bool			isVertexCase			= m_caseType == CASETYPE_VERTEX;
1473 	const char* const	precision				= glu::getPrecisionName(m_precision);
1474 	const char* const	returnTypeName			= getDataTypeName(m_returnType);
1475 	const string		returnPrecisionMaybe	= glu::isDataTypeBoolOrBVec(m_returnType) ? "" : string() + precision + " ";
1476 	const char*			inputPrecision			= DE_NULL;
1477 	const bool			isMatrixReturn			= isDataTypeMatrix(m_returnType);
1478 	int					numParams				= 0;
1479 	const char*			paramTypeNames[MAX_PARAMS];
1480 	string				paramPrecisionsMaybe[MAX_PARAMS];
1481 
1482 	for (int i = 0; i < MAX_PARAMS; i++)
1483 	{
1484 		paramTypeNames[i]			= getDataTypeName(m_paramTypes[i]);
1485 		paramPrecisionsMaybe[i]		= glu::isDataTypeBoolOrBVec(m_paramTypes[i]) ? "" : string() + precision + " ";
1486 
1487 		if (inputPrecision == DE_NULL && isDataTypeIntOrIVec(m_paramTypes[i]) && m_precision == glu::PRECISION_LOWP)
1488 			inputPrecision = "mediump";
1489 
1490 		if (m_paramTypes[i] != TYPE_INVALID)
1491 			numParams = i+1;
1492 	}
1493 
1494 	DE_ASSERT(numParams > 0);
1495 
1496 	if (inputPrecision == DE_NULL)
1497 		inputPrecision = precision;
1498 
1499 	int						numAttributes	= FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS + numParams - 1;
1500 	std::ostringstream		vtx;
1501 	std::ostringstream		frag;
1502 	std::ostringstream&		op				= isVertexCase ? vtx : frag;
1503 
1504 	vtx << "#version 300 es\n";
1505 	frag << "#version 300 es\n"
1506 		 << "layout (location = 0) out mediump vec4 o_color;\n";
1507 
1508 	// Attributes.
1509 	vtx << "in highp vec4 a_position;\n";
1510 	for (int i = 0; i < numAttributes; i++)
1511 		vtx << "in " << inputPrecision << " vec4 a_in" << i << ";\n";
1512 
1513 	if (isVertexCase)
1514 	{
1515 		vtx << "out mediump vec4 v_color;\n";
1516 		frag << "in mediump vec4 v_color;\n";
1517 	}
1518 	else
1519 	{
1520 		for (int i = 0; i < numAttributes; i++)
1521 		{
1522 			vtx << "out " << inputPrecision << " vec4 v_in" << i << ";\n";
1523 			frag << "in " << inputPrecision << " vec4 v_in" << i << ";\n";
1524 		}
1525 	}
1526 
1527 	op << "uniform mediump int u_numLoopIterations;\n";
1528 	if (isVertexCase)
1529 		op << "uniform mediump float u_zero;\n";
1530 
1531 	for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1532 		op << "uniform " << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " u_inc" << (char)('A'+paramNdx) << ";\n";
1533 
1534 	vtx << "\n";
1535 	vtx << "void main()\n";
1536 	vtx << "{\n";
1537 
1538 	if (!isVertexCase)
1539 		vtx << "\tgl_Position = a_position;\n";
1540 
1541 	frag << "\n";
1542 	frag << "void main()\n";
1543 	frag << "{\n";
1544 
1545 	// Function call input and return value accumulation variables.
1546 	{
1547 		const char* const inPrefix = isVertexCase ? "a_" : "v_";
1548 
1549 		for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
1550 		{
1551 			for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1552 			{
1553 				const glu::DataType		paramType	= m_paramTypes[paramNdx];
1554 				const bool				mustCast	= paramType != glu::TYPE_FLOAT_VEC4;
1555 
1556 				op << "\t" << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " in" << calcNdx << (char)('a'+paramNdx) << " = ";
1557 
1558 				if (mustCast)
1559 					op << paramTypeNames[paramNdx] << "(";
1560 
1561 				if (glu::isDataTypeMatrix(paramType))
1562 				{
1563 					static const char* const	swizzles[3]		= { "x", "xy", "xyz" };
1564 					const int					numRows			= glu::getDataTypeMatrixNumRows(paramType);
1565 					const int					numCols			= glu::getDataTypeMatrixNumColumns(paramType);
1566 					const string				swizzle			= numRows < 4 ? string() + "." + swizzles[numRows-1] : "";
1567 
1568 					for (int i = 0; i < numCols; i++)
1569 						op << (i > 0 ? ", " : "") << inPrefix << "in" << calcNdx+paramNdx << swizzle;
1570 				}
1571 				else
1572 				{
1573 					op << inPrefix << "in" << calcNdx+paramNdx;
1574 
1575 					if (paramNdx == m_modifyParamNdx)
1576 					{
1577 						DE_ASSERT(glu::isDataTypeFloatOrVec(paramType));
1578 						op << " + 2.0";
1579 					}
1580 				}
1581 
1582 				if (mustCast)
1583 					op << ")";
1584 
1585 				op << ";\n";
1586 			}
1587 
1588 			op << "\t" << returnPrecisionMaybe << returnTypeName << " res" << calcNdx << " = " << returnTypeName << "(0);\n";
1589 		}
1590 	}
1591 
1592 	// Loop with expressions in it.
1593 	op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
1594 	op << "\t{\n";
1595 	for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
1596 	{
1597 		if (calcNdx > 0)
1598 			op << "\n";
1599 
1600 		op << "\t\t{\n";
1601 
1602 		for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1603 		{
1604 			const string inputName	= "in" + de::toString(calcNdx) + (char)('a'+inputNdx);
1605 			const string incName	= string() + "u_inc" + (char)('A'+inputNdx);
1606 			const string incExpr	= incrementExpr(incName, m_paramTypes[inputNdx], m_useNearlyConstantInputs);
1607 
1608 			op << "\t\t\t" << inputName << " = " << sumExpr(inputName, incExpr, m_paramTypes[inputNdx]) << ";\n";
1609 		}
1610 
1611 		op << "\t\t\t" << returnPrecisionMaybe << returnTypeName << " eval" << calcNdx << " = ";
1612 
1613 		if (programID == PROGRAM_WITH_FUNCTION_CALLS)
1614 		{
1615 			op << m_func << "(";
1616 
1617 			for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1618 			{
1619 				if (paramNdx > 0)
1620 					op << ", ";
1621 
1622 				op << "in" << calcNdx << (char)('a'+paramNdx);
1623 			}
1624 
1625 			op << ")";
1626 		}
1627 		else
1628 		{
1629 			DE_ASSERT(programID == PROGRAM_WITHOUT_FUNCTION_CALLS);
1630 			op << returnTypeName << "(1)";
1631 		}
1632 
1633 		op << ";\n";
1634 
1635 		{
1636 			const string resName	= "res" + de::toString(calcNdx);
1637 			const string evalName	= "eval" + de::toString(calcNdx);
1638 			const string incExpr	= incrementExpr(evalName, m_returnType, m_useNearlyConstantInputs);
1639 
1640 			op << "\t\t\tres" << calcNdx << " = " << sumExpr(resName, incExpr, m_returnType) << ";\n";
1641 		}
1642 
1643 		op << "\t\t}\n";
1644 	}
1645 	op << "\t}\n";
1646 	op << "\n";
1647 
1648 	// Result variables.
1649 	for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1650 	{
1651 		op << "\t" << paramPrecisionsMaybe[inputNdx] << paramTypeNames[inputNdx] << " sumIn" << (char)('A'+inputNdx) << " = ";
1652 		{
1653 			string expr = string() + "in0" + (char)('a'+inputNdx);
1654 			for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1655 				expr = sumExpr(expr, string() + "in" + de::toString(i) + (char)('a'+inputNdx), m_paramTypes[inputNdx]);
1656 			op << expr;
1657 		}
1658 		op << ";\n";
1659 	}
1660 
1661 	op << "\t" << returnPrecisionMaybe << returnTypeName << " sumRes = ";
1662 	{
1663 		string expr = "res0";
1664 		for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1665 			expr = sumExpr(expr, "res" + de::toString(i), m_returnType);
1666 		op << expr;
1667 	}
1668 	op << ";\n";
1669 
1670 	{
1671 		glu::DataType finalResultDataType = glu::TYPE_LAST;
1672 
1673 		if (glu::isDataTypeMatrix(m_returnType))
1674 		{
1675 			finalResultDataType = m_returnType;
1676 
1677 			op << "\t" << precision << " " << returnTypeName << " finalRes = ";
1678 
1679 			for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1680 			{
1681 				DE_ASSERT(m_paramTypes[inputNdx] == m_returnType);
1682 				op << "sumIn" << (char)('A'+inputNdx) << " + ";
1683 			}
1684 			op << "sumRes;\n";
1685 		}
1686 		else
1687 		{
1688 			int numFinalResComponents = glu::getDataTypeScalarSize(m_returnType);
1689 			for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1690 				numFinalResComponents = de::max(numFinalResComponents, glu::getDataTypeScalarSize(m_paramTypes[inputNdx]));
1691 
1692 			finalResultDataType = getDataTypeFloatOrVec(numFinalResComponents);
1693 
1694 			{
1695 				const string finalResType = glu::getDataTypeName(finalResultDataType);
1696 				op << "\t" << precision << " " << finalResType << " finalRes = ";
1697 				for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1698 					op << finalResType << "(sumIn" << (char)('A'+inputNdx) << ") + ";
1699 				op << finalResType << "(sumRes);\n";
1700 			}
1701 		}
1702 
1703 		// Convert to color.
1704 		op << "\tmediump vec4 color = ";
1705 		if (finalResultDataType == TYPE_FLOAT_VEC4)
1706 			op << "finalRes";
1707 		else
1708 		{
1709 			int size = isMatrixReturn ? getDataTypeMatrixNumRows(finalResultDataType) : getDataTypeScalarSize(finalResultDataType);
1710 
1711 			op << "vec4(";
1712 
1713 			if (isMatrixReturn)
1714 			{
1715 				for (int i = 0; i < getDataTypeMatrixNumColumns(finalResultDataType); i++)
1716 				{
1717 					if (i > 0)
1718 						op << " + ";
1719 					op << "finalRes[" << i << "]";
1720 				}
1721 			}
1722 			else
1723 				op << "finalRes";
1724 
1725 			for (int i = size; i < 4; i++)
1726 				op << ", " << (i == 3 ? "1.0" : "0.0");
1727 
1728 			op << ")";
1729 		}
1730 		op << ";\n";
1731 		op << "\t" << (isVertexCase ? "v_color" : "o_color") << " = color;\n";
1732 
1733 		if (isVertexCase)
1734 		{
1735 			vtx << "	gl_Position = a_position + u_zero*color;\n";
1736 			frag << "	o_color = v_color;\n";
1737 		}
1738 		else
1739 		{
1740 			for (int i = 0; i < numAttributes; i++)
1741 				vtx << "	v_in" << i << " = a_in" << i << ";\n";
1742 		}
1743 
1744 		vtx << "}\n";
1745 		frag << "}\n";
1746 	}
1747 
1748 	{
1749 		vector<AttribSpec> attributes;
1750 		for (int i = 0; i < numAttributes; i++)
1751 			attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
1752 											m_attribute.swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
1753 											m_attribute.swizzle((i+1)%4, (i+2)%4, (i+3)%4, (i+0)%4),
1754 											m_attribute.swizzle((i+2)%4, (i+3)%4, (i+0)%4, (i+1)%4),
1755 											m_attribute.swizzle((i+3)%4, (i+0)%4, (i+1)%4, (i+2)%4)));
1756 
1757 		{
1758 			string description = "This is the program ";
1759 
1760 			description += programID == PROGRAM_WITHOUT_FUNCTION_CALLS	? "without"
1761 						 : programID == PROGRAM_WITH_FUNCTION_CALLS		? "with"
1762 						 : DE_NULL;
1763 
1764 			description += " '" + m_func + "' function calls.\n"
1765 						   "Note: workload size for this program means the number of loop iterations.";
1766 
1767 			return ProgramContext(vtx.str(), frag.str(), attributes, description);
1768 		}
1769 	}
1770 }
1771 
generateProgramData(void) const1772 vector<FunctionCase::ProgramContext> FunctionCase::generateProgramData (void) const
1773 {
1774 	vector<ProgramContext> progData;
1775 	for (int i = 0; i < PROGRAM_LAST; i++)
1776 		progData.push_back(generateSingleProgramData((ProgramID)i));
1777 	return progData;
1778 }
1779 
setGeneralUniforms(deUint32 program) const1780 void FunctionCase::setGeneralUniforms (deUint32 program) const
1781 {
1782 	const glw::Functions& gl = m_renderCtx.getFunctions();
1783 
1784 	gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
1785 
1786 	for (int paramNdx = 0; paramNdx < MAX_PARAMS; paramNdx++)
1787 	{
1788 		if (m_paramTypes[paramNdx] != glu::TYPE_INVALID)
1789 		{
1790 			const glu::DataType		paramType	= m_paramTypes[paramNdx];
1791 			const int				scalarSize	= glu::getDataTypeScalarSize(paramType);
1792 			const int				location	= gl.getUniformLocation(program, (string() + "u_inc" + (char)('A'+paramNdx)).c_str());
1793 
1794 			if (glu::isDataTypeFloatOrVec(paramType))
1795 			{
1796 				float values[4];
1797 				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1798 					values[i] = (float)paramNdx*0.01f + (float)i*0.001f; // Arbitrary small values.
1799 				uniformNfv(gl, scalarSize, location, 1, &values[0]);
1800 			}
1801 			else if (glu::isDataTypeIntOrIVec(paramType))
1802 			{
1803 				int values[4];
1804 				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1805 					values[i] = paramNdx*100 + i; // Arbitrary values.
1806 				uniformNiv(gl, scalarSize, location, 1, &values[0]);
1807 			}
1808 			else if (glu::isDataTypeBoolOrBVec(paramType))
1809 			{
1810 				int values[4];
1811 				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1812 					values[i] = (paramNdx >> i) & 1; // Arbitrary values.
1813 				uniformNiv(gl, scalarSize, location, 1, &values[0]);
1814 			}
1815 			else if (glu::isDataTypeMatrix(paramType))
1816 			{
1817 				const int size = glu::getDataTypeMatrixNumRows(paramType);
1818 				DE_ASSERT(size == glu::getDataTypeMatrixNumColumns(paramType));
1819 				float values[4*4];
1820 				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1821 					values[i] = (float)paramNdx*0.01f + (float)i*0.001f; // Arbitrary values.
1822 				uniformMatrixNfv(gl, size, location, 1, &values[0]);
1823 			}
1824 			else
1825 				DE_ASSERT(false);
1826 		}
1827 	}
1828 }
1829 
setWorkloadSizeUniform(deUint32 program,int numLoopIterations) const1830 void FunctionCase::setWorkloadSizeUniform (deUint32 program, int numLoopIterations) const
1831 {
1832 	const glw::Functions&	gl		= m_renderCtx.getFunctions();
1833 	const int				loc		= gl.getUniformLocation(program, "u_numLoopIterations");
1834 
1835 	gl.uniform1i(loc, numLoopIterations);
1836 }
1837 
computeSingleOperationTime(const vector<float> & perProgramOperationCosts) const1838 float FunctionCase::computeSingleOperationTime (const vector<float>& perProgramOperationCosts) const
1839 {
1840 	DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
1841 	const int		numFunctionCalls			= FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
1842 	const float		programOperationCostDiff	= perProgramOperationCosts[PROGRAM_WITH_FUNCTION_CALLS] - perProgramOperationCosts[PROGRAM_WITHOUT_FUNCTION_CALLS];
1843 
1844 	return programOperationCostDiff / (float)numFunctionCalls;
1845 }
1846 
logSingleOperationCalculationInfo(void) const1847 void FunctionCase::logSingleOperationCalculationInfo (void) const
1848 {
1849 	const int numFunctionCalls = FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
1850 
1851 	m_testCtx.getLog() << TestLog::Message << "Note: program " << (int)PROGRAM_WITH_FUNCTION_CALLS << " contains "
1852 										   << numFunctionCalls << " calls to '" << m_func << "' in one loop iteration; "
1853 										   << "cost of one operation is calculated as "
1854 										   << "(cost_of_workload_with_calls - cost_of_workload_without_calls) / " << numFunctionCalls << TestLog::EndMessage;
1855 }
1856 
1857 } // anonymous
1858 
ShaderOperatorTests(Context & context)1859 ShaderOperatorTests::ShaderOperatorTests (Context& context)
1860 	: TestCaseGroup(context, "operator", "Operator Performance Tests")
1861 {
1862 }
1863 
~ShaderOperatorTests(void)1864 ShaderOperatorTests::~ShaderOperatorTests (void)
1865 {
1866 }
1867 
init(void)1868 void ShaderOperatorTests::init (void)
1869 {
1870 	// Binary operator cases
1871 
1872 	static const DataType binaryOpTypes[] =
1873 	{
1874 		TYPE_FLOAT,
1875 		TYPE_FLOAT_VEC2,
1876 		TYPE_FLOAT_VEC3,
1877 		TYPE_FLOAT_VEC4,
1878 		TYPE_INT,
1879 		TYPE_INT_VEC2,
1880 		TYPE_INT_VEC3,
1881 		TYPE_INT_VEC4,
1882 	};
1883 	static const Precision precisions[] =
1884 	{
1885 		PRECISION_LOWP,
1886 		PRECISION_MEDIUMP,
1887 		PRECISION_HIGHP
1888 	};
1889 	static const struct
1890 	{
1891 		const char*		name;
1892 		const char*		op;
1893 		bool			swizzle;
1894 	} binaryOps[] =
1895 	{
1896 		{ "add",		"+",		false	},
1897 		{ "sub",		"-",		true	},
1898 		{ "mul",		"*",		false	},
1899 		{ "div",		"/",		true	}
1900 	};
1901 
1902 	tcu::TestCaseGroup* const binaryOpsGroup = new tcu::TestCaseGroup(m_testCtx, "binary_operator", "Binary Operator Performance Tests");
1903 	addChild(binaryOpsGroup);
1904 
1905 	for (int opNdx = 0; opNdx < DE_LENGTH_OF_ARRAY(binaryOps); opNdx++)
1906 	{
1907 		tcu::TestCaseGroup* const opGroup = new tcu::TestCaseGroup(m_testCtx, binaryOps[opNdx].name, "");
1908 		binaryOpsGroup->addChild(opGroup);
1909 
1910 		for (int isFrag = 0; isFrag <= 1; isFrag++)
1911 		{
1912 			const BinaryOpCase::InitialCalibrationStorage	shaderGroupCalibrationStorage	(new BinaryOpCase::InitialCalibration);
1913 			const bool										isVertex						= isFrag == 0;
1914 			tcu::TestCaseGroup* const						shaderGroup						= new tcu::TestCaseGroup(m_testCtx, isVertex ? "vertex" : "fragment", "");
1915 			opGroup->addChild(shaderGroup);
1916 
1917 			for (int typeNdx = 0; typeNdx < DE_LENGTH_OF_ARRAY(binaryOpTypes); typeNdx++)
1918 			{
1919 				for (int precNdx = 0; precNdx < DE_LENGTH_OF_ARRAY(precisions); precNdx++)
1920 				{
1921 					const DataType		type			= binaryOpTypes[typeNdx];
1922 					const Precision		precision		= precisions[precNdx];
1923 					const char* const	op				= binaryOps[opNdx].op;
1924 					const bool			useSwizzle		= binaryOps[opNdx].swizzle;
1925 					std::ostringstream	name;
1926 
1927 					name << getPrecisionName(precision) << "_" << getDataTypeName(type);
1928 
1929 					shaderGroup->addChild(new BinaryOpCase(m_context, name.str().c_str(), "", op, type, precision, useSwizzle, isVertex, shaderGroupCalibrationStorage));
1930 				}
1931 			}
1932 		}
1933 	}
1934 
1935 	// Built-in function cases.
1936 
1937 	// Non-specific (i.e. includes gentypes) parameter types for the functions.
1938 	enum ValueType
1939 	{
1940 		VALUE_NONE			= 0,
1941 		VALUE_FLOAT			= (1<<0),	// float scalar
1942 		VALUE_FLOAT_VEC		= (1<<1),	// float vector
1943 		VALUE_FLOAT_VEC34	= (1<<2),	// float vector of size 3 or 4
1944 		VALUE_FLOAT_GENTYPE	= (1<<3),	// float scalar/vector
1945 		VALUE_VEC3			= (1<<4),	// vec3 only
1946 		VALUE_VEC4			= (1<<5),	// vec4 only
1947 		VALUE_MATRIX		= (1<<6),	// matrix
1948 		VALUE_BOOL			= (1<<7),	// boolean scalar
1949 		VALUE_BOOL_VEC		= (1<<8),	// boolean vector
1950 		VALUE_BOOL_VEC4		= (1<<9),	// bvec4 only
1951 		VALUE_BOOL_GENTYPE	= (1<<10),	// boolean scalar/vector
1952 		VALUE_INT			= (1<<11),	// int scalar
1953 		VALUE_INT_VEC		= (1<<12),	// int vector
1954 		VALUE_INT_VEC4		= (1<<13),	// ivec4 only
1955 		VALUE_INT_GENTYPE	= (1<<14),	// int scalar/vector
1956 
1957 		// Shorthands.
1958 		N				= VALUE_NONE,
1959 		F				= VALUE_FLOAT,
1960 		FV				= VALUE_FLOAT_VEC,
1961 		VL				= VALUE_FLOAT_VEC34, // L for "large"
1962 		GT				= VALUE_FLOAT_GENTYPE,
1963 		V3				= VALUE_VEC3,
1964 		V4				= VALUE_VEC4,
1965 		M				= VALUE_MATRIX,
1966 		B				= VALUE_BOOL,
1967 		BV				= VALUE_BOOL_VEC,
1968 		B4				= VALUE_BOOL_VEC4,
1969 		BGT				= VALUE_BOOL_GENTYPE,
1970 		I				= VALUE_INT,
1971 		IV				= VALUE_INT_VEC,
1972 		I4				= VALUE_INT_VEC4,
1973 		IGT				= VALUE_INT_GENTYPE,
1974 
1975 		VALUE_ANY_FLOAT			= VALUE_FLOAT		|	VALUE_FLOAT_VEC		|	VALUE_FLOAT_GENTYPE		| VALUE_VEC3 | VALUE_VEC4 | VALUE_FLOAT_VEC34,
1976 		VALUE_ANY_INT			= VALUE_INT			|	VALUE_INT_VEC		|	VALUE_INT_GENTYPE		| VALUE_INT_VEC4,
1977 		VALUE_ANY_BOOL			= VALUE_BOOL		|	VALUE_BOOL_VEC		|	VALUE_BOOL_GENTYPE		| VALUE_BOOL_VEC4,
1978 
1979 		VALUE_ANY_GENTYPE		= VALUE_FLOAT_VEC	|	VALUE_FLOAT_GENTYPE	|	VALUE_FLOAT_VEC34	|
1980 								  VALUE_BOOL_VEC	|	VALUE_BOOL_GENTYPE	|
1981 								  VALUE_INT_VEC		|	VALUE_INT_GENTYPE	|
1982 								  VALUE_MATRIX
1983 	};
1984 	enum PrecisionMask
1985 	{
1986 		PRECMASK_NA				= 0,						//!< Precision not applicable (booleans)
1987 		PRECMASK_LOWP			= (1<<PRECISION_LOWP),
1988 		PRECMASK_MEDIUMP		= (1<<PRECISION_MEDIUMP),
1989 		PRECMASK_HIGHP			= (1<<PRECISION_HIGHP),
1990 
1991 		PRECMASK_MEDIUMP_HIGHP	= (1<<PRECISION_MEDIUMP) | (1<<PRECISION_HIGHP),
1992 		PRECMASK_ALL			= (1<<PRECISION_LOWP) | (1<<PRECISION_MEDIUMP) | (1<<PRECISION_HIGHP)
1993 	};
1994 
1995 	static const DataType floatTypes[] =
1996 	{
1997 		TYPE_FLOAT,
1998 		TYPE_FLOAT_VEC2,
1999 		TYPE_FLOAT_VEC3,
2000 		TYPE_FLOAT_VEC4
2001 	};
2002 	static const DataType intTypes[] =
2003 	{
2004 		TYPE_INT,
2005 		TYPE_INT_VEC2,
2006 		TYPE_INT_VEC3,
2007 		TYPE_INT_VEC4
2008 	};
2009 	static const DataType boolTypes[] =
2010 	{
2011 		TYPE_BOOL,
2012 		TYPE_BOOL_VEC2,
2013 		TYPE_BOOL_VEC3,
2014 		TYPE_BOOL_VEC4
2015 	};
2016 	static const DataType matrixTypes[] =
2017 	{
2018 		TYPE_FLOAT_MAT2,
2019 		TYPE_FLOAT_MAT3,
2020 		TYPE_FLOAT_MAT4
2021 	};
2022 
2023 	tcu::TestCaseGroup* const angleAndTrigonometryGroup		= new tcu::TestCaseGroup(m_testCtx, "angle_and_trigonometry",	"Built-In Angle and Trigonometry Function Performance Tests");
2024 	tcu::TestCaseGroup* const exponentialGroup				= new tcu::TestCaseGroup(m_testCtx, "exponential",				"Built-In Exponential Function Performance Tests");
2025 	tcu::TestCaseGroup* const commonFunctionsGroup			= new tcu::TestCaseGroup(m_testCtx, "common_functions",			"Built-In Common Function Performance Tests");
2026 	tcu::TestCaseGroup* const geometricFunctionsGroup		= new tcu::TestCaseGroup(m_testCtx, "geometric",				"Built-In Geometric Function Performance Tests");
2027 	tcu::TestCaseGroup* const matrixFunctionsGroup			= new tcu::TestCaseGroup(m_testCtx, "matrix",					"Built-In Matrix Function Performance Tests");
2028 	tcu::TestCaseGroup* const floatCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "float_compare",			"Built-In Floating Point Comparison Function Performance Tests");
2029 	tcu::TestCaseGroup* const intCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "int_compare",				"Built-In Integer Comparison Function Performance Tests");
2030 	tcu::TestCaseGroup* const boolCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "bool_compare",				"Built-In Boolean Comparison Function Performance Tests");
2031 
2032 	addChild(angleAndTrigonometryGroup);
2033 	addChild(exponentialGroup);
2034 	addChild(commonFunctionsGroup);
2035 	addChild(geometricFunctionsGroup);
2036 	addChild(matrixFunctionsGroup);
2037 	addChild(floatCompareGroup);
2038 	addChild(intCompareGroup);
2039 	addChild(boolCompareGroup);
2040 
2041 	// Some attributes to be used as parameters for the functions.
2042 	const Vec4 attrPos		= Vec4( 2.3f,  1.9f,  0.8f,  0.7f);
2043 	const Vec4 attrNegPos	= Vec4(-1.3f,  2.5f, -3.5f,	 4.3f);
2044 	const Vec4 attrSmall	= Vec4(-0.9f,  0.8f, -0.4f,	 0.2f);
2045 	const Vec4 attrBig		= Vec4( 1.3f,  2.4f,  3.0f,	 4.0f);
2046 
2047 	// \todo The following functions and variants are missing, and should be added in the future:
2048 	//		 - modf (has an output parameter, not currently handled by test code)
2049 	//		 - functions with uint/uvec* return or parameter types
2050 	//		 - non-matrix <-> matrix functions (outerProduct etc.)
2051 	// \note Remember to update test spec when these are added.
2052 
2053 	// Function name, return type and parameter type information; also, what attribute should be used in the test.
2054 	// \note Different versions of the same function (i.e. with the same group name) can be defined by putting them successively in this array.
2055 	// \note In order to reduce case count and thus total execution time, we don't test all input type combinations for every function.
2056 	static const struct
2057 	{
2058 		tcu::TestCaseGroup*					parentGroup;
2059 		const char*							groupName;
2060 		const char*							func;
2061 		const ValueType						types[FunctionCase::MAX_PARAMS + 1]; // Return type and parameter types, in that order.
2062 		const Vec4&							attribute;
2063 		int									modifyParamNdx;
2064 		bool								useNearlyConstantInputs;
2065 		bool								booleanCase;
2066 		PrecisionMask						precMask;
2067 	} functionCaseGroups[] =
2068 	{
2069 		{ angleAndTrigonometryGroup,	"radians",			"radians",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2070 		{ angleAndTrigonometryGroup,	"degrees",			"degrees",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2071 		{ angleAndTrigonometryGroup,	"sin",				"sin",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2072 		{ angleAndTrigonometryGroup,	"cos",				"cos",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2073 		{ angleAndTrigonometryGroup,	"tan",				"tan",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2074 		{ angleAndTrigonometryGroup,	"asin",				"asin",				{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
2075 		{ angleAndTrigonometryGroup,	"acos",				"acos",				{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
2076 		{ angleAndTrigonometryGroup,	"atan2",			"atan",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2077 		{ angleAndTrigonometryGroup,	"atan",				"atan",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2078 		{ angleAndTrigonometryGroup,	"sinh",				"sinh",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2079 		{ angleAndTrigonometryGroup,	"cosh",				"cosh",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2080 		{ angleAndTrigonometryGroup,	"tanh",				"tanh",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2081 		{ angleAndTrigonometryGroup,	"asinh",			"asinh",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2082 		{ angleAndTrigonometryGroup,	"acosh",			"acosh",			{ F,  F,  N,  N  }, attrBig,		-1, false,	false,	PRECMASK_ALL			},
2083 		{ angleAndTrigonometryGroup,	"atanh",			"atanh",			{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
2084 
2085 		{ exponentialGroup,				"pow",				"pow",				{ F,  F,  F,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2086 		{ exponentialGroup,				"exp",				"exp",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2087 		{ exponentialGroup,				"log",				"log",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2088 		{ exponentialGroup,				"exp2",				"exp2",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2089 		{ exponentialGroup,				"log2",				"log2",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2090 		{ exponentialGroup,				"sqrt",				"sqrt",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2091 		{ exponentialGroup,				"inversesqrt",		"inversesqrt",		{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
2092 
2093 		{ commonFunctionsGroup,			"abs",				"abs",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2094 		{ commonFunctionsGroup,			"abs",				"abs",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2095 		{ commonFunctionsGroup,			"sign",				"sign",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2096 		{ commonFunctionsGroup,			"sign",				"sign",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2097 		{ commonFunctionsGroup,			"floor",			"floor",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2098 		{ commonFunctionsGroup,			"floor",			"floor",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2099 		{ commonFunctionsGroup,			"trunc",			"trunc",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2100 		{ commonFunctionsGroup,			"trunc",			"trunc",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2101 		{ commonFunctionsGroup,			"round",			"round",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2102 		{ commonFunctionsGroup,			"round",			"round",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2103 		{ commonFunctionsGroup,			"roundEven",		"roundEven",		{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2104 		{ commonFunctionsGroup,			"roundEven",		"roundEven",		{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2105 		{ commonFunctionsGroup,			"ceil",				"ceil",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2106 		{ commonFunctionsGroup,			"ceil",				"ceil",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2107 		{ commonFunctionsGroup,			"fract",			"fract",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2108 		{ commonFunctionsGroup,			"fract",			"fract",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2109 		{ commonFunctionsGroup,			"mod",				"mod",				{ GT, GT, GT, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2110 		{ commonFunctionsGroup,			"min",				"min",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2111 		{ commonFunctionsGroup,			"min",				"min",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2112 		{ commonFunctionsGroup,			"max",				"max",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2113 		{ commonFunctionsGroup,			"max",				"max",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2114 		{ commonFunctionsGroup,			"clamp",			"clamp",			{ F,  F,  F,  F  }, attrSmall,		 2, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2115 		{ commonFunctionsGroup,			"clamp",			"clamp",			{ V4, V4, V4, V4 }, attrSmall,		 2, false,	false,	PRECMASK_ALL			},
2116 		{ commonFunctionsGroup,			"mix",				"mix",				{ F,  F,  F,  F  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2117 		{ commonFunctionsGroup,			"mix",				"mix",				{ V4, V4, V4, V4 }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2118 		{ commonFunctionsGroup,			"mix",				"mix",				{ F,  F,  F,  B  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2119 		{ commonFunctionsGroup,			"mix",				"mix",				{ V4, V4, V4, B4 }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2120 		{ commonFunctionsGroup,			"step",				"step",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2121 		{ commonFunctionsGroup,			"step",				"step",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2122 		{ commonFunctionsGroup,			"smoothstep",		"smoothstep",		{ F,  F,  F,  F  }, attrSmall,		 1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2123 		{ commonFunctionsGroup,			"smoothstep",		"smoothstep",		{ V4, V4, V4, V4 }, attrSmall,		 1, false,	false,	PRECMASK_ALL			},
2124 		{ commonFunctionsGroup,			"isnan",			"isnan",			{ B,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2125 		{ commonFunctionsGroup,			"isnan",			"isnan",			{ B4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2126 		{ commonFunctionsGroup,			"isinf",			"isinf",			{ B,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2127 		{ commonFunctionsGroup,			"isinf",			"isinf",			{ B4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2128 		{ commonFunctionsGroup,			"floatBitsToInt",	"floatBitsToInt",	{ I,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2129 		{ commonFunctionsGroup,			"floatBitsToInt",	"floatBitsToInt",	{ I4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2130 		{ commonFunctionsGroup,			"intBitsToFloat",	"intBitsToFloat",	{ F,  I,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
2131 		{ commonFunctionsGroup,			"intBitsToFloat",	"intBitsToFloat",	{ V4, I4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2132 
2133 		{ geometricFunctionsGroup,		"length",			"length",			{ F,  VL, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2134 		{ geometricFunctionsGroup,		"distance",			"distance",			{ F,  VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2135 		{ geometricFunctionsGroup,		"dot",				"dot",				{ F,  VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2136 		{ geometricFunctionsGroup,		"cross",			"cross",			{ V3, V3, V3, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2137 		{ geometricFunctionsGroup,		"normalize",		"normalize",		{ VL, VL, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2138 		{ geometricFunctionsGroup,		"faceforward",		"faceforward",		{ VL, VL, VL, VL }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2139 		{ geometricFunctionsGroup,		"reflect",			"reflect",			{ VL, VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2140 		{ geometricFunctionsGroup,		"refract",			"refract",			{ VL, VL, VL, F  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2141 
2142 		{ matrixFunctionsGroup,			"matrixCompMult",	"matrixCompMult",	{ M,  M,  M,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2143 		{ matrixFunctionsGroup,			"transpose",		"transpose",		{ M,  M,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2144 		{ matrixFunctionsGroup,			"inverse",			"inverse",			{ M,  M,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2145 
2146 		{ floatCompareGroup,			"lessThan",			"lessThan",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2147 		{ floatCompareGroup,			"lessThanEqual",	"lessThanEqual",	{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2148 		{ floatCompareGroup,			"greaterThan",		"greaterThan",		{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2149 		{ floatCompareGroup,			"greaterThanEqual",	"greaterThanEqual",	{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2150 		{ floatCompareGroup,			"equal",			"equal",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2151 		{ floatCompareGroup,			"notEqual",			"notEqual",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2152 
2153 		{ intCompareGroup,				"lessThan",			"lessThan",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2154 		{ intCompareGroup,				"lessThanEqual",	"lessThanEqual",	{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2155 		{ intCompareGroup,				"greaterThan",		"greaterThan",		{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2156 		{ intCompareGroup,				"greaterThanEqual",	"greaterThanEqual",	{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2157 		{ intCompareGroup,				"equal",			"equal",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2158 		{ intCompareGroup,				"notEqual",			"notEqual",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
2159 
2160 		{ boolCompareGroup,				"equal",			"equal",			{ BV, BV, BV, N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2161 		{ boolCompareGroup,				"notEqual",			"notEqual",			{ BV, BV, BV, N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2162 		{ boolCompareGroup,				"any",				"any",				{ B,  BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2163 		{ boolCompareGroup,				"all",				"all",				{ B,  BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
2164 		{ boolCompareGroup,				"not",				"not",				{ BV, BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		}
2165 	};
2166 
2167 	// vertexSubGroup and fragmentSubGroup are the groups where the various vertex/fragment cases of a single function are added.
2168 	// \note These are defined here so that different versions (different entries in the functionCaseGroups array) of the same function can be put in the same group.
2169 	tcu::TestCaseGroup*							vertexSubGroup		= DE_NULL;
2170 	tcu::TestCaseGroup*							fragmentSubGroup	= DE_NULL;
2171 	FunctionCase::InitialCalibrationStorage		vertexSubGroupCalibrationStorage;
2172 	FunctionCase::InitialCalibrationStorage		fragmentSubGroupCalibrationStorage;
2173 	for (int funcNdx = 0; funcNdx < DE_LENGTH_OF_ARRAY(functionCaseGroups); funcNdx++)
2174 	{
2175 		tcu::TestCaseGroup* const	parentGroup					= functionCaseGroups[funcNdx].parentGroup;
2176 		const char* const			groupName					= functionCaseGroups[funcNdx].groupName;
2177 		const char* const			groupFunc					= functionCaseGroups[funcNdx].func;
2178 		const ValueType* const		funcTypes					= functionCaseGroups[funcNdx].types;
2179 		const Vec4&					groupAttribute				= functionCaseGroups[funcNdx].attribute;
2180 		const int					modifyParamNdx				= functionCaseGroups[funcNdx].modifyParamNdx;
2181 		const bool					useNearlyConstantInputs		= functionCaseGroups[funcNdx].useNearlyConstantInputs;
2182 		const bool					booleanCase					= functionCaseGroups[funcNdx].booleanCase;
2183 		const PrecisionMask			precMask					= functionCaseGroups[funcNdx].precMask;
2184 
2185 		// If this is a new function and not just a different version of the previously defined function, create a new group.
2186 		if (funcNdx == 0 || parentGroup != functionCaseGroups[funcNdx-1].parentGroup || string(groupName) != functionCaseGroups[funcNdx-1].groupName)
2187 		{
2188 			tcu::TestCaseGroup* const funcGroup = new tcu::TestCaseGroup(m_testCtx, groupName, "");
2189 			functionCaseGroups[funcNdx].parentGroup->addChild(funcGroup);
2190 
2191 			vertexSubGroup		= new tcu::TestCaseGroup(m_testCtx, "vertex", "");
2192 			fragmentSubGroup	= new tcu::TestCaseGroup(m_testCtx, "fragment", "");
2193 
2194 			funcGroup->addChild(vertexSubGroup);
2195 			funcGroup->addChild(fragmentSubGroup);
2196 
2197 			vertexSubGroupCalibrationStorage	= FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
2198 			fragmentSubGroupCalibrationStorage	= FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
2199 		}
2200 
2201 		DE_ASSERT(vertexSubGroup != DE_NULL);
2202 		DE_ASSERT(fragmentSubGroup != DE_NULL);
2203 
2204 		// Find the type size range of parameters (e.g. from 2 to 4 in case of vectors).
2205 		int genTypeFirstSize	= 1;
2206 		int genTypeLastSize		= 1;
2207 
2208 		// Find the first return value or parameter with a gentype (if any) and set sizes accordingly.
2209 		// \note Assumes only matching sizes gentypes are to be found, e.g. no "genType func (vec param)"
2210 		for (int i = 0; i < FunctionCase::MAX_PARAMS + 1 && genTypeLastSize == 1; i++)
2211 		{
2212 			switch (funcTypes[i])
2213 			{
2214 				case VALUE_FLOAT_VEC:
2215 				case VALUE_BOOL_VEC:
2216 				case VALUE_INT_VEC:			// \note Fall-through.
2217 					genTypeFirstSize = 2;
2218 					genTypeLastSize = 4;
2219 					break;
2220 				case VALUE_FLOAT_VEC34:
2221 					genTypeFirstSize = 3;
2222 					genTypeLastSize = 4;
2223 					break;
2224 				case VALUE_FLOAT_GENTYPE:
2225 				case VALUE_BOOL_GENTYPE:
2226 				case VALUE_INT_GENTYPE:		// \note Fall-through.
2227 					genTypeFirstSize = 1;
2228 					genTypeLastSize = 4;
2229 					break;
2230 				case VALUE_MATRIX:
2231 					genTypeFirstSize = 2;
2232 					genTypeLastSize = 4;
2233 					break;
2234 				// If none of the above, keep looping.
2235 				default:
2236 					break;
2237 			}
2238 		}
2239 
2240 		// Create a case for each possible size of the gentype.
2241 		for (int curSize = genTypeFirstSize; curSize <= genTypeLastSize; curSize++)
2242 		{
2243 			// Determine specific types for return value and the parameters, according to curSize. Non-gentypes not affected by curSize.
2244 			DataType types[FunctionCase::MAX_PARAMS + 1];
2245 			for (int i = 0; i < FunctionCase::MAX_PARAMS + 1; i++)
2246 			{
2247 				if (funcTypes[i] == VALUE_NONE)
2248 					types[i] = TYPE_INVALID;
2249 				else
2250 				{
2251 					int isFloat	= funcTypes[i] & VALUE_ANY_FLOAT;
2252 					int isBool	= funcTypes[i] & VALUE_ANY_BOOL;
2253 					int isInt	= funcTypes[i] & VALUE_ANY_INT;
2254 					int isMat	= funcTypes[i] == VALUE_MATRIX;
2255 					int inSize	= (funcTypes[i] & VALUE_ANY_GENTYPE)	? curSize
2256 								: funcTypes[i] == VALUE_VEC3			? 3
2257 								: funcTypes[i] == VALUE_VEC4			? 4
2258 								: funcTypes[i] == VALUE_BOOL_VEC4		? 4
2259 								: funcTypes[i] == VALUE_INT_VEC4		? 4
2260 								: 1;
2261 					int			typeArrayNdx = isMat ? inSize - 2 : inSize - 1; // \note No matrices of size 1.
2262 
2263 					types[i]	= isFloat	? floatTypes[typeArrayNdx]
2264 								: isBool	? boolTypes[typeArrayNdx]
2265 								: isInt		? intTypes[typeArrayNdx]
2266 								: isMat		? matrixTypes[typeArrayNdx]
2267 								: TYPE_LAST;
2268 				}
2269 
2270 				DE_ASSERT(types[i] != TYPE_LAST);
2271 			}
2272 
2273 			// Array for just the parameter types.
2274 			DataType paramTypes[FunctionCase::MAX_PARAMS];
2275 			for (int i = 0; i < FunctionCase::MAX_PARAMS; i++)
2276 				paramTypes[i] = types[i+1];
2277 
2278 			for (int prec = (int)PRECISION_LOWP; prec < (int)PRECISION_LAST; prec++)
2279 			{
2280 				if ((precMask & (1 << prec)) == 0)
2281 					continue;
2282 
2283 				const string		precisionPrefix = booleanCase ? "" : (string(getPrecisionName((Precision)prec)) + "_");
2284 				std::ostringstream	caseName;
2285 
2286 				caseName << precisionPrefix;
2287 
2288 				// Write the name of each distinct parameter data type into the test case name.
2289 				for (int i = 1; i < FunctionCase::MAX_PARAMS + 1 && types[i] != TYPE_INVALID; i++)
2290 				{
2291 					if (i == 1 || types[i] != types[i-1])
2292 					{
2293 						if (i > 1)
2294 							caseName << "_";
2295 
2296 						caseName << getDataTypeName(types[i]);
2297 					}
2298 				}
2299 
2300 				for (int fragI = 0; fragI <= 1; fragI++)
2301 				{
2302 					const bool					vert	= fragI == 0;
2303 					tcu::TestCaseGroup* const	group	= vert ? vertexSubGroup : fragmentSubGroup;
2304 					group->addChild	(new FunctionCase(m_context,
2305 													  caseName.str().c_str(), "",
2306 													  groupFunc,
2307 													  types[0], paramTypes,
2308 													  groupAttribute, modifyParamNdx, useNearlyConstantInputs,
2309 													  (Precision)prec, vert,
2310 													  vert ? vertexSubGroupCalibrationStorage : fragmentSubGroupCalibrationStorage));
2311 				}
2312 			}
2313 		}
2314 	}
2315 }
2316 
2317 } // Performance
2318 } // gles3
2319 } // deqp
2320