1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <math.h>
12#include <stdlib.h>
13#include <string.h>
14
15#include "third_party/googletest/src/include/gtest/gtest.h"
16
17#include "./vp9_rtcd.h"
18#include "./vpx_dsp_rtcd.h"
19#include "test/acm_random.h"
20#include "test/clear_system_state.h"
21#include "test/register_state_check.h"
22#include "test/util.h"
23#include "vp9/common/vp9_entropy.h"
24#include "vp9/common/vp9_scan.h"
25#include "vpx/vpx_codec.h"
26#include "vpx/vpx_integer.h"
27#include "vpx_ports/mem.h"
28#include "vpx_ports/msvc.h"  // for round()
29
30using libvpx_test::ACMRandom;
31
32namespace {
33
34const int kNumCoeffs = 256;
35const double C1 = 0.995184726672197;
36const double C2 = 0.98078528040323;
37const double C3 = 0.956940335732209;
38const double C4 = 0.923879532511287;
39const double C5 = 0.881921264348355;
40const double C6 = 0.831469612302545;
41const double C7 = 0.773010453362737;
42const double C8 = 0.707106781186548;
43const double C9 = 0.634393284163646;
44const double C10 = 0.555570233019602;
45const double C11 = 0.471396736825998;
46const double C12 = 0.38268343236509;
47const double C13 = 0.290284677254462;
48const double C14 = 0.195090322016128;
49const double C15 = 0.098017140329561;
50
51void butterfly_16x16_dct_1d(double input[16], double output[16]) {
52  double step[16];
53  double intermediate[16];
54  double temp1, temp2;
55
56  // step 1
57  step[0] = input[0] + input[15];
58  step[1] = input[1] + input[14];
59  step[2] = input[2] + input[13];
60  step[3] = input[3] + input[12];
61  step[4] = input[4] + input[11];
62  step[5] = input[5] + input[10];
63  step[6] = input[6] + input[9];
64  step[7] = input[7] + input[8];
65  step[8] = input[7] - input[8];
66  step[9] = input[6] - input[9];
67  step[10] = input[5] - input[10];
68  step[11] = input[4] - input[11];
69  step[12] = input[3] - input[12];
70  step[13] = input[2] - input[13];
71  step[14] = input[1] - input[14];
72  step[15] = input[0] - input[15];
73
74  // step 2
75  output[0] = step[0] + step[7];
76  output[1] = step[1] + step[6];
77  output[2] = step[2] + step[5];
78  output[3] = step[3] + step[4];
79  output[4] = step[3] - step[4];
80  output[5] = step[2] - step[5];
81  output[6] = step[1] - step[6];
82  output[7] = step[0] - step[7];
83
84  temp1 = step[8] * C7;
85  temp2 = step[15] * C9;
86  output[8] = temp1 + temp2;
87
88  temp1 = step[9] * C11;
89  temp2 = step[14] * C5;
90  output[9] = temp1 - temp2;
91
92  temp1 = step[10] * C3;
93  temp2 = step[13] * C13;
94  output[10] = temp1 + temp2;
95
96  temp1 = step[11] * C15;
97  temp2 = step[12] * C1;
98  output[11] = temp1 - temp2;
99
100  temp1 = step[11] * C1;
101  temp2 = step[12] * C15;
102  output[12] = temp2 + temp1;
103
104  temp1 = step[10] * C13;
105  temp2 = step[13] * C3;
106  output[13] = temp2 - temp1;
107
108  temp1 = step[9] * C5;
109  temp2 = step[14] * C11;
110  output[14] = temp2 + temp1;
111
112  temp1 = step[8] * C9;
113  temp2 = step[15] * C7;
114  output[15] = temp2 - temp1;
115
116  // step 3
117  step[0] = output[0] + output[3];
118  step[1] = output[1] + output[2];
119  step[2] = output[1] - output[2];
120  step[3] = output[0] - output[3];
121
122  temp1 = output[4] * C14;
123  temp2 = output[7] * C2;
124  step[4] = temp1 + temp2;
125
126  temp1 = output[5] * C10;
127  temp2 = output[6] * C6;
128  step[5] = temp1 + temp2;
129
130  temp1 = output[5] * C6;
131  temp2 = output[6] * C10;
132  step[6] = temp2 - temp1;
133
134  temp1 = output[4] * C2;
135  temp2 = output[7] * C14;
136  step[7] = temp2 - temp1;
137
138  step[8] = output[8] + output[11];
139  step[9] = output[9] + output[10];
140  step[10] = output[9] - output[10];
141  step[11] = output[8] - output[11];
142
143  step[12] = output[12] + output[15];
144  step[13] = output[13] + output[14];
145  step[14] = output[13] - output[14];
146  step[15] = output[12] - output[15];
147
148  // step 4
149  output[0] = (step[0] + step[1]);
150  output[8] = (step[0] - step[1]);
151
152  temp1 = step[2] * C12;
153  temp2 = step[3] * C4;
154  temp1 = temp1 + temp2;
155  output[4] = 2 * (temp1 * C8);
156
157  temp1 = step[2] * C4;
158  temp2 = step[3] * C12;
159  temp1 = temp2 - temp1;
160  output[12] = 2 * (temp1 * C8);
161
162  output[2] = 2 * ((step[4] + step[5]) * C8);
163  output[14] = 2 * ((step[7] - step[6]) * C8);
164
165  temp1 = step[4] - step[5];
166  temp2 = step[6] + step[7];
167  output[6] = (temp1 + temp2);
168  output[10] = (temp1 - temp2);
169
170  intermediate[8] = step[8] + step[14];
171  intermediate[9] = step[9] + step[15];
172
173  temp1 = intermediate[8] * C12;
174  temp2 = intermediate[9] * C4;
175  temp1 = temp1 - temp2;
176  output[3] = 2 * (temp1 * C8);
177
178  temp1 = intermediate[8] * C4;
179  temp2 = intermediate[9] * C12;
180  temp1 = temp2 + temp1;
181  output[13] = 2 * (temp1 * C8);
182
183  output[9] = 2 * ((step[10] + step[11]) * C8);
184
185  intermediate[11] = step[10] - step[11];
186  intermediate[12] = step[12] + step[13];
187  intermediate[13] = step[12] - step[13];
188  intermediate[14] = step[8] - step[14];
189  intermediate[15] = step[9] - step[15];
190
191  output[15] = (intermediate[11] + intermediate[12]);
192  output[1] = -(intermediate[11] - intermediate[12]);
193
194  output[7] = 2 * (intermediate[13] * C8);
195
196  temp1 = intermediate[14] * C12;
197  temp2 = intermediate[15] * C4;
198  temp1 = temp1 - temp2;
199  output[11] = -2 * (temp1 * C8);
200
201  temp1 = intermediate[14] * C4;
202  temp2 = intermediate[15] * C12;
203  temp1 = temp2 + temp1;
204  output[5] = 2 * (temp1 * C8);
205}
206
207void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
208  // First transform columns
209  for (int i = 0; i < 16; ++i) {
210    double temp_in[16], temp_out[16];
211    for (int j = 0; j < 16; ++j) temp_in[j] = input[j * 16 + i];
212    butterfly_16x16_dct_1d(temp_in, temp_out);
213    for (int j = 0; j < 16; ++j) output[j * 16 + i] = temp_out[j];
214  }
215  // Then transform rows
216  for (int i = 0; i < 16; ++i) {
217    double temp_in[16], temp_out[16];
218    for (int j = 0; j < 16; ++j) temp_in[j] = output[j + i * 16];
219    butterfly_16x16_dct_1d(temp_in, temp_out);
220    // Scale by some magic number
221    for (int j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j] / 2;
222  }
223}
224
225typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
226typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
227typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
228                        int tx_type);
229typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
230                        int tx_type);
231
232typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
233typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
234typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
235    Idct16x16Param;
236
237void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
238                   int /*tx_type*/) {
239  vpx_fdct16x16_c(in, out, stride);
240}
241
242void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
243                   int /*tx_type*/) {
244  vpx_idct16x16_256_add_c(in, dest, stride);
245}
246
247void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
248  vp9_fht16x16_c(in, out, stride, tx_type);
249}
250
251void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
252                  int tx_type) {
253  vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
254}
255
256#if CONFIG_VP9_HIGHBITDEPTH
257void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
258  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
259}
260
261void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
262  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
263}
264
265void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
266                      int /*tx_type*/) {
267  idct16x16_10(in, out, stride);
268}
269
270void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
271                      int /*tx_type*/) {
272  idct16x16_12(in, out, stride);
273}
274
275void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
276  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
277}
278
279void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
280  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
281}
282
283#if HAVE_SSE2
284void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
285  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
286}
287
288void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
289  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
290}
291
292void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
293  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
294}
295
296void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
297  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
298}
299
300void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
301  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
302}
303
304void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
305  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
306}
307#endif  // HAVE_SSE2
308#endif  // CONFIG_VP9_HIGHBITDEPTH
309
310class Trans16x16TestBase {
311 public:
312  virtual ~Trans16x16TestBase() {}
313
314 protected:
315  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
316
317  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
318
319  void RunAccuracyCheck() {
320    ACMRandom rnd(ACMRandom::DeterministicSeed());
321    uint32_t max_error = 0;
322    int64_t total_error = 0;
323    const int count_test_block = 10000;
324    for (int i = 0; i < count_test_block; ++i) {
325      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
326      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
327      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
328      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
329#if CONFIG_VP9_HIGHBITDEPTH
330      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
331      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
332#endif
333
334      // Initialize a test block with input range [-mask_, mask_].
335      for (int j = 0; j < kNumCoeffs; ++j) {
336        if (bit_depth_ == VPX_BITS_8) {
337          src[j] = rnd.Rand8();
338          dst[j] = rnd.Rand8();
339          test_input_block[j] = src[j] - dst[j];
340#if CONFIG_VP9_HIGHBITDEPTH
341        } else {
342          src16[j] = rnd.Rand16() & mask_;
343          dst16[j] = rnd.Rand16() & mask_;
344          test_input_block[j] = src16[j] - dst16[j];
345#endif
346        }
347      }
348
349      ASM_REGISTER_STATE_CHECK(
350          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
351      if (bit_depth_ == VPX_BITS_8) {
352        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
353#if CONFIG_VP9_HIGHBITDEPTH
354      } else {
355        ASM_REGISTER_STATE_CHECK(
356            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
357#endif
358      }
359
360      for (int j = 0; j < kNumCoeffs; ++j) {
361#if CONFIG_VP9_HIGHBITDEPTH
362        const int32_t diff =
363            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
364#else
365        const int32_t diff = dst[j] - src[j];
366#endif
367        const uint32_t error = diff * diff;
368        if (max_error < error) max_error = error;
369        total_error += error;
370      }
371    }
372
373    EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
374        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
375
376    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
377        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
378  }
379
380  void RunCoeffCheck() {
381    ACMRandom rnd(ACMRandom::DeterministicSeed());
382    const int count_test_block = 1000;
383    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
384    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
385    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
386
387    for (int i = 0; i < count_test_block; ++i) {
388      // Initialize a test block with input range [-mask_, mask_].
389      for (int j = 0; j < kNumCoeffs; ++j) {
390        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
391      }
392
393      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
394      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
395
396      // The minimum quant value is 4.
397      for (int j = 0; j < kNumCoeffs; ++j)
398        EXPECT_EQ(output_block[j], output_ref_block[j]);
399    }
400  }
401
402  void RunMemCheck() {
403    ACMRandom rnd(ACMRandom::DeterministicSeed());
404    const int count_test_block = 1000;
405    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
406    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
407    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
408
409    for (int i = 0; i < count_test_block; ++i) {
410      // Initialize a test block with input range [-mask_, mask_].
411      for (int j = 0; j < kNumCoeffs; ++j) {
412        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
413      }
414      if (i == 0) {
415        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
416      } else if (i == 1) {
417        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
418      }
419
420      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
421      ASM_REGISTER_STATE_CHECK(
422          RunFwdTxfm(input_extreme_block, output_block, pitch_));
423
424      // The minimum quant value is 4.
425      for (int j = 0; j < kNumCoeffs; ++j) {
426        EXPECT_EQ(output_block[j], output_ref_block[j]);
427        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
428            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
429      }
430    }
431  }
432
433  void RunQuantCheck(int dc_thred, int ac_thred) {
434    ACMRandom rnd(ACMRandom::DeterministicSeed());
435    const int count_test_block = 100000;
436    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
437    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
438
439    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
440    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
441#if CONFIG_VP9_HIGHBITDEPTH
442    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
443    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
444#endif
445
446    for (int i = 0; i < count_test_block; ++i) {
447      // Initialize a test block with input range [-mask_, mask_].
448      for (int j = 0; j < kNumCoeffs; ++j) {
449        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
450      }
451      if (i == 0) {
452        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
453      }
454      if (i == 1) {
455        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
456      }
457
458      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
459
460      // clear reconstructed pixel buffers
461      memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
462      memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
463#if CONFIG_VP9_HIGHBITDEPTH
464      memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
465      memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
466#endif
467
468      // quantization with maximum allowed step sizes
469      output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
470      for (int j = 1; j < kNumCoeffs; ++j) {
471        output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
472      }
473      if (bit_depth_ == VPX_BITS_8) {
474        inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
475        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
476#if CONFIG_VP9_HIGHBITDEPTH
477      } else {
478        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,
479                     tx_type_);
480        ASM_REGISTER_STATE_CHECK(
481            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));
482#endif
483      }
484      if (bit_depth_ == VPX_BITS_8) {
485        for (int j = 0; j < kNumCoeffs; ++j) EXPECT_EQ(ref[j], dst[j]);
486#if CONFIG_VP9_HIGHBITDEPTH
487      } else {
488        for (int j = 0; j < kNumCoeffs; ++j) EXPECT_EQ(ref16[j], dst16[j]);
489#endif
490      }
491    }
492  }
493
494  void RunInvAccuracyCheck() {
495    ACMRandom rnd(ACMRandom::DeterministicSeed());
496    const int count_test_block = 1000;
497    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
498    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
499    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
500    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
501#if CONFIG_VP9_HIGHBITDEPTH
502    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
503    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
504#endif  // CONFIG_VP9_HIGHBITDEPTH
505
506    for (int i = 0; i < count_test_block; ++i) {
507      double out_r[kNumCoeffs];
508
509      // Initialize a test block with input range [-255, 255].
510      for (int j = 0; j < kNumCoeffs; ++j) {
511        if (bit_depth_ == VPX_BITS_8) {
512          src[j] = rnd.Rand8();
513          dst[j] = rnd.Rand8();
514          in[j] = src[j] - dst[j];
515#if CONFIG_VP9_HIGHBITDEPTH
516        } else {
517          src16[j] = rnd.Rand16() & mask_;
518          dst16[j] = rnd.Rand16() & mask_;
519          in[j] = src16[j] - dst16[j];
520#endif  // CONFIG_VP9_HIGHBITDEPTH
521        }
522      }
523
524      reference_16x16_dct_2d(in, out_r);
525      for (int j = 0; j < kNumCoeffs; ++j) {
526        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
527      }
528
529      if (bit_depth_ == VPX_BITS_8) {
530        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
531#if CONFIG_VP9_HIGHBITDEPTH
532      } else {
533        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));
534#endif  // CONFIG_VP9_HIGHBITDEPTH
535      }
536
537      for (int j = 0; j < kNumCoeffs; ++j) {
538#if CONFIG_VP9_HIGHBITDEPTH
539        const uint32_t diff =
540            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
541#else
542        const uint32_t diff = dst[j] - src[j];
543#endif  // CONFIG_VP9_HIGHBITDEPTH
544        const uint32_t error = diff * diff;
545        EXPECT_GE(1u, error)
546            << "Error: 16x16 IDCT has error " << error << " at index " << j;
547      }
548    }
549  }
550
551  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
552    ACMRandom rnd(ACMRandom::DeterministicSeed());
553    const int count_test_block = 10000;
554    const int eob = 10;
555    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
556    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
557    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
558    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
559#if CONFIG_VP9_HIGHBITDEPTH
560    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
561    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
562#endif  // CONFIG_VP9_HIGHBITDEPTH
563
564    for (int i = 0; i < count_test_block; ++i) {
565      for (int j = 0; j < kNumCoeffs; ++j) {
566        if (j < eob) {
567          // Random values less than the threshold, either positive or negative
568          coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
569        } else {
570          coeff[scan[j]] = 0;
571        }
572        if (bit_depth_ == VPX_BITS_8) {
573          dst[j] = 0;
574          ref[j] = 0;
575#if CONFIG_VP9_HIGHBITDEPTH
576        } else {
577          dst16[j] = 0;
578          ref16[j] = 0;
579#endif  // CONFIG_VP9_HIGHBITDEPTH
580        }
581      }
582      if (bit_depth_ == VPX_BITS_8) {
583        ref_txfm(coeff, ref, pitch_);
584        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
585      } else {
586#if CONFIG_VP9_HIGHBITDEPTH
587        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
588        ASM_REGISTER_STATE_CHECK(
589            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
590#endif  // CONFIG_VP9_HIGHBITDEPTH
591      }
592
593      for (int j = 0; j < kNumCoeffs; ++j) {
594#if CONFIG_VP9_HIGHBITDEPTH
595        const uint32_t diff =
596            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
597#else
598        const uint32_t diff = dst[j] - ref[j];
599#endif  // CONFIG_VP9_HIGHBITDEPTH
600        const uint32_t error = diff * diff;
601        EXPECT_EQ(0u, error) << "Error: 16x16 IDCT Comparison has error "
602                             << error << " at index " << j;
603      }
604    }
605  }
606
607  int pitch_;
608  int tx_type_;
609  vpx_bit_depth_t bit_depth_;
610  int mask_;
611  FhtFunc fwd_txfm_ref;
612  IhtFunc inv_txfm_ref;
613};
614
615class Trans16x16DCT : public Trans16x16TestBase,
616                      public ::testing::TestWithParam<Dct16x16Param> {
617 public:
618  virtual ~Trans16x16DCT() {}
619
620  virtual void SetUp() {
621    fwd_txfm_ = GET_PARAM(0);
622    inv_txfm_ = GET_PARAM(1);
623    tx_type_ = GET_PARAM(2);
624    bit_depth_ = GET_PARAM(3);
625    pitch_ = 16;
626    fwd_txfm_ref = fdct16x16_ref;
627    inv_txfm_ref = idct16x16_ref;
628    mask_ = (1 << bit_depth_) - 1;
629#if CONFIG_VP9_HIGHBITDEPTH
630    switch (bit_depth_) {
631      case VPX_BITS_10: inv_txfm_ref = idct16x16_10_ref; break;
632      case VPX_BITS_12: inv_txfm_ref = idct16x16_12_ref; break;
633      default: inv_txfm_ref = idct16x16_ref; break;
634    }
635#else
636    inv_txfm_ref = idct16x16_ref;
637#endif
638  }
639  virtual void TearDown() { libvpx_test::ClearSystemState(); }
640
641 protected:
642  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
643    fwd_txfm_(in, out, stride);
644  }
645  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
646    inv_txfm_(out, dst, stride);
647  }
648
649  FdctFunc fwd_txfm_;
650  IdctFunc inv_txfm_;
651};
652
653TEST_P(Trans16x16DCT, AccuracyCheck) { RunAccuracyCheck(); }
654
655TEST_P(Trans16x16DCT, CoeffCheck) { RunCoeffCheck(); }
656
657TEST_P(Trans16x16DCT, MemCheck) { RunMemCheck(); }
658
659TEST_P(Trans16x16DCT, QuantCheck) {
660  // Use maximally allowed quantization step sizes for DC and AC
661  // coefficients respectively.
662  RunQuantCheck(1336, 1828);
663}
664
665TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
666
667class Trans16x16HT : public Trans16x16TestBase,
668                     public ::testing::TestWithParam<Ht16x16Param> {
669 public:
670  virtual ~Trans16x16HT() {}
671
672  virtual void SetUp() {
673    fwd_txfm_ = GET_PARAM(0);
674    inv_txfm_ = GET_PARAM(1);
675    tx_type_ = GET_PARAM(2);
676    bit_depth_ = GET_PARAM(3);
677    pitch_ = 16;
678    fwd_txfm_ref = fht16x16_ref;
679    inv_txfm_ref = iht16x16_ref;
680    mask_ = (1 << bit_depth_) - 1;
681#if CONFIG_VP9_HIGHBITDEPTH
682    switch (bit_depth_) {
683      case VPX_BITS_10: inv_txfm_ref = iht16x16_10; break;
684      case VPX_BITS_12: inv_txfm_ref = iht16x16_12; break;
685      default: inv_txfm_ref = iht16x16_ref; break;
686    }
687#else
688    inv_txfm_ref = iht16x16_ref;
689#endif
690  }
691  virtual void TearDown() { libvpx_test::ClearSystemState(); }
692
693 protected:
694  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
695    fwd_txfm_(in, out, stride, tx_type_);
696  }
697  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
698    inv_txfm_(out, dst, stride, tx_type_);
699  }
700
701  FhtFunc fwd_txfm_;
702  IhtFunc inv_txfm_;
703};
704
705TEST_P(Trans16x16HT, AccuracyCheck) { RunAccuracyCheck(); }
706
707TEST_P(Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
708
709TEST_P(Trans16x16HT, MemCheck) { RunMemCheck(); }
710
711TEST_P(Trans16x16HT, QuantCheck) {
712  // The encoder skips any non-DC intra prediction modes,
713  // when the quantization step size goes beyond 988.
714  RunQuantCheck(429, 729);
715}
716
717class InvTrans16x16DCT : public Trans16x16TestBase,
718                         public ::testing::TestWithParam<Idct16x16Param> {
719 public:
720  virtual ~InvTrans16x16DCT() {}
721
722  virtual void SetUp() {
723    ref_txfm_ = GET_PARAM(0);
724    inv_txfm_ = GET_PARAM(1);
725    thresh_ = GET_PARAM(2);
726    bit_depth_ = GET_PARAM(3);
727    pitch_ = 16;
728    mask_ = (1 << bit_depth_) - 1;
729  }
730  virtual void TearDown() { libvpx_test::ClearSystemState(); }
731
732 protected:
733  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
734  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
735    inv_txfm_(out, dst, stride);
736  }
737
738  IdctFunc ref_txfm_;
739  IdctFunc inv_txfm_;
740  int thresh_;
741};
742
743TEST_P(InvTrans16x16DCT, CompareReference) {
744  CompareInvReference(ref_txfm_, thresh_);
745}
746
747using std::tr1::make_tuple;
748
749#if CONFIG_VP9_HIGHBITDEPTH
750INSTANTIATE_TEST_CASE_P(
751    C, Trans16x16DCT,
752    ::testing::Values(
753        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
754        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
755        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
756#else
757INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
758                        ::testing::Values(make_tuple(&vpx_fdct16x16_c,
759                                                     &vpx_idct16x16_256_add_c,
760                                                     0, VPX_BITS_8)));
761#endif  // CONFIG_VP9_HIGHBITDEPTH
762
763#if CONFIG_VP9_HIGHBITDEPTH
764INSTANTIATE_TEST_CASE_P(
765    C, Trans16x16HT,
766    ::testing::Values(
767        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
768        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
769        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
770        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
771        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
772        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
773        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
774        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
775        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
776        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
777        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
778        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
779#else
780INSTANTIATE_TEST_CASE_P(
781    C, Trans16x16HT,
782    ::testing::Values(
783        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
784        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
785        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
786        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
787#endif  // CONFIG_VP9_HIGHBITDEPTH
788
789#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
790INSTANTIATE_TEST_CASE_P(
791    NEON, Trans16x16DCT,
792    ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
793                                 &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
794#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
795
796#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
797INSTANTIATE_TEST_CASE_P(
798    SSE2, Trans16x16DCT,
799    ::testing::Values(make_tuple(&vpx_fdct16x16_sse2,
800                                 &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
801INSTANTIATE_TEST_CASE_P(
802    SSE2, Trans16x16HT,
803    ::testing::Values(make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
804                                 0, VPX_BITS_8),
805                      make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
806                                 1, VPX_BITS_8),
807                      make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
808                                 2, VPX_BITS_8),
809                      make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
810                                 3, VPX_BITS_8)));
811#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
812
813#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
814INSTANTIATE_TEST_CASE_P(
815    SSE2, Trans16x16DCT,
816    ::testing::Values(
817        make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 0, VPX_BITS_10),
818        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_256_add_10_sse2, 0,
819                   VPX_BITS_10),
820        make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 0, VPX_BITS_12),
821        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_256_add_12_sse2, 0,
822                   VPX_BITS_12),
823        make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_c, 0,
824                   VPX_BITS_8)));
825INSTANTIATE_TEST_CASE_P(
826    SSE2, Trans16x16HT,
827    ::testing::Values(
828        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
829        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
830        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
831        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
832                   VPX_BITS_8)));
833// Optimizations take effect at a threshold of 3155, so we use a value close to
834// that to test both branches.
835INSTANTIATE_TEST_CASE_P(
836    SSE2, InvTrans16x16DCT,
837    ::testing::Values(make_tuple(&idct16x16_10_add_10_c,
838                                 &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
839                      make_tuple(&idct16x16_10, &idct16x16_256_add_10_sse2,
840                                 3167, VPX_BITS_10),
841                      make_tuple(&idct16x16_10_add_12_c,
842                                 &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
843                      make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
844                                 3167, VPX_BITS_12)));
845#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
846
847#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
848INSTANTIATE_TEST_CASE_P(MSA, Trans16x16DCT,
849                        ::testing::Values(make_tuple(&vpx_fdct16x16_msa,
850                                                     &vpx_idct16x16_256_add_msa,
851                                                     0, VPX_BITS_8)));
852INSTANTIATE_TEST_CASE_P(
853    MSA, Trans16x16HT,
854    ::testing::Values(
855        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
856        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
857        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
858        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
859                   VPX_BITS_8)));
860#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
861
862#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
863INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT,
864                        ::testing::Values(make_tuple(&vpx_fdct16x16_c,
865                                                     &vpx_idct16x16_256_add_vsx,
866                                                     0, VPX_BITS_8)));
867#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
868}  // namespace
869