1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved.
275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License");
475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License.
575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at
675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//     http://www.apache.org/licenses/LICENSE-2.0
875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software
1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS,
1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and
1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License.
1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// kernel_reference.h: a reference kernel for CPU architectures where we don't
1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// have optimized kernels yet. Also useful for testing, as it's templatized
1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to have any arbitrary format, allowing tests to cover all sorts of corner
1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// cases.
1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
23544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "kernel.h"
2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#include <cstdio>
267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <cstring>
2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp {
2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This kernel is templatized in an arbitrary Format template parameter,
3175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// allowing it to have any arbitrary format.
3275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobtemplate <typename tFormat>
3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobstruct ReferenceKernel : KernelBase {
3475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef tFormat Format;
3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  const char* Name() const override {
3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    static char buf[256];
3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    snprintf(buf, sizeof(buf),
3975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob             "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)",
4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob             Format::Lhs::kCells, Format::Lhs::Cell::kWidth,
4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob             Format::Lhs::Cell::kDepth,
4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob             CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells,
4375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob             Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth,
4475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob             CellOrderName(Format::Rhs::Cell::kOrder));
4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return buf;
4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    std::int32_t accumulator[Format::kRows * Format::kCols];
5375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    memset(accumulator, 0, sizeof(accumulator));
5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    const int run_depth_cells = static_cast<int>(run_depth / Format::kDepth);
5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    // The outer loop is over the depth dimension.
5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (int dc = 0; dc < run_depth_cells; dc++) {
5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      // The next two loops are over cells of the Lhs (stacked vertically),
6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      // and over cells of the Rhs (stacked horizontally).
6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        const std::uint8_t* lhs_cell_ptr = lhs_ptr +
6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                                           (dc * Format::Lhs::kCells + rc) *
6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                                               Format::Lhs::Cell::kWidth *
6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                                               Format::kDepth;
6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          const std::uint8_t* rhs_cell_ptr = rhs_ptr +
6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                                             (dc * Format::Rhs::kCells + cc) *
6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                                                 Format::Rhs::Cell::kWidth *
7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                                                 Format::kDepth;
7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          // Now we are inside one cell of the Lhs and inside one cell
7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          // of the Rhs, so the remaining inner loops are just
7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          // traditional three loops of matrix multiplication.
7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          for (int di = 0; di < Format::kDepth; di++) {
7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob            for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob              for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                const std::uint8_t* lhs_coeff_ptr =
7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    lhs_cell_ptr +
8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                const std::uint8_t* rhs_coeff_ptr =
8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    rhs_cell_ptr +
8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                std::int32_t* accumulator_coeff_ptr =
8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    accumulator + (ri + rc * Format::Lhs::Cell::kWidth) +
8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                *accumulator_coeff_ptr +=
8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                    std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr);
8975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob              }
9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob            }
9175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          }
9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        }
9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    if (start_depth == 0) {
9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      // start_depth == 0 means we haven't accumulated anything yet, so we need
9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      // to overwrite the accumulator, as it hasn't been initialized to zero.
9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (int r = 0; r < Format::kRows; r++) {
10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        for (int c = 0; c < Format::kCols; c++) {
10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          dst_ptr[r * dst_row_stride + c * dst_col_stride] =
10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob              accumulator[r + c * Format::kRows];
10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        }
10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    } else {
10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      // We have already accumulated stuff, so we need to continue accumulating
10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      // instead of just overwriting.
10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (int r = 0; r < Format::kRows; r++) {
10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        for (int c = 0; c < Format::kCols; c++) {
11075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob          dst_ptr[r * dst_row_stride + c * dst_col_stride] +=
11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob              accumulator[r + c * Format::kRows];
11275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        }
11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}  // namespace gemmlowp
11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif  // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
121