1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved. 275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License"); 475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License. 575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at 675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// http://www.apache.org/licenses/LICENSE-2.0 875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software 1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS, 1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and 1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License. 1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// kernel_reference.h: a reference kernel for CPU architectures where we don't 1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// have optimized kernels yet. Also useful for testing, as it's templatized 1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to have any arbitrary format, allowing tests to cover all sorts of corner 1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// cases. 1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ 2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ 2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 23544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "kernel.h" 2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#include <cstdio> 267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <cstring> 2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp { 2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// This kernel is templatized in an arbitrary Format template parameter, 3175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// allowing it to have any arbitrary format. 3275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobtemplate <typename tFormat> 3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobstruct ReferenceKernel : KernelBase { 3475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef tFormat Format; 3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const char* Name() const override { 3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static char buf[256]; 3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob snprintf(buf, sizeof(buf), 3975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)", 4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::Lhs::kCells, Format::Lhs::Cell::kWidth, 4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::Lhs::Cell::kDepth, 4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells, 4375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth, 4475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob CellOrderName(Format::Rhs::Cell::kOrder)); 4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return buf; 4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const std::uint8_t* rhs_ptr, std::size_t start_depth, 517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::size_t run_depth) const override { 5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::int32_t accumulator[Format::kRows * Format::kCols]; 5375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob memset(accumulator, 0, sizeof(accumulator)); 5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const int run_depth_cells = static_cast<int>(run_depth / Format::kDepth); 5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The outer loop is over the depth dimension. 5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int dc = 0; dc < run_depth_cells; dc++) { 5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The next two loops are over cells of the Lhs (stacked vertically), 6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // and over cells of the Rhs (stacked horizontally). 6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int rc = 0; rc < Format::Lhs::kCells; rc++) { 6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const std::uint8_t* lhs_cell_ptr = lhs_ptr + 6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob (dc * Format::Lhs::kCells + rc) * 6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::Lhs::Cell::kWidth * 6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::kDepth; 6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int cc = 0; cc < Format::Rhs::kCells; cc++) { 6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const std::uint8_t* rhs_cell_ptr = rhs_ptr + 6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob (dc * Format::Rhs::kCells + cc) * 6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::Rhs::Cell::kWidth * 7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Format::kDepth; 7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Now we are inside one cell of the Lhs and inside one cell 7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // of the Rhs, so the remaining inner loops are just 7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // traditional three loops of matrix multiplication. 7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int di = 0; di < Format::kDepth; di++) { 7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) { 7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) { 7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const std::uint8_t* lhs_coeff_ptr = 7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob lhs_cell_ptr + 8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob OffsetIntoCell<typename Format::Lhs::Cell>(ri, di); 8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const std::uint8_t* rhs_coeff_ptr = 8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob rhs_cell_ptr + 8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob OffsetIntoCell<typename Format::Rhs::Cell>(ci, di); 8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::int32_t* accumulator_coeff_ptr = 8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob accumulator + (ri + rc * Format::Lhs::Cell::kWidth) + 8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows; 8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob *accumulator_coeff_ptr += 8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr); 8975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob if (start_depth == 0) { 9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // start_depth == 0 means we haven't accumulated anything yet, so we need 9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // to overwrite the accumulator, as it hasn't been initialized to zero. 9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int r = 0; r < Format::kRows; r++) { 10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int c = 0; c < Format::kCols; c++) { 10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob dst_ptr[r * dst_row_stride + c * dst_col_stride] = 10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob accumulator[r + c * Format::kRows]; 10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } else { 10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // We have already accumulated stuff, so we need to continue accumulating 10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // instead of just overwriting. 10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int r = 0; r < Format::kRows; r++) { 10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int c = 0; c < Format::kCols; c++) { 11075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob dst_ptr[r * dst_row_stride + c * dst_col_stride] += 11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob accumulator[r + c * Format::kRows]; 11275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}; 11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} // namespace gemmlowp 11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ 121