1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 3a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License"); 4a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// you may not use this file except in compliance with the License. 5a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// You may obtain a copy of the License at 6a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 7a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// http://www.apache.org/licenses/LICENSE-2.0 8a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 9a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Unless required by applicable law or agreed to in writing, software 10a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS, 11a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// See the License for the specific language governing permissions and 13a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// limitations under the License. 14a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 15a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is a standalone testbed and benchmark for gemmlowp-style GEMM kernels, 16a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// either doing integer or float arithmetic. 17a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It verifies that a kernel produces correct results, then benchmarks it. 18a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 19a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Some benchmark results are recorded in this spreadsheet: 20a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 21a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://docs.google.com/spreadsheets/d/1UPbzbp9rdsD6RXxOr5q6AZ0n1omgEknLYO2ogiw6Kqk/edit?usp=sharing 22a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 23a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This program is entirely self-contained, and can be compiled manually 24a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// such as suggested in the command lines below. 25a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It currently supports only Android/ARM but would trivially generalize to 26a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// other OSes (it's mostly standard POSIX) or architectures (each kernel 27a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// targets a specific architecture, one may simply add more). 28a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 29a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang/* 30a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Build and run this benchmark on Android/ARM/32bit: 31a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ~/android/toolchains/arm-linux-androideabi/bin/arm-linux-androideabi-clang++ \ 32a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang -fPIE -pie -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \ 33a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /tmp/benchmark -mfloat-abi=softfp -mfpu=neon-vfpv4 && adb push /tmp/benchmark \ 34a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /data/local/tmp && adb shell /data/local/tmp/benchmark 35a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Build and run this benchmark on Android/ARM/64bit: 36a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ~/android/toolchains/aarch64-linux-android/bin/aarch64-linux-android-clang++ \ 37a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang -fPIE -static -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \ 38a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /tmp/benchmark && adb push /tmp/benchmark /data/local/tmp && adb shell \ 39a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /data/local/tmp/benchmark 40a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang */ 41a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 42a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// For big.LITTLE devices, use 'taskset' to select which cores to benchmark. 43a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 44a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The syntax is: taskset <mask> <commandline> 45a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where mask is a binary mask where each bit corresponds to a core, 46a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// and low bits are little cores. 47a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 48a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Examples: 49a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Nexus 5X big cores: taskset 30 50a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Nexus 5X little cores: taskset 0f 51a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Pixel XL big cores: taskset 0c 52a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Pixel XL little cores: taskset 03 53a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 54a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Full example: 55a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// adb shell taskset 0c /data/local/tmp/benchmark 56a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 57a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <sched.h> 58a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <unistd.h> 59a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 60a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <algorithm> 61a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cassert> 62a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cstdint> 63a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cstdlib> 647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#include <cstring> 65a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <iostream> 66a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <random> 67a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <type_traits> 68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if !defined(__arm__) && !defined(__aarch64__) && \ 707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang !(defined(__mips) && (__mips_isa_rev >= 5) && defined(__mips_msa)) 717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#error This benchmark assumes ARM or MIPS (for intrinsics and inline assembly sections). 72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if defined(__arm__) || defined(__aarch64__) 75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <arm_neon.h> 767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif 777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if defined(__mips) 797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#include <msa.h> 807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Some convenience macros to hide differences between MIPS32 and MIPS64. 827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __LP64__ 837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#define GEMMLOWP_MIPS_XADDIU "daddiu" 847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#else 857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#define GEMMLOWP_MIPS_XADDIU "addiu" 867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif 877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif 88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Typically one wants to fit in L1 cache, and GEMM implementations 90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// are carefully optimized to tune their access patterns to that effect. 91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Most devices have at least 16k of L1 cache. The Kraits have exactly 16k. 92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kDefaultCacheSizeK = 16; 93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kCacheLineSize = 64; 95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// These definitions are used for labels within assembly code. Required for 97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// iOS toolchain compatibility. 98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_AFTER_LOOP "1" 99a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_LOOP "2" 100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3" 101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_STORE "4" 102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// BEGIN code copied from gemmlowp/internal/kernel.h 104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Explanation of general gemmlowp terminology 106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// =========================================== 107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We use the following abbreviations: 109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// LHS = "left-hand side" 110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// RHS = "right-hand side" 111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Sometimes when referring to either LHS or RHS, we just say a "Side". 112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// In a matrix product of a MxK matrix times a KxN matrix, 114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// we call K the 'depth'. Note that M is the number of rows 115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of the result (and of the LHS), and N is the number of columns 116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of the result (and of the RHS). 117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// In each of the LHS and RHS matrices, we call 'width' the 119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// other dimension, besides the depth. So in the LHS, 'width' 120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is the number of rows, while in the RHS, 'width' is the number 121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of columns. 122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// So in the LHS MxK matrix, the depth is K and the width in M. 124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// And in the RHS KxN matrix, the depth is K and the width in N. 125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is illustrated in this picture: 127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// RHS width 129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// <-----------------> 130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// +-----------------+ ^ 131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// | RHS | | Depth 132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// +-----------------+ v 133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ^ +--+ +-----------------+ 134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// | |L | | | 135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// LHS width | |H | | Result | 136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// | |S | | | 137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// v +--+ +-----------------+ 138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// <--> 139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Depth 140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Explanation of gemmlowp kernel formats and "cells" 142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ================================================== 143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Kernels operate on small LHS and RHS blocks that fit in registers. 145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// These blocks are stored contiguously in memory, but not always 146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in a traditional column-major or row-major order; instead, 147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// they consist of a number of sub-blocks, which we call "cells", 148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// that are stored in column-major or row-major order. However, 149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// what really matters to us is not so much rows vs columns, but 150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// rather width vs depth. So we refer to "width-major" and "depth-major" 151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// storage orders. In the LHS, width-major means row-major, 152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// while in the RHS, width-major means column-major. 153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// There is also a third possibility, "diagonal order", 154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// which is unused at the moment. 155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We aim to treat both sides, LHS and RHS, on an equal footing, 157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// so we call them both 'sides'. A KernelFormat thus is just a pair 158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat 159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// contains a CellFormat and a number of cells; cells are only ever 160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// stacked in the width dimension, which means stacked vertically in the 161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// LHS and stacked horizondally in the RHS. 162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Example 164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ======= 165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Let's work out the data layout expected by a kernel having the 167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// following format (the struct names here are defined below in this file): 168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelFormat< 170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelSideFormat<CellFormat<3, 4>, 3>, 171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelSideFormat<CellFormat<5, 4>, 2> 172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// > 173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means: 175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 3 cells, each cell having dimensions (width=3, depth=4), laid out in 176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor order (the default value, see CellFormat). In the LHS, 177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor means column-major, so the LHS cells are of size 3x4 in 178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// column-major order, so the LHS layout is: 179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 0 3 6 9 181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 1 4 7 10 182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 2 5 8 11 183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 12 15 18 21 184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 13 16 19 22 185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 14 17 20 23 186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 24 27 30 33 187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 25 28 31 34 188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 26 29 32 35 189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means: 191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 2 cells each having dimensions (width=5, depth=4), laid out in 192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor order (the default value, see CellFormat). In the RHS, 193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor means row-major, so the RHS cells are of size 4x5 in 194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// row-major order, so the RHS layout is: 195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 0 1 2 3 4 20 21 22 23 24 197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 5 6 7 8 9 25 26 27 28 29 198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 10 11 12 13 14 30 31 32 33 34 199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 15 16 17 18 19 35 36 37 38 39 200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// CellOrder enumerates the possible storage orders (=layouts) for 202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// a cell (see explanation above). 203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangenum class CellOrder { DepthMajor, WidthMajor, Diagonal }; 204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// CellFormat describes how data is laid 206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// out in a cell. That is, a CellOrder together with actual dimensions. 207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int tWidth, int tDepth, CellOrder tOrder> 208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct CellFormat { 209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kWidth = tWidth; 210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kDepth = tDepth; 211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const CellOrder kOrder = tOrder; 212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kSize = kWidth * kDepth; 214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelSideFormat describes how data is laid out in a kernel side 217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (i.e. LHS or RHS). That is, a CellFormat together with a number of 218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// cells. These cells are always stacked in the Width dimension. 219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// For example, in the LHS case, the Width dimension is the rows dimension, 220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// se we're saying that in the LHS, cells are stacked vertically. 221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We never stack cells in the Depth dimension. 222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename tCellFormat, int tCells> 223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct KernelSideFormat { 224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef tCellFormat Cell; 225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kCells = tCells; 226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kWidth = kCells * Cell::kWidth; 227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kDepth = Cell::kDepth; 228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelFormat describes fully the input data layout that a kernel expects. 231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It consists of two KernelSideFormat's, one for LHS and one for RHS. 232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename tLhs, typename tRhs> 233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct KernelFormat { 234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef tLhs Lhs; 235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef tRhs Rhs; 236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, ""); 238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kDepth = Lhs::Cell::kDepth; 239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kRows = Lhs::Cell::kWidth * Lhs::kCells; 240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kCols = Rhs::Cell::kWidth * Rhs::kCells; 241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline const char* CellOrderName(CellOrder o) { 244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang switch (o) { 245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang case CellOrder::DepthMajor: 246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return "DepthMajor"; 247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang case CellOrder::WidthMajor: 248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return "WidthMajor"; 249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang case CellOrder::Diagonal: 250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return "Diagonal"; 251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang default: 252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang assert(false); 253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return nullptr; 254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Returns the offset into a cell, at which a given coefficient is stored. 258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename CellFormat> 259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline int OffsetIntoCell(int w, int d) { 260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang switch (CellFormat::kOrder) { 261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang case CellOrder::DepthMajor: 262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return w + d * CellFormat::kWidth; 263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang case CellOrder::WidthMajor: 264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return d + w * CellFormat::kDepth; 265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang case CellOrder::Diagonal: 266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang assert(CellFormat::kWidth == CellFormat::kDepth); 267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int size = CellFormat::kWidth; 268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return ((size + w - d) * size + d) % (size * size); 269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang default: 270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang assert(false); 271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return 0; 272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// END code copied from gemmlowp/internal/kernel.h 276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __arm__ 278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the current standard kernel in gemmlowp, see: 280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L33 281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators { 282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint8_t OperandType; 283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint32_t AccumulatorType; 284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> > 287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1 Rhs cell of size 2x4 292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.8 {d0}, [%[rhs_ptr]]!\n" 293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x2 each 294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.8 {d2}, [%[lhs_ptr]]!\n" 295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.8 {d4}, [%[lhs_ptr]]!\n" 296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.8 {d6}, [%[lhs_ptr]]!\n" 297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #2\n" 313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n" 315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0). 321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7 322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // (q1--q3). 323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x4 block of accumulators is stored in 32bit in q4--q15. 324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-----+-----+-----+-----+ 326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d0[0]|d0[1]|d0[2]|d0[3]| 327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs +-----+-----+-----+-----+ 328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d1[0]|d1[1]|d1[2]|d1[3]| 329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-----+-----+-----+-----+ 330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | | | 332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | | | 334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+--+ - - - - +-----+-----+-----+-----+ 336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2|d3| | q4 | q5 | q6 | q7 | 337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2|d3| | q4 | q5 | q6 | q7 | 338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2|d3| | q4 | q5 | q6 | q7 | 339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2|d3| | q4 | q5 | q6 | q7 | 340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+--+ - - - - +-----+-----+-----+-----+ 341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4|d5| | q8 | q9 | q10 | q11 | 342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4|d5| | q8 | q9 | q10 | q11 | 343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4|d5| | q8 | q9 | q10 | q11 | 344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4|d5| | q8 | q9 | q10 | q11 | 345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+--+ - - - - +-----+-----+-----+-----+ 346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6|d7| | q12 | q13 | q14 | q15 | 347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6|d7| | q12 | q13 | q14 | q15 | 348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6|d7| | q12 | q13 | q14 | q15 | 349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6|d7| | q12 | q13 | q14 | q15 | 350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+--+ - - - - +-----+-----+-----+-----+ 351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Expand Lhs/Rhs cells to 16 bit. 355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Note: moving theses vmovls further down to allow for 356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // longer data pipelining helps a little on A57 but is 357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // harmful on A53 --- It looks as if A53 doesn't like 358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // interleaving vmovl's into the vmlal's. 359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q0, d0\n" 360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q1, d2\n" 361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q2, d4\n" 362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q3, d6\n" 363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, level of depth 0 365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q4, d2, d0[0]\n" 366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q5, d2, d0[1]\n" 367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q6, d2, d0[2]\n" 368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q7, d2, d0[3]\n" 369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d2, [%[lhs_ptr]]\n" 370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q8, d4, d0[0]\n" 371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q9, d4, d0[1]\n" 372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q10, d4, d0[2]\n" 373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q11, d4, d0[3]\n" 374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #8]\n" 375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q12, d6, d0[0]\n" 376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q13, d6, d0[1]\n" 377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q14, d6, d0[2]\n" 378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q15, d6, d0[3]\n" 379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #16]\n" 380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr]]\n" 381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, level of depth 1 383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q4, d3, d1[0]\n" 384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q5, d3, d1[1]\n" 385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], #24\n" 386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q6, d3, d1[2]\n" 387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q7, d3, d1[3]\n" 388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], #8\n" 389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q8, d5, d1[0]\n" 390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q9, d5, d1[1]\n" 391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #2\n" 392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q10, d5, d1[2]\n" 393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q11, d5, d1[3]\n" 394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q12, d7, d1[0]\n" 395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q13, d7, d1[1]\n" 396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q14, d7, d1[2]\n" 397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q15, d7, d1[3]\n" 398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP "b\n" 400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_AFTER_LOOP 402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Expand Lhs/Rhs cells to 16 bit. 405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q0, d0\n" 406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q1, d2\n" 407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q2, d4\n" 408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmovl.u8 q3, d6\n" 409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, level of depth 0 411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q4, d2, d0[0]\n" 412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q5, d2, d0[1]\n" 413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q6, d2, d0[2]\n" 414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q7, d2, d0[3]\n" 415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q8, d4, d0[0]\n" 416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q9, d4, d0[1]\n" 417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q10, d4, d0[2]\n" 418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q11, d4, d0[3]\n" 419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q12, d6, d0[0]\n" 420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q13, d6, d0[1]\n" 421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q14, d6, d0[2]\n" 422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q15, d6, d0[3]\n" 423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, level of depth 1 425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q4, d3, d1[0]\n" 426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q5, d3, d1[1]\n" 427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q6, d3, d1[2]\n" 428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q7, d3, d1[3]\n" 429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q8, d5, d1[0]\n" 430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q9, d5, d1[1]\n" 431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q10, d5, d1[2]\n" 432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q11, d5, d1[3]\n" 433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q12, d7, d1[0]\n" 434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q13, d7, d1[1]\n" 435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q14, d7, d1[2]\n" 436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.u16 q15, d7, d1[3]\n" 437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is Maciek Chociej's fast kernel not expanding operands, 466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// from gemmlowp/meta/. Search for 467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// mul_3x8_3x8_int32_lhsadd_rhsadd 468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in this file: 469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://raw.githubusercontent.com/google/gemmlowp/e4b9d858b6637d5d0058bfa3d869d2b95864251b/meta/single_thread_gemm.h 470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand { 471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint8_t OperandType; 472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint32_t AccumulatorType; 473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1>, 475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1> > 476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Clear aggregators. 481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q0, #0\n" 482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q1, #0\n" 483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q2, #0\n" 484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q3, q0\n" 485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q4, q1\n" 486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q5, q2\n" 487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q6, q3\n" 488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q7, q4\n" 489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q8, q5\n" 490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop head 492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Subtract counter. 496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], %[depth], #8\n" 497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.8 {d18, d19, d20}, [%[rhs_ptr]]!\n" 499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.8 {d21, d22, d23}, [%[lhs_ptr]]!\n" 500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q12, d18, d21\n" 501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q13, d18, d22\n" 502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q14, d18, d23\n" 503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q15, d19, d21\n" 504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q0, q12\n" 505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q1, q13\n" 506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q2, q14\n" 507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q3, q15\n" 508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q12, d19, d22\n" 509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q13, d19, d23\n" 510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q14, d20, d21\n" 511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q15, d20, d22\n" 512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.u8 q9, d20, d23\n" 513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q4, q12\n" 514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q5, q13\n" 515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q6, q14\n" 516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q7, q15\n" 517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.u16 q8, q9\n" 518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop branch 520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Horizontal reduce aggregators, step 1 524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d0, d0, d1\n" 525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d2, d2, d3\n" 526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d4, d4, d5\n" 527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d6, d6, d7\n" 528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d8, d8, d9\n" 529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d10, d10, d11\n" 530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d12, d12, d13\n" 531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d14, d14, d15\n" 532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d16, d16, d17\n" 533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Horizontal reduce aggregators, step 2 535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d0, d0, d2\n" 536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d1, d4, d4\n" 537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d6, d6, d8\n" 538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d7, d10, d10\n" 539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d12, d12, d14\n" 540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.u32 d13, d16, d16\n" 541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2}, [r0]!\n" 545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d3[0]}, [r0]!\n" 546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8}, [r0]!\n" 548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d9[0]}, [r0]!\n" 549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14}, [r0]!\n" 551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d15[0]}, [r0]!\n" 552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulate 554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vadd.s32 q0, q0, q1\n" 555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vadd.s32 q3, q3, q4\n" 556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vadd.s32 q6, q6, q7\n" 557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d0}, [r0]!\n" 561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d1[0]}, [r0]!\n" 562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d6}, [r0]!\n" 564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d7[0]}, [r0]!\n" 565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12}, [r0]!\n" 567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d13[0]}, [r0]!\n" 568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Fast kernel operating on int8 operands. 582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It is assumed that one of the two int8 operands only takes values 583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in [-127, 127], while the other may freely range in [-128, 127]. 584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The issue with both operands taking the value -128 is that: 585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// -128*-128 + -128*-128 == -32768 overflows int16. 586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16 587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// range. That is the basic idea of this kernel. 588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits { 589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int8_t OperandType; 590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t AccumulatorType; 591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<2, 16, CellOrder::WidthMajor>, 1> > 594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::size_t start_depth = 123; 598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::size_t run_depth = depth; 599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::size_t dst_col_stride = 4; 600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* dst_ptr = accum_ptr; 601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 2x16 block of Rhs is stored in 8 bit in d0--d3. 606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only 607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // half of the register space required, so we loop over these registers 608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // twice. Only half of it, a 2x16 block, is stored in d4--d7 at 609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // any given time. 610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit 612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // components which need to be horizontally-added at the end) 613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The Lhs vectors are multiplied by the Rhs vectors with a widening 615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // multiply over the 8 first levels of depth, producing int16x8 616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // vectors of products for each position in the accumulator matrix. 617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Here comes the special trick: since the operands are signed int8, 618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // their range being [ -2^7 , 2^7 ), their products are in range 619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values 620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // without any risk of overflowing int16. 621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // We thus proceed with the 8 next levels of depth, multiplying 622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // again Lhs by Rhs, accumulating into this existing int16x8 vector. 623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Only then, having processed 16 levels of depth, do we need to 625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // horizontally add these int16x8 accumulators into the final 626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // int32x4 accumulators. 627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // As we do not have enough registers to store all 16 int16x8 629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // temporary-16bit-accumulators, we have them cycle through q4--q7. 630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Register layout (ignoring the q4--q7 temporary 16bit accumulators): 633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +----+----+ 635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | d0 | d2 | 636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | . | . | 637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | . | . | 638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | . | . | 639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs +----+----+ 640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | d1 | d3 | 641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | . | . | 642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | . | . | 643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | . | . | 644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +----+----+ 645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | 647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | 649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+ - - - - +----+----+ 651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | d4 ... | d5 ... | | q8 | q9 | 652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | d6 ... | d7 ... | | q10| q11| 653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | d4 ... | d5 ... | | q12| q13| 654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | d6 ... | d7 ... | | q14| q15| 655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+ - - - - +----+----+ 656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Clear accumulators, and, interleaved with it, 661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // initial loads of the first loop iteration, 662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // taken out of the loop so that in the loop itself we have 663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // optimal streaming of data from memory. 664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr], #0]\n" 665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q8, #0\n" 666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #0]\n" 667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q9, #0\n" 668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d2, [%[rhs_ptr], #16]\n" 669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q10, q8\n" 670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #16]\n" 671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q11, q8\n" 672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d1, [%[rhs_ptr], #8]\n" 673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q12, q8\n" 674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d5, [%[lhs_ptr], #8]\n" 675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q13, q8\n" 676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d3, [%[rhs_ptr], #24]\n" 677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q14, q8\n" 678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d7, [%[lhs_ptr], #24]\n" 679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov.i32 q15, q8\n" 680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // General loop. 682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply 8 first levels of depth. 686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q4, d0, d4\n" 687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], %[rhs_ptr], #32\n" 688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q5, d2, d4\n" 689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #32]\n" 690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q6, d0, d6\n" 691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q7, d2, d6\n" 692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #48]\n" 693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate second-half, again into the same 695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 16bit local accumulator registers. This is where we 696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // take advantage of having int8 instead of uint8 and therefore 697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // being able to accumulate two products into int16. 698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q4, d1, d5\n" 699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q5, d3, d5\n" 700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d5, [%[lhs_ptr], #40]\n" 701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q6, d1, d7\n" 702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q7, d3, d7\n" 703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d7, [%[lhs_ptr], #56]\n" 704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Add pairwise, accumulate into 32-bit accumulators. 706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q8, q4\n" 707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], %[lhs_ptr], #64\n" 708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q9, q5\n" 709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[run_depth], %[run_depth], #16\n" 710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q10, q6\n" 711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q11, q7\n" 712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "beq " GEMMLOWP_LABEL_AFTER_LOOP 714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "f\n" 715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply first half. 717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q4, d0, d4\n" 718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q5, d2, d4\n" 719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #0]\n" 720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q6, d0, d6\n" 721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr], #0]\n" 722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q7, d2, d6\n" 723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d2, [%[rhs_ptr], #16]\n" 724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate second-half, again into the same 726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 16bit local accumulator registers. This is where we 727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // take advantage of having int8 instead of uint8 and therefore 728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // being able to accumulate two products into int16. 729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q4, d1, d5\n" 730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #16]\n" 731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q5, d3, d5\n" 732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d5, [%[lhs_ptr], #8]\n" 733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q6, d1, d7\n" 734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d1, [%[rhs_ptr], #8]\n" 735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q7, d3, d7\n" 736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d3, [%[rhs_ptr], #24]\n" 737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Add pairwise, accumulate into 32-bit accumulators. 739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q12, q4\n" 740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d7, [%[lhs_ptr], #24]\n" 741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q13, q5\n" 742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q14, q6\n" 743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q15, q7\n" 744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b " GEMMLOWP_LABEL_LOOP "b\n" 746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_AFTER_LOOP 748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply first half. 751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q4, d0, d4\n" 752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q5, d2, d4\n" 753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q6, d0, d6\n" 754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmull.s8 q7, d2, d6\n" 755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate second-half, again into the same 757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 16bit local accumulator registers. This is where we 758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // take advantage of having int8 instead of uint8 and therefore 759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // being able to accumulate two products into int16. 760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q4, d1, d5\n" 761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q5, d3, d5\n" 762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q6, d1, d7\n" 763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmlal.s8 q7, d3, d7\n" 764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Add pairwise, accumulate into 32-bit accumulators. 766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q12, q4\n" 767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q13, q5\n" 768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q14, q6\n" 769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadal.s16 q15, q7\n" 770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cmp %[start_depth], #0\n" 771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Reduce 32bit accumulators horizontally. 773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d0, d16, d17\n" 774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d1, d18, d19\n" 775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d2, d20, d21\n" 776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d3, d22, d23\n" 777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d4, d24, d25\n" 778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d5, d26, d27\n" 779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d6, d28, d29\n" 780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d7, d30, d31\n" 781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "f\n" 784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Reduce 32bit accumulators horizontally, second pass 786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // (each pass adds pairwise. we need to add 4-wise). 787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d8, d0, d2\n" 788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d9, d4, d6\n" 789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d10, d1, d3\n" 790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d11, d5, d7\n" 791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b " GEMMLOWP_LABEL_STORE "f\n" 793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES 795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Reduce 32bit accumulators horizontally, second pass 798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // (each pass adds pairwise. we need to add 4-wise), 799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // and load destination values from memory. 800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[dst_ptr]\n" 801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d8, d0, d2\n" 803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d9, d4, d6\n" 804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]\n" 805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d10, d1, d3\n" 806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vpadd.s32 d11, d5, d7\n" 807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Add horizontally-reduced accumulators into 809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // the values loaded from memory 810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vadd.s32 q4, q8, q4\n" 811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vadd.s32 q5, q9, q5\n" 812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_STORE 814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store back into memory 816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[dst_ptr]\n" 817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]\n" 819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth) 822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [start_depth] "r"(start_depth) 824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We don't actually use int32*int32 in production. This is just an 833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// experiment to help dissociate the effect of integer-vs-float, from the 834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// effect of operands width. 835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Int32_WithScalar { 836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t OperandType; 837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t AccumulatorType; 838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1 Rhs cell of size 1x4 864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n" 865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" 868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n" 869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n" 870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q4, q1, d0[0]\n" 873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q5, q1, d0[1]\n" 874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q6, q1, d1[0]\n" 875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q7, q1, d1[1]\n" 876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q8, q2, d0[0]\n" 877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q9, q2, d0[1]\n" 878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q10, q2, d1[0]\n" 879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q11, q2, d1[1]\n" 880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q12, q3, d0[0]\n" 881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q13, q3, d0[1]\n" 882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q14, q3, d1[0]\n" 883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.s32 q15, q3, d1[1]\n" 884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Not very efficient kernel, just an experiment to see what we can do 919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// without using NEON multiply-with-scalar instructions. 920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar { 921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" 950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n" 951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n" 952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q4, q1, q0\n" 956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q8, q2, q0\n" 957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q12, q3, q0\n" 958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q5, q1, q0\n" 960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q9, q2, q0\n" 961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q13, q3, q0\n" 962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q6, q1, q0\n" 964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q10, q2, q0\n" 965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q14, q3, q0\n" 966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q7, q1, q0\n" 968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q11, q2, q0\n" 969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q15, q3, q0\n" 970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 1000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 1001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Not very efficient kernel, just an experiment to see what we can do 1005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// without using NEON multiply-with-scalar instructions. 1006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This variant is relevant as on ARMv7 FMA does not have a with-scalar variant. 1007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar { 1008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 1009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 1010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 1012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 1013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 1020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 1021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 1022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 1023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 1024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 1025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 1026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 1027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 1028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 1029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 1030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 1031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 1036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" 1037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n" 1038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n" 1039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 1041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 1042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q4, q1, q0\n" 1043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q8, q2, q0\n" 1044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q12, q3, q0\n" 1045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 1046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q5, q1, q0\n" 1047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q9, q2, q0\n" 1048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q13, q3, q0\n" 1049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 1050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q6, q1, q0\n" 1051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q10, q2, q0\n" 1052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q14, q3, q0\n" 1053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n" 1054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q7, q1, q0\n" 1055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q11, q2, q0\n" 1056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q15, q3, q0\n" 1057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 1059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 1060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 1061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 1062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 1063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 1067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 1068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 1069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 1070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 1071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 1072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 1073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 1074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 1075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 1076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 1077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 1078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 1086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 1087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 1088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the "most natural" kernel, using NEON multiply-with-scalar 1092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// instructions. 1093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_MLA_WithScalar { 1094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 1095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 1096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 1098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 1099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 1106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 1107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 1108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 1109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 1110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 1111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 1112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 1113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 1114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 1115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 1116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 1117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1 Rhs cell of size 1x4 1122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n" 1123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 1125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" 1126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n" 1127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n" 1128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 1130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q4, q1, d0[0]\n" 1131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q5, q1, d0[1]\n" 1132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q6, q1, d1[0]\n" 1133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q7, q1, d1[1]\n" 1134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q8, q2, d0[0]\n" 1135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q9, q2, d0[1]\n" 1136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q10, q2, d1[0]\n" 1137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q11, q2, d1[1]\n" 1138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q12, q3, d0[0]\n" 1139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q13, q3, d0[1]\n" 1140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q14, q3, d1[0]\n" 1141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q15, q3, d1[1]\n" 1142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 1144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 1145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 1146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 1147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 1148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 1152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 1153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 1154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 1155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 1156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 1157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 1158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 1159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 1160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 1161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 1162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 1163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 1171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 1172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 1173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel contributed by ARM in 64bit form 1177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (see NEON_64bit_GEMM_Float32_WithScalar_A53) then ported to 32bit code. 1178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Tuned for A53. 1179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_WithScalar_A53 { 1180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 1181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 1182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 1184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 1185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 1192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 1193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 1194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 1195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 1196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 1197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 1198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 1199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 1200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 1201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 1202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 1203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 1205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 1x4 cell of Rhs is stored in d0--d1 (q0). 1207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7 1208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // (q1--q3). 1209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x4 block of accumulators is stored in q4--q15. 1210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-----+-----+-----+-----+ 1212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs |d0[0]|d0[1]|d1[0]|d1[1]| 1213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-----+-----+-----+-----+ 1214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | | | 1216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | | | 1218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+- - - - - - +-----+-----+-----+-----+ 1220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2| | q4 | q5 | q6 | q7 | 1221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2| | q4 | q5 | q6 | q7 | 1222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d3| | q4 | q5 | q6 | q7 | 1223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d3| | q4 | q5 | q6 | q7 | 1224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+- - - - - - +-----+-----+-----+-----+ 1225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4| | q8 | q9 | q10 | q11 | 1226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4| | q8 | q9 | q10 | q11 | 1227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d5| | q8 | q9 | q10 | q11 | 1228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d5| | q8 | q9 | q10 | q11 | 1229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+ - - - - - - +-----+-----+-----+-----+ 1230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6| | q12 | q13 | q14 | q15 | 1231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6| | q12 | q13 | q14 | q15 | 1232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d7| | q12 | q13 | q14 | q15 | 1233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d7| | q12 | q13 | q14 | q15 | 1234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+- - - - - - +-----+-----+-----+-----+ 1235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 1237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load Rhs cell 1239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr]]\n" 1240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[rhs_ptr], #8]\n" 1241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[rhs_ptr], #12]\n" 1242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1st Lhs Cell 1244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]\n" 1245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #16]\n" // Load 1st half of 2nd Lhs cell 1250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d1, r2, r3\n" // Prepare 2nd half of Rhs cell 1251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q4, q1, d0[0]\n" // Multiply 1st Lhs cell with column 0 1252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #24]\n" // Load 2nd half of 2nd Lhs cell, part 1 1253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q5, q1, d0[1]\n" // Multiply 1st Lhs cell with column 1 1254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #28]\n" // Load 2nd half of 2nd Lhs cell, part 2 1255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q6, q1, d1[0]\n" // Multiply 1st Lhs cell with column 2 1256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 1257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #32]\n" // Load 1st half of 3rd Lhs cell 1259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d5, r2, r3\n" // Prepare 2nd half of 2nd Lhs cell 1260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q7, q1, d1[1]\n" // Multiply 1st Lhs cell with column 3 1261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #40]\n" // Load 2nd half of 3rd Lhs cell, part 1 1262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q8, q2, d0[0]\n" // Multiply 2nd Lhs cell with column 0 1263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #44]\n" // Load 2nd half of 3rd Lhs cell, part 2 1264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q9, q2, d0[1]\n" // Multiply 2nd Lhs cell with column 1 1265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], %[rhs_ptr], #16\n" // Move forward by 1 Rhs cell 1266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d2, [%[lhs_ptr], #48]\n" // Load 1st half of 1st Lhs cell of next 1268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d7, r2, r3\n" // Prepare 2nd half of 3rd Lhs cell 1270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q10, q2, d1[0]\n" // Multiply 2nd Lhs cell with column 2 1271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #56]\n" // Load 2nd half of 1st Lhs cell of next 1272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iter, part 1 1273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q12, q3, d0[0]\n" // Multiply 3rd Lhs cell with column 0 1274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #60]\n" // Load 2nd half of 1st Lhs cell of next 1275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iter, part 2 1276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q13, q3, d0[1]\n" // Multiply 3rd Lhs cell with column 1 1277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], %[lhs_ptr], #48\n" // Move forward by 3 Lhs cells 1278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr]]\n" // Load 1st half of Rhs cell of next 1280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d3, r2, r3\n" // Prepare 2nd half of 1st Lhs cell of next 1282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q11, q2, d1[1]\n" // Multiply 2nd Lhs cell with column 3 1284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[rhs_ptr], #8]\n" // Load 2nd half of Rhs cell of next 1285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration, part 1 1286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q14, q3, d1[0]\n" // Multiply 3rd Lhs cell with column 2 1287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[rhs_ptr], #12]\n" // Load 2nd half of Rhs cell of next 1288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration, part 2 1289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q15, q3, d1[1]\n" // Multiply 3rd Lhs cell with column 3 1290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop branch. This will dual issue in fmla cycle 3 of the 4th block. 1292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 1293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 1294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 1298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 1299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 1300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 1301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 1302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 1303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 1304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 1305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 1306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 1307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 1308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 1309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", 1316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", 1317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", 1318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d27", "d28", "d29", "d30", "d31"); 1319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_WithScalar_A53_depth2 { 1323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 1324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 1325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 1327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> > 1328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 1335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 1336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 1337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 1338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 1339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 1340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 1341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 1342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 1343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 1344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 1345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 1346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 1348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 1x4 cell of Rhs is stored in d0--d1 (q0). 1350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7 1351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // (q1--q3). 1352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x4 block of accumulators is stored in q4--q15. 1353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-----+-----+-----+-----+ 1355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs |d0[0]|d0[1]|d1[0]|d1[1]| 1356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-----+-----+-----+-----+ 1357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | | | 1359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | | | 1361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+- - - - - - +-----+-----+-----+-----+ 1363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2| | q4 | q5 | q6 | q7 | 1364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d2| | q4 | q5 | q6 | q7 | 1365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d3| | q4 | q5 | q6 | q7 | 1366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d3| | q4 | q5 | q6 | q7 | 1367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+- - - - - - +-----+-----+-----+-----+ 1368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4| | q8 | q9 | q10 | q11 | 1369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d4| | q8 | q9 | q10 | q11 | 1370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d5| | q8 | q9 | q10 | q11 | 1371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d5| | q8 | q9 | q10 | q11 | 1372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+ - - - - - - +-----+-----+-----+-----+ 1373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6| | q12 | q13 | q14 | q15 | 1374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d6| | q12 | q13 | q14 | q15 | 1375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d7| | q12 | q13 | q14 | q15 | 1376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |d7| | q12 | q13 | q14 | q15 | 1377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--+- - - - - - +-----+-----+-----+-----+ 1378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 1380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load Rhs cell 1382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr]]\n" 1383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[rhs_ptr], #8]\n" 1384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[rhs_ptr], #12]\n" 1385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1st Lhs Cell 1387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]\n" 1388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop head - handling 2 levels of depth at once 1390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Level of depth 1 1394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #32]\n" // Load 1st half of 2nd Lhs cell 1396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d1, r2, r3\n" // Prepare 2nd half of Rhs cell 1397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q4, q1, d0[0]\n" // Multiply 1st Lhs cell with column 0 1398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #40]\n" // Load 2nd half of 2nd Lhs cell, part 1 1399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q5, q1, d0[1]\n" // Multiply 1st Lhs cell with column 1 1400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #44]\n" // Load 2nd half of 2nd Lhs cell, part 2 1401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q6, q1, d1[0]\n" // Multiply 1st Lhs cell with column 2 1402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #64]\n" // Load 1st half of 3rd Lhs cell 1404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d5, r2, r3\n" // Prepare 2nd half of 2nd Lhs cell 1405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q7, q1, d1[1]\n" // Multiply 1st Lhs cell with column 3 1406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #72]\n" // Load 2nd half of 3rd Lhs cell, part 1 1407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q8, q2, d0[0]\n" // Multiply 2nd Lhs cell with column 0 1408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #76]\n" // Load 2nd half of 3rd Lhs cell, part 2 1409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q9, q2, d0[1]\n" // Multiply 2nd Lhs cell with column 1 1410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d2, [%[lhs_ptr], #16]\n" // Load 1st half of 1st Lhs cell of next 1412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d7, r2, r3\n" // Prepare 2nd half of 3rd Lhs cell 1414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q10, q2, d1[0]\n" // Multiply 2nd Lhs cell with column 2 1415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #24]\n" // Load 2nd half of 1st Lhs cell of next 1416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iter, part 1 1417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q12, q3, d0[0]\n" // Multiply 3rd Lhs cell with column 0 1418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #28]\n" // Load 2nd half of 1st Lhs cell of next 1419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iter, part 2 1420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q13, q3, d0[1]\n" // Multiply 3rd Lhs cell with column 1 1421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr], #16]\n" // Load 1st half of Rhs cell of next 1423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d3, r2, r3\n" // Prepare 2nd half of 1st Lhs cell of next 1425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q11, q2, d1[1]\n" // Multiply 2nd Lhs cell with column 3 1427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[rhs_ptr], #24]\n" // Load 2nd half of Rhs cell of next 1428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration, part 1 1429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q14, q3, d1[0]\n" // Multiply 3rd Lhs cell with column 2 1430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[rhs_ptr], #28]\n" // Load 2nd half of Rhs cell of next 1431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration, part 2 1432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q15, q3, d1[1]\n" // Multiply 3rd Lhs cell with column 3 1433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Level of depth 2 1435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d4, [%[lhs_ptr], #48]\n" // Load 1st half of 2nd Lhs cell 1436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d1, r2, r3\n" // Prepare 2nd half of Rhs cell 1437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q4, q1, d0[0]\n" // Multiply 1st Lhs cell with column 0 1438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #56]\n" // Load 2nd half of 2nd Lhs cell, part 1 1439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q5, q1, d0[1]\n" // Multiply 1st Lhs cell with column 1 1440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #60]\n" // Load 2nd half of 2nd Lhs cell, part 2 1441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q6, q1, d1[0]\n" // Multiply 1st Lhs cell with column 2 1442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #2\n" // Decrement depth counter 1443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d6, [%[lhs_ptr], #80]\n" // Load 1st half of 3rd Lhs cell 1445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d5, r2, r3\n" // Prepare 2nd half of 2nd Lhs cell 1446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q7, q1, d1[1]\n" // Multiply 1st Lhs cell with column 3 1447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #88]\n" // Load 2nd half of 3rd Lhs cell, part 1 1448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q8, q2, d0[0]\n" // Multiply 2nd Lhs cell with column 0 1449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #92]\n" // Load 2nd half of 3rd Lhs cell, part 2 1450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q9, q2, d0[1]\n" // Multiply 2nd Lhs cell with column 1 1451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], %[rhs_ptr], #32\n" // Move forward by 1 Rhs cell 1452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d2, [%[lhs_ptr], #96]\n" // Load 1st half of 1st Lhs cell of next 1454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d7, r2, r3\n" // Prepare 2nd half of 3rd Lhs cell 1456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q10, q2, d1[0]\n" // Multiply 2nd Lhs cell with column 2 1457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[lhs_ptr], #104]\n" // Load 2nd half of 1st Lhs cell of next 1458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iter, part 1 1459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q12, q3, d0[0]\n" // Multiply 3rd Lhs cell with column 0 1460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[lhs_ptr], #108]\n" // Load 2nd half of 1st Lhs cell of next 1461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iter, part 2 1462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q13, q3, d0[1]\n" // Multiply 3rd Lhs cell with column 1 1463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], %[lhs_ptr], #96\n" // Move forward by 3 Lhs cells 1464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vldr d0, [%[rhs_ptr]]\n" // Load 1st half of Rhs cell of next 1466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmov d3, r2, r3\n" // Prepare 2nd half of 1st Lhs cell of next 1468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration 1469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q11, q2, d1[1]\n" // Multiply 2nd Lhs cell with column 3 1470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r2, [%[rhs_ptr], #8]\n" // Load 2nd half of Rhs cell of next 1471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration, part 1 1472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q14, q3, d1[0]\n" // Multiply 3rd Lhs cell with column 2 1473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr r3, [%[rhs_ptr], #12]\n" // Load 2nd half of Rhs cell of next 1474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // iteration, part 2 1475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q15, q3, d1[1]\n" // Multiply 3rd Lhs cell with column 3 1476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop branch. This will dual issue in fmla cycle 3 of the 4th block. 1478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang //"bne loop_%=\n" 1479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 1480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 1481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 1485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 1486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 1487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 1488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 1489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 1490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 1491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 1492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 1493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 1494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 1495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 1496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", 1503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", 1504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", 1505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d27", "d28", "d29", "d30", "d31"); 1506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This rotating variant performs well when permutations (vext) can be 1510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// dual-issued with arithmetic instructions. 1511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_MLA_Rotating { 1512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 1513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 1514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 1516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 1517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 1524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 1525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 1526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 1527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 1528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 1529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 1530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 1531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 1532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 1533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 1534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 1535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS \ 1537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vtrn.32 q4, q5\n" \ 1538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vtrn.32 q6, q7\n" \ 1539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vswp d9, d12\n" \ 1540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vswp d11, d14\n" \ 1541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vtrn.32 q8, q9\n" \ 1542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vtrn.32 q10, q11\n" \ 1543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vswp d17, d20\n" \ 1544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vswp d19, d22\n" \ 1545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vtrn.32 q12, q13\n" \ 1546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vtrn.32 q14, q15\n" \ 1547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vswp d25, d28\n" \ 1548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vswp d27, d30\n" 1549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(a, b, c) \ 1551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS \ 1552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q5, q5, q5, #" #a \ 1553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q6, q6, q6, #" #b \ 1555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q7, q7, q7, #" #c \ 1557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q9, q9, q9, #" #a \ 1559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q10, q10, q10, #" #b \ 1561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q11, q11, q11, #" #c \ 1563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q13, q13, q13, #" #a \ 1565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q14, q14, q14, #" #b \ 1567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" \ 1568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.32 q15, q15, q15, #" #c \ 1569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "\n" NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS 1570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3) 1572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang //"loop_%=:\n" 1574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1 Rhs cell of size 1x4 1578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n" 1579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 1581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" 1582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n" 1583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n" 1584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 1586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q4, q1, q0\n" 1587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q8, q2, q0\n" 1588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q12, q3, q0\n" 1589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.f32 q0, q0, q0, #1\n" 1590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q5, q1, q0\n" 1591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q9, q2, q0\n" 1592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q13, q3, q0\n" 1593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.f32 q0, q0, q0, #1\n" 1594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q6, q1, q0\n" 1595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q10, q2, q0\n" 1596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q14, q3, q0\n" 1597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.f32 q0, q0, q0, #1\n" 1598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q7, q1, q0\n" 1599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q11, q2, q0\n" 1600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vmla.f32 q15, q3, q0\n" 1601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 1603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 1604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 1605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang //"bne loop_%=\n" 1606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 1607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 1608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1) 1613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 1615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 1616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 1617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 1618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 1619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 1620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 1621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 1622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 1623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 1624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 1625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 1626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 1634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 1635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 1636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This rotating variant performs well when permutations (vext) can be 1640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// dual-issued with arithmetic instructions. It is relevant as the rotating 1641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// approach removes the need for multiply-with-scalar instructions, and ARMv7 1642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// FMA does not have a with-scalar variant. 1643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_FMA_Rotating { 1644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 1645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 1646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 1648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> > 1649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d8, d9}, [r0]!\n" 1656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d16, d17}, [r0]!\n" 1657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d24, d25}, [r0]!\n" 1658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d10, d11}, [r0]!\n" 1659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d18, d19}, [r0]!\n" 1660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d26, d27}, [r0]!\n" 1661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d12, d13}, [r0]!\n" 1662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d20, d21}, [r0]!\n" 1663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d28, d29}, [r0]!\n" 1664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d14, d15}, [r0]!\n" 1665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d22, d23}, [r0]!\n" 1666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d30, d31}, [r0]!\n" 1667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3) 1669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang //"loop_%=:\n" 1671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1 Rhs cell of size 1x4 1675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n" 1676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 1678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" 1679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n" 1680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n" 1681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 1683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q4, q1, q0\n" 1684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q8, q2, q0\n" 1685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q12, q3, q0\n" 1686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.f32 q0, q0, q0, #1\n" 1687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q5, q1, q0\n" 1688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q9, q2, q0\n" 1689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q13, q3, q0\n" 1690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.f32 q0, q0, q0, #1\n" 1691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q6, q1, q0\n" 1692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q10, q2, q0\n" 1693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q14, q3, q0\n" 1694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vext.f32 q0, q0, q0, #1\n" 1695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q7, q1, q0\n" 1696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q11, q2, q0\n" 1697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vfma.f32 q15, q3, q0\n" 1698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 1700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 1701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[depth], #1\n" 1702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang //"bne loop_%=\n" 1703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP "b\n" 1704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1) 1706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov r0, %[accum_ptr]\n" 1709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d8, d9}, [r0]!\n" 1710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d16, d17}, [r0]!\n" 1711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d24, d25}, [r0]!\n" 1712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d10, d11}, [r0]!\n" 1713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d18, d19}, [r0]!\n" 1714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d26, d27}, [r0]!\n" 1715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d12, d13}, [r0]!\n" 1716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d20, d21}, [r0]!\n" 1717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d28, d29}, [r0]!\n" 1718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d14, d15}, [r0]!\n" 1719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d22, d23}, [r0]!\n" 1720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "vst1.32 {d30, d31}, [r0]!\n" 1721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", 1729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", 1730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "d28", "d29", "d30", "d31"); 1731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif // __arm__ 1735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __aarch64__ 1737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the current standard kernel in gemmlowp, see: 1739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L646 1740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators { 1741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint8_t OperandType; 1742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint32_t AccumulatorType; 1743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 1744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 1745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> > 1746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 1747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 1748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 1749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 1750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 1 Rhs cell of size 2x8 1751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.8b}, [%[rhs_ptr]], #8\n" 1752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v6.8b}, [%[rhs_ptr]], #8\n" 1753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x2 each 1755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.8b}, [%[lhs_ptr]], #8\n" 1756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.8b}, [%[lhs_ptr]], #8\n" 1757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.8b}, [%[lhs_ptr]], #8\n" 1758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #2\n" 1760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 1762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 1763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 1764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v16.16b}, [x0], #16\n" 1765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v24.16b}, [x0], #16\n" 1766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 1767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v17.16b}, [x0], #16\n" 1768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v25.16b}, [x0], #16\n" 1769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 1770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v18.16b}, [x0], #16\n" 1771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v26.16b}, [x0], #16\n" 1772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 1773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v19.16b}, [x0], #16\n" 1774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v27.16b}, [x0], #16\n" 1775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v12.16b}, [x0], #16\n" 1776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v20.16b}, [x0], #16\n" 1777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v28.16b}, [x0], #16\n" 1778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v13.16b}, [x0], #16\n" 1779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v21.16b}, [x0], #16\n" 1780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v29.16b}, [x0], #16\n" 1781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v14.16b}, [x0], #16\n" 1782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v22.16b}, [x0], #16\n" 1783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v30.16b}, [x0], #16\n" 1784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v15.16b}, [x0], #16\n" 1785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v23.16b}, [x0], #16\n" 1786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v31.16b}, [x0], #16\n" 1787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n" 1789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang //"loop_%=:\n" 1791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 1792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 1795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1. 1797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4. 1798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 12x8 block of accumulators is stored in 32bit in v8--v31. 1799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+-----+--------+--------+ 1801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] | 1802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs +--------+--------+-----+--------+--------+ 1803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] | 1804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+-----+--------+--------+ 1805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | | | | 1807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | | | | 1809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v2.h[0]|v2.h[4]| |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]| 1812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v2.h[1]|v2.h[5]| |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]| 1813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v2.h[2]|v2.h[6]| |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]| 1814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v2.h[3]|v2.h[7]| |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]| 1815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v3.h[0]|v3.h[4]| |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]| 1817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v3.h[1]|v3.h[5]| |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]| 1818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v3.h[2]|v3.h[6]| |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]| 1819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v3.h[3]|v3.h[7]| |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]| 1820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.h[0]|v4.h[4]| |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]| 1822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.h[1]|v4.h[5]| |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]| 1823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.h[2]|v4.h[6]| |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]| 1824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.h[3]|v4.h[7]| |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]| 1825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+-------+ - - +--------+--------+-----+--------+--------+ 1826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 1828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Expand Lhs/Rhs cells to 16 bit. 1830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v0.8h, v5.8b\n" 1831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.8b}, [%[rhs_ptr]], #8\n" 1832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v1.8h, v6.8b\n" 1833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v6.8b}, [%[rhs_ptr]], #8\n" 1834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v2.8h, v2.8b\n" 1835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v3.8h, v3.8b\n" 1836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v4.8h, v4.8b\n" 1837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, top third 1839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v8.4s, v2.4h, v0.h[0]\n" 1840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v9.4s, v2.4h, v0.h[1]\n" 1841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v10.4s, v2.4h, v0.h[2]\n" 1842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v11.4s, v2.4h, v0.h[3]\n" 1843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v12.4s, v2.4h, v1.h[0]\n" 1844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v13.4s, v2.4h, v1.h[1]\n" 1845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v14.4s, v2.4h, v1.h[2]\n" 1846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v15.4s, v2.4h, v1.h[3]\n" 1847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v8.4s, v2.8h, v0.h[4]\n" 1848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v9.4s, v2.8h, v0.h[5]\n" 1849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v10.4s, v2.8h, v0.h[6]\n" 1850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v11.4s, v2.8h, v0.h[7]\n" 1851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v12.4s, v2.8h, v1.h[4]\n" 1852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v13.4s, v2.8h, v1.h[5]\n" 1853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v14.4s, v2.8h, v1.h[6]\n" 1854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v15.4s, v2.8h, v1.h[7]\n" 1855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.8b}, [%[lhs_ptr]], #8\n" 1856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, middle third 1858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v16.4s, v3.4h, v0.h[0]\n" 1859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v17.4s, v3.4h, v0.h[1]\n" 1860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v18.4s, v3.4h, v0.h[2]\n" 1861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v19.4s, v3.4h, v0.h[3]\n" 1862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v20.4s, v3.4h, v1.h[0]\n" 1863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v21.4s, v3.4h, v1.h[1]\n" 1864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v22.4s, v3.4h, v1.h[2]\n" 1865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v23.4s, v3.4h, v1.h[3]\n" 1866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v16.4s, v3.8h, v0.h[4]\n" 1867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v17.4s, v3.8h, v0.h[5]\n" 1868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v18.4s, v3.8h, v0.h[6]\n" 1869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v19.4s, v3.8h, v0.h[7]\n" 1870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v20.4s, v3.8h, v1.h[4]\n" 1871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v21.4s, v3.8h, v1.h[5]\n" 1872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v22.4s, v3.8h, v1.h[6]\n" 1873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v23.4s, v3.8h, v1.h[7]\n" 1874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.8b}, [%[lhs_ptr]], #8\n" 1875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #2\n" 1877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, bottom third 1879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v24.4s, v4.4h, v0.h[0]\n" 1880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v25.4s, v4.4h, v0.h[1]\n" 1881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v26.4s, v4.4h, v0.h[2]\n" 1882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v27.4s, v4.4h, v0.h[3]\n" 1883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v28.4s, v4.4h, v1.h[0]\n" 1884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v29.4s, v4.4h, v1.h[1]\n" 1885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v30.4s, v4.4h, v1.h[2]\n" 1886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v31.4s, v4.4h, v1.h[3]\n" 1887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v24.4s, v4.8h, v0.h[4]\n" 1888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v25.4s, v4.8h, v0.h[5]\n" 1889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v26.4s, v4.8h, v0.h[6]\n" 1890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v27.4s, v4.8h, v0.h[7]\n" 1891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v28.4s, v4.8h, v1.h[4]\n" 1892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v29.4s, v4.8h, v1.h[5]\n" 1893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v30.4s, v4.8h, v1.h[6]\n" 1894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v31.4s, v4.8h, v1.h[7]\n" 1895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.8b}, [%[lhs_ptr]], #8\n" 1896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP "b\n" 1898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_AFTER_LOOP 1900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 1901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Expand Lhs/Rhs cells to 16 bit. 1903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v0.8h, v5.8b\n" 1904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v1.8h, v6.8b\n" 1905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v2.8h, v2.8b\n" 1906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v3.8h, v3.8b\n" 1907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uxtl v4.8h, v4.8b\n" 1908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, level of depth 0 1910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v8.4s, v2.4h, v0.h[0]\n" 1911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v9.4s, v2.4h, v0.h[1]\n" 1912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v10.4s, v2.4h, v0.h[2]\n" 1913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v11.4s, v2.4h, v0.h[3]\n" 1914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v12.4s, v2.4h, v1.h[0]\n" 1915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v13.4s, v2.4h, v1.h[1]\n" 1916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v14.4s, v2.4h, v1.h[2]\n" 1917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v15.4s, v2.4h, v1.h[3]\n" 1918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v16.4s, v3.4h, v0.h[0]\n" 1919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v17.4s, v3.4h, v0.h[1]\n" 1920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v18.4s, v3.4h, v0.h[2]\n" 1921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v19.4s, v3.4h, v0.h[3]\n" 1922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v20.4s, v3.4h, v1.h[0]\n" 1923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v21.4s, v3.4h, v1.h[1]\n" 1924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v22.4s, v3.4h, v1.h[2]\n" 1925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v23.4s, v3.4h, v1.h[3]\n" 1926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v24.4s, v4.4h, v0.h[0]\n" 1927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v25.4s, v4.4h, v0.h[1]\n" 1928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v26.4s, v4.4h, v0.h[2]\n" 1929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v27.4s, v4.4h, v0.h[3]\n" 1930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v28.4s, v4.4h, v1.h[0]\n" 1931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v29.4s, v4.4h, v1.h[1]\n" 1932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v30.4s, v4.4h, v1.h[2]\n" 1933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal v31.4s, v4.4h, v1.h[3]\n" 1934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate, level of depth 1 1936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v8.4s, v2.8h, v0.h[4]\n" 1937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v9.4s, v2.8h, v0.h[5]\n" 1938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v10.4s, v2.8h, v0.h[6]\n" 1939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v11.4s, v2.8h, v0.h[7]\n" 1940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v12.4s, v2.8h, v1.h[4]\n" 1941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v13.4s, v2.8h, v1.h[5]\n" 1942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v14.4s, v2.8h, v1.h[6]\n" 1943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v15.4s, v2.8h, v1.h[7]\n" 1944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v16.4s, v3.8h, v0.h[4]\n" 1945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v17.4s, v3.8h, v0.h[5]\n" 1946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v18.4s, v3.8h, v0.h[6]\n" 1947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v19.4s, v3.8h, v0.h[7]\n" 1948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v20.4s, v3.8h, v1.h[4]\n" 1949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v21.4s, v3.8h, v1.h[5]\n" 1950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v22.4s, v3.8h, v1.h[6]\n" 1951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v23.4s, v3.8h, v1.h[7]\n" 1952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v24.4s, v4.8h, v0.h[4]\n" 1953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v25.4s, v4.8h, v0.h[5]\n" 1954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v26.4s, v4.8h, v0.h[6]\n" 1955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v27.4s, v4.8h, v0.h[7]\n" 1956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v28.4s, v4.8h, v1.h[4]\n" 1957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v29.4s, v4.8h, v1.h[5]\n" 1958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v30.4s, v4.8h, v1.h[6]\n" 1959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umlal2 v31.4s, v4.8h, v1.h[7]\n" 1960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 1962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 1963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 1964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v16.16b}, [x0], #16\n" 1965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v24.16b}, [x0], #16\n" 1966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 1967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v17.16b}, [x0], #16\n" 1968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v25.16b}, [x0], #16\n" 1969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 1970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v18.16b}, [x0], #16\n" 1971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v26.16b}, [x0], #16\n" 1972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 1973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v19.16b}, [x0], #16\n" 1974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v27.16b}, [x0], #16\n" 1975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 1976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v20.16b}, [x0], #16\n" 1977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v28.16b}, [x0], #16\n" 1978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 1979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v21.16b}, [x0], #16\n" 1980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v29.16b}, [x0], #16\n" 1981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 1982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v22.16b}, [x0], #16\n" 1983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v30.16b}, [x0], #16\n" 1984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v15.16b}, [x0], #16\n" 1985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v23.16b}, [x0], #16\n" 1986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v31.16b}, [x0], #16\n" 1987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 1988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 1990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 1991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 1992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 1993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 1995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 1996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 1997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 1998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 1999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel by ARM. Not expanding operands before multiplication. 2001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Tuned for A57. Compare to 2002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand 2003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57 { 2004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint8_t OperandType; 2005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint32_t AccumulatorType; 2006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 2007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<5, 16, CellOrder::WidthMajor>, 1>, 2008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > 2009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 2010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 2011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 2012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kLhsWidth = Format::Lhs::kWidth; 2013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kRhsWidth = Format::Rhs::kWidth; 2014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType rowmajor_accumulator_buffer[kLhsWidth * kRhsWidth]; 2015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 2016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Clear aggregators 2017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v12.4s, wzr\n" 2018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v13.4s, wzr\n" 2019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v14.4s, wzr\n" 2020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v15.4s, wzr\n" 2021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v16.4s, wzr\n" 2022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v17.4s, wzr\n" 2023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v18.4s, wzr\n" 2024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v19.4s, wzr\n" 2025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v20.4s, wzr\n" 2026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v21.4s, wzr\n" 2027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v22.4s, wzr\n" 2028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v23.4s, wzr\n" 2029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v24.4s, wzr\n" 2030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v25.4s, wzr\n" 2031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v26.4s, wzr\n" 2032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v27.4s, wzr\n" 2033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v28.4s, wzr\n" 2034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v29.4s, wzr\n" 2035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v30.4s, wzr\n" 2036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v31.4s, wzr\n" 2037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 2039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 2040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 2042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x16 block of Rhs is stored in 8 bit in v0--v3. 2044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 5x16 block of Lhs is cycled through v4 and v5 in 8 bit. 2045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x5 block of aggregators is stored in v12-v31 (as 4x32 bit 2047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // components which would need to be added at the end) 2048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The Lhs vectors are multiplied by the Rhs vectors with a widening 2050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // multiply to produce an intermediate result which is stored in 2051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // v6-v11. Each intermediate result is 8x16 bits so this happens 2052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // twice for each Lhs/Rhs combination (once with UMULL for elements 2053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 0-7 and once with UMULL2 for elements 8-15). 2054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // UADALP is used to accumulate these intermediate results into the 2056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // result aggregators. 2057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+--------+--------+ 2061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] | 2062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs +--------+--------+--------+--------+ 2063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | ... | ... | ... | ... | 2064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+--------+--------| 2065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]| 2066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+--------+--------+ 2067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | | | 2069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | | | 2071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+-----+--------+ - - +--------+--------+--------+--------+ 2073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.b[0]| ... |v4.b[15]| | v12.4s | v13.4s | v14.4s | v15.4s | 2074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v5.b[0]| ... |v5.b[15]| | v16.4s | v17.4s | v18.4s | v19.4s | 2075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.b[0]| ... |v4.b[15]| | v20.4s | v21.4s | v22.4s | v23.4s | 2076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v5.b[0]| ... |v5.b[15]| | v24.4s | v25.4s | v26.4s | v27.4s | 2077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.b[0]| ... |v4.b[15]| | v28.4s | v29.4s | v30.4s | v31.4s | 2078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+--------------+ - - +--------+--------+--------+--------+ 2079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 2081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Further possible optimisations (not tried): 2084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // - Move early loads into previous iteration (see Float32_WithScalar 2085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // for example). - Unroll loop 2x to alternate more smoothly between 2086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // v4 and v5. - A different number of temporary registers might work 2087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // better. - Pairing umull with corresponding umull2 might allow 2088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // better 2089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // register loading (e.g. at the start of the loop) 2090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // - Interleaving umull{2} and uadalp even more aggressively might 2091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // help, (not sure about latency vs. dispatch rate). 2092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Start loading Rhs - further loads are interleaved amongst the 2095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // multiplies for better dispatch on A57. 2096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 2097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load first Lhs vector - further loads are interleaved amongst the 2099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // multiplies 2100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" 2101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v6.8h, v0.8b, v4.8b\n" 2103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" // 2nd RHS element 2104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v7.8h, v1.8b, v4.8b\n" 2105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.16b}, [%[rhs_ptr]], #16\n" // 3rd RHS element 2106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v8.8h, v2.8b, v4.8b\n" 2107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.16b}, [%[rhs_ptr]], #16\n" // 4th RHS element 2108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v9.8h, v3.8b, v4.8b\n" 2109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v10.8h, v0.16b, v4.16b\n" 2110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v11.8h, v1.16b, v4.16b\n" 2111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.16b}, [%[lhs_ptr]], #16\n" // 2nd LHS element 2112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v12.4s, v6.8h\n" 2114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v6.8h, v2.16b, v4.16b\n" 2115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v13.4s, v7.8h\n" 2116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v7.8h, v3.16b, v4.16b\n" 2117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // 1st LHS element done - Reuse v4 2118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // for 3rd LHS element 2119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v14.4s, v8.8h\n" 2120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v8.8h, v0.8b, v5.8b\n" 2121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v15.4s, v9.8h\n" 2122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v9.8h, v1.8b, v5.8b\n" 2123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v12.4s, v10.8h\n" 2124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v10.8h, v2.8b, v5.8b\n" 2125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v13.4s, v11.8h\n" 2126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v11.8h, v3.8b, v5.8b\n" 2127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v14.4s, v6.8h\n" 2129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v6.8h, v0.16b, v5.16b\n" 2130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v15.4s, v7.8h\n" 2131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v7.8h, v1.16b, v5.16b\n" 2132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v16.4s, v8.8h\n" 2133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v8.8h, v2.16b, v5.16b\n" 2134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v17.4s, v9.8h\n" 2135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v9.8h, v3.16b, v5.16b\n" 2136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.16b}, [%[lhs_ptr]], #16\n" // 2nd LHS element done - Reuse v5 2137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // for 4th LHS element 2138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v18.4s, v10.8h\n" 2139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v10.8h, v0.8b, v4.8b\n" 2140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v19.4s, v11.8h\n" 2141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v11.8h, v1.8b, v4.8b\n" 2142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v16.4s, v6.8h\n" 2144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v6.8h, v2.8b, v4.8b\n" 2145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v17.4s, v7.8h\n" 2146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v7.8h, v3.8b, v4.8b\n" 2147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v18.4s, v8.8h\n" 2148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v8.8h, v0.16b, v4.16b\n" 2149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v19.4s, v9.8h\n" 2150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v9.8h, v1.16b, v4.16b\n" 2151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v20.4s, v10.8h\n" 2152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v10.8h, v2.16b, v4.16b\n" 2153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v21.4s, v11.8h\n" 2154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v11.8h, v3.16b, v4.16b\n" 2155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // 3rd LHS element done - Reuse v4 2156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // for 5th LHS element 2157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v22.4s, v6.8h\n" 2159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v6.8h, v0.8b, v5.8b\n" 2160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v23.4s, v7.8h\n" 2161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v7.8h, v1.8b, v5.8b\n" 2162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v20.4s, v8.8h\n" 2163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v8.8h, v2.8b, v5.8b\n" 2164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v21.4s, v9.8h\n" 2165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v9.8h, v3.8b, v5.8b\n" 2166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v22.4s, v10.8h\n" 2167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v10.8h, v0.16b, v5.16b\n" 2168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v23.4s, v11.8h\n" 2169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v11.8h, v1.16b, v5.16b\n" 2170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v24.4s, v6.8h\n" 2172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v6.8h, v2.16b, v5.16b\n" 2173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v25.4s, v7.8h\n" 2174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v7.8h, v3.16b, v5.16b\n" 2175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v26.4s, v8.8h\n" 2176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v8.8h, v0.8b, v4.8b\n" 2177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v27.4s, v9.8h\n" 2178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v9.8h, v1.8b, v4.8b\n" 2179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v24.4s, v10.8h\n" 2180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v10.8h, v2.8b, v4.8b\n" 2181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v25.4s, v11.8h\n" 2182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull v11.8h, v3.8b, v4.8b\n" 2183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v26.4s, v6.8h\n" 2185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v6.8h, v0.16b, v4.16b\n" 2186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v27.4s, v7.8h\n" 2187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v7.8h, v1.16b, v4.16b\n" 2188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v28.4s, v8.8h\n" 2189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v8.8h, v2.16b, v4.16b\n" 2190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v29.4s, v9.8h\n" 2191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "umull2 v9.8h, v3.16b, v4.16b\n" 2192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v30.4s, v10.8h\n" 2193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v31.4s, v11.8h\n" 2194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v28.4s, v6.8h\n" 2196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v29.4s, v7.8h\n" 2197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 16, since we just handled 2198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 16 levels of depth. Do this subs a bit before the end of the loop 2199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // for better dispatch on A57. 2200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #16\n" 2201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v30.4s, v8.8h\n" 2202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "uadalp v31.4s, v9.8h\n" 2203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 2205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 2206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Reduce aggregators horizontally 2208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v0.4s, v12.4s, v13.4s\n" 2209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v1.4s, v14.4s, v15.4s\n" 2210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v2.4s, v16.4s, v17.4s\n" 2211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v3.4s, v18.4s, v19.4s\n" 2212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v4.4s, v20.4s, v21.4s\n" 2213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v5.4s, v22.4s, v23.4s\n" 2214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v6.4s, v24.4s, v25.4s\n" 2215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v7.4s, v26.4s, v27.4s\n" 2216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v8.4s, v28.4s, v29.4s\n" 2217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v9.4s, v30.4s, v31.4s\n" 2218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v10.4s, v0.4s, v1.4s\n" 2220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v11.4s, v2.4s, v3.4s\n" 2221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v12.4s, v4.4s, v5.4s\n" 2222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v13.4s, v6.4s, v7.4s\n" 2223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v14.4s, v8.4s, v9.4s\n" 2224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[rowmajor_accumulator_buffer]\n" 2226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 2227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 2228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 2229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 2230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 2231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 2232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 2233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 2234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 2235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [rowmajor_accumulator_buffer] "r"(rowmajor_accumulator_buffer) 2236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 2237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 2239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 2240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 2241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // accumulate row-major accumulators into global (column-major) accumulators 2243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int l = 0; l < kLhsWidth; l++) { 2244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int r = 0; r < kRhsWidth; r++) { 2245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang accum_ptr[l + kLhsWidth * r] += 2246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rowmajor_accumulator_buffer[r + l * kRhsWidth]; 2247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 2248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 2249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 2250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 2251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Fast kernel operating on int8 operands. 2253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It is assumed that one of the two int8 operands only takes values 2254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in [-127, 127], while the other may freely range in [-128, 127]. 2255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The issue with both operands taking the value -128 is that: 2256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// -128*-128 + -128*-128 == -32768 overflows int16. 2257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16 2258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// range. That is the basic idea of this kernel. 2259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits { 2260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int8_t OperandType; 2261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t AccumulatorType; 2262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 2263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 2264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > 2265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 2266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 2267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 2268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::size_t start_depth = 123; 2269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::size_t run_depth = depth; 2270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::size_t dst_col_stride = 4; 2271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* dst_ptr = accum_ptr; 2272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 2273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Overview of register layout: 2274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x16 block of Rhs is stored in 8 bit in v0--v3. 2276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x16 block of Lhs is stored in 8 bit in v4--v7. 2277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit 2279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // components which need to be horizontally-added at the end) 2280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The Lhs vectors are multiplied by the Rhs vectors with a widening 2282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // multiply over the 8 first levels of depth, producing int16x8 2283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // vectors of products for each position in the accumulator matrix. 2284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Here comes the special trick: since the operands are signed int8, 2285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // their range being [ -2^7 , 2^7 ), their products are in range 2286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values 2287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // without any risk of overflowing int16. 2288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // We thus proceed with the 8 next levels of depth, multiplying 2289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // again Lhs by Rhs, accumulating into this existing int16x8 vector. 2290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Only then, having processed 16 levels of depth, do we need to 2292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // horizontally add these int16x8 accumulators into the final 2293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // int32x4 accumulators. 2294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // As we do not have enough registers to store all 16 int16x8 2296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // temporary-16bit-accumulators, we have them cycle through v8--v15. 2297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Register layout (ignoring the v8--v15 temporary 16bit accumulators): 2300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+--------+--------+ 2302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] | 2303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Rhs +--------+--------+--------+--------+ 2304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | ... | ... | ... | ... | 2305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+--------+--------| 2306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]| 2307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +--------+--------+--------+--------+ 2308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // | | | | | 2310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Lhs | | | | | 2312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+-----+--------+ - - +--------+--------+--------+--------+ 2314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v4.b[0]| ... |v4.b[15]| | v16.4s | v17.4s | v18.4s | v19.4s | 2315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v5.b[0]| ... |v5.b[15]| | v20.4s | v21.4s | v22.4s | v23.4s | 2316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v6.b[0]| ... |v6.b[15]| | v24.4s | v25.4s | v26.4s | v27.4s | 2317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // |v7.b[0]| ... |v7.b[15]| | v28.4s | v29.4s | v30.4s | v31.4s | 2318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // +-------+--------------+ - - +--------+--------+--------+--------+ 2319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Accumulator 2321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Clear accumulators 2324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 2325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v16.4s, wzr\n" 2326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" 2327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v17.4s, wzr\n" 2328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" 2329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v18.4s, wzr\n" 2330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.16b}, [%[lhs_ptr]], #16\n" 2331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v19.4s, wzr\n" 2332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v6.16b}, [%[lhs_ptr]], #16\n" 2333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v20.4s, wzr\n" 2334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v7.16b}, [%[lhs_ptr]], #16\n" 2335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v21.4s, wzr\n" 2336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.16b}, [%[rhs_ptr]], #16\n" 2337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v22.4s, wzr\n" 2338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.16b}, [%[rhs_ptr]], #16\n" 2339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v23.4s, wzr\n" 2340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[run_depth], %[run_depth], #16\n" 2341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v24.4s, wzr\n" 2342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[dst_ptr]\n" 2343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v25.4s, wzr\n" 2344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v26.4s, wzr\n" 2345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v27.4s, wzr\n" 2346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v28.4s, wzr\n" 2347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v29.4s, wzr\n" 2348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v30.4s, wzr\n" 2349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v31.4s, wzr\n" 2350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v0.8b, v4.8b\n" 2352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v1.8b, v4.8b\n" 2353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v0.8b, v5.8b\n" 2354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v1.8b, v5.8b\n" 2355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v0.16b, v4.16b\n" 2356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v1.16b, v4.16b\n" 2357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v0.16b, v5.16b\n" 2358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v1.16b, v5.16b\n" 2359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n" 2361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 2363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 2364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %[run_depth], %[run_depth], #16\n" 2366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v16.4s, v12.8h\n" 2368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v0.8b, v6.8b\n" 2369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v17.4s, v13.8h\n" 2370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v0.8b, v7.8b\n" 2371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v20.4s, v14.8h\n" 2372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v1.8b, v6.8b\n" 2373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v21.4s, v15.8h\n" 2374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v1.8b, v7.8b\n" 2375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v0.16b, v6.16b\n" 2376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v0.16b, v7.16b\n" 2377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 2378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v1.16b, v6.16b\n" 2379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v1.16b, v7.16b\n" 2380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" 2381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v24.4s, v12.8h\n" 2382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v2.8b, v4.8b\n" 2383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v28.4s, v13.8h\n" 2384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v3.8b, v4.8b\n" 2385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v25.4s, v14.8h\n" 2386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v2.8b, v5.8b\n" 2387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v29.4s, v15.8h\n" 2388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v3.8b, v5.8b\n" 2389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v2.16b, v4.16b\n" 2390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v3.16b, v4.16b\n" 2391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" 2392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v2.16b, v5.16b\n" 2393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v3.16b, v5.16b\n" 2394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.16b}, [%[lhs_ptr]], #16\n" 2395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v18.4s, v12.8h\n" 2396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v2.8b, v6.8b\n" 2397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v19.4s, v13.8h\n" 2398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v2.8b, v7.8b\n" 2399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v22.4s, v14.8h\n" 2400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v3.8b, v6.8b\n" 2401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v23.4s, v15.8h\n" 2402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v3.8b, v7.8b\n" 2403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v2.16b, v6.16b\n" 2404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v2.16b, v7.16b\n" 2405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.16b}, [%[rhs_ptr]], #16\n" 2406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v3.16b, v6.16b\n" 2407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v6.16b}, [%[lhs_ptr]], #16\n" 2408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v3.16b, v7.16b\n" 2409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v7.16b}, [%[lhs_ptr]], #16\n" 2410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v26.4s, v12.8h\n" 2411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.16b}, [%[rhs_ptr]], #16\n" 2412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v0.8b, v4.8b\n" 2413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v30.4s, v13.8h\n" 2414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v1.8b, v4.8b\n" 2415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v27.4s, v14.8h\n" 2416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v0.8b, v5.8b\n" 2417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v31.4s, v15.8h\n" 2418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v1.8b, v5.8b\n" 2419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v0.16b, v4.16b\n" 2420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v1.16b, v4.16b\n" 2421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v0.16b, v5.16b\n" 2422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v1.16b, v5.16b\n" 2423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP "b\n" 2425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_AFTER_LOOP 2427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 2428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators from memory 2430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 2431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 2432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 2433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 2434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[dst_ptr]\n" 2435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Do the remaining arithmetic for the 16 last levels of depths. 2437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // All the operands are already loaded. 2438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v16.4s, v12.8h\n" 2439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v0.8b, v6.8b\n" 2440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v17.4s, v13.8h\n" 2441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v0.8b, v7.8b\n" 2442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v20.4s, v14.8h\n" 2443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v1.8b, v6.8b\n" 2444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v21.4s, v15.8h\n" 2445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v1.8b, v7.8b\n" 2446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v0.16b, v6.16b\n" 2447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v0.16b, v7.16b\n" 2448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v1.16b, v6.16b\n" 2449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v1.16b, v7.16b\n" 2450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v24.4s, v12.8h\n" 2451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v2.8b, v4.8b\n" 2452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v28.4s, v13.8h\n" 2453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v3.8b, v4.8b\n" 2454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v25.4s, v14.8h\n" 2455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v2.8b, v5.8b\n" 2456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v29.4s, v15.8h\n" 2457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v3.8b, v5.8b\n" 2458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v2.16b, v4.16b\n" 2459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v3.16b, v4.16b\n" 2460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v2.16b, v5.16b\n" 2461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v3.16b, v5.16b\n" 2462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v18.4s, v12.8h\n" 2463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v12.8h, v2.8b, v6.8b\n" 2464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v19.4s, v13.8h\n" 2465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v13.8h, v2.8b, v7.8b\n" 2466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v22.4s, v14.8h\n" 2467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v14.8h, v3.8b, v6.8b\n" 2468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v23.4s, v15.8h\n" 2469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smull v15.8h, v3.8b, v7.8b\n" 2470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v12.8h, v2.16b, v6.16b\n" 2471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v13.8h, v2.16b, v7.16b\n" 2472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v14.8h, v3.16b, v6.16b\n" 2473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "smlal2 v15.8h, v3.16b, v7.16b\n" 2474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v26.4s, v12.8h\n" 2475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v30.4s, v13.8h\n" 2476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v27.4s, v14.8h\n" 2477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "sadalp v31.4s, v15.8h\n" 2478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Reduce aggregators horizontally 2480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v0.4s, v16.4s, v20.4s\n" 2481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v1.4s, v17.4s, v21.4s\n" 2482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v2.4s, v18.4s, v22.4s\n" 2483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v3.4s, v19.4s, v23.4s\n" 2484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v4.4s, v24.4s, v28.4s\n" 2485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v5.4s, v25.4s, v29.4s\n" 2486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v6.4s, v26.4s, v30.4s\n" 2487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v7.4s, v27.4s, v31.4s\n" 2488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v12.4s, v0.4s, v4.4s\n" 2490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v13.4s, v1.4s, v5.4s\n" 2491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v14.4s, v2.4s, v6.4s\n" 2492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "addp v15.4s, v3.4s, v7.4s\n" 2493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Add to the accumulators loaded from memory 2495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add v8.4s, v8.4s, v12.4s\n" 2496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add v9.4s, v9.4s, v13.4s\n" 2497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add v10.4s, v10.4s, v14.4s\n" 2498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add v11.4s, v11.4s, v15.4s\n" 2499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators back to memory 2501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 2502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 2503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 2504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 2505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 2506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 2507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth), 2508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [dst_col_stride] "+r"(dst_col_stride) 2509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 2510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [start_depth] "r"(start_depth) 2511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 2512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 2514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 2515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 2516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 2517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 2518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 25197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __ARM_FEATURE_DOTPROD 25207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Kernels utilizing the Armv8.2 Dot Product extension. 25217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 25227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// The dot product instructions work by taking 4 consecutive 8-bit depth 25237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// values from each operand, multiplying the 4 pairs together and 25247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// accumulating all the results into the corresponding 32-bit accumulator 25257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// lane. As such, the operation is identical to a 32-bit instruction (like 25267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// FMLA used in SGEMM), except that 4 depth values are processed at a time 25277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// instead of 1. 25287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 25297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Thus, this first kernel is a carbon copy of 25307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// "NEON_64bit_GEMM_Float32_WithScalar_A57" (which should provide good 25317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// performance for most processors) below with the opcode (fmla -> udot) and 25327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// types (float32 -> uint8/uint32) changed. 25337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 25347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// A signed version of this kernel could be produced by replacing "udot" 25357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// with "sdot" - performance should be identical to this udot kernel. 25367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct { 25377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 25387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint32_t AccumulatorType; 25397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 25407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>, 25417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> > 25427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 25437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 25447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 25457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 25467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 25477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "mov x0, %[accum_ptr]\n" 25487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v8.4s}, [x0], #16\n" 25497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v16.4s}, [x0], #16\n" 25507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v24.4s}, [x0], #16\n" 25517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v9.4s}, [x0], #16\n" 25527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v17.4s}, [x0], #16\n" 25537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v25.4s}, [x0], #16\n" 25547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v10.4s}, [x0], #16\n" 25557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v18.4s}, [x0], #16\n" 25567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v26.4s}, [x0], #16\n" 25577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v11.4s}, [x0], #16\n" 25587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v19.4s}, [x0], #16\n" 25597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v27.4s}, [x0], #16\n" 25607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v12.4s}, [x0], #16\n" 25617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v20.4s}, [x0], #16\n" 25627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v28.4s}, [x0], #16\n" 25637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v13.4s}, [x0], #16\n" 25647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v21.4s}, [x0], #16\n" 25657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v29.4s}, [x0], #16\n" 25667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v14.4s}, [x0], #16\n" 25677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v22.4s}, [x0], #16\n" 25687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v30.4s}, [x0], #16\n" 25697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v15.4s}, [x0], #16\n" 25707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v23.4s}, [x0], #16\n" 25717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v31.4s}, [x0], #16\n" 25727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 25737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // The start of the loop assumes first Rhs cell is already loaded, so 25747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // do it here for first iteration. 25757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" 25767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 25777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // And the same for the first Lhs cell. 25787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v2.16b}, [%[lhs_ptr]], #16\n" 25797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 25807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP 25817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang ":\n" 25827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 25837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Start the MACs at the head of the loop - 1st cell from each side 25847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // already loaded. 25857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v8.4s, v2.16b, v0.b[0]\n" 25867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v9.4s, v2.16b, v0.b[1]\n" 25877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v1.16b}, [%[rhs_ptr]], #16\n" // Load second Rhs cell. 25887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v10.4s, v2.16b, v0.b[2]\n" 25897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v11.4s, v2.16b, v0.b[3]\n" 25907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v3.16b}, [%[lhs_ptr]], #16\n" // Load second Lhs cell. 25917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v12.4s, v2.16b, v1.b[0]\n" 25927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v13.4s, v2.16b, v1.b[1]\n" 25937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v4.16b}, [%[lhs_ptr]], #16\n" // Load third Lhs cell. 25947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v14.4s, v2.16b, v1.b[2]\n" 25957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v15.4s, v2.16b, v1.b[3]\n" 25967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v2.16b}, [%[lhs_ptr]], #16\n" // Done with first Lhs cell - load 25977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // for the next iteration early. 25987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v16.4s, v3.16b, v0.b[0]\n" 25997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v17.4s, v3.16b, v0.b[1]\n" 26007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v18.4s, v3.16b, v0.b[2]\n" 26017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v19.4s, v3.16b, v0.b[3]\n" 26027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v20.4s, v3.16b, v1.b[0]\n" 26037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v21.4s, v3.16b, v1.b[1]\n" 26047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v22.4s, v3.16b, v1.b[2]\n" 26057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v23.4s, v3.16b, v1.b[3]\n" 26067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v24.4s, v4.16b, v0.b[0]\n" 26077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v25.4s, v4.16b, v0.b[1]\n" 26087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v26.4s, v4.16b, v0.b[2]\n" 26097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v27.4s, v4.16b, v0.b[3]\n" 26107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" // Done with the first Rhs cell - 26117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // load for the next iteration early. 26127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v28.4s, v4.16b, v1.b[0]\n" 26137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v29.4s, v4.16b, v1.b[1]\n" 26147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 26157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Loop. Decrement loop index (depth) by 4 as udot processes 4 26167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // depth values. 26177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "subs %w[depth], %w[depth], #4\n" 26187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v30.4s, v4.16b, v1.b[2]\n" 26197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v31.4s, v4.16b, v1.b[3]\n" 26207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 26217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bne " GEMMLOWP_LABEL_LOOP 26227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "b\n" 26237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 26247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators 26257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "mov x0, %[accum_ptr]\n" 26267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v8.16b}, [x0], #16\n" 26277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v16.16b}, [x0], #16\n" 26287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v24.16b}, [x0], #16\n" 26297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v9.16b}, [x0], #16\n" 26307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v17.16b}, [x0], #16\n" 26317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v25.16b}, [x0], #16\n" 26327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v10.16b}, [x0], #16\n" 26337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v18.16b}, [x0], #16\n" 26347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v26.16b}, [x0], #16\n" 26357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v11.16b}, [x0], #16\n" 26367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v19.16b}, [x0], #16\n" 26377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v27.16b}, [x0], #16\n" 26387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v12.16b}, [x0], #16\n" 26397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v20.16b}, [x0], #16\n" 26407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v28.16b}, [x0], #16\n" 26417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v13.16b}, [x0], #16\n" 26427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v21.16b}, [x0], #16\n" 26437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v29.16b}, [x0], #16\n" 26447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v14.16b}, [x0], #16\n" 26457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v22.16b}, [x0], #16\n" 26467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v30.16b}, [x0], #16\n" 26477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v15.16b}, [x0], #16\n" 26487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v23.16b}, [x0], #16\n" 26497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v31.16b}, [x0], #16\n" 26507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 26517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 26527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 26537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 26547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 26557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 26567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 26577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 26587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 26597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v28", "v29", "v30", "v31"); 26607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 26617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 26627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 26637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// As above, except tuned for Cortex-A55r1. 26647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 26657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Similarly, this is a clone of NEON_64bit_GEMM_Float32_WithScalar_A55r1 26667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// with the names changed. 26677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct_A55r1 { 26687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 26697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint32_t AccumulatorType; 26707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 26717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>, 26727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> > 26737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 26747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 26757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 26767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 26777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 26787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "mov x0, %[accum_ptr]\n" 26797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v8.4s}, [x0], #16\n" 26807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v16.4s}, [x0], #16\n" 26817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v24.4s}, [x0], #16\n" 26827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v9.4s}, [x0], #16\n" 26837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v17.4s}, [x0], #16\n" 26847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v25.4s}, [x0], #16\n" 26857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v10.4s}, [x0], #16\n" 26867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v18.4s}, [x0], #16\n" 26877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v26.4s}, [x0], #16\n" 26887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v11.4s}, [x0], #16\n" 26897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v19.4s}, [x0], #16\n" 26907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v27.4s}, [x0], #16\n" 26917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v12.4s}, [x0], #16\n" 26927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v20.4s}, [x0], #16\n" 26937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v28.4s}, [x0], #16\n" 26947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v13.4s}, [x0], #16\n" 26957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v21.4s}, [x0], #16\n" 26967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v29.4s}, [x0], #16\n" 26977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v14.4s}, [x0], #16\n" 26987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v22.4s}, [x0], #16\n" 26997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v30.4s}, [x0], #16\n" 27007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v15.4s}, [x0], #16\n" 27017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v23.4s}, [x0], #16\n" 27027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v31.4s}, [x0], #16\n" 27037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // For details on how this kernel works, see the Float32 kernel below. 27057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d0, [%[rhs_ptr]]\n" 27077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" 27087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr q2, [%[lhs_ptr]]\n" 27107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr q3, [%[lhs_ptr], #16]\n" 27117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP 27137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang ":\n" 27147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v8.4s, v2.16b, v0.b[0]\n" 27167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d1, [%[rhs_ptr], #16]\n" // Bottom half of v1 27177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v9.4s, v2.16b, v0.b[1]\n" 27187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v0.d[1], x18\n" // Finish loading v0 27197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v16.4s, v3.16b, v0.b[0]\n" // out of sequence - used to reduce load/use pressure. 27207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[rhs_ptr], #24]\n" // Top half of v1 to X register 27217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v17.4s, v3.16b, v0.b[1]\n" // out of sequence - used to reduce load/use pressure. 27227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "add %[rhs_ptr], %[rhs_ptr], #32\n" // RHS loads complete - increment pointer. 27237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v10.4s, v2.16b, v0.b[2]\n" 27247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d4, [%[lhs_ptr], #32]\n" // Bottom half of v4 27257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v11.4s, v2.16b, v0.b[3]\n" 27267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v1.d[1], x18\n" // Finish loading v1 27277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v12.4s, v2.16b, v1.b[0]\n" 27287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[lhs_ptr], #40]\n" // Top half of v4 to X register 27297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v13.4s, v2.16b, v1.b[1]\n" 27307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "add %[lhs_ptr], %[lhs_ptr], #48\n" // LHS loads complete - increment pointer. 27317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v14.4s, v2.16b, v1.b[2]\n" 27327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v15.4s, v2.16b, v1.b[3]\n" 27347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d2, [%[lhs_ptr]]\n" // Bottom half of v2 (for next time) 27357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v18.4s, v3.16b, v0.b[2]\n" 27367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v4.d[1], x18\n" // Finish loading v4 27377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v19.4s, v3.16b, v0.b[3]\n" 27387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[lhs_ptr], #8]\n" // Top half of next v2 to X register 27397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v20.4s, v3.16b, v1.b[0]\n" 27407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "subs %w[depth], %w[depth], #4\n" 27417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v21.4s, v3.16b, v1.b[1]\n" 27427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v22.4s, v3.16b, v1.b[2]\n" 27447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v23.4s, v3.16b, v1.b[3]\n" 27467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d3, [%[lhs_ptr], #16]\n" // Bottom half of v3 (for next time) 27477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v24.4s, v4.16b, v0.b[0]\n" 27487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v2.d[1], x18\n" // Finish loading next v2 27497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v25.4s, v4.16b, v0.b[1]\n" 27507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[lhs_ptr], #24]\n" // Top half of next v3 to X register 27517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v26.4s, v4.16b, v0.b[2]\n" 27527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v27.4s, v4.16b, v0.b[3]\n" 27547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d0, [%[rhs_ptr]]\n" // Bottom half of v0 (for next time) 27557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v28.4s, v4.16b, v1.b[0]\n" 27567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v3.d[1], x18\n" // Finish loading next v3 27577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v29.4s, v4.16b, v1.b[1]\n" 27587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" // Top half of next v0 to X register 27597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v30.4s, v4.16b, v1.b[2]\n" 27607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "udot v31.4s, v4.16b, v1.b[3]\n" 27627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bne " GEMMLOWP_LABEL_LOOP "b\n" 27637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 27647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators 27657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "mov x0, %[accum_ptr]\n" 27667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v8.4s}, [x0], #16\n" 27677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v16.4s}, [x0], #16\n" 27687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v24.4s}, [x0], #16\n" 27697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v9.4s}, [x0], #16\n" 27707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v17.4s}, [x0], #16\n" 27717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v25.4s}, [x0], #16\n" 27727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v10.4s}, [x0], #16\n" 27737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v18.4s}, [x0], #16\n" 27747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v26.4s}, [x0], #16\n" 27757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v11.4s}, [x0], #16\n" 27767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v19.4s}, [x0], #16\n" 27777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v27.4s}, [x0], #16\n" 27787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v12.4s}, [x0], #16\n" 27797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v20.4s}, [x0], #16\n" 27807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v28.4s}, [x0], #16\n" 27817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v13.4s}, [x0], #16\n" 27827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v21.4s}, [x0], #16\n" 27837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v29.4s}, [x0], #16\n" 27847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v14.4s}, [x0], #16\n" 27857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v22.4s}, [x0], #16\n" 27867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v30.4s}, [x0], #16\n" 27877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v15.4s}, [x0], #16\n" 27887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v23.4s}, [x0], #16\n" 27897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v31.4s}, [x0], #16\n" 27907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 27917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 27927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 27937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 27947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 27957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 27967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 27977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 27987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", 27997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v27", "v28", "v29", "v30", "v31"); 28007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 28017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 28027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif // __ARM_FEATURE_DOTPROD 28037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 2804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We don't actually use int32*int32 in production. This is just an 2805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// experiment to help dissociate the effect of integer-vs-float, from the 2806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// effect of operands width. 2807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Int32_WithScalar { 2808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t OperandType; 2809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t AccumulatorType; 2810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 2811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 2812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> > 2813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 2814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 2815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 2816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 2817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 2818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 2819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 2820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v16.16b}, [x0], #16\n" 2821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v24.16b}, [x0], #16\n" 2822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 2823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v17.16b}, [x0], #16\n" 2824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v25.16b}, [x0], #16\n" 2825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 2826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v18.16b}, [x0], #16\n" 2827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v26.16b}, [x0], #16\n" 2828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 2829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v19.16b}, [x0], #16\n" 2830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v27.16b}, [x0], #16\n" 2831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v12.16b}, [x0], #16\n" 2832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v20.16b}, [x0], #16\n" 2833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v28.16b}, [x0], #16\n" 2834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v13.16b}, [x0], #16\n" 2835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v21.16b}, [x0], #16\n" 2836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v29.16b}, [x0], #16\n" 2837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v14.16b}, [x0], #16\n" 2838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v22.16b}, [x0], #16\n" 2839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v30.16b}, [x0], #16\n" 2840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v15.16b}, [x0], #16\n" 2841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v23.16b}, [x0], #16\n" 2842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v31.16b}, [x0], #16\n" 2843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 2845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 2846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 2 Rhs cell of size 1x4 each 2848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.4s}, [%[rhs_ptr]], #16\n" 2849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v1.4s}, [%[rhs_ptr]], #16\n" 2850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 2852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" 2853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.4s}, [%[lhs_ptr]], #16\n" 2854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.4s}, [%[lhs_ptr]], #16\n" 2855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 2857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v8.4s, v2.4s, v0.s[0]\n" 2858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v9.4s, v2.4s, v0.s[1]\n" 2859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v10.4s, v2.4s, v0.s[2]\n" 2860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v11.4s, v2.4s, v0.s[3]\n" 2861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v12.4s, v2.4s, v1.s[0]\n" 2862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v13.4s, v2.4s, v1.s[1]\n" 2863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v14.4s, v2.4s, v1.s[2]\n" 2864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v15.4s, v2.4s, v1.s[3]\n" 2865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v16.4s, v3.4s, v0.s[0]\n" 2866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v17.4s, v3.4s, v0.s[1]\n" 2867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v18.4s, v3.4s, v0.s[2]\n" 2868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v19.4s, v3.4s, v0.s[3]\n" 2869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v20.4s, v3.4s, v1.s[0]\n" 2870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v21.4s, v3.4s, v1.s[1]\n" 2871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v22.4s, v3.4s, v1.s[2]\n" 2872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v23.4s, v3.4s, v1.s[3]\n" 2873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v24.4s, v4.4s, v0.s[0]\n" 2874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v25.4s, v4.4s, v0.s[1]\n" 2875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v26.4s, v4.4s, v0.s[2]\n" 2876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v27.4s, v4.4s, v0.s[3]\n" 2877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v28.4s, v4.4s, v1.s[0]\n" 2878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v29.4s, v4.4s, v1.s[1]\n" 2879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v30.4s, v4.4s, v1.s[2]\n" 2880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mla v31.4s, v4.4s, v1.s[3]\n" 2881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 2883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 2884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #1\n" 2885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 2886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 2887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 2889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 2890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 2891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v16.16b}, [x0], #16\n" 2892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v24.16b}, [x0], #16\n" 2893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 2894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v17.16b}, [x0], #16\n" 2895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v25.16b}, [x0], #16\n" 2896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 2897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v18.16b}, [x0], #16\n" 2898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v26.16b}, [x0], #16\n" 2899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 2900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v19.16b}, [x0], #16\n" 2901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v27.16b}, [x0], #16\n" 2902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 2903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v20.16b}, [x0], #16\n" 2904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v28.16b}, [x0], #16\n" 2905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 2906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v21.16b}, [x0], #16\n" 2907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v29.16b}, [x0], #16\n" 2908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 2909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v22.16b}, [x0], #16\n" 2910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v30.16b}, [x0], #16\n" 2911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v15.16b}, [x0], #16\n" 2912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v23.16b}, [x0], #16\n" 2913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v31.16b}, [x0], #16\n" 2914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 2915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 2916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 2917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 2918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 2919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 2920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 2922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 2923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 2924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 2925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 2926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Not very efficient kernel, just an experiment to see what we can do 2928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// without using NEON multiply-with-scalar instructions. 2929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithVectorDuplicatingScalar { 2930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 2931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 2932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 2933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 2934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> > 2935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 2936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 2937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 2938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 2939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 2940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 2941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 2942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v16.16b}, [x0], #16\n" 2943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v24.16b}, [x0], #16\n" 2944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 2945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v17.16b}, [x0], #16\n" 2946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v25.16b}, [x0], #16\n" 2947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 2948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v18.16b}, [x0], #16\n" 2949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v26.16b}, [x0], #16\n" 2950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 2951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v19.16b}, [x0], #16\n" 2952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v27.16b}, [x0], #16\n" 2953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v12.16b}, [x0], #16\n" 2954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v20.16b}, [x0], #16\n" 2955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v28.16b}, [x0], #16\n" 2956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v13.16b}, [x0], #16\n" 2957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v21.16b}, [x0], #16\n" 2958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v29.16b}, [x0], #16\n" 2959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v14.16b}, [x0], #16\n" 2960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v22.16b}, [x0], #16\n" 2961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v30.16b}, [x0], #16\n" 2962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v15.16b}, [x0], #16\n" 2963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v23.16b}, [x0], #16\n" 2964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v31.16b}, [x0], #16\n" 2965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 2967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 2968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 2 Rhs cell of size 1x4 each 2970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v5.4s}, [%[rhs_ptr]], #16\n" 2971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v6.4s}, [%[rhs_ptr]], #16\n" 2972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 2974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" 2975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.4s}, [%[lhs_ptr]], #16\n" 2976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.4s}, [%[lhs_ptr]], #16\n" 2977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 2978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 2979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v0.4s, v5.s[0]\n" 2980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v1.4s, v5.s[1]\n" 2981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v8.4s, v2.4s, v0.4s\n" 2982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v16.4s, v3.4s, v0.4s\n" 2983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v24.4s, v4.4s, v0.4s\n" 2984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v9.4s, v2.4s, v1.4s\n" 2985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v17.4s, v3.4s, v1.4s\n" 2986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v25.4s, v4.4s, v1.4s\n" 2987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v0.4s, v5.s[2]\n" 2988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v1.4s, v5.s[3]\n" 2989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v10.4s, v2.4s, v0.4s\n" 2990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v18.4s, v3.4s, v0.4s\n" 2991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v26.4s, v4.4s, v0.4s\n" 2992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v11.4s, v2.4s, v1.4s\n" 2993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v19.4s, v3.4s, v1.4s\n" 2994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v27.4s, v4.4s, v1.4s\n" 2995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v0.4s, v6.s[0]\n" 2996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v1.4s, v6.s[1]\n" 2997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v12.4s, v2.4s, v0.4s\n" 2998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v20.4s, v3.4s, v0.4s\n" 2999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v28.4s, v4.4s, v0.4s\n" 3000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v13.4s, v2.4s, v1.4s\n" 3001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v21.4s, v3.4s, v1.4s\n" 3002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v29.4s, v4.4s, v1.4s\n" 3003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v0.4s, v6.s[2]\n" 3004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "dup v1.4s, v6.s[3]\n" 3005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v14.4s, v2.4s, v0.4s\n" 3006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v22.4s, v3.4s, v0.4s\n" 3007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v30.4s, v4.4s, v0.4s\n" 3008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v15.4s, v2.4s, v1.4s\n" 3009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v23.4s, v3.4s, v1.4s\n" 3010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v31.4s, v4.4s, v1.4s\n" 3011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 3013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 3014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #1\n" 3015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 3016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 3017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 3019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 3021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v16.16b}, [x0], #16\n" 3022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v24.16b}, [x0], #16\n" 3023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 3024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v17.16b}, [x0], #16\n" 3025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v25.16b}, [x0], #16\n" 3026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 3027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v18.16b}, [x0], #16\n" 3028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v26.16b}, [x0], #16\n" 3029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 3030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v19.16b}, [x0], #16\n" 3031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v27.16b}, [x0], #16\n" 3032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 3033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v20.16b}, [x0], #16\n" 3034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v28.16b}, [x0], #16\n" 3035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 3036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v21.16b}, [x0], #16\n" 3037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v29.16b}, [x0], #16\n" 3038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 3039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v22.16b}, [x0], #16\n" 3040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v30.16b}, [x0], #16\n" 3041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v15.16b}, [x0], #16\n" 3042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v23.16b}, [x0], #16\n" 3043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v31.16b}, [x0], #16\n" 3044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 3045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 3046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 3047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 3048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 3049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 3050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 3051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 3052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 3053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 3054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the "most natural" kernel, using NEON multiply-with-scalar 3058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// instructions. 3059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar { 3060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 3061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 3062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 3063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 3064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> > 3065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 3066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 3067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 3068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 3069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 3070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 3072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v16.16b}, [x0], #16\n" 3073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v24.16b}, [x0], #16\n" 3074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 3075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v17.16b}, [x0], #16\n" 3076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v25.16b}, [x0], #16\n" 3077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 3078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v18.16b}, [x0], #16\n" 3079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v26.16b}, [x0], #16\n" 3080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 3081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v19.16b}, [x0], #16\n" 3082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v27.16b}, [x0], #16\n" 3083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v12.16b}, [x0], #16\n" 3084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v20.16b}, [x0], #16\n" 3085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v28.16b}, [x0], #16\n" 3086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v13.16b}, [x0], #16\n" 3087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v21.16b}, [x0], #16\n" 3088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v29.16b}, [x0], #16\n" 3089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v14.16b}, [x0], #16\n" 3090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v22.16b}, [x0], #16\n" 3091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v30.16b}, [x0], #16\n" 3092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v15.16b}, [x0], #16\n" 3093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v23.16b}, [x0], #16\n" 3094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v31.16b}, [x0], #16\n" 3095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 3097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 3098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 2 Rhs cell of size 1x4 each 3100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.4s}, [%[rhs_ptr]], #16\n" 3101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v1.4s}, [%[rhs_ptr]], #16\n" 3102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load 3 Lhs cells of size 4x1 each 3104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" 3105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.4s}, [%[lhs_ptr]], #16\n" 3106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.4s}, [%[lhs_ptr]], #16\n" 3107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Multiply-accumulate 3109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v8.4s, v2.4s, v0.s[0]\n" 3110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v9.4s, v2.4s, v0.s[1]\n" 3111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v10.4s, v2.4s, v0.s[2]\n" 3112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v11.4s, v2.4s, v0.s[3]\n" 3113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v12.4s, v2.4s, v1.s[0]\n" 3114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v13.4s, v2.4s, v1.s[1]\n" 3115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v14.4s, v2.4s, v1.s[2]\n" 3116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v15.4s, v2.4s, v1.s[3]\n" 3117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v16.4s, v3.4s, v0.s[0]\n" 3118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v17.4s, v3.4s, v0.s[1]\n" 3119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v18.4s, v3.4s, v0.s[2]\n" 3120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v19.4s, v3.4s, v0.s[3]\n" 3121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v20.4s, v3.4s, v1.s[0]\n" 3122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v21.4s, v3.4s, v1.s[1]\n" 3123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v22.4s, v3.4s, v1.s[2]\n" 3124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v23.4s, v3.4s, v1.s[3]\n" 3125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v24.4s, v4.4s, v0.s[0]\n" 3126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v25.4s, v4.4s, v0.s[1]\n" 3127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v26.4s, v4.4s, v0.s[2]\n" 3128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v27.4s, v4.4s, v0.s[3]\n" 3129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v28.4s, v4.4s, v1.s[0]\n" 3130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v29.4s, v4.4s, v1.s[1]\n" 3131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v30.4s, v4.4s, v1.s[2]\n" 3132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v31.4s, v4.4s, v1.s[3]\n" 3133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 1 3135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // level of depth. 3136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #1\n" 3137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 3138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 3139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 3141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 3143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v16.16b}, [x0], #16\n" 3144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v24.16b}, [x0], #16\n" 3145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 3146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v17.16b}, [x0], #16\n" 3147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v25.16b}, [x0], #16\n" 3148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 3149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v18.16b}, [x0], #16\n" 3150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v26.16b}, [x0], #16\n" 3151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 3152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v19.16b}, [x0], #16\n" 3153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v27.16b}, [x0], #16\n" 3154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 3155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v20.16b}, [x0], #16\n" 3156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v28.16b}, [x0], #16\n" 3157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 3158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v21.16b}, [x0], #16\n" 3159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v29.16b}, [x0], #16\n" 3160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 3161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v22.16b}, [x0], #16\n" 3162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v30.16b}, [x0], #16\n" 3163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v15.16b}, [x0], #16\n" 3164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v23.16b}, [x0], #16\n" 3165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v31.16b}, [x0], #16\n" 3166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 3167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 3168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 3169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 3170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 3171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 3172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 3173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 3174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 3175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 3176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel contributed by ARM. Tuned for A57. 3180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar_A57 { 3181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 3182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 3183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 3184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 3185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> > 3186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 3187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 3188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 3189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 3190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 3191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 3193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v16.16b}, [x0], #16\n" 3194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v24.16b}, [x0], #16\n" 3195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 3196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v17.16b}, [x0], #16\n" 3197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v25.16b}, [x0], #16\n" 3198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 3199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v18.16b}, [x0], #16\n" 3200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v26.16b}, [x0], #16\n" 3201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 3202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v19.16b}, [x0], #16\n" 3203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v27.16b}, [x0], #16\n" 3204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v12.16b}, [x0], #16\n" 3205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v20.16b}, [x0], #16\n" 3206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v28.16b}, [x0], #16\n" 3207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v13.16b}, [x0], #16\n" 3208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v21.16b}, [x0], #16\n" 3209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v29.16b}, [x0], #16\n" 3210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v14.16b}, [x0], #16\n" 3211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v22.16b}, [x0], #16\n" 3212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v30.16b}, [x0], #16\n" 3213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v15.16b}, [x0], #16\n" 3214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v23.16b}, [x0], #16\n" 3215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v31.16b}, [x0], #16\n" 3216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The start of the loop assumes first Rhs cell is already loaded, so 3218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // do it here for first iteration. 3219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.4s}, [%[rhs_ptr]], #16\n" 3220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // And the same for the first Lhs cell. 3222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" 3223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 3225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 3226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Start the MACs at the head of the loop - 1st cell from each side 3228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // already loaded. 3229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v8.4s, v2.4s, v0.s[0]\n" 3230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v9.4s, v2.4s, v0.s[1]\n" 3231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v1.4s}, [%[rhs_ptr]], #16\n" // Load second Rhs cell. 3232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v10.4s, v2.4s, v0.s[2]\n" 3233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v11.4s, v2.4s, v0.s[3]\n" 3234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v3.4s}, [%[lhs_ptr]], #16\n" // Load second Lhs cell. 3235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v12.4s, v2.4s, v1.s[0]\n" 3236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v13.4s, v2.4s, v1.s[1]\n" 3237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v4.4s}, [%[lhs_ptr]], #16\n" // Load third Lhs cell. 3238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v14.4s, v2.4s, v1.s[2]\n" 3239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v15.4s, v2.4s, v1.s[3]\n" 3240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" // Done with first Lhs cell - load 3241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // for the next iteration early. 3242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v16.4s, v3.4s, v0.s[0]\n" 3243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v17.4s, v3.4s, v0.s[1]\n" 3244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v18.4s, v3.4s, v0.s[2]\n" 3245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v19.4s, v3.4s, v0.s[3]\n" 3246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v20.4s, v3.4s, v1.s[0]\n" 3247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v21.4s, v3.4s, v1.s[1]\n" 3248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v22.4s, v3.4s, v1.s[2]\n" 3249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v23.4s, v3.4s, v1.s[3]\n" 3250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v24.4s, v4.4s, v0.s[0]\n" 3251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v25.4s, v4.4s, v0.s[1]\n" 3252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v26.4s, v4.4s, v0.s[2]\n" 3253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v27.4s, v4.4s, v0.s[3]\n" 3254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v0.4s}, [%[rhs_ptr]], #16\n" // Done with the first Rhs cell - 3255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // load for the next iteration 3256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // early. 3257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v28.4s, v4.4s, v1.s[0]\n" 3258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v29.4s, v4.4s, v1.s[1]\n" 3259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop. Decrement loop index (depth) by 1, since we just handled 3260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1 level of depth. Do this a bit before the end of the loop for 3261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // better dispatch on A57. 3262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #1\n" 3263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v30.4s, v4.4s, v1.s[2]\n" 3264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v31.4s, v4.4s, v1.s[3]\n" 3265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 3267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 3268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 3270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 3272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v16.16b}, [x0], #16\n" 3273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v24.16b}, [x0], #16\n" 3274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 3275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v17.16b}, [x0], #16\n" 3276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v25.16b}, [x0], #16\n" 3277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 3278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v18.16b}, [x0], #16\n" 3279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v26.16b}, [x0], #16\n" 3280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 3281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v19.16b}, [x0], #16\n" 3282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v27.16b}, [x0], #16\n" 3283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 3284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v20.16b}, [x0], #16\n" 3285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v28.16b}, [x0], #16\n" 3286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 3287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v21.16b}, [x0], #16\n" 3288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v29.16b}, [x0], #16\n" 3289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 3290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v22.16b}, [x0], #16\n" 3291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v30.16b}, [x0], #16\n" 3292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v15.16b}, [x0], #16\n" 3293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v23.16b}, [x0], #16\n" 3294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v31.16b}, [x0], #16\n" 3295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 3296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 3297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 3298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 3299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 3300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 3301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 3302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", 3303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 3304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v28", "v29", "v30", "v31"); 3305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef __APPLE__ 3309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel contributed by ARM. Tuned for A53. 3310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar_A53 { 3311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 3312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 3313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 3314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 3315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> > 3316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 3317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 3318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 3319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang asm volatile( 3320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Load accumulators 3321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v8.16b}, [x0], #16\n" 3323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v16.16b}, [x0], #16\n" 3324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v24.16b}, [x0], #16\n" 3325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v9.16b}, [x0], #16\n" 3326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v17.16b}, [x0], #16\n" 3327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v25.16b}, [x0], #16\n" 3328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v10.16b}, [x0], #16\n" 3329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v18.16b}, [x0], #16\n" 3330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v26.16b}, [x0], #16\n" 3331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v11.16b}, [x0], #16\n" 3332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v19.16b}, [x0], #16\n" 3333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v27.16b}, [x0], #16\n" 3334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v12.16b}, [x0], #16\n" 3335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v20.16b}, [x0], #16\n" 3336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v28.16b}, [x0], #16\n" 3337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v13.16b}, [x0], #16\n" 3338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v21.16b}, [x0], #16\n" 3339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v29.16b}, [x0], #16\n" 3340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v14.16b}, [x0], #16\n" 3341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v22.16b}, [x0], #16\n" 3342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v30.16b}, [x0], #16\n" 3343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v15.16b}, [x0], #16\n" 3344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v23.16b}, [x0], #16\n" 3345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v31.16b}, [x0], #16\n" 3346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // For A53, a very different-looking loop is needed. 3348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 3349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The main reason for this is that on A53 128-bit loads take two 3350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // cycles during which no dual issue can occur. Doing two separate 3351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 64-bit loads avoids this issue - they each take one cycle and are 3352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // able to dual issue. Since vector register loads don't dual issue 3353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // with FMLA, we load half the register as normal and the other half 3354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // into an integer register. This second half can then be moved into 3355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // place later with an INS instruction - which will dual issue with a 3356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // later FP load. 3357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 3358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // For this kernel there are approximately 3 times as many multiplies 3359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // as loads, so it makes sense to structure the loop into blocks of 4 3360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // cycles, with 1 dedicated "load cycle" and 3 "multiply cycles" per 3361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // block. Strictly preserving this structure with NOPs where no load 3362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // is needed seems to result in higher performance. 3363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 3364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Choice of x18 to store the upper halves on their way into the 3365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // vector registers is arbitrary. Added to the clobber list so that 3366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // the compiler will make it available. 3367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 3368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 3369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // At the start of the loop, it is assumed that v0 is "half loaded" - 3370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // bottom half in place in d0 and the upper half in x18 ready to 3371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // insert. So set that up here for the first iteration: 3372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr d0, [%[rhs_ptr]]\n" // Bottom half of first Rhs cell 3373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" // Upper half 3374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], %[rhs_ptr], #16\n" // Separate increment (needed as 3375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // there is no operation to load at 3376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // reg + 8 but then increment reg 3377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // by 16). 3378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // v2 should be fully loaded - as it's outside the loop proper it's fine 3380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // to use a 128-bit load here. 3381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" // first Lhs cell 3382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang GEMMLOWP_LABEL_LOOP 3384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ":\n" 3385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // First block of four cycles. Multplies all require v2 and v0; v2 is 3387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // loaded earlier and v0 is half loaded and completed in the load 3388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // cycle at the start. 3389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr d1, [%[rhs_ptr]]\n" // "load" cycle - loading bottom half of v1 3390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // (second Rhs cell). 3391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ins v0.d[1], x18\n" // "load" cycle - moving the upper half of v0 into 3392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // place. 3393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v8.4s, v2.4s, v0.s[0]\n" // "fmla" cycle 1 - first multiply. 3394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" // "fmla" cycle 1 - load upper half of v1 3395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // into x18. 3396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v9.4s, v2.4s, v0.s[1]\n" // "fmla" cycle 2 - second multiply 3397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], %[rhs_ptr], #16\n" // "fmla" cycle 2 - increment Rhs 3398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // pointer (if needed) 3399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v10.4s, v2.4s, v0.s[2]\n" // "fmla" cycle 3 - third multiply. No 3400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // more work to dual issue. 3401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Second block. Start loading v3 (second Lhs cell), finish loading v1. 3403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr d3, [%[lhs_ptr]]\n" 3404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ins v1.d[1], x18\n" // v1 ready here. 3405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v11.4s, v2.4s, v0.s[3]\n" 3406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr x18, [%[lhs_ptr], #8]\n" 3407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v12.4s, v2.4s, v1.s[0]\n" // First use of v1. 3408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], %[lhs_ptr], #16\n" 3409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v13.4s, v2.4s, v1.s[1]\n" 3410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Third block. Start loading v4 (third Lhs cell), finish loading v3. 3412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr d4, [%[lhs_ptr]]\n" 3413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ins v3.d[1], x18\n" // v3 ready here. 3414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v14.4s, v2.4s, v1.s[2]\n" 3415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr x18, [%[lhs_ptr], #8]\n" 3416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v15.4s, v2.4s, v1.s[3]\n" 3417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], %[lhs_ptr], #16\n" 3418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v16.4s, v3.4s, v0.s[0]\n" // First use of v3. 3419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Fourth block. v2 (first Lhs cell) is now finished with, so start 3421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // loading value for next iteration. Finish loading v4. 3422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr d2, [%[lhs_ptr]]\n" 3423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ins v4.d[1], x18\n" // v4 ready here. 3424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v17.4s, v3.4s, v0.s[1]\n" 3425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr x18, [%[lhs_ptr], #8]\n" 3426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v18.4s, v3.4s, v0.s[2]\n" 3427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[lhs_ptr], %[lhs_ptr], #16\n" 3428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v19.4s, v3.4s, v0.s[3]\n" 3429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Fifth block, finish loading v2. No new load to start as the other 3431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // registers are all still live. 3432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ins v2.d[1], x18\n" 3433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v20.4s, v3.4s, v1.s[0]\n" 3434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v21.4s, v3.4s, v1.s[1]\n" 3435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v22.4s, v3.4s, v1.s[2]\n" 3436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Sixth block, nothing to load. 2 nops needed as a single nop would 3438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // dual issue with the FMLA and break the timing. 3439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "nop\n" 3440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "nop\n" 3441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v23.4s, v3.4s, v1.s[3]\n" 3442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v24.4s, v4.4s, v0.s[0]\n" // First use of v4. 3443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v25.4s, v4.4s, v0.s[1]\n" 3444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Seventh block, nothing to load. Decrement the loop counter in this 3446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // block as the last block is very full. 3447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "nop\n" 3448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "nop\n" 3449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v26.4s, v4.4s, v0.s[2]\n" 3450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "subs %w[depth], %w[depth], #1\n" 3451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v27.4s, v4.4s, v0.s[3]\n" 3452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v28.4s, v4.4s, v1.s[0]\n" 3453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Eighth block - start loading v0 for next iteration. 3455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr d0, [%[rhs_ptr]]\n" 3456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v29.4s, v4.4s, v1.s[1]\n" 3457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" 3458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v30.4s, v4.4s, v1.s[2]\n" 3459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "add %[rhs_ptr], %[rhs_ptr], #16\n" 3460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "fmla v31.4s, v4.4s, v1.s[3]\n" 3461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Loop branch. This will dual issue in fmla cycle 3 of the 8th block. 3463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "bne " GEMMLOWP_LABEL_LOOP 3464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "b\n" 3465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Store accumulators 3467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "mov x0, %[accum_ptr]\n" 3468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v8.16b}, [x0], #16\n" 3469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v16.16b}, [x0], #16\n" 3470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v24.16b}, [x0], #16\n" 3471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v9.16b}, [x0], #16\n" 3472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v17.16b}, [x0], #16\n" 3473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v25.16b}, [x0], #16\n" 3474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v10.16b}, [x0], #16\n" 3475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v18.16b}, [x0], #16\n" 3476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v26.16b}, [x0], #16\n" 3477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v11.16b}, [x0], #16\n" 3478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v19.16b}, [x0], #16\n" 3479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v27.16b}, [x0], #16\n" 3480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v12.16b}, [x0], #16\n" 3481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v20.16b}, [x0], #16\n" 3482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v28.16b}, [x0], #16\n" 3483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v13.16b}, [x0], #16\n" 3484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v21.16b}, [x0], #16\n" 3485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v29.16b}, [x0], #16\n" 3486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v14.16b}, [x0], #16\n" 3487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v22.16b}, [x0], #16\n" 3488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v30.16b}, [x0], #16\n" 3489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v15.16b}, [x0], #16\n" 3490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v23.16b}, [x0], #16\n" 3491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "st1 {v31.16b}, [x0], #16\n" 3492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // outputs 3493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 3494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [depth] "+r"(depth) 3495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // inputs 3496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang [accum_ptr] "r"(accum_ptr) 3497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang : // clobbers 3498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 3499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 3500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", 3501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang "v27", "v28", "v29", "v30", "v31"); 3502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 3505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 35067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Faster kernel contributed by ARM. Tuned for A55r1. 35077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar_A55r1 { 35087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef float OperandType; 35097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef float AccumulatorType; 35107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 35117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 35127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> > 35137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 35147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 35157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 35167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 35177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 35187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "mov x0, %[accum_ptr]\n" 35197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v8.4s}, [x0], #16\n" 35207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v16.4s}, [x0], #16\n" 35217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v24.4s}, [x0], #16\n" 35227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v9.4s}, [x0], #16\n" 35237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v17.4s}, [x0], #16\n" 35247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v25.4s}, [x0], #16\n" 35257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v10.4s}, [x0], #16\n" 35267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v18.4s}, [x0], #16\n" 35277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v26.4s}, [x0], #16\n" 35287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v11.4s}, [x0], #16\n" 35297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v19.4s}, [x0], #16\n" 35307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v27.4s}, [x0], #16\n" 35317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v12.4s}, [x0], #16\n" 35327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v20.4s}, [x0], #16\n" 35337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v28.4s}, [x0], #16\n" 35347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v13.4s}, [x0], #16\n" 35357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v21.4s}, [x0], #16\n" 35367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v29.4s}, [x0], #16\n" 35377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v14.4s}, [x0], #16\n" 35387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v22.4s}, [x0], #16\n" 35397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v30.4s}, [x0], #16\n" 35407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v15.4s}, [x0], #16\n" 35417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v23.4s}, [x0], #16\n" 35427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld1 {v31.4s}, [x0], #16\n" 35437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A55r1 requires a hybrid of the A53 and standard approaches. 35457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 35467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Like A53, this processor prefers 64-bit loads. 35477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 35487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Unlike A53, it is capable of dual-issuing a 64-bit vector load 35497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // (or INS) with a FMLA instruction. 35507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 35517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Therefore we aim to issue an FMLA instruction every cycle. 35527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Alongside three FMLAs we can dual issue a (vector) 64-bit load, a 35537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // scalar 64-bit load and finally an INS to replicate the effect of 35547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // a single 128-bit load. 35557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 35567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // The loop contains 24 FMLA instructions, and 5 vector registers 35577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // need to be loaded, consuming 15 dual issue slots. This leaves 9 35587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // dual issue slots. Four of these are used for loop housekeeping 35597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // (2 pointer adds, 1 counter update and 1 branch), leaving 5 left 35607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // over (marked by blank lines). 35617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 35627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Choice of x18 to store the upper halves on their way into the 35637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // vector registers is arbitrary. Added to the clobber list so that 35647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // the compiler will make it available. 35657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // At the start of the loop, it is assumed that v0 is "half loaded" - 35687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // bottom half in place in d0 and the upper half in x18 ready to 35697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // insert. So set that up here for the first iteration: 35707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d0, [%[rhs_ptr]]\n" // Bottom half of first Rhs cell 35717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" // Upper half 35727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // v2-v3 should be fully loaded - as it's outside the loop proper it's fine 35747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // to use a 128-bit load here. 35757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr q2, [%[lhs_ptr]]\n" // first Lhs cell 35767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr q3, [%[lhs_ptr], #16]\n" // second Lhs cell 35777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP 35797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang ":\n" 35807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v8.4s, v2.4s, v0.s[0]\n" 35827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d1, [%[rhs_ptr], #16]\n" // Bottom half of v1 35837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v9.4s, v2.4s, v0.s[1]\n" 35847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v0.d[1], x18\n" // Finish loading v0 35857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v16.4s, v3.4s, v0.s[0]\n" // out of sequence - used to reduce load/use pressure. 35867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[rhs_ptr], #24]\n" // Top half of v1 to X register 35877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v17.4s, v3.4s, v0.s[1]\n" // out of sequence - used to reduce load/use pressure. 35887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "add %[rhs_ptr], %[rhs_ptr], #32\n" // RHS loads complete - increment pointer. 35897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v10.4s, v2.4s, v0.s[2]\n" 35907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d4, [%[lhs_ptr], #32]\n" // Bottom half of v4 35917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v11.4s, v2.4s, v0.s[3]\n" 35927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v1.d[1], x18\n" // Finish loading v1 35937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v12.4s, v2.4s, v1.s[0]\n" 35947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[lhs_ptr], #40]\n" // Top half of v4 to X register 35957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v13.4s, v2.4s, v1.s[1]\n" 35967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "add %[lhs_ptr], %[lhs_ptr], #48\n" // LHS loads complete - increment pointer. 35977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v14.4s, v2.4s, v1.s[2]\n" 35987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 35997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v15.4s, v2.4s, v1.s[3]\n" 36007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d2, [%[lhs_ptr]]\n" // Bottom half of v2 (for next time) 36017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v18.4s, v3.4s, v0.s[2]\n" 36027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v4.d[1], x18\n" // Finish loading v4 36037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v19.4s, v3.4s, v0.s[3]\n" 36047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[lhs_ptr], #8]\n" // Top half of next v2 to X register 36057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v20.4s, v3.4s, v1.s[0]\n" 36067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "subs %w[depth], %w[depth], #1\n" 36077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v21.4s, v3.4s, v1.s[1]\n" 36087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 36097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v22.4s, v3.4s, v1.s[2]\n" 36107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 36117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v23.4s, v3.4s, v1.s[3]\n" 36127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d3, [%[lhs_ptr], #16]\n" // Bottom half of v3 (for next time) 36137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v24.4s, v4.4s, v0.s[0]\n" 36147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v2.d[1], x18\n" // Finish loading next v2 36157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v25.4s, v4.4s, v0.s[1]\n" 36167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[lhs_ptr], #24]\n" // Top half of next v3 to X register 36177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v26.4s, v4.4s, v0.s[2]\n" 36187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 36197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v27.4s, v4.4s, v0.s[3]\n" 36207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr d0, [%[rhs_ptr]]\n" // Bottom half of v0 (for next time) 36217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v28.4s, v4.4s, v1.s[0]\n" 36227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins v3.d[1], x18\n" // Finish loading next v3 36237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v29.4s, v4.4s, v1.s[1]\n" 36247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldr x18, [%[rhs_ptr], #8]\n" // Top half of next v0 to X register 36257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v30.4s, v4.4s, v1.s[2]\n" 36267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 36277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fmla v31.4s, v4.4s, v1.s[3]\n" 36287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bne " GEMMLOWP_LABEL_LOOP "b\n" 36297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 36307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators 36317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "mov x0, %[accum_ptr]\n" 36327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v8.4s}, [x0], #16\n" 36337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v16.4s}, [x0], #16\n" 36347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v24.4s}, [x0], #16\n" 36357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v9.4s}, [x0], #16\n" 36367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v17.4s}, [x0], #16\n" 36377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v25.4s}, [x0], #16\n" 36387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v10.4s}, [x0], #16\n" 36397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v18.4s}, [x0], #16\n" 36407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v26.4s}, [x0], #16\n" 36417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v11.4s}, [x0], #16\n" 36427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v19.4s}, [x0], #16\n" 36437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v27.4s}, [x0], #16\n" 36447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v12.4s}, [x0], #16\n" 36457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v20.4s}, [x0], #16\n" 36467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v28.4s}, [x0], #16\n" 36477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v13.4s}, [x0], #16\n" 36487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v21.4s}, [x0], #16\n" 36497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v29.4s}, [x0], #16\n" 36507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v14.4s}, [x0], #16\n" 36517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v22.4s}, [x0], #16\n" 36527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v30.4s}, [x0], #16\n" 36537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v15.4s}, [x0], #16\n" 36547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v23.4s}, [x0], #16\n" 36557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st1 {v31.4s}, [x0], #16\n" 36567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 36577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 36587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 36597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 36607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 36617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 36627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 36637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 36647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", 36657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v27", "v28", "v29", "v30", "v31"); 36667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 36677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 36687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 3669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif // __aarch64__ 3670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 36717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if defined(__arm__) || defined(__aarch64__) 3672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef __aarch64__ 3673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { 3674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int32x2_t c = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); 3675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int32x2_t d = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); 3676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return vcombine_s32(c, d); 3677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 3678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 3679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// C++ intrinsics-based variant of the deep, int8, fast kernel 3681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int Cols> 3682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics { 3683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int8_t OperandType; 3684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t AccumulatorType; 3685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 3686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>, 3687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<Cols, 16, CellOrder::WidthMajor>, 1> > 3688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 3689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 3690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 3691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int32x4_t acc[4][Cols]; 3692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 4; i++) { 3693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < Cols; j++) { 3694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][j] = vdupq_n_s32(0); 3695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int d = 0; d < depth; d += 16) { 3698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int8x16_t lhs[4]; 3699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 4; i++) { 3700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs[i] = vld1q_s8(lhs_ptr + 16 * i); 3701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int8x16_t rhs[Cols]; 3703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < Cols; i++) { 3704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs[i] = vld1q_s8(rhs_ptr + 16 * i); 3705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 4; i++) { 3707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < Cols; j++) { 3708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int16x8_t local_acc = 3709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vmull_s8(vget_low_s8(lhs[i]), vget_low_s8(rhs[j])); 3710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang local_acc = 3711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vmlal_s8(local_acc, vget_high_s8(lhs[i]), vget_high_s8(rhs[j])); 3712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][j] = vpadalq_s16(acc[i][j], local_acc); 3713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs_ptr += 64; 3716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs_ptr += 16 * Cols; 3717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < Cols; i++) { 3719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int32x4_t acc_2x_0 = vpaddq_s32(acc[0][i], acc[1][i]); 3720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int32x4_t acc_2x_1 = vpaddq_s32(acc[2][i], acc[3][i]); 3721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int32x4_t acc_4x = vpaddq_s32(acc_2x_0, acc_2x_1); 3722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int32x4_t dst_val = vld1q_s32(accum_ptr + 4 * i); 3723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang dst_val = vaddq_s32(dst_val, acc_4x); 3724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vst1q_s32(accum_ptr + 4 * i, dst_val); 3725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics = 3730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics<4>; 3731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics = 3733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics<2>; 3734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// C++ intrinsics-based variant of the wide, uint8, general kernel 3736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int RhsCells> 3737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics { 3738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::uint8_t OperandType; 3739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef std::int32_t AccumulatorType; 3740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 3741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 3742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, RhsCells> > 3743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 3744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 3745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 3746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int32x4_t acc[3][4 * RhsCells]; 3747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < 4 * RhsCells; j++) { 3749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][j] = vld1q_s32(accum_ptr + 4 * (i + 3 * j)); 3750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int d = 0; d < depth; d += 2) { 3753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int16x8_t lhs[3]; 3754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs[i] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(lhs_ptr + 8 * i))); 3756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang int16x8_t rhs[RhsCells]; 3758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < RhsCells; i++) { 3759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs[i] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(rhs_ptr + 8 * i))); 3760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < RhsCells; j++) { 3763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 0] = vmlal_lane_s16( 3764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 0], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 0); 3765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 1] = vmlal_lane_s16( 3766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 1], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 1); 3767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 2] = vmlal_lane_s16( 3768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 2], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 2); 3769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 3] = vmlal_lane_s16( 3770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 3], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 3); 3771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 0] = 3772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vmlal_lane_s16(acc[i][4 * j + 0], vget_high_s16(lhs[i]), 3773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_high_s16(rhs[j]), 0); 3774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 1] = 3775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vmlal_lane_s16(acc[i][4 * j + 1], vget_high_s16(lhs[i]), 3776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_high_s16(rhs[j]), 1); 3777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 2] = 3778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vmlal_lane_s16(acc[i][4 * j + 2], vget_high_s16(lhs[i]), 3779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_high_s16(rhs[j]), 2); 3780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 3] = 3781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vmlal_lane_s16(acc[i][4 * j + 3], vget_high_s16(lhs[i]), 3782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_high_s16(rhs[j]), 3); 3783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs_ptr += 24; 3786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs_ptr += 8 * RhsCells; 3787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < 4 * RhsCells; j++) { 3790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vst1q_s32(accum_ptr + 4 * (i + 3 * j), acc[i][j]); 3791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics = 3797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics<1>; 3798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics = 3800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics<2>; 3801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int RhsCells> 3803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_GEMM_Float32_WithScalar_intrinsics { 3804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float OperandType; 3805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef float AccumulatorType; 3806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef KernelFormat< 3807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>, 3808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, RhsCells> > 3809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format; 3810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 3811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 3812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang float32x4_t acc[3][4 * RhsCells]; 3813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < 4 * RhsCells; j++) { 3815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][j] = vld1q_f32(accum_ptr + 4 * (i + 3 * j)); 3816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int d = 0; d < depth; d++) { 3819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang float32x4_t lhs[3]; 3820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs[i] = vld1q_f32(lhs_ptr + 4 * i); 3822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang float32x4_t rhs[RhsCells]; 3824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < RhsCells; i++) { 3825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs[i] = vld1q_f32(rhs_ptr + 4 * i); 3826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < RhsCells; j++) { 3829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 0] = vmlaq_lane_f32(acc[i][4 * j + 0], lhs[i], 3830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_low_f32(rhs[j]), 0); 3831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 1] = vmlaq_lane_f32(acc[i][4 * j + 1], lhs[i], 3832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_low_f32(rhs[j]), 1); 3833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 2] = vmlaq_lane_f32(acc[i][4 * j + 2], lhs[i], 3834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_high_f32(rhs[j]), 0); 3835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang acc[i][4 * j + 3] = vmlaq_lane_f32(acc[i][4 * j + 3], lhs[i], 3836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vget_high_f32(rhs[j]), 1); 3837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs_ptr += 12; 3840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs_ptr += 4 * RhsCells; 3841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < 3; i++) { 3843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int j = 0; j < 4 * RhsCells; j++) { 3844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang vst1q_f32(accum_ptr + 4 * (i + 3 * j), acc[i][j]); 3845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 3848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 3849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_32bit_GEMM_Float32_WithScalar_intrinsics = 3851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_GEMM_Float32_WithScalar_intrinsics<1>; 3852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 3853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_64bit_GEMM_Float32_WithScalar_intrinsics = 3854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang NEON_GEMM_Float32_WithScalar_intrinsics<2>; 38557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif // __arm__ || __aarch64__ 38567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 38577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __mips 38587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstatic inline v4i32 workaround_msa_maddv_w(v4i32 a, v4i32 b, v4i32 c) { 38597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Workaround for incorrect encoding of maddv.df in gcc (a exchanged with c). 38607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if 0 38617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang return __builtin_msa_maddv_w(a, b, c); 38627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#else 38637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile("maddv.w %w[a], %w[b], %w[c]\n" 38647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Outputs 38657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : [a] "+f"(a) 38667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Inputs 38677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : [b] "f"(b), [c] "f"(c)); 38687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang return a; 38697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif 38707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang} 38717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 38727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications. 38737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 20 MSA regs used: 38747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 12 accumulators 38757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs 38767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs 38777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes 38787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~55 instructions in the loop. 38797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics { 38807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 38817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::int32_t AccumulatorType; 38827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 38837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 38847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> > 38857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 38867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 38877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 38887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang const v16i8 zeroes = __builtin_msa_ldi_b(0); 38897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 acc[3][4]; 38907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators. 38917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 38927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 4; j++) { 38937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = __builtin_msa_ld_w(accum_ptr + 4 * (i + 3 * j), 0); 38947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 38957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 38967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 38977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang while (depth > 0) { 38987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads. 38997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v8i16 lhs[6]; 39007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr), 0)); 39017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[1] = 39027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr + 8), 0)); 39037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 8-bit elements of lhs[] to 16 bits. 39057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes, 39067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v16i8>(lhs[0]))); 39077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[2] = reinterpret_cast<v8i16>(__builtin_msa_ilvl_b(zeroes, 39087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v16i8>(lhs[1]))); 39097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[1] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes, 39107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v16i8>(lhs[1]))); 39117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 16-bit elements of lhs[] to 32 bits. 39137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[3] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[0]); 39147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[4] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[1]); 39157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[5] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[2]); 39167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[0] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[0]); 39177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[1] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[1]); 39187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[2] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[2]); 39197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 0. 39217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 4; j++) { 39227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 1 byte of rhs, making 4 32-bit replicas of it. 39237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j])); 39247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Multiply-add into accumulators. 39257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 39267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i]), rhs); 39277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 1. 39317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 4; j++) { 39327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 1 byte of rhs, making 4 32-bit replicas of it. 39337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 4])); 39347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Multiply-add into accumulators. 39357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 39367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i + 3]), rhs); 39377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs_ptr += 24; 39417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang rhs_ptr += 8; 39427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang depth -= 2; 39437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators. 39467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 39477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 4; j++) { 39487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang __builtin_msa_st_w(acc[i][j], accum_ptr + 4 * (i + 3 * j), 0); 39497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 39527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 39537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above 39557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics. 39567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications. 39577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 20 MSA regs used: 39587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 12 accumulators 39597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs 39607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs 39617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes 39627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~55 instructions in the loop. 39637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly { 39647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 39657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::int32_t AccumulatorType; 39667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 39677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 39687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> > 39697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 39707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr, 39717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 39727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 39737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 39747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w0, (0*16)(%[accum_ptr])\n" 39757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w4, (1*16)(%[accum_ptr])\n" 39767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w8, (2*16)(%[accum_ptr])\n" 39777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w1, (3*16)(%[accum_ptr])\n" 39787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w5, (4*16)(%[accum_ptr])\n" 39797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w9, (5*16)(%[accum_ptr])\n" 39807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w2, (6*16)(%[accum_ptr])\n" 39817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w6, (7*16)(%[accum_ptr])\n" 39827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w10, (8*16)(%[accum_ptr])\n" 39837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w3, (9*16)(%[accum_ptr])\n" 39847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w7, (10*16)(%[accum_ptr])\n" 39857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w11, (11*16)(%[accum_ptr])\n" 39867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Set a temp to all zeroes. 39877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldi.b $w19, 0\n" 39887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 39897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP ":\n" 39907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Overview of register layout: 39917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 39927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A half of the 2x4 cell of Rhs is stored in 32bit in w18. 39937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 32bit in w12-w17. 39947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x4 block of accumulators is stored in 32bit in w0-w11. 39957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 39967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +------+------+------+------+ 39977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Rhs |w18[0]|w18[1]|w18[2]|w18[3]| 39987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +------+------+------+------+ 39997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 40007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // | | | | | 40017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 40027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Lhs | | | | | 40037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 40047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 40057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12|w15| | w0 | w1 | w2 | w3 | 40067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12|w15| | w0 | w1 | w2 | w3 | 40077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12|w15| | w0 | w1 | w2 | w3 | 40087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12|w15| | w0 | w1 | w2 | w3 | 40097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 40107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13|w16| | w4 | w5 | w6 | w7 | 40117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13|w16| | w4 | w5 | w6 | w7 | 40127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13|w16| | w4 | w5 | w6 | w7 | 40137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13|w16| | w4 | w5 | w6 | w7 | 40147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 40157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14|w17| | w8 | w9 | w10 | w11 | 40167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14|w17| | w8 | w9 | w10 | w11 | 40177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14|w17| | w8 | w9 | w10 | w11 | 40187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14|w17| | w8 | w9 | w10 | w11 | 40197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 40207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 40217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Accumulator 40227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads. 40247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w12, 0(%[lhs_ptr])\n" 40257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w13, 8(%[lhs_ptr])\n" 40267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for depth 0. 40287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 0(%[rhs_ptr])\n" 40297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 1(%[rhs_ptr])\n" 40307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 2(%[rhs_ptr])\n" 40317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 3(%[rhs_ptr])\n" 40327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 8-bit elements of lhs[] to 16 bits. 40347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w12, $w19, $w12\n" 40357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.b $w14, $w19, $w13\n" 40367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w13, $w19, $w13\n" 40377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 16-bit elements of lhs[] to 32 bits. 40387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.h $w15, $w19, $w12\n" 40397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.h $w16, $w19, $w13\n" 40407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.h $w17, $w19, $w14\n" 40417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w12, $w19, $w12\n" 40427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w13, $w19, $w13\n" 40437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w14, $w19, $w14\n" 40447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 0. 40467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a0\n" 40477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 4(%[rhs_ptr])\n" 40487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w0, $w12, $w18\n" 40497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w4, $w13, $w18\n" 40507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w8, $w14, $w18\n" 40517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a1\n" 40527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 5(%[rhs_ptr])\n" 40537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w1, $w12, $w18\n" 40547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w5, $w13, $w18\n" 40557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w9, $w14, $w18\n" 40567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a2\n" 40577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 6(%[rhs_ptr])\n" 40587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w2, $w12, $w18\n" 40597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w6, $w13, $w18\n" 40607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w10, $w14, $w18\n" 40617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a3\n" 40627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 7(%[rhs_ptr])\n" 40637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w3, $w12, $w18\n" 40647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w7, $w13, $w18\n" 40657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w11, $w14, $w18\n" 40667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 1. 40687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a0\n" 40697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w0, $w15, $w18\n" 40707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w4, $w16, $w18\n" 40717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w8, $w17, $w18\n" 40727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a1\n" 40737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w1, $w15, $w18\n" 40747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w5, $w16, $w18\n" 40757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w9, $w17, $w18\n" 40767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a2\n" 40777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w2, $w15, $w18\n" 40787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w6, $w16, $w18\n" 40797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w10, $w17, $w18\n" 40807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a3\n" 40817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w3, $w15, $w18\n" 40827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w7, $w16, $w18\n" 40837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w11, $w17, $w18\n" 40847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "addiu %[depth], -2\n" 40867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n" 40877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 8\n" 40887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bnez %[depth]," GEMMLOWP_LABEL_LOOP "b\n" 40897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 40907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators. 40917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w0, (0*16)(%[accum_ptr])\n" 40927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w4, (1*16)(%[accum_ptr])\n" 40937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w8, (2*16)(%[accum_ptr])\n" 40947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w1, (3*16)(%[accum_ptr])\n" 40957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w5, (4*16)(%[accum_ptr])\n" 40967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w9, (5*16)(%[accum_ptr])\n" 40977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w2, (6*16)(%[accum_ptr])\n" 40987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w6, (7*16)(%[accum_ptr])\n" 40997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w10, (8*16)(%[accum_ptr])\n" 41007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w3, (9*16)(%[accum_ptr])\n" 41017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w7, (10*16)(%[accum_ptr])\n" 41027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w11, (11*16)(%[accum_ptr])\n" 41037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 41047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 41057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 41067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 41077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 41087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 41097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "memory", 41107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "a0", "a1", "a2", "a3", 41117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", 41127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", 41137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f16", "$f17", "$f18", "$f19"); 41147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 41157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 41167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 41177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above 41187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics2 (TODO). 41197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 16x16=32 multiplications. 41207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 20 MSA regs used: 41217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 12 accumulators 41227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 3 lhs 41237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 4 rhs 41247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes 41257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~45 instructions in the loop. 41267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly2 { 41277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 41287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::int32_t AccumulatorType; 41297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 41307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 41317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> > 41327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 41337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr, 41347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 41357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 41367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 41377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w0, (0*16)(%[accum_ptr])\n" 41387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w4, (1*16)(%[accum_ptr])\n" 41397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w8, (2*16)(%[accum_ptr])\n" 41407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w1, (3*16)(%[accum_ptr])\n" 41417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w5, (4*16)(%[accum_ptr])\n" 41427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w9, (5*16)(%[accum_ptr])\n" 41437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w2, (6*16)(%[accum_ptr])\n" 41447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w6, (7*16)(%[accum_ptr])\n" 41457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w10, (8*16)(%[accum_ptr])\n" 41467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w3, (9*16)(%[accum_ptr])\n" 41477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w7, (10*16)(%[accum_ptr])\n" 41487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w11, (11*16)(%[accum_ptr])\n" 41497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Set a temp to all zeroes. 41507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldi.b $w19, 0\n" 41517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 41527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP ":\n" 41537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Overview of register layout: 41547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 41557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 2x4 cell of Rhs is stored in 16bit in w15-w18 (each register 41567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // contains 4 replicas of a pair of elements). 41577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in w12-w14. 41587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x4 block of accumulators is stored in 32bit in w0-w11. 41597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 41607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +-----+-----+-----+-----+ 41617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Rhs | w15 | w16 | w17 | w18 | 41627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +-----+-----+-----+-----+ 41637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 41647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // | | | | | 41657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 41667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Lhs | | | | | 41677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 41687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +-----+-----+-----+-----+ 41697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12| | w0 | w1 | w2 | w3 | 41707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12| | w0 | w1 | w2 | w3 | 41717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12| | w0 | w1 | w2 | w3 | 41727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w12| | w0 | w1 | w2 | w3 | 41737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +-----+-----+-----+-----+ 41747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13| | w4 | w5 | w6 | w7 | 41757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13| | w4 | w5 | w6 | w7 | 41767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13| | w4 | w5 | w6 | w7 | 41777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w13| | w4 | w5 | w6 | w7 | 41787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +-----+-----+-----+-----+ 41797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14| | w8 | w9 | w10 | w11 | 41807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14| | w8 | w9 | w10 | w11 | 41817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14| | w8 | w9 | w10 | w11 | 41827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w14| | w8 | w9 | w10 | w11 | 41837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +-----+-----+-----+-----+ 41847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 41857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Accumulators 41867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 41877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads. 41887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w12, 0(%[lhs_ptr])\n" 41897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w13, 8(%[lhs_ptr])\n" 41907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 41917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for depth 0. 41927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 0(%[rhs_ptr])\n" 41937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 1(%[rhs_ptr])\n" 41947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 2(%[rhs_ptr])\n" 41957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 3(%[rhs_ptr])\n" 41967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for depth 1. 41977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $v0, 4(%[rhs_ptr])\n" 41987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $v1, 5(%[rhs_ptr])\n" 41997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $t8, 6(%[rhs_ptr])\n" 42007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $t9, 7(%[rhs_ptr])\n" 42017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 8-bit elements of lhs[] to 16 bits. 42037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w12, $w19, $w12\n" 42047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.b $w14, $w19, $w13\n" 42057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w13, $w19, $w13\n" 42067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Interleave depth 0 and depth 1 elements of lhs[] for dpadd_u.w. 42077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.d $w15, $w19, $w12\n" 42087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.d $w16, $w19, $w13\n" 42097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.d $w17, $w19, $w14\n" 42107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w12, $w15, $w12\n" 42117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w13, $w16, $w13\n" 42127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w14, $w17, $w14\n" 42137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Combine and interleave depth 0 and depth 1 elements of rhs[] for dpadd_u.w. 42157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a0, $v0, 16, 8\n" 42167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a1, $v1, 16, 8\n" 42177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a2, $t8, 16, 8\n" 42187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a3, $t9, 16, 8\n" 42197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Make 4 replicas of every pair of rhs[] elements. 42207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w15, $a0\n" 42217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w16, $a1\n" 42227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w17, $a2\n" 42237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w18, $a3\n" 42247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depths 0 and 1. 42267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Dot-product-(and)-add doubles multiplicand width. 42277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w0, $w12, $w15\n" 42287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w4, $w13, $w15\n" 42297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w8, $w14, $w15\n" 42307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w1, $w12, $w16\n" 42317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w5, $w13, $w16\n" 42327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w9, $w14, $w16\n" 42337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w2, $w12, $w17\n" 42347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w6, $w13, $w17\n" 42357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w10, $w14, $w17\n" 42367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w3, $w12, $w18\n" 42377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w7, $w13, $w18\n" 42387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w11, $w14, $w18\n" 42397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "addiu %[depth], -2\n" 42417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n" 42427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 8\n" 42437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bnez %[depth]," GEMMLOWP_LABEL_LOOP "b\n" 42447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators. 42467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w0, (0*16)(%[accum_ptr])\n" 42477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w4, (1*16)(%[accum_ptr])\n" 42487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w8, (2*16)(%[accum_ptr])\n" 42497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w1, (3*16)(%[accum_ptr])\n" 42507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w5, (4*16)(%[accum_ptr])\n" 42517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w9, (5*16)(%[accum_ptr])\n" 42527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w2, (6*16)(%[accum_ptr])\n" 42537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w6, (7*16)(%[accum_ptr])\n" 42547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w10, (8*16)(%[accum_ptr])\n" 42557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w3, (9*16)(%[accum_ptr])\n" 42567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w7, (10*16)(%[accum_ptr])\n" 42577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w11, (11*16)(%[accum_ptr])\n" 42587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 42597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 42607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 42617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 42627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 42637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 42647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "memory", 42657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v0", "v1", 42667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "a0", "a1", "a2", "a3", 42677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "t8", "t9", 42687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", 42697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", 42707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f16", "$f17", "$f18", "$f19"); 42717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 42727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 42737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications. 42757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 32 MSA regs used: 42767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 24 accumulators 42777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs 42787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs 42797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes 42807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~95 instructions in the loop. 42817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics { 42827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 42837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint32_t AccumulatorType; 42847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 42857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 42867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> > 42877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 42887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 42897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 42907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang const v16i8 zeroes = __builtin_msa_ldi_b(0); 42917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 acc[3][8]; 42927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators. 42937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 42947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 8; j++) { 42957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = __builtin_msa_ld_w(accum_ptr + 4 * (i + 3 * j), 0); 42967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 42977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 42987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 42997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang while (depth > 0) { 43007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads. 43017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v8i16 lhs[6]; 43027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr), 0)); 43037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[1] = 43047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr + 8), 0)); 43057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 8-bit elements of lhs[] to 16 bits. 43077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes, 43087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v16i8>(lhs[0]))); 43097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[2] = reinterpret_cast<v8i16>(__builtin_msa_ilvl_b(zeroes, 43107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v16i8>(lhs[1]))); 43117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[1] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes, 43127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang reinterpret_cast<v16i8>(lhs[1]))); 43137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 16-bit elements of lhs[] to 32 bits. 43157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[3] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[0]); 43167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[4] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[1]); 43177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[5] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[2]); 43187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[0] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[0]); 43197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[1] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[1]); 43207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs[2] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[2]); 43217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 0. 43237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 4; j++) { 43247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 1 byte of rhs, making 4 32-bit replicas of it. 43257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j])); 43267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Multiply-add into accumulators. 43277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 43287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i]), rhs); 43297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 4; j < 8; j++) { 43327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 1 byte of rhs, making 4 32-bit replicas of it. 43337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 4])); 43347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Multiply-add into accumulators. 43357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 43367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i]), rhs); 43377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 1. 43417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 4; j++) { 43427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 1 byte of rhs, making 4 32-bit replicas of it. 43437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 4])); 43447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Multiply-add into accumulators. 43457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 43467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i + 3]), rhs); 43477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 4; j < 8; j++) { 43507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 1 byte of rhs, making 4 32-bit replicas of it. 43517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 8])); 43527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Multiply-add into accumulators. 43537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 43547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i + 3]), rhs); 43557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang lhs_ptr += 24; 43597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang rhs_ptr += 16; 43607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang depth -= 2; 43617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators. 43647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int i = 0; i < 3; i++) { 43657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang for (int j = 0; j < 8; j++) { 43667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang __builtin_msa_st_w(acc[i][j], accum_ptr + 4 * (i + 3 * j), 0); 43677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 43707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 43717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 43727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above 43737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics. 43747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications. 43757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 32 MSA regs used: 43767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 24 accumulators 43777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs 43787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs 43797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes 43807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~95 instructions in the loop. 43817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly { 43827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 43837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint32_t AccumulatorType; 43847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 43857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 43867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> > 43877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 43887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr, 43897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 43907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 43917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 43927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w0, (0*16)(%[accum_ptr])\n" 43937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w4, (1*16)(%[accum_ptr])\n" 43947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w8, (2*16)(%[accum_ptr])\n" 43957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w1, (3*16)(%[accum_ptr])\n" 43967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w5, (4*16)(%[accum_ptr])\n" 43977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w9, (5*16)(%[accum_ptr])\n" 43987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w2, (6*16)(%[accum_ptr])\n" 43997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w6, (7*16)(%[accum_ptr])\n" 44007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w10, (8*16)(%[accum_ptr])\n" 44017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w3, (9*16)(%[accum_ptr])\n" 44027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w7, (10*16)(%[accum_ptr])\n" 44037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w11, (11*16)(%[accum_ptr])\n" 44047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w12, (12*16)(%[accum_ptr])\n" 44057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w16, (13*16)(%[accum_ptr])\n" 44067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w20, (14*16)(%[accum_ptr])\n" 44077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w13, (15*16)(%[accum_ptr])\n" 44087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w17, (16*16)(%[accum_ptr])\n" 44097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w21, (17*16)(%[accum_ptr])\n" 44107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w14, (18*16)(%[accum_ptr])\n" 44117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w18, (19*16)(%[accum_ptr])\n" 44127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w22, (20*16)(%[accum_ptr])\n" 44137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w15, (21*16)(%[accum_ptr])\n" 44147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w19, (22*16)(%[accum_ptr])\n" 44157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w23, (23*16)(%[accum_ptr])\n" 44167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Set a temp to all zeroes. 44177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldi.b $w31, 0\n" 44187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 44197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP ":\n" 44207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Overview of register layout: 44217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 44227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A quarter of the 2 2x4 cells of Rhs is stored in 32bit in w30. 44237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 32bit in w24-w29. 44247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x8 block of accumulators is stored in 32bit in w0-w23. 44257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 44267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +------+------+------+------+ 44277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Rhs |w30[0]|w30[1]|w30[2]|w30[3]| 44287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +------+------+------+------+ 44297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 44307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // | | | | | 44317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 44327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Lhs | | | | | 44337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 44347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 44357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24|w27| |w0/12 |w1/13 |w2/14 |w3/15 | 44367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24|w27| |w0/12 |w1/13 |w2/14 |w3/15 | 44377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24|w27| |w0/12 |w1/13 |w2/14 |w3/15 | 44387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24|w27| |w0/12 |w1/13 |w2/14 |w3/15 | 44397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 44407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25|w28| |w4/16 |w5/17 |w6/18 |w7/19 | 44417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25|w28| |w4/16 |w5/17 |w6/18 |w7/19 | 44427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25|w28| |w4/16 |w5/17 |w6/18 |w7/19 | 44437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25|w28| |w4/16 |w5/17 |w6/18 |w7/19 | 44447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 44457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26|w29| |w8/20 |w9/21 |w10/22|w11/23| 44467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26|w29| |w8/20 |w9/21 |w10/22|w11/23| 44477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26|w29| |w8/20 |w9/21 |w10/22|w11/23| 44487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26|w29| |w8/20 |w9/21 |w10/22|w11/23| 44497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+---+ - - - - +------+------+------+------+ 44507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 44517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Accumulator 44527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 44537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads. 44547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w24, 0(%[lhs_ptr])\n" 44557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w25, 8(%[lhs_ptr])\n" 44567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 44577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for the first half of depth 0. 44587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 0(%[rhs_ptr])\n" 44597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 1(%[rhs_ptr])\n" 44607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 2(%[rhs_ptr])\n" 44617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 3(%[rhs_ptr])\n" 44627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 44637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 8-bit elements of lhs[] to 16 bits. 44647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w24, $w31, $w24\n" 44657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.b $w26, $w31, $w25\n" 44667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w25, $w31, $w25\n" 44677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 16-bit elements of lhs[] to 32 bits. 44687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.h $w27, $w31, $w24\n" 44697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.h $w28, $w31, $w25\n" 44707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.h $w29, $w31, $w26\n" 44717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w24, $w31, $w24\n" 44727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w25, $w31, $w25\n" 44737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w26, $w31, $w26\n" 44747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 44757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 0. 44767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a0\n" 44777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 8(%[rhs_ptr])\n" 44787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w0, $w24, $w30\n" 44797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w4, $w25, $w30\n" 44807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w8, $w26, $w30\n" 44817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a1\n" 44827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 9(%[rhs_ptr])\n" 44837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w1, $w24, $w30\n" 44847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w5, $w25, $w30\n" 44857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w9, $w26, $w30\n" 44867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a2\n" 44877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 10(%[rhs_ptr])\n" 44887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w2, $w24, $w30\n" 44897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w6, $w25, $w30\n" 44907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w10, $w26, $w30\n" 44917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a3\n" 44927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 11(%[rhs_ptr])\n" 44937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w3, $w24, $w30\n" 44947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w7, $w25, $w30\n" 44957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w11, $w26, $w30\n" 44967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 44977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a0\n" 44987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 4(%[rhs_ptr])\n" 44997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w12, $w24, $w30\n" 45007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w16, $w25, $w30\n" 45017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w20, $w26, $w30\n" 45027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a1\n" 45037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 5(%[rhs_ptr])\n" 45047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w13, $w24, $w30\n" 45057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w17, $w25, $w30\n" 45067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w21, $w26, $w30\n" 45077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a2\n" 45087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 6(%[rhs_ptr])\n" 45097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w14, $w24, $w30\n" 45107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w18, $w25, $w30\n" 45117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w22, $w26, $w30\n" 45127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a3\n" 45137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 7(%[rhs_ptr])\n" 45147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w15, $w24, $w30\n" 45157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w19, $w25, $w30\n" 45167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w23, $w26, $w30\n" 45177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 45187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Depth 1. 45197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a0\n" 45207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 12(%[rhs_ptr])\n" 45217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w0, $w27, $w30\n" 45227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w4, $w28, $w30\n" 45237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w8, $w29, $w30\n" 45247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a1\n" 45257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 13(%[rhs_ptr])\n" 45267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w1, $w27, $w30\n" 45277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w5, $w28, $w30\n" 45287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w9, $w29, $w30\n" 45297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a2\n" 45307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 14(%[rhs_ptr])\n" 45317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w2, $w27, $w30\n" 45327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w6, $w28, $w30\n" 45337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w10, $w29, $w30\n" 45347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a3\n" 45357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 15(%[rhs_ptr])\n" 45367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w3, $w27, $w30\n" 45377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w7, $w28, $w30\n" 45387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w11, $w29, $w30\n" 45397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 45407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a0\n" 45417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w12, $w27, $w30\n" 45427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w16, $w28, $w30\n" 45437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w20, $w29, $w30\n" 45447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a1\n" 45457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w13, $w27, $w30\n" 45467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w17, $w28, $w30\n" 45477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w21, $w29, $w30\n" 45487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a2\n" 45497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w14, $w27, $w30\n" 45507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w18, $w28, $w30\n" 45517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w22, $w29, $w30\n" 45527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a3\n" 45537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w15, $w27, $w30\n" 45547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w19, $w28, $w30\n" 45557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "maddv.w $w23, $w29, $w30\n" 45567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 45577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "addiu %[depth], -2\n" 45587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n" 45597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 16\n" 45607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bnez %[depth]," GEMMLOWP_LABEL_LOOP "b\n" 45617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 45627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators. 45637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w0, (0*16)(%[accum_ptr])\n" 45647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w4, (1*16)(%[accum_ptr])\n" 45657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w8, (2*16)(%[accum_ptr])\n" 45667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w1, (3*16)(%[accum_ptr])\n" 45677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w5, (4*16)(%[accum_ptr])\n" 45687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w9, (5*16)(%[accum_ptr])\n" 45697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w2, (6*16)(%[accum_ptr])\n" 45707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w6, (7*16)(%[accum_ptr])\n" 45717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w10, (8*16)(%[accum_ptr])\n" 45727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w3, (9*16)(%[accum_ptr])\n" 45737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w7, (10*16)(%[accum_ptr])\n" 45747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w11, (11*16)(%[accum_ptr])\n" 45757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w12, (12*16)(%[accum_ptr])\n" 45767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w16, (13*16)(%[accum_ptr])\n" 45777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w20, (14*16)(%[accum_ptr])\n" 45787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w13, (15*16)(%[accum_ptr])\n" 45797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w17, (16*16)(%[accum_ptr])\n" 45807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w21, (17*16)(%[accum_ptr])\n" 45817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w14, (18*16)(%[accum_ptr])\n" 45827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w18, (19*16)(%[accum_ptr])\n" 45837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w22, (20*16)(%[accum_ptr])\n" 45847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w15, (21*16)(%[accum_ptr])\n" 45857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w19, (22*16)(%[accum_ptr])\n" 45867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w23, (23*16)(%[accum_ptr])\n" 45877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 45887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 45897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 45907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 45917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 45927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 45937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "memory", 45947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "a0", "a1", "a2", "a3", 45957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", 45967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", 45977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23", 45987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31"); 45997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 46007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 46017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 46027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above 46037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics2 (TODO). 46047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 16x16=32 multiplications. 46057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 32 MSA regs used: 46067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 24 accumulators 46077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 3 lhs 46087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 4 rhs 46097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes 46107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~70 instructions in the loop. 46117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly2 { 46127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint8_t OperandType; 46137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef std::uint32_t AccumulatorType; 46147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang typedef KernelFormat< 46157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>, 46167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> > 46177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang Format; 46187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr, 46197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang AccumulatorType* accum_ptr, int depth) { 46207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang asm volatile( 46217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load accumulators 46227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w0, (0*16)(%[accum_ptr])\n" 46237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w4, (1*16)(%[accum_ptr])\n" 46247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w8, (2*16)(%[accum_ptr])\n" 46257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w1, (3*16)(%[accum_ptr])\n" 46267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w5, (4*16)(%[accum_ptr])\n" 46277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w9, (5*16)(%[accum_ptr])\n" 46287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w2, (6*16)(%[accum_ptr])\n" 46297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w6, (7*16)(%[accum_ptr])\n" 46307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w10, (8*16)(%[accum_ptr])\n" 46317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w3, (9*16)(%[accum_ptr])\n" 46327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w7, (10*16)(%[accum_ptr])\n" 46337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w11, (11*16)(%[accum_ptr])\n" 46347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w12, (12*16)(%[accum_ptr])\n" 46357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w16, (13*16)(%[accum_ptr])\n" 46367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w20, (14*16)(%[accum_ptr])\n" 46377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w13, (15*16)(%[accum_ptr])\n" 46387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w17, (16*16)(%[accum_ptr])\n" 46397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w21, (17*16)(%[accum_ptr])\n" 46407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w14, (18*16)(%[accum_ptr])\n" 46417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w18, (19*16)(%[accum_ptr])\n" 46427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w22, (20*16)(%[accum_ptr])\n" 46437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w15, (21*16)(%[accum_ptr])\n" 46447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w19, (22*16)(%[accum_ptr])\n" 46457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.w $w23, (23*16)(%[accum_ptr])\n" 46467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Set a temp to all zeroes. 46477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ldi.b $w31, 0\n" 46487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 46497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_LABEL_LOOP ":\n" 46507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Overview of register layout: 46517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 46527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A half of the 2 2x4 cells of Rhs is stored in 16bit in w27-w30 46537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // (each register contains 4 replicas of a pair of elements). 46547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in w24-w26. 46557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // A 12x8 block of accumulators is stored in 32bit in w0-w23. 46567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 46577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +------+------+------+------+ 46587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Rhs |w27 |w28 |w29 |w30 | 46597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +------+------+------+------+ 46607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 46617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // | | | | | 46627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 46637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Lhs | | | | | 46647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 46657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +------+------+------+------+ 46667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24| |w0/12 |w1/13 |w2/14 |w3/15 | 46677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24| |w0/12 |w1/13 |w2/14 |w3/15 | 46687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24| |w0/12 |w1/13 |w2/14 |w3/15 | 46697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w24| |w0/12 |w1/13 |w2/14 |w3/15 | 46707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +------+------+------+------+ 46717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25| |w4/16 |w5/17 |w6/18 |w7/19 | 46727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25| |w4/16 |w5/17 |w6/18 |w7/19 | 46737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25| |w4/16 |w5/17 |w6/18 |w7/19 | 46747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w25| |w4/16 |w5/17 |w6/18 |w7/19 | 46757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +------+------+------+------+ 46767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26| |w8/20 |w9/21 |w10/22|w11/23| 46777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26| |w8/20 |w9/21 |w10/22|w11/23| 46787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26| |w8/20 |w9/21 |w10/22|w11/23| 46797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // |w26| |w8/20 |w9/21 |w10/22|w11/23| 46807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // +---+ - - - - +------+------+------+------+ 46817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // 46827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Accumulators 46837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 46847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads. 46857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w24, 0(%[lhs_ptr])\n" 46867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ld.b $w25, 8(%[lhs_ptr])\n" 46877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 46887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for the first half of depth 0. 46897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 0(%[rhs_ptr])\n" 46907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 1(%[rhs_ptr])\n" 46917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 2(%[rhs_ptr])\n" 46927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 3(%[rhs_ptr])\n" 46937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for the first half of depth 1. 46947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $v0, 4(%[rhs_ptr])\n" 46957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $v1, 5(%[rhs_ptr])\n" 46967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $t8, 6(%[rhs_ptr])\n" 46977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $t9, 7(%[rhs_ptr])\n" 46987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 46997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Zero-extend 8-bit elements of lhs[] to 16 bits. 47007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w24, $w31, $w24\n" 47017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.b $w26, $w31, $w25\n" 47027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.b $w25, $w31, $w25\n" 47037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Interleave depth 0 and depth 1 elements of lhs[] for dpadd_u.w. 47047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.d $w27, $w31, $w24\n" 47057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.d $w28, $w31, $w25\n" 47067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvl.d $w29, $w31, $w26\n" 47077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w24, $w27, $w24\n" 47087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w25, $w28, $w25\n" 47097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ilvr.h $w26, $w29, $w26\n" 47107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Combine and interleave depth 0 and depth 1 elements of rhs[] for dpadd_u.w 47127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // (for the first half). 47137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a0, $v0, 16, 8\n" 47147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a1, $v1, 16, 8\n" 47157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a2, $t8, 16, 8\n" 47167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a3, $t9, 16, 8\n" 47177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Make 4 replicas of every pair of rhs[] elements. 47187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w27, $a0\n" 47197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w28, $a1\n" 47207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w29, $a2\n" 47217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a3\n" 47227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for the second half of depth 0. 47247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a0, 8(%[rhs_ptr])\n" 47257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a1, 9(%[rhs_ptr])\n" 47267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a2, 10(%[rhs_ptr])\n" 47277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $a3, 11(%[rhs_ptr])\n" 47287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Load 4 bytes of rhs[] for the second half of depth 1. 47297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $v0, 12(%[rhs_ptr])\n" 47307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $v1, 13(%[rhs_ptr])\n" 47317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $t8, 14(%[rhs_ptr])\n" 47327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "lbu $t9, 15(%[rhs_ptr])\n" 47337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // First half of depths 0 and 1. 47357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Dot-product-(and)-add doubles multiplicand width. 47367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w0, $w24, $w27\n" 47377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w4, $w25, $w27\n" 47387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w8, $w26, $w27\n" 47397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w1, $w24, $w28\n" 47407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w5, $w25, $w28\n" 47417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w9, $w26, $w28\n" 47427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w2, $w24, $w29\n" 47437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w6, $w25, $w29\n" 47447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w10, $w26, $w29\n" 47457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w3, $w24, $w30\n" 47467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w7, $w25, $w30\n" 47477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w11, $w26, $w30\n" 47487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Combine and interleave depth 0 and depth 1 elements of rhs[] for dpadd_u.w 47507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // (for the second half). 47517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a0, $v0, 16, 8\n" 47527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a1, $v1, 16, 8\n" 47537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a2, $t8, 16, 8\n" 47547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "ins $a3, $t9, 16, 8\n" 47557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Make 4 replicas of every pair of rhs[] elements. 47567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w27, $a0\n" 47577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w28, $a1\n" 47587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w29, $a2\n" 47597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "fill.w $w30, $a3\n" 47607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Second half of depths 0 and 1. 47627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Dot-product-(and)-add doubles multiplicand width. 47637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w12, $w24, $w27\n" 47647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w16, $w25, $w27\n" 47657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w20, $w26, $w27\n" 47667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w13, $w24, $w28\n" 47677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w17, $w25, $w28\n" 47687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w21, $w26, $w28\n" 47697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w14, $w24, $w29\n" 47707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w18, $w25, $w29\n" 47717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w22, $w26, $w29\n" 47727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w15, $w24, $w30\n" 47737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w19, $w25, $w30\n" 47747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "dpadd_u.w $w23, $w26, $w30\n" 47757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "addiu %[depth], -2\n" 47777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n" 47787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 16\n" 47797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "bnez %[depth]," GEMMLOWP_LABEL_LOOP "b\n" 47807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 47817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang // Store accumulators. 47827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w0, (0*16)(%[accum_ptr])\n" 47837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w4, (1*16)(%[accum_ptr])\n" 47847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w8, (2*16)(%[accum_ptr])\n" 47857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w1, (3*16)(%[accum_ptr])\n" 47867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w5, (4*16)(%[accum_ptr])\n" 47877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w9, (5*16)(%[accum_ptr])\n" 47887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w2, (6*16)(%[accum_ptr])\n" 47897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w6, (7*16)(%[accum_ptr])\n" 47907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w10, (8*16)(%[accum_ptr])\n" 47917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w3, (9*16)(%[accum_ptr])\n" 47927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w7, (10*16)(%[accum_ptr])\n" 47937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w11, (11*16)(%[accum_ptr])\n" 47947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w12, (12*16)(%[accum_ptr])\n" 47957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w16, (13*16)(%[accum_ptr])\n" 47967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w20, (14*16)(%[accum_ptr])\n" 47977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w13, (15*16)(%[accum_ptr])\n" 47987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w17, (16*16)(%[accum_ptr])\n" 47997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w21, (17*16)(%[accum_ptr])\n" 48007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w14, (18*16)(%[accum_ptr])\n" 48017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w18, (19*16)(%[accum_ptr])\n" 48027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w22, (20*16)(%[accum_ptr])\n" 48037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w15, (21*16)(%[accum_ptr])\n" 48047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w19, (22*16)(%[accum_ptr])\n" 48057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "st.w $w23, (23*16)(%[accum_ptr])\n" 48067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // outputs 48077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 48087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [depth] "+r"(depth) 48097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // inputs 48107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang [accum_ptr] "r"(accum_ptr) 48117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang : // clobbers 48127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "memory", 48137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "v0", "v1", 48147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "a0", "a1", "a2", "a3", 48157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "t8", "t9", 48167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", 48177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", 48187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23", 48197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31"); 48207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang } 48217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}; 48227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif // __mips 4823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// BEGIN code copied from gemmlowp/internal/kernel_reference.h 4825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This kernel is templatized in an arbitrary Format template parameter, 4827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// allowing it to have any arbitrary format. 4828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename tOperandType, typename tAccumulatorType, typename tFormat> 4829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct ReferenceKernel { 4830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef tOperandType OperandType; 4831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef tAccumulatorType AccumulatorType; 4832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef tFormat Format; 4833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr, 4835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accum_ptr, int depth) { 4836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int depth_cells = static_cast<int>(depth / Format::kDepth); 4837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The outer loop is over the depth dimension. 4839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int dc = 0; dc < depth_cells; dc++) { 4840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // The next two loops are over cells of the Lhs (stacked vertically), 4841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // and over cells of the Rhs (stacked horizontally). 4842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int rc = 0; rc < Format::Lhs::kCells; rc++) { 4843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const OperandType* lhs_cell_ptr = 4844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs_ptr + (dc * Format::Lhs::kCells + rc) * 4845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format::Lhs::Cell::kWidth * Format::kDepth; 4846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int cc = 0; cc < Format::Rhs::kCells; cc++) { 4847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const OperandType* rhs_cell_ptr = 4848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs_ptr + (dc * Format::Rhs::kCells + cc) * 4849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Format::Rhs::Cell::kWidth * Format::kDepth; 4850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Now we are inside one cell of the Lhs and inside one cell 4852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // of the Rhs, so the remaining inner loops are just 4853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // traditional three loops of matrix multiplication. 4854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int di = 0; di < Format::kDepth; di++) { 4855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) { 4856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) { 4857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const OperandType* lhs_coeff_ptr = 4858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang lhs_cell_ptr + 4859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang OffsetIntoCell<typename Format::Lhs::Cell>(ri, di); 4860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const OperandType* rhs_coeff_ptr = 4861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang rhs_cell_ptr + 4862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang OffsetIntoCell<typename Format::Rhs::Cell>(ci, di); 4863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType* accumulator_coeff_ptr = 4864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang accum_ptr + (ri + rc * Format::Lhs::Cell::kWidth) + 4865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows; 4866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang *accumulator_coeff_ptr += AccumulatorType(*lhs_coeff_ptr) * 4867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang AccumulatorType(*rhs_coeff_ptr); 4868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 4876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// END code copied from gemmlowp/internal/kernel_reference.h 4878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType> 4880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangclass CacheLineAlignedBuffer { 4881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang public: 4882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer(std::size_t size) : size_(size) { 4883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang data_ = nullptr; 4884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Adds a few bytes of padding here, because the 64-bit 'A57' kernel 4885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // reads one iteration past the end the buffer, causing a crash on iOS. 48867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang int res = posix_memalign(reinterpret_cast<void**>(&data_), kCacheLineSize, 48877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang size_ * sizeof(DataType) + 16); 48887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang (void)res; 4889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ~CacheLineAlignedBuffer() { free(data_); } 4892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const DataType* data() const { return data_; } 4894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang DataType* data() { return data_; } 4895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 48967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang std::size_t size() const { return size_; } 4897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang private: 4899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const std::size_t size_; 4900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang DataType* data_; 4901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 4902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType> 4904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid FillRandom(CacheLineAlignedBuffer<DataType>* buffer) { 4905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static std::mt19937 generator(0); 4906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 100 is smaller than any nonzero bound of the range of any data type. 4907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const DataType kMaxVal = DataType(100); 4908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const DataType kMinVal = 4909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::is_signed<DataType>::value ? -kMaxVal : DataType(0); 4910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::uniform_real_distribution<float> dist(kMinVal, kMaxVal); 4911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (std::size_t i = 0; i < buffer->size(); i++) { 4912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang buffer->data()[i] = DataType(dist(generator)); 4913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 4915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType> 4917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid FillZero(CacheLineAlignedBuffer<DataType>* buffer) { 4918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (std::size_t i = 0; i < buffer->size(); i++) { 4919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang buffer->data()[i] = DataType(0); 4920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 4922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType> 4924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid Copy(CacheLineAlignedBuffer<DataType>* dst, 4925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const CacheLineAlignedBuffer<DataType>& src) { 4926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang assert(dst->size() == src.size()); 4927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang memcpy(dst->data(), src.data(), src.size() * sizeof(DataType)); 4928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 4929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType> 4931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid PrintMatrix(int rows, int cols, int rowstride, int colstride, 4932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const DataType* data) { 4933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int r = 0; r < rows; r++) { 4934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int c = 0; c < cols; c++) { 4935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << double(data[r * rowstride + c * colstride]) << " "; 4936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << std::endl; 4938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << std::endl; 4940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 4941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType> 4943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangbool approx_equals(DataType a, DataType b) { 4944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return a == b; 4945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 4946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <> 4948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangbool approx_equals(float a, float b) { 4949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (!a && !b) { 4950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return true; 4951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 4952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 1e-1 is very coarse accuracy, we should switch to an overall L2 metric 4953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // and tighten the tolerance on that metric. 4954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return std::abs(a - b) < 1e-1f * std::min(std::abs(a), std::abs(b)); 4955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 4956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel> 4958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid test_kernel(int depth, const char* kernel_name) { 4959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::OperandType OperandType; 4960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::AccumulatorType AccumulatorType; 4961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::Format Format; 4962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kLhsWidth = Format::Lhs::kWidth; 4963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int kRhsWidth = Format::Rhs::kWidth; 4964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef ReferenceKernel<OperandType, AccumulatorType, Format> ReferenceKernel; 4966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<OperandType> lhs(kLhsWidth * depth); 4968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<OperandType> rhs(kRhsWidth * depth); 4969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<AccumulatorType> accum_initial(kLhsWidth * kRhsWidth); 4970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<AccumulatorType> accum(kLhsWidth * kRhsWidth); 4971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<AccumulatorType> accum_reference(kLhsWidth * 4972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang kRhsWidth); 4973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang FillRandom(&lhs); 4975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang FillRandom(&rhs); 4976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang FillRandom(&accum_initial); 4977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Copy(&accum, accum_initial); 4978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Copy(&accum_reference, accum_initial); 4979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ReferenceKernel::Run(lhs.data(), rhs.data(), accum_reference.data(), depth); 4981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Kernel::Run(lhs.data(), rhs.data(), accum.data(), depth); 4982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 4983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int l = 0; l < kLhsWidth; l++) { 4984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int r = 0; r < kRhsWidth; r++) { 4985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int index = l + kLhsWidth * r; 4986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (!approx_equals(accum.data()[index], accum_reference.data()[index])) { 4987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "Arithmetic error in kernel:" << std::endl 4988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang << " " << kernel_name << std::endl 4989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang << "Wrong accumulator for depth=" << depth << ", " 4990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang << "at l = " << l << ", r = " << r << std::endl; 4991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "reference value: " << accum_reference.data()[index] 4992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang << std::endl; 4993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "actual value: " << accum.data()[index] << std::endl; 4994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (depth <= 16) { 4995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "LHS matrix:" << std::endl; 4996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang PrintMatrix(kLhsWidth, depth, 1, kLhsWidth, lhs.data()); 4997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "RHS matrix:" << std::endl; 4998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang PrintMatrix(depth, kRhsWidth, kRhsWidth, 1, rhs.data()); 4999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "Initial Accumulator matrix:" << std::endl; 5000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, accum_initial.data()); 5001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "Reference Accumulator matrix:" << std::endl; 5002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, 5003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang accum_reference.data()); 5004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cerr << "Actual Accumulator matrix:" << std::endl; 5005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, accum.data()); 5006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang abort(); 5008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel> 5014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint ops(int depth) { 5015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // 2x the number of multiply-accumulate scalar ops. 5016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return 2 * Kernel::Format::Lhs::kWidth * Kernel::Format::Rhs::kWidth * depth; 5017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <unsigned Modulus, typename Integer> 5020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangInteger RoundDown(Integer i) { 5021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return i - (i % Modulus); 5022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint CacheSizeInKB() { 5025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const char* cache_size_k_env = getenv("CACHE_SIZE_KB"); 5026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const int cache_size_k = 5027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang cache_size_k_env ? atoi(cache_size_k_env) : kDefaultCacheSizeK; 5028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return cache_size_k; 5029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel> 5032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint BenchmarkDepthToFitInCache() { 5033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int cache_size_bytes = 1024 * CacheSizeInKB(); 5034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Subtract the typical size of a few cache lines, so 5036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // we don't need to worry too hard about e.g. some stack data. 5037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int conservative_cache_size_bytes = 5038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang cache_size_bytes - 2 * kCacheLineSize; 5039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // We will subtract the memory occupied by accumulators. 5041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::AccumulatorType AccumulatorType; 5042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int kAccumulatorBytes = sizeof(AccumulatorType) * 5043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Kernel::Format::Lhs::kWidth * 5044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Kernel::Format::Rhs::kWidth; 5045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Compute the depth. 5047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::OperandType OperandType; 5048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int kBytesPerUnitOfDepth = 5049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang sizeof(OperandType) * 5050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang (Kernel::Format::Lhs::kWidth + Kernel::Format::Rhs::kWidth); 5051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int unrounded_depth = 5052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang (conservative_cache_size_bytes - kAccumulatorBytes) / 5053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang kBytesPerUnitOfDepth; 5054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Cap depth, to avoid unfairly favoring narrower kernels 5056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int kMaxDepth = 1024; 5057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int clamped_unrounded_depth = std::min(kMaxDepth, unrounded_depth); 5058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Round depth down to a multiple of cache line size, which helps because 5060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // our kernels may crash if depth is not a multiple of the number of 5061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // depth level that they want to 5062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // handle at each loop iteration, and we don't want to require kernels 5063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // to be more complex. Currently all kernels process 1, 2 or 8 levels of 5064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // depth at a time. The main reason why that might increase in the future 5065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // is if registers get wider, but I don't suppose that register could 5066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // ever get wider than cache lines. 5067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return RoundDown<kCacheLineSize>(clamped_unrounded_depth); 5068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangdouble current_time_in_seconds() { 5071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang timespec t; 5072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang clock_gettime(CLOCK_REALTIME, &t); 5073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return t.tv_sec + 1e-9 * t.tv_nsec; 5074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel> 5077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangdouble benchmark(int depth) { 5078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // Minimum duration for this benchmark to run. If the workload finishes 5079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang // sooner, we retry with double the number of iterations. 5080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang static const double min_benchmark_time_in_seconds = 1.0; 5081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::OperandType OperandType; 5083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef typename Kernel::AccumulatorType AccumulatorType; 5084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<OperandType> lhs(Kernel::Format::Lhs::kWidth * depth); 5086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<OperandType> rhs(Kernel::Format::Rhs::kWidth * depth); 5087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang CacheLineAlignedBuffer<AccumulatorType> accum(Kernel::Format::Lhs::kWidth * 5088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Kernel::Format::Rhs::kWidth); 5089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (std::uint64_t iters_at_a_time = 1;; iters_at_a_time *= 2) { 5091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const double t_start = current_time_in_seconds(); 5092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (std::uint64_t i = 0; i < iters_at_a_time; i++) { 5093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Kernel::Run(lhs.data(), rhs.data(), accum.data(), depth); 5094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const double t_end = current_time_in_seconds(); 5096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const double elapsed = t_end - t_start; 5097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (elapsed > min_benchmark_time_in_seconds) { 5098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return iters_at_a_time * ops<Kernel>(depth) / elapsed; 5099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel> 5104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid benchmark_and_print_results(const char* kernel_name) { 5105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (getenv("BENCHMARK_KERNEL")) { 5106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (strcmp(getenv("BENCHMARK_KERNEL"), kernel_name)) { 5107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return; 5108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int kKernelDepth = Kernel::Format::kDepth; 5111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int depth = kKernelDepth; depth <= 1024; depth += kKernelDepth) { 5112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang test_kernel<Kernel>(depth, kernel_name); 5113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (getenv("BENCHMARK_ALL_DEPTHS")) { 5116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int depth = kKernelDepth; 5117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang depth <= BenchmarkDepthToFitInCache<Kernel>(); depth *= 2) { 5118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cout << kernel_name << "," << depth << "," 5119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang << benchmark<Kernel>(depth) * 1e-9f << std::endl; 5120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } else { 5122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int depth = BenchmarkDepthToFitInCache<Kernel>(); 5123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cout << kernel_name << "," << benchmark<Kernel>(depth) * 1e-9f 5124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang << std::endl; 5125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define BENCHMARK(Kernel) \ 5129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang do { \ 5130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang benchmark_and_print_results<Kernel>(#Kernel); \ 5131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } while (false) 5132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint main() { 5134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (getenv("BENCHMARK_ALL_DEPTHS")) { 5135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cout << "kernel,depth,Gop/s" << std::endl; 5136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } else { 5137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::cout << "kernel,Gop/s" << std::endl; 5138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 5139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __arm__ 5141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits); 5142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics); 5143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators); 5144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics); 5145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand); 5146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Int32_WithScalar); 5147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar); 5148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __ARM_FEATURE_FMA 5149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar); 5150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 5151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_MLA_WithScalar); 5152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_intrinsics); 5153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_A53); 5154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_A53_depth2); 5155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_MLA_Rotating); 5156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __ARM_FEATURE_FMA 5157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_32bit_GEMM_Float32_FMA_Rotating); 5158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 5159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 5160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __aarch64__ 5162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits); 5163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics); 5164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators); 5165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics); 5166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57); 51677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __ARM_FEATURE_DOTPROD 51687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct); 51697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct_A55r1); 51707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif 5171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Int32_WithScalar); 5172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Float32_WithVectorDuplicatingScalar); 5173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar); 5174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_intrinsics); 5175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A57); 5176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef __APPLE__ 5177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A53); 5178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 51797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A55r1); 51807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif 51817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang 51827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __mips 51837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics); 51847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly); 51857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly2); 51867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics); 51877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly); 51887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang BENCHMARK(MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly2); 5189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif 5190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 5191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return 0; 5192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 5193