1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
3a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License");
4a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// you may not use this file except in compliance with the License.
5a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// You may obtain a copy of the License at
6a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
7a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//     http://www.apache.org/licenses/LICENSE-2.0
8a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
9a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Unless required by applicable law or agreed to in writing, software
10a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS,
11a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// See the License for the specific language governing permissions and
13a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// limitations under the License.
14a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
15a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is a standalone testbed and benchmark for gemmlowp-style GEMM kernels,
16a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// either doing integer or float arithmetic.
17a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It verifies that a kernel produces correct results, then benchmarks it.
18a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
19a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Some benchmark results are recorded in this spreadsheet:
20a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
21a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://docs.google.com/spreadsheets/d/1UPbzbp9rdsD6RXxOr5q6AZ0n1omgEknLYO2ogiw6Kqk/edit?usp=sharing
22a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
23a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This program is entirely self-contained, and can be compiled manually
24a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// such as suggested in the command lines below.
25a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It currently supports only Android/ARM but would trivially generalize to
26a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// other OSes (it's mostly standard POSIX) or architectures (each kernel
27a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// targets a specific architecture, one may simply add more).
28a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
29a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang/*
30a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Build and run this benchmark on Android/ARM/32bit:
31a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ~/android/toolchains/arm-linux-androideabi/bin/arm-linux-androideabi-clang++ \
32a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang -fPIE -pie -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \
33a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /tmp/benchmark -mfloat-abi=softfp -mfpu=neon-vfpv4 && adb push /tmp/benchmark \
34a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /data/local/tmp && adb shell /data/local/tmp/benchmark
35a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Build and run this benchmark on Android/ARM/64bit:
36a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang ~/android/toolchains/aarch64-linux-android/bin/aarch64-linux-android-clang++ \
37a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang -fPIE -static -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \
38a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /tmp/benchmark && adb push /tmp/benchmark /data/local/tmp && adb shell \
39a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang /data/local/tmp/benchmark
40a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang */
41a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
42a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// For big.LITTLE devices, use 'taskset' to select which cores to benchmark.
43a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
44a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The syntax is: taskset <mask> <commandline>
45a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where mask is a binary mask where each bit corresponds to a core,
46a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// and low bits are little cores.
47a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
48a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Examples:
49a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Nexus 5X big cores: taskset 30
50a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Nexus 5X little cores: taskset 0f
51a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Pixel XL big cores: taskset 0c
52a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Pixel XL little cores: taskset 03
53a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
54a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Full example:
55a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// adb shell taskset 0c /data/local/tmp/benchmark
56a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
57a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <sched.h>
58a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <unistd.h>
59a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
60a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <algorithm>
61a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cassert>
62a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cstdint>
63a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cstdlib>
647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#include <cstring>
65a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <iostream>
66a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <random>
67a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <type_traits>
68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if !defined(__arm__) && !defined(__aarch64__) && \
707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  !(defined(__mips) && (__mips_isa_rev >= 5) && defined(__mips_msa))
717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#error This benchmark assumes ARM or MIPS (for intrinsics and inline assembly sections).
72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if defined(__arm__) || defined(__aarch64__)
75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <arm_neon.h>
767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif
777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if defined(__mips)
797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#include <msa.h>
807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Some convenience macros to hide differences between MIPS32 and MIPS64.
827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __LP64__
837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#define GEMMLOWP_MIPS_XADDIU     "daddiu"
847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#else
857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#define GEMMLOWP_MIPS_XADDIU     "addiu"
867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif
877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif
88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Typically one wants to fit in L1 cache, and GEMM implementations
90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// are carefully optimized to tune their access patterns to that effect.
91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Most devices have at least 16k of L1 cache. The Kraits have exactly 16k.
92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kDefaultCacheSizeK = 16;
93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kCacheLineSize = 64;
95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// These definitions are used for labels within assembly code. Required for
97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// iOS toolchain compatibility.
98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_AFTER_LOOP "1"
99a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_LOOP "2"
100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_LABEL_STORE "4"
102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// BEGIN code copied from gemmlowp/internal/kernel.h
104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Explanation of general gemmlowp terminology
106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ===========================================
107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We use the following abbreviations:
109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// LHS = "left-hand side"
110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// RHS = "right-hand side"
111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Sometimes when referring to either LHS or RHS, we just say a "Side".
112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// In a matrix product of a MxK matrix times a KxN matrix,
114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// we call K the 'depth'. Note that M is the number of rows
115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of the result (and of the LHS), and N is the number of columns
116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of the result (and of the RHS).
117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// In each of the LHS and RHS matrices, we call 'width' the
119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// other dimension, besides the depth. So in the LHS, 'width'
120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is the number of rows, while in the RHS, 'width' is the number
121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of columns.
122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//  So in the LHS MxK matrix, the depth is K and the width in M.
124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// And in the RHS KxN matrix, the depth is K and the width in N.
125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is illustrated in this picture:
127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                             RHS width
129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                        <----------------->
130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                        +-----------------+ ^
131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                        |       RHS       | | Depth
132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                        +-----------------+ v
133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                 ^ +--+ +-----------------+
134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                 | |L | |                 |
135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//       LHS width | |H | |      Result     |
136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                 | |S | |                 |
137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                 v +--+ +-----------------+
138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                   <-->
139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//                   Depth
140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Explanation of gemmlowp kernel formats and "cells"
142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ==================================================
143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Kernels operate on small LHS and RHS blocks that fit in registers.
145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// These blocks are stored contiguously in memory, but not always
146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in a traditional column-major or row-major order; instead,
147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// they consist of a number of sub-blocks, which we call "cells",
148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// that are stored in column-major or row-major order. However,
149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// what really matters to us is not so much rows vs columns, but
150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// rather width vs depth. So we refer to "width-major" and "depth-major"
151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// storage orders. In the LHS, width-major means row-major,
152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// while in the RHS, width-major means column-major.
153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// There is also a third possibility, "diagonal order",
154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// which is unused at the moment.
155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We aim to treat both sides, LHS and RHS, on an equal footing,
157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// so we call them both 'sides'. A KernelFormat thus is just a pair
158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// contains a CellFormat and a number of cells; cells are only ever
160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// stacked in the width dimension, which means stacked vertically in the
161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// LHS and stacked horizondally in the RHS.
162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Example
164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// =======
165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Let's work out the data layout expected by a kernel having the
167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// following format (the struct names here are defined below in this file):
168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelFormat<
170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   KernelSideFormat<CellFormat<3, 4>, 3>,
171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   KernelSideFormat<CellFormat<5, 4>, 2>
172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// >
173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor order (the default value, see CellFormat). In the LHS,
177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor means column-major, so the LHS cells are of size 3x4 in
178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// column-major order, so the LHS layout is:
179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 0  3  6  9
181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 1  4  7  10
182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 2  5  8  11
183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 12 15 18 21
184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 13 16 19 22
185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 14 17 20 23
186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 24 27 30 33
187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 25 28 31 34
188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 26 29 32 35
189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 2 cells each having dimensions (width=5, depth=4), laid out in
192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor order (the default value, see CellFormat). In the RHS,
193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// DepthMajor means row-major, so the RHS cells are of size 4x5 in
194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// row-major order, so the RHS layout is:
195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 0  1  2  3  4  20 21 22 23 24
197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 5  6  7  8  9  25 26 27 28 29
198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 10 11 12 13 14 30 31 32 33 34
199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 15 16 17 18 19 35 36 37 38 39
200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// CellOrder enumerates the possible storage orders (=layouts) for
202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// a cell (see explanation above).
203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangenum class CellOrder { DepthMajor, WidthMajor, Diagonal };
204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// CellFormat describes how data is laid
206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// out in a cell. That is, a CellOrder together with actual dimensions.
207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int tWidth, int tDepth, CellOrder tOrder>
208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct CellFormat {
209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kWidth = tWidth;
210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kDepth = tDepth;
211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const CellOrder kOrder = tOrder;
212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kSize = kWidth * kDepth;
214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelSideFormat describes how data is laid out in a kernel side
217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (i.e. LHS or RHS). That is, a CellFormat together with a number of
218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// cells. These cells are always stacked in the Width dimension.
219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// For example, in the LHS case, the Width dimension is the rows dimension,
220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// se we're saying that in the LHS, cells are stacked vertically.
221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We never stack cells in the Depth dimension.
222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename tCellFormat, int tCells>
223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct KernelSideFormat {
224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef tCellFormat Cell;
225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kCells = tCells;
226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kWidth = kCells * Cell::kWidth;
227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kDepth = Cell::kDepth;
228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// KernelFormat describes fully the input data layout that a kernel expects.
231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It consists of two KernelSideFormat's, one for LHS and one for RHS.
232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename tLhs, typename tRhs>
233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct KernelFormat {
234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef tLhs Lhs;
235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef tRhs Rhs;
236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kDepth = Lhs::Cell::kDepth;
239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline const char* CellOrderName(CellOrder o) {
244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  switch (o) {
245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    case CellOrder::DepthMajor:
246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return "DepthMajor";
247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    case CellOrder::WidthMajor:
248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return "WidthMajor";
249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    case CellOrder::Diagonal:
250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return "Diagonal";
251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    default:
252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      assert(false);
253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return nullptr;
254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Returns the offset into a cell, at which a given coefficient is stored.
258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename CellFormat>
259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline int OffsetIntoCell(int w, int d) {
260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  switch (CellFormat::kOrder) {
261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    case CellOrder::DepthMajor:
262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return w + d * CellFormat::kWidth;
263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    case CellOrder::WidthMajor:
264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return d + w * CellFormat::kDepth;
265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    case CellOrder::Diagonal:
266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      assert(CellFormat::kWidth == CellFormat::kDepth);
267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      static const int size = CellFormat::kWidth;
268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return ((size + w - d) * size + d) % (size * size);
269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    default:
270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      assert(false);
271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return 0;
272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// END code copied from gemmlowp/internal/kernel.h
276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __arm__
278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the current standard kernel in gemmlowp, see:
280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L33
281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators {
282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint8_t OperandType;
283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint32_t AccumulatorType;
284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1 Rhs cell of size 2x4
292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.8 {d0}, [%[rhs_ptr]]!\n"
293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x2 each
294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.8 {d2}, [%[lhs_ptr]]!\n"
295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.8 {d4}, [%[lhs_ptr]]!\n"
296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.8 {d6}, [%[lhs_ptr]]!\n"
297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #2\n"
313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // (q1--q3).
323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x4 block of accumulators is stored in 32bit in q4--q15.
324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   +-----+-----+-----+-----+
326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   |d0[0]|d0[1]|d0[2]|d0[3]|
327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //              Rhs  +-----+-----+-----+-----+
328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   |d1[0]|d1[1]|d1[2]|d1[3]|
329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   +-----+-----+-----+-----+
330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   |     |     |     |     |
332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //    Lhs            |     |     |     |     |
334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+--+ - - - -  +-----+-----+-----+-----+
336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|d3|          | q4  | q5  | q6  | q7  |
337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|d3|          | q4  | q5  | q6  | q7  |
338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|d3|          | q4  | q5  | q6  | q7  |
339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|d3|          | q4  | q5  | q6  | q7  |
340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+--+ - - - -  +-----+-----+-----+-----+
341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|d5|          | q8  | q9  | q10 | q11 |
342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|d5|          | q8  | q9  | q10 | q11 |
343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|d5|          | q8  | q9  | q10 | q11 |
344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|d5|          | q8  | q9  | q10 | q11 |
345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+--+ - - - -  +-----+-----+-----+-----+
346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|d7|          | q12 | q13 | q14 | q15 |
347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|d7|          | q12 | q13 | q14 | q15 |
348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|d7|          | q12 | q13 | q14 | q15 |
349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|d7|          | q12 | q13 | q14 | q15 |
350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+--+ - - - -  +-----+-----+-----+-----+
351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                            Accumulator
353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Expand Lhs/Rhs cells to 16 bit.
355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Note: moving theses vmovls further down to allow for
356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // longer data pipelining helps a little on A57 but is
357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // harmful on A53 --- It looks as if A53 doesn't like
358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // interleaving vmovl's into the vmlal's.
359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q0, d0\n"
360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q1, d2\n"
361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q2, d4\n"
362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q3, d6\n"
363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, level of depth 0
365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q4, d2, d0[0]\n"
366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q5, d2, d0[1]\n"
367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q6, d2, d0[2]\n"
368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q7, d2, d0[3]\n"
369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d2, [%[lhs_ptr]]\n"
370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q8, d4, d0[0]\n"
371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q9, d4, d0[1]\n"
372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q10, d4, d0[2]\n"
373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q11, d4, d0[3]\n"
374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #8]\n"
375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q12, d6, d0[0]\n"
376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q13, d6, d0[1]\n"
377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q14, d6, d0[2]\n"
378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q15, d6, d0[3]\n"
379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #16]\n"
380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr]]\n"
381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, level of depth 1
383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q4, d3, d1[0]\n"
384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q5, d3, d1[1]\n"
385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], #24\n"
386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q6, d3, d1[2]\n"
387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q7, d3, d1[3]\n"
388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], #8\n"
389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q8, d5, d1[0]\n"
390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q9, d5, d1[1]\n"
391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #2\n"
392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q10, d5, d1[2]\n"
393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q11, d5, d1[3]\n"
394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q12, d7, d1[0]\n"
395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q13, d7, d1[1]\n"
396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q14, d7, d1[2]\n"
397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q15, d7, d1[3]\n"
398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP "b\n"
400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_AFTER_LOOP
402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Expand Lhs/Rhs cells to 16 bit.
405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q0, d0\n"
406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q1, d2\n"
407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q2, d4\n"
408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmovl.u8 q3, d6\n"
409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, level of depth 0
411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q4, d2, d0[0]\n"
412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q5, d2, d0[1]\n"
413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q6, d2, d0[2]\n"
414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q7, d2, d0[3]\n"
415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q8, d4, d0[0]\n"
416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q9, d4, d0[1]\n"
417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q10, d4, d0[2]\n"
418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q11, d4, d0[3]\n"
419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q12, d6, d0[0]\n"
420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q13, d6, d0[1]\n"
421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q14, d6, d0[2]\n"
422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q15, d6, d0[3]\n"
423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, level of depth 1
425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q4, d3, d1[0]\n"
426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q5, d3, d1[1]\n"
427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q6, d3, d1[2]\n"
428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q7, d3, d1[3]\n"
429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q8, d5, d1[0]\n"
430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q9, d5, d1[1]\n"
431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q10, d5, d1[2]\n"
432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q11, d5, d1[3]\n"
433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q12, d7, d1[0]\n"
434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q13, d7, d1[1]\n"
435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q14, d7, d1[2]\n"
436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.u16 q15, d7, d1[3]\n"
437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is Maciek Chociej's fast kernel not expanding operands,
466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// from gemmlowp/meta/. Search for
467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//      mul_3x8_3x8_int32_lhsadd_rhsadd
468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in this file:
469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://raw.githubusercontent.com/google/gemmlowp/e4b9d858b6637d5d0058bfa3d869d2b95864251b/meta/single_thread_gemm.h
470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand {
471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint8_t OperandType;
472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint32_t AccumulatorType;
473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1>,
475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1> >
476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Clear aggregators.
481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q0, #0\n"
482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q1, #0\n"
483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q2, #0\n"
484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q3, q0\n"
485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q4, q1\n"
486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q5, q2\n"
487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q6, q3\n"
488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q7, q4\n"
489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q8, q5\n"
490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop head
492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Subtract counter.
496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], %[depth], #8\n"
497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.8 {d18, d19, d20}, [%[rhs_ptr]]!\n"
499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.8 {d21, d22, d23}, [%[lhs_ptr]]!\n"
500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q12, d18, d21\n"
501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q13, d18, d22\n"
502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q14, d18, d23\n"
503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q15, d19, d21\n"
504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q0, q12\n"
505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q1, q13\n"
506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q2, q14\n"
507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q3, q15\n"
508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q12, d19, d22\n"
509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q13, d19, d23\n"
510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q14, d20, d21\n"
511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q15, d20, d22\n"
512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.u8 q9, d20, d23\n"
513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q4, q12\n"
514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q5, q13\n"
515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q6, q14\n"
516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q7, q15\n"
517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.u16 q8, q9\n"
518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop branch
520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Horizontal reduce aggregators, step 1
524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d0, d0, d1\n"
525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d2, d2, d3\n"
526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d4, d4, d5\n"
527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d6, d6, d7\n"
528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d8, d8, d9\n"
529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d10, d10, d11\n"
530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d12, d12, d13\n"
531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d14, d14, d15\n"
532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d16, d16, d17\n"
533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Horizontal reduce aggregators, step 2
535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d0, d0, d2\n"
536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d1, d4, d4\n"
537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d6, d6, d8\n"
538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d7, d10, d10\n"
539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d12, d12, d14\n"
540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.u32 d13, d16, d16\n"
541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2}, [r0]!\n"
545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d3[0]}, [r0]!\n"
546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8}, [r0]!\n"
548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d9[0]}, [r0]!\n"
549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14}, [r0]!\n"
551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d15[0]}, [r0]!\n"
552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Accumulate
554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vadd.s32 q0, q0, q1\n"
555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vadd.s32 q3, q3, q4\n"
556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vadd.s32 q6, q6, q7\n"
557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d0}, [r0]!\n"
561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d1[0]}, [r0]!\n"
562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d6}, [r0]!\n"
564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d7[0]}, [r0]!\n"
565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12}, [r0]!\n"
567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d13[0]}, [r0]!\n"
568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Fast kernel operating on int8 operands.
582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It is assumed that one of the two int8 operands only takes values
583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in [-127, 127], while the other may freely range in [-128, 127].
584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The issue with both operands taking the value -128 is that:
585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// -128*-128 + -128*-128 == -32768 overflows int16.
586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// range. That is the basic idea of this kernel.
588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits {
589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int8_t OperandType;
590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t AccumulatorType;
591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::size_t start_depth = 123;
598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::size_t run_depth = depth;
599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::size_t dst_col_stride = 4;
600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    AccumulatorType* dst_ptr = accum_ptr;
601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 2x16 block of Rhs is stored in 8 bit in d0--d3.
606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only
607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // half of the register space required, so we loop over these registers
608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // twice. Only half of it, a 2x16 block, is stored in d4--d7 at
609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // any given time.
610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit
612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // components which need to be horizontally-added at the end)
613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // The Lhs vectors are multiplied by the Rhs vectors with a widening
615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // multiply over the 8 first levels of depth, producing int16x8
616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // vectors of products for each position in the accumulator matrix.
617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Here comes the special trick: since the operands are signed int8,
618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // their range being [ -2^7 , 2^7 ), their products are in range
619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // without any risk of overflowing int16.
621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // We thus proceed with the 8 next levels of depth, multiplying
622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // again Lhs by Rhs, accumulating into this existing int16x8 vector.
623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Only then, having processed 16 levels of depth, do we need to
625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // horizontally add these int16x8 accumulators into the final
626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // int32x4 accumulators.
627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // As we do not have enough registers to store all 16 int16x8
629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // temporary-16bit-accumulators, we have them cycle through q4--q7.
630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Register layout (ignoring the q4--q7 temporary 16bit accumulators):
633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +----+----+
635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | d0 | d2 |
636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | .  | .  |
637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | .  | .  |
638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | .  | .  |
639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                       Rhs     +----+----+
640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | d1 | d3 |
641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | .  | .  |
642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | .  | .  |
643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               | .  | .  |
644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +----+----+
645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |    |    |
647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //    Lhs                        |    |    |
649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--------+--------+ - - - -  +----+----+
651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  | d4 ... | d5 ... |          | q8 | q9 |
652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  | d6 ... | d7 ... |          | q10| q11|
653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  | d4 ... | d5 ... |          | q12| q13|
654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  | d6 ... | d7 ... |          | q14| q15|
655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--------+--------+ - - - -  +----+----+
656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               Accumulator
658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Clear accumulators, and, interleaved with it,
661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // initial loads of the first loop iteration,
662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // taken out of the loop so that in the loop itself we have
663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // optimal streaming of data from memory.
664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr], #0]\n"
665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q8, #0\n"
666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #0]\n"
667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q9, #0\n"
668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d2, [%[rhs_ptr], #16]\n"
669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q10, q8\n"
670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #16]\n"
671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q11, q8\n"
672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d1, [%[rhs_ptr], #8]\n"
673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q12, q8\n"
674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d5, [%[lhs_ptr], #8]\n"
675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q13, q8\n"
676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d3, [%[rhs_ptr], #24]\n"
677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q14, q8\n"
678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d7, [%[lhs_ptr], #24]\n"
679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov.i32 q15, q8\n"
680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // General loop.
682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply 8 first levels of depth.
686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q4,  d0,  d4\n"
687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #32\n"
688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q5,  d2,  d4\n"
689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #32]\n"
690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q6,  d0,  d6\n"
691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q7,  d2,  d6\n"
692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #48]\n"
693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate second-half, again into the same
695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 16bit local accumulator registers. This is where we
696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // take advantage of having int8 instead of uint8 and therefore
697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // being able to accumulate two products into int16.
698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q4,  d1,  d5\n"
699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q5,  d3,  d5\n"
700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d5, [%[lhs_ptr], #40]\n"
701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q6,  d1,  d7\n"
702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q7,  d3,  d7\n"
703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d7, [%[lhs_ptr], #56]\n"
704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Add pairwise, accumulate into 32-bit accumulators.
706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q8,  q4\n"
707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #64\n"
708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q9,  q5\n"
709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[run_depth], %[run_depth], #16\n"
710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q10, q6\n"
711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q11, q7\n"
712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "beq " GEMMLOWP_LABEL_AFTER_LOOP
714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "f\n"
715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply first half.
717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q4,  d0,  d4\n"
718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q5,  d2,  d4\n"
719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #0]\n"
720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q6,  d0,  d6\n"
721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr], #0]\n"
722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q7,  d2,  d6\n"
723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d2, [%[rhs_ptr], #16]\n"
724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate second-half, again into the same
726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 16bit local accumulator registers. This is where we
727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // take advantage of having int8 instead of uint8 and therefore
728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // being able to accumulate two products into int16.
729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q4,  d1,  d5\n"
730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #16]\n"
731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q5,  d3,  d5\n"
732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d5, [%[lhs_ptr], #8]\n"
733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q6,  d1,  d7\n"
734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d1, [%[rhs_ptr], #8]\n"
735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q7,  d3,  d7\n"
736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d3, [%[rhs_ptr], #24]\n"
737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Add pairwise, accumulate into 32-bit accumulators.
739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q12, q4\n"
740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d7, [%[lhs_ptr], #24]\n"
741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q13, q5\n"
742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q14, q6\n"
743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q15, q7\n"
744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b " GEMMLOWP_LABEL_LOOP "b\n"
746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_AFTER_LOOP
748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply first half.
751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q4,  d0,  d4\n"
752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q5,  d2,  d4\n"
753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q6,  d0,  d6\n"
754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmull.s8    q7,  d2,  d6\n"
755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate second-half, again into the same
757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 16bit local accumulator registers. This is where we
758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // take advantage of having int8 instead of uint8 and therefore
759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // being able to accumulate two products into int16.
760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q4,  d1,  d5\n"
761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q5,  d3,  d5\n"
762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q6,  d1,  d7\n"
763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmlal.s8    q7,  d3,  d7\n"
764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Add pairwise, accumulate into 32-bit accumulators.
766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q12, q4\n"
767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q13, q5\n"
768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q14, q6\n"
769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadal.s16   q15, q7\n"
770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cmp %[start_depth], #0\n"
771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Reduce 32bit accumulators horizontally.
773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d0, d16, d17\n"
774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d1, d18, d19\n"
775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d2, d20, d21\n"
776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d3, d22, d23\n"
777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d4, d24, d25\n"
778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d5, d26, d27\n"
779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d6, d28, d29\n"
780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d7, d30, d31\n"
781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "f\n"
784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Reduce 32bit accumulators horizontally, second pass
786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // (each pass adds pairwise. we need to add 4-wise).
787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d8, d0, d2\n"
788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d9, d4, d6\n"
789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d10, d1, d3\n"
790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d11, d5, d7\n"
791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b " GEMMLOWP_LABEL_STORE "f\n"
793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Reduce 32bit accumulators horizontally, second pass
798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // (each pass adds pairwise. we need to add 4-wise),
799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // and load destination values from memory.
800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[dst_ptr]\n"
801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d8, d0, d2\n"
803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d9, d4, d6\n"
804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]\n"
805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d10, d1, d3\n"
806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vpadd.s32 d11, d5, d7\n"
807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Add horizontally-reduced accumulators into
809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // the values loaded from memory
810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vadd.s32 q4, q8, q4\n"
811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vadd.s32 q5, q9, q5\n"
812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_STORE
814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store back into memory
816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[dst_ptr]\n"
817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9}, [r0]!\n"
818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]\n"
819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth)
822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [start_depth] "r"(start_depth)
824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We don't actually use int32*int32 in production. This is just an
833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// experiment to help dissociate the effect of integer-vs-float, from the
834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// effect of operands width.
835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Int32_WithScalar {
836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t OperandType;
837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t AccumulatorType;
838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1 Rhs cell of size 1x4
864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q4, q1, d0[0]\n"
873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q5, q1, d0[1]\n"
874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q6, q1, d1[0]\n"
875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q7, q1, d1[1]\n"
876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q8, q2, d0[0]\n"
877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q9, q2, d0[1]\n"
878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q10, q2, d1[0]\n"
879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q11, q2, d1[1]\n"
880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q12, q3, d0[0]\n"
881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q13, q3, d0[1]\n"
882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q14, q3, d1[0]\n"
883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.s32 q15, q3, d1[1]\n"
884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Not very efficient kernel, just an experiment to see what we can do
919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// without using NEON multiply-with-scalar instructions.
920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar {
921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q4, q1, q0\n"
956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q8, q2, q0\n"
957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q12, q3, q0\n"
958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q5, q1, q0\n"
960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q9, q2, q0\n"
961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q13, q3, q0\n"
962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q6, q1, q0\n"
964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q10, q2, q0\n"
965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q14, q3, q0\n"
966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q7, q1, q0\n"
968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q11, q2, q0\n"
969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q15, q3, q0\n"
970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
1000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
1001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Not very efficient kernel, just an experiment to see what we can do
1005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// without using NEON multiply-with-scalar instructions.
1006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This variant is relevant as on ARMv7 FMA does not have a with-scalar variant.
1007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar {
1008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
1009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
1010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
1012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
1013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
1020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
1021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
1022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
1023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
1024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
1025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
1026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
1027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
1028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
1029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
1030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
1031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
1036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
1037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
1038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
1039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
1041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
1042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q4, q1, q0\n"
1043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q8, q2, q0\n"
1044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q12, q3, q0\n"
1045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
1046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q5, q1, q0\n"
1047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q9, q2, q0\n"
1048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q13, q3, q0\n"
1049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
1050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q6, q1, q0\n"
1051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q10, q2, q0\n"
1052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q14, q3, q0\n"
1053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
1054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q7, q1, q0\n"
1055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q11, q2, q0\n"
1056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q15, q3, q0\n"
1057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
1059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
1060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
1061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
1062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
1063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
1067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
1068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
1069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
1070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
1071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
1072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
1073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
1074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
1075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
1076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
1077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
1078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
1086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
1087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
1088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the "most natural" kernel, using NEON multiply-with-scalar
1092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// instructions.
1093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_MLA_WithScalar {
1094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
1095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
1096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
1098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
1099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
1106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
1107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
1108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
1109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
1110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
1111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
1112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
1113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
1114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
1115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
1116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
1117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1 Rhs cell of size 1x4
1122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
1123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
1125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
1126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
1127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
1128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
1130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q4, q1, d0[0]\n"
1131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q5, q1, d0[1]\n"
1132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q6, q1, d1[0]\n"
1133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q7, q1, d1[1]\n"
1134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q8, q2, d0[0]\n"
1135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q9, q2, d0[1]\n"
1136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q10, q2, d1[0]\n"
1137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q11, q2, d1[1]\n"
1138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q12, q3, d0[0]\n"
1139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q13, q3, d0[1]\n"
1140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q14, q3, d1[0]\n"
1141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q15, q3, d1[1]\n"
1142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
1144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
1145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
1146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
1147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
1148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
1152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
1153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
1154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
1155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
1156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
1157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
1158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
1159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
1160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
1161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
1162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
1163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
1171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
1172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
1173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel contributed by ARM in 64bit form
1177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (see NEON_64bit_GEMM_Float32_WithScalar_A53) then ported to 32bit code.
1178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Tuned for A53.
1179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_WithScalar_A53 {
1180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
1181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
1182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
1184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
1185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
1192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
1193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
1194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
1195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
1196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
1197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
1198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
1199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
1200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
1201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
1202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
1203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
1205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 1x4 cell of Rhs is stored in d0--d1 (q0).
1207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7
1208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // (q1--q3).
1209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x4 block of accumulators is stored in q4--q15.
1210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   +-----+-----+-----+-----+
1212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //             Rhs   |d0[0]|d0[1]|d1[0]|d1[1]|
1213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   +-----+-----+-----+-----+
1214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   |     |     |     |     |
1216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  Lhs              |     |     |     |     |
1218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+- - - - - -  +-----+-----+-----+-----+
1220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|             | q4  | q5  | q6  | q7  |
1221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|             | q4  | q5  | q6  | q7  |
1222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d3|             | q4  | q5  | q6  | q7  |
1223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d3|             | q4  | q5  | q6  | q7  |
1224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+- - - - - -  +-----+-----+-----+-----+
1225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|             | q8  | q9  | q10 | q11 |
1226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|             | q8  | q9  | q10 | q11 |
1227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d5|             | q8  | q9  | q10 | q11 |
1228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d5|             | q8  | q9  | q10 | q11 |
1229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+ - - - - - - +-----+-----+-----+-----+
1230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|             | q12 | q13 | q14 | q15 |
1231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|             | q12 | q13 | q14 | q15 |
1232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d7|             | q12 | q13 | q14 | q15 |
1233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d7|             | q12 | q13 | q14 | q15 |
1234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+- - - - - -  +-----+-----+-----+-----+
1235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                            Accumulator
1237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load Rhs cell
1239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr]]\n"
1240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[rhs_ptr], #8]\n"
1241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[rhs_ptr], #12]\n"
1242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1st Lhs Cell
1244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]\n"
1245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #16]\n"  // Load 1st half of 2nd Lhs cell
1250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d1, r2, r3\n"             // Prepare 2nd half of Rhs cell
1251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q4, q1, d0[0]\n"      // Multiply 1st Lhs cell with column 0
1252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #24]\n"   // Load 2nd half of 2nd Lhs cell, part 1
1253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q5, q1, d0[1]\n"      // Multiply 1st Lhs cell with column 1
1254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #28]\n"   // Load 2nd half of 2nd Lhs cell, part 2
1255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q6, q1, d1[0]\n"      // Multiply 1st Lhs cell with column 2
1256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
1257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #32]\n"  // Load 1st half of 3rd Lhs cell
1259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d5, r2, r3\n"             // Prepare 2nd half of 2nd Lhs cell
1260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q7, q1, d1[1]\n"      // Multiply 1st Lhs cell with column 3
1261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #40]\n"   // Load 2nd half of 3rd Lhs cell, part 1
1262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q8, q2, d0[0]\n"      // Multiply 2nd Lhs cell with column 0
1263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #44]\n"   // Load 2nd half of 3rd Lhs cell, part 2
1264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q9, q2, d0[1]\n"      // Multiply 2nd Lhs cell with column 1
1265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #16\n"  // Move forward by 1 Rhs cell
1266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d2, [%[lhs_ptr], #48]\n"  // Load 1st half of 1st Lhs cell of next
1268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d7, r2, r3\n"            // Prepare 2nd half of 3rd Lhs cell
1270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q10, q2, d1[0]\n"    // Multiply 2nd Lhs cell with column 2
1271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #56]\n"  // Load 2nd half of 1st Lhs cell of next
1272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iter, part 1
1273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q12, q3, d0[0]\n"    // Multiply 3rd Lhs cell with column 0
1274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #60]\n"  // Load 2nd half of 1st Lhs cell of next
1275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iter, part 2
1276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q13, q3, d0[1]\n"  // Multiply 3rd Lhs cell with column 1
1277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #48\n"  // Move forward by 3 Lhs cells
1278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr]]\n"  // Load 1st half of Rhs cell of next
1280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d3, r2, r3\n"  // Prepare 2nd half of 1st Lhs cell of next
1282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q11, q2, d1[1]\n"   // Multiply 2nd Lhs cell with column 3
1284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[rhs_ptr], #8]\n"  // Load 2nd half of Rhs cell of next
1285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration, part 1
1286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q14, q3, d1[0]\n"    // Multiply 3rd Lhs cell with column 2
1287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[rhs_ptr], #12]\n"  // Load 2nd half of Rhs cell of next
1288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration, part 2
1289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q15, q3, d1[1]\n"  // Multiply 3rd Lhs cell with column 3
1290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop branch.  This will dual issue in fmla cycle 3 of the 4th block.
1292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
1293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
1294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
1298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
1299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
1300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
1301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
1302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
1303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
1304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
1305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
1306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
1307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
1308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
1309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5",
1316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
1317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26",
1318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d27", "d28", "d29", "d30", "d31");
1319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_WithScalar_A53_depth2 {
1323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
1324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
1325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
1327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
1328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
1335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
1336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
1337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
1338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
1339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
1340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
1341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
1342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
1343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
1344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
1345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
1346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
1348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 1x4 cell of Rhs is stored in d0--d1 (q0).
1350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7
1351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // (q1--q3).
1352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x4 block of accumulators is stored in q4--q15.
1353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   +-----+-----+-----+-----+
1355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //             Rhs   |d0[0]|d0[1]|d1[0]|d1[1]|
1356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   +-----+-----+-----+-----+
1357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                   |     |     |     |     |
1359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  Lhs              |     |     |     |     |
1361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+- - - - - -  +-----+-----+-----+-----+
1363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|             | q4  | q5  | q6  | q7  |
1364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d2|             | q4  | q5  | q6  | q7  |
1365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d3|             | q4  | q5  | q6  | q7  |
1366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d3|             | q4  | q5  | q6  | q7  |
1367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+- - - - - -  +-----+-----+-----+-----+
1368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|             | q8  | q9  | q10 | q11 |
1369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d4|             | q8  | q9  | q10 | q11 |
1370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d5|             | q8  | q9  | q10 | q11 |
1371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d5|             | q8  | q9  | q10 | q11 |
1372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+ - - - - - - +-----+-----+-----+-----+
1373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|             | q12 | q13 | q14 | q15 |
1374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d6|             | q12 | q13 | q14 | q15 |
1375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d7|             | q12 | q13 | q14 | q15 |
1376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |d7|             | q12 | q13 | q14 | q15 |
1377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +--+- - - - - -  +-----+-----+-----+-----+
1378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                            Accumulator
1380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load Rhs cell
1382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr]]\n"
1383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[rhs_ptr], #8]\n"
1384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[rhs_ptr], #12]\n"
1385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1st Lhs Cell
1387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]\n"
1388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop head - handling 2 levels of depth at once
1390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Level of depth 1
1394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #32]\n"  // Load 1st half of 2nd Lhs cell
1396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d1, r2, r3\n"             // Prepare 2nd half of Rhs cell
1397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q4, q1, d0[0]\n"      // Multiply 1st Lhs cell with column 0
1398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #40]\n"   // Load 2nd half of 2nd Lhs cell, part 1
1399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q5, q1, d0[1]\n"      // Multiply 1st Lhs cell with column 1
1400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #44]\n"   // Load 2nd half of 2nd Lhs cell, part 2
1401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q6, q1, d1[0]\n"      // Multiply 1st Lhs cell with column 2
1402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #64]\n"  // Load 1st half of 3rd Lhs cell
1404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d5, r2, r3\n"             // Prepare 2nd half of 2nd Lhs cell
1405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q7, q1, d1[1]\n"      // Multiply 1st Lhs cell with column 3
1406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #72]\n"   // Load 2nd half of 3rd Lhs cell, part 1
1407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q8, q2, d0[0]\n"      // Multiply 2nd Lhs cell with column 0
1408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #76]\n"   // Load 2nd half of 3rd Lhs cell, part 2
1409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q9, q2, d0[1]\n"      // Multiply 2nd Lhs cell with column 1
1410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d2, [%[lhs_ptr], #16]\n"  // Load 1st half of 1st Lhs cell of next
1412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d7, r2, r3\n"            // Prepare 2nd half of 3rd Lhs cell
1414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q10, q2, d1[0]\n"    // Multiply 2nd Lhs cell with column 2
1415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #24]\n"  // Load 2nd half of 1st Lhs cell of next
1416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iter, part 1
1417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q12, q3, d0[0]\n"    // Multiply 3rd Lhs cell with column 0
1418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #28]\n"  // Load 2nd half of 1st Lhs cell of next
1419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iter, part 2
1420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q13, q3, d0[1]\n"  // Multiply 3rd Lhs cell with column 1
1421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr], #16]\n"  // Load 1st half of Rhs cell of next
1423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d3, r2, r3\n"  // Prepare 2nd half of 1st Lhs cell of next
1425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q11, q2, d1[1]\n"    // Multiply 2nd Lhs cell with column 3
1427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[rhs_ptr], #24]\n"  // Load 2nd half of Rhs cell of next
1428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration, part 1
1429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q14, q3, d1[0]\n"    // Multiply 3rd Lhs cell with column 2
1430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[rhs_ptr], #28]\n"  // Load 2nd half of Rhs cell of next
1431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration, part 2
1432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q15, q3, d1[1]\n"  // Multiply 3rd Lhs cell with column 3
1433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Level of depth 2
1435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d4, [%[lhs_ptr], #48]\n"  // Load 1st half of 2nd Lhs cell
1436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d1, r2, r3\n"             // Prepare 2nd half of Rhs cell
1437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q4, q1, d0[0]\n"      // Multiply 1st Lhs cell with column 0
1438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #56]\n"   // Load 2nd half of 2nd Lhs cell, part 1
1439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q5, q1, d0[1]\n"      // Multiply 1st Lhs cell with column 1
1440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #60]\n"   // Load 2nd half of 2nd Lhs cell, part 2
1441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q6, q1, d1[0]\n"      // Multiply 1st Lhs cell with column 2
1442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #2\n"           // Decrement depth counter
1443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d6, [%[lhs_ptr], #80]\n"  // Load 1st half of 3rd Lhs cell
1445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d5, r2, r3\n"             // Prepare 2nd half of 2nd Lhs cell
1446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q7, q1, d1[1]\n"      // Multiply 1st Lhs cell with column 3
1447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #88]\n"   // Load 2nd half of 3rd Lhs cell, part 1
1448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q8, q2, d0[0]\n"      // Multiply 2nd Lhs cell with column 0
1449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #92]\n"   // Load 2nd half of 3rd Lhs cell, part 2
1450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q9, q2, d0[1]\n"      // Multiply 2nd Lhs cell with column 1
1451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #32\n"  // Move forward by 1 Rhs cell
1452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d2, [%[lhs_ptr], #96]\n"  // Load 1st half of 1st Lhs cell of next
1454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d7, r2, r3\n"             // Prepare 2nd half of 3rd Lhs cell
1456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q10, q2, d1[0]\n"     // Multiply 2nd Lhs cell with column 2
1457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[lhs_ptr], #104]\n"  // Load 2nd half of 1st Lhs cell of next
1458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iter, part 1
1459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q12, q3, d0[0]\n"     // Multiply 3rd Lhs cell with column 0
1460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[lhs_ptr], #108]\n"  // Load 2nd half of 1st Lhs cell of next
1461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iter, part 2
1462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q13, q3, d0[1]\n"  // Multiply 3rd Lhs cell with column 1
1463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #96\n"  // Move forward by 3 Lhs cells
1464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vldr d0, [%[rhs_ptr]]\n"  // Load 1st half of Rhs cell of next
1466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmov d3, r2, r3\n"  // Prepare 2nd half of 1st Lhs cell of next
1468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration
1469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q11, q2, d1[1]\n"   // Multiply 2nd Lhs cell with column 3
1470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r2, [%[rhs_ptr], #8]\n"  // Load 2nd half of Rhs cell of next
1471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration, part 1
1472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q14, q3, d1[0]\n"    // Multiply 3rd Lhs cell with column 2
1473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr r3, [%[rhs_ptr], #12]\n"  // Load 2nd half of Rhs cell of next
1474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // iteration, part 2
1475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q15, q3, d1[1]\n"  // Multiply 3rd Lhs cell with column 3
1476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop branch.  This will dual issue in fmla cycle 3 of the 4th block.
1478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //"bne loop_%=\n"
1479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
1480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
1481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
1485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
1486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
1487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
1488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
1489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
1490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
1491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
1492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
1493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
1494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
1495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
1496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5",
1503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
1504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26",
1505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d27", "d28", "d29", "d30", "d31");
1506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This rotating variant performs well when permutations (vext) can be
1510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// dual-issued with arithmetic instructions.
1511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_MLA_Rotating {
1512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
1513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
1514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
1516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
1517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
1524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
1525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
1526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
1527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
1528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
1529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
1530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
1531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
1532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
1533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
1534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
1535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS \
1537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vtrn.32 q4, q5\n"                                                 \
1538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vtrn.32 q6, q7\n"                                                 \
1539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vswp d9, d12\n"                                                   \
1540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vswp d11, d14\n"                                                  \
1541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vtrn.32 q8, q9\n"                                                 \
1542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vtrn.32 q10, q11\n"                                               \
1543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vswp d17, d20\n"                                                  \
1544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vswp d19, d22\n"                                                  \
1545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vtrn.32 q12, q13\n"                                               \
1546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vtrn.32 q14, q15\n"                                               \
1547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vswp d25, d28\n"                                                  \
1548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vswp d27, d30\n"
1549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(a, b, c) \
1551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS             \
1552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q5, q5, q5, #" #a                                               \
1553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q6, q6, q6, #" #b                                               \
1555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q7, q7, q7, #" #c                                               \
1557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q9, q9, q9, #" #a                                               \
1559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q10, q10, q10, #" #b                                            \
1561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q11, q11, q11, #" #c                                            \
1563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q13, q13, q13, #" #a                                            \
1565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q14, q14, q14, #" #b                                            \
1567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n"                                                                     \
1568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "vext.32 q15, q15, q15, #" #c                                            \
1569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  "\n" NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS
1570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3)
1572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //"loop_%=:\n"
1574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1 Rhs cell of size 1x4
1578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
1579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
1581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
1582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
1583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
1584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
1586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q4, q1, q0\n"
1587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q8, q2, q0\n"
1588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q12, q3, q0\n"
1589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vext.f32 q0, q0, q0, #1\n"
1590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q5, q1, q0\n"
1591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q9, q2, q0\n"
1592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q13, q3, q0\n"
1593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vext.f32 q0, q0, q0, #1\n"
1594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q6, q1, q0\n"
1595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q10, q2, q0\n"
1596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q14, q3, q0\n"
1597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vext.f32 q0, q0, q0, #1\n"
1598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q7, q1, q0\n"
1599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q11, q2, q0\n"
1600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vmla.f32 q15, q3, q0\n"
1601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
1603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
1604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
1605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //"bne loop_%=\n"
1606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
1607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
1608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1)
1613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d8, d9},   [r0]!\n"
1615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d16, d17}, [r0]!\n"
1616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d24, d25}, [r0]!\n"
1617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d10, d11}, [r0]!\n"
1618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d18, d19}, [r0]!\n"
1619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d26, d27}, [r0]!\n"
1620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d12, d13}, [r0]!\n"
1621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d20, d21}, [r0]!\n"
1622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d28, d29}, [r0]!\n"
1623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d14, d15}, [r0]!\n"
1624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d22, d23}, [r0]!\n"
1625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            "vst1.32 {d30, d31}, [r0]!\n"
1626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
1634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
1635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
1636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This rotating variant performs well when permutations (vext) can be
1640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// dual-issued with arithmetic instructions. It is relevant as the rotating
1641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// approach removes the need for multiply-with-scalar instructions, and ARMv7
1642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// FMA does not have a with-scalar variant.
1643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_32bit_GEMM_Float32_FMA_Rotating {
1644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
1645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
1646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
1648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
1649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d8, d9},   [r0]!\n"
1656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d16, d17}, [r0]!\n"
1657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d24, d25}, [r0]!\n"
1658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d10, d11}, [r0]!\n"
1659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d18, d19}, [r0]!\n"
1660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d26, d27}, [r0]!\n"
1661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d12, d13}, [r0]!\n"
1662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d20, d21}, [r0]!\n"
1663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d28, d29}, [r0]!\n"
1664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d14, d15}, [r0]!\n"
1665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d22, d23}, [r0]!\n"
1666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d30, d31}, [r0]!\n"
1667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3)
1669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //"loop_%=:\n"
1671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1 Rhs cell of size 1x4
1675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"
1676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
1678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
1679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
1680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"
1681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
1683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q4, q1, q0\n"
1684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q8, q2, q0\n"
1685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q12, q3, q0\n"
1686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vext.f32 q0, q0, q0, #1\n"
1687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q5, q1, q0\n"
1688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q9, q2, q0\n"
1689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q13, q3, q0\n"
1690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vext.f32 q0, q0, q0, #1\n"
1691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q6, q1, q0\n"
1692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q10, q2, q0\n"
1693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q14, q3, q0\n"
1694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vext.f32 q0, q0, q0, #1\n"
1695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q7, q1, q0\n"
1696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q11, q2, q0\n"
1697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vfma.f32 q15, q3, q0\n"
1698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
1700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
1701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[depth], #1\n"
1702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //"bne loop_%=\n"
1703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP "b\n"
1704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1)
1706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov r0, %[accum_ptr]\n"
1709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d8, d9},   [r0]!\n"
1710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d16, d17}, [r0]!\n"
1711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d24, d25}, [r0]!\n"
1712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d10, d11}, [r0]!\n"
1713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d18, d19}, [r0]!\n"
1714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d26, d27}, [r0]!\n"
1715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d12, d13}, [r0]!\n"
1716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d20, d21}, [r0]!\n"
1717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d28, d29}, [r0]!\n"
1718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d14, d15}, [r0]!\n"
1719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d22, d23}, [r0]!\n"
1720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "vst1.32 {d30, d31}, [r0]!\n"
1721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
1729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
1730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d28", "d29", "d30", "d31");
1731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif  // __arm__
1735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __aarch64__
1737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the current standard kernel in gemmlowp, see:
1739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L646
1740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators {
1741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint8_t OperandType;
1742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint32_t AccumulatorType;
1743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
1744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
1745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> >
1746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
1747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
1748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
1749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
1750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 1 Rhs cell of size 2x8
1751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
1752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
1753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x2 each
1755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
1756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
1757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
1758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #2\n"
1760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
1762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
1763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
1764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v16.16b}, [x0], #16\n"
1765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v24.16b}, [x0], #16\n"
1766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
1767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v17.16b}, [x0], #16\n"
1768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v25.16b}, [x0], #16\n"
1769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
1770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v18.16b}, [x0], #16\n"
1771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v26.16b}, [x0], #16\n"
1772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
1773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v19.16b}, [x0], #16\n"
1774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v27.16b}, [x0], #16\n"
1775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v12.16b}, [x0], #16\n"
1776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v20.16b}, [x0], #16\n"
1777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v28.16b}, [x0], #16\n"
1778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v13.16b}, [x0], #16\n"
1779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v21.16b}, [x0], #16\n"
1780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v29.16b}, [x0], #16\n"
1781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v14.16b}, [x0], #16\n"
1782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v22.16b}, [x0], #16\n"
1783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v30.16b}, [x0], #16\n"
1784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v15.16b}, [x0], #16\n"
1785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v23.16b}, [x0], #16\n"
1786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v31.16b}, [x0], #16\n"
1787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
1789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //"loop_%=:\n"
1791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
1792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
1795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
1797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
1798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 12x8 block of accumulators is stored in 32bit in v8--v31.
1799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                         +--------+--------+-----+--------+--------+
1801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                         |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
1802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                    Rhs  +--------+--------+-----+--------+--------+
1803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                         |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
1804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                         +--------+--------+-----+--------+--------+
1805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                         |        |        |     |        |        |
1807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //    Lhs                  |        |        |     |        |        |
1809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v2.h[0]|v2.h[4]|      |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
1812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v2.h[1]|v2.h[5]|      |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
1813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v2.h[2]|v2.h[6]|      |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
1814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v2.h[3]|v2.h[7]|      |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
1815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v3.h[0]|v3.h[4]|      |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
1817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v3.h[1]|v3.h[5]|      |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
1818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v3.h[2]|v3.h[6]|      |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
1819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v3.h[3]|v3.h[7]|      |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
1820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.h[0]|v4.h[4]|      |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
1822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.h[1]|v4.h[5]|      |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
1823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.h[2]|v4.h[6]|      |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
1824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.h[3]|v4.h[7]|      |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
1825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
1827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                            Accumulator
1828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Expand Lhs/Rhs cells to 16 bit.
1830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v0.8h, v5.8b\n"
1831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
1832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v1.8h, v6.8b\n"
1833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
1834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v2.8h, v2.8b\n"
1835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v3.8h, v3.8b\n"
1836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v4.8h, v4.8b\n"
1837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, top third
1839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v8.4s, v2.4h, v0.h[0]\n"
1840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v9.4s, v2.4h, v0.h[1]\n"
1841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v10.4s, v2.4h, v0.h[2]\n"
1842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v11.4s, v2.4h, v0.h[3]\n"
1843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v12.4s, v2.4h, v1.h[0]\n"
1844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v13.4s, v2.4h, v1.h[1]\n"
1845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v14.4s, v2.4h, v1.h[2]\n"
1846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v15.4s, v2.4h, v1.h[3]\n"
1847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
1848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
1849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
1850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
1851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
1852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
1853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
1854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
1855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
1856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, middle third
1858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v16.4s, v3.4h, v0.h[0]\n"
1859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v17.4s, v3.4h, v0.h[1]\n"
1860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v18.4s, v3.4h, v0.h[2]\n"
1861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v19.4s, v3.4h, v0.h[3]\n"
1862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v20.4s, v3.4h, v1.h[0]\n"
1863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v21.4s, v3.4h, v1.h[1]\n"
1864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v22.4s, v3.4h, v1.h[2]\n"
1865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v23.4s, v3.4h, v1.h[3]\n"
1866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
1867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
1868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
1869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
1870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
1871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
1872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
1873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
1874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
1875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #2\n"
1877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, bottom third
1879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v24.4s, v4.4h, v0.h[0]\n"
1880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v25.4s, v4.4h, v0.h[1]\n"
1881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v26.4s, v4.4h, v0.h[2]\n"
1882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v27.4s, v4.4h, v0.h[3]\n"
1883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v28.4s, v4.4h, v1.h[0]\n"
1884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v29.4s, v4.4h, v1.h[1]\n"
1885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v30.4s, v4.4h, v1.h[2]\n"
1886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v31.4s, v4.4h, v1.h[3]\n"
1887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
1888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
1889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
1890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
1891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
1892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
1893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
1894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v31.4s, v4.8h, v1.h[7]\n"
1895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
1896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP "b\n"
1898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_AFTER_LOOP
1900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
1901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Expand Lhs/Rhs cells to 16 bit.
1903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v0.8h, v5.8b\n"
1904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v1.8h, v6.8b\n"
1905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v2.8h, v2.8b\n"
1906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v3.8h, v3.8b\n"
1907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uxtl v4.8h, v4.8b\n"
1908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, level of depth 0
1910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v8.4s, v2.4h, v0.h[0]\n"
1911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v9.4s, v2.4h, v0.h[1]\n"
1912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v10.4s, v2.4h, v0.h[2]\n"
1913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v11.4s, v2.4h, v0.h[3]\n"
1914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v12.4s, v2.4h, v1.h[0]\n"
1915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v13.4s, v2.4h, v1.h[1]\n"
1916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v14.4s, v2.4h, v1.h[2]\n"
1917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v15.4s, v2.4h, v1.h[3]\n"
1918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v16.4s, v3.4h, v0.h[0]\n"
1919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v17.4s, v3.4h, v0.h[1]\n"
1920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v18.4s, v3.4h, v0.h[2]\n"
1921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v19.4s, v3.4h, v0.h[3]\n"
1922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v20.4s, v3.4h, v1.h[0]\n"
1923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v21.4s, v3.4h, v1.h[1]\n"
1924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v22.4s, v3.4h, v1.h[2]\n"
1925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v23.4s, v3.4h, v1.h[3]\n"
1926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v24.4s, v4.4h, v0.h[0]\n"
1927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v25.4s, v4.4h, v0.h[1]\n"
1928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v26.4s, v4.4h, v0.h[2]\n"
1929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v27.4s, v4.4h, v0.h[3]\n"
1930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v28.4s, v4.4h, v1.h[0]\n"
1931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v29.4s, v4.4h, v1.h[1]\n"
1932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v30.4s, v4.4h, v1.h[2]\n"
1933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal v31.4s, v4.4h, v1.h[3]\n"
1934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate, level of depth 1
1936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
1937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
1938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
1939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
1940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
1941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
1942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
1943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
1944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
1945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
1946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
1947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
1948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
1949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
1950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
1951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
1952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
1953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
1954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
1955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
1956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
1957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
1958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
1959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umlal2 v31.4s, v4.8h, v1.h[7]\n"
1960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
1962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
1963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
1964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v16.16b}, [x0], #16\n"
1965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v24.16b}, [x0], #16\n"
1966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
1967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v17.16b}, [x0], #16\n"
1968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v25.16b}, [x0], #16\n"
1969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
1970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v18.16b}, [x0], #16\n"
1971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v26.16b}, [x0], #16\n"
1972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
1973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v19.16b}, [x0], #16\n"
1974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v27.16b}, [x0], #16\n"
1975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
1976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v20.16b}, [x0], #16\n"
1977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v28.16b}, [x0], #16\n"
1978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
1979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v21.16b}, [x0], #16\n"
1980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v29.16b}, [x0], #16\n"
1981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
1982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v22.16b}, [x0], #16\n"
1983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v30.16b}, [x0], #16\n"
1984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v15.16b}, [x0], #16\n"
1985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v23.16b}, [x0], #16\n"
1986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v31.16b}, [x0], #16\n"
1987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
1988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
1990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
1991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
1992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
1993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
1995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
1997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
1998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
1999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel by ARM. Not expanding operands before multiplication.
2001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Tuned for A57. Compare to
2002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand
2003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57 {
2004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint8_t OperandType;
2005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint32_t AccumulatorType;
2006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
2007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<5, 16, CellOrder::WidthMajor>, 1>,
2008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
2009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
2010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
2011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
2012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    static const int kLhsWidth = Format::Lhs::kWidth;
2013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    static const int kRhsWidth = Format::Rhs::kWidth;
2014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    AccumulatorType rowmajor_accumulator_buffer[kLhsWidth * kRhsWidth];
2015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
2016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Clear aggregators
2017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v12.4s, wzr\n"
2018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v13.4s, wzr\n"
2019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v14.4s, wzr\n"
2020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v15.4s, wzr\n"
2021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v16.4s, wzr\n"
2022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v17.4s, wzr\n"
2023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v18.4s, wzr\n"
2024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v19.4s, wzr\n"
2025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v20.4s, wzr\n"
2026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v21.4s, wzr\n"
2027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v22.4s, wzr\n"
2028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v23.4s, wzr\n"
2029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v24.4s, wzr\n"
2030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v25.4s, wzr\n"
2031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v26.4s, wzr\n"
2032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v27.4s, wzr\n"
2033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v28.4s, wzr\n"
2034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v29.4s, wzr\n"
2035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v30.4s, wzr\n"
2036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v31.4s, wzr\n"
2037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
2039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
2040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
2042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
2044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 5x16 block of Lhs is cycled through v4 and v5 in 8 bit.
2045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x5 block of aggregators is stored in v12-v31 (as 4x32 bit
2047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // components which would need to be added at the end)
2048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // The Lhs vectors are multiplied by the Rhs vectors with a widening
2050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // multiply to produce an intermediate result which is stored in
2051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // v6-v11.  Each intermediate result is 8x16 bits so this happens
2052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // twice for each Lhs/Rhs combination (once with UMULL for elements
2053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 0-7 and once with UMULL2 for elements 8-15).
2054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // UADALP is used to accumulate these intermediate results into the
2056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // result aggregators.
2057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +--------+--------+--------+--------+
2061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
2062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                          Rhs  +--------+--------+--------+--------+
2063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |  ...   |  ...   |  ...   |  ...   |
2064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +--------+--------+--------+--------|
2065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
2066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +--------+--------+--------+--------+
2067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |        |        |        |        |
2069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //    Lhs                        |        |        |        |        |
2071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+-----+--------+ - - +--------+--------+--------+--------+
2073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.b[0]| ... |v4.b[15]|     | v12.4s | v13.4s | v14.4s | v15.4s |
2074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v5.b[0]| ... |v5.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
2075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.b[0]| ... |v4.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
2076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v5.b[0]| ... |v5.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
2077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.b[0]| ... |v4.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
2078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+--------------+ - - +--------+--------+--------+--------+
2079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                                                Accumulator
2081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Further possible optimisations (not tried):
2084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //   - Move early loads into previous iteration (see Float32_WithScalar
2085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //   for example). - Unroll loop 2x to alternate more smoothly between
2086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //   v4 and v5. - A different number of temporary registers might work
2087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //   better. - Pairing umull with corresponding umull2 might allow
2088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //   better
2089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //     register loading (e.g. at the start of the loop)
2090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //   - Interleaving umull{2} and uadalp even more aggressively might
2091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //     help, (not sure about latency vs. dispatch rate).
2092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Start loading Rhs - further loads are interleaved amongst the
2095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // multiplies for better dispatch on A57.
2096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
2097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load first Lhs vector - further loads are interleaved amongst the
2099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // multiplies
2100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
2101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v6.8h,  v0.8b,  v4.8b\n"
2103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // 2nd RHS element
2104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v7.8h,  v1.8b,  v4.8b\n"
2105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"  // 3rd RHS element
2106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v8.8h,  v2.8b,  v4.8b\n"
2107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"  // 4th RHS element
2108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v9.8h,  v3.8b,  v4.8b\n"
2109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2  v10.8h, v0.16b, v4.16b\n"
2110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2  v11.8h, v1.16b, v4.16b\n"
2111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"  // 2nd LHS element
2112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v12.4s, v6.8h\n"
2114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v6.8h, v2.16b, v4.16b\n"
2115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v13.4s, v7.8h\n"
2116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v7.8h, v3.16b, v4.16b\n"
2117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // 1st LHS element done - Reuse v4
2118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // for 3rd LHS element
2119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v14.4s, v8.8h\n"
2120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v8.8h,  v0.8b,  v5.8b\n"
2121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v15.4s, v9.8h\n"
2122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v9.8h,  v1.8b,  v5.8b\n"
2123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v12.4s, v10.8h\n"
2124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull   v10.8h,  v2.8b,  v5.8b\n"
2125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v13.4s, v11.8h\n"
2126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull   v11.8h,  v3.8b,  v5.8b\n"
2127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v14.4s, v6.8h\n"
2129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v6.8h, v0.16b, v5.16b\n"
2130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v15.4s, v7.8h\n"
2131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v7.8h, v1.16b, v5.16b\n"
2132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v16.4s, v8.8h\n"
2133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v8.8h, v2.16b, v5.16b\n"
2134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v17.4s, v9.8h\n"
2135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v9.8h, v3.16b, v5.16b\n"
2136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"  // 2nd LHS element done - Reuse v5
2137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // for 4th LHS element
2138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v18.4s, v10.8h\n"
2139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull   v10.8h,  v0.8b,  v4.8b\n"
2140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v19.4s, v11.8h\n"
2141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull   v11.8h,  v1.8b,  v4.8b\n"
2142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v16.4s, v6.8h\n"
2144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v6.8h,  v2.8b,  v4.8b\n"
2145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v17.4s, v7.8h\n"
2146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v7.8h,  v3.8b,  v4.8b\n"
2147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v18.4s, v8.8h\n"
2148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v8.8h, v0.16b, v4.16b\n"
2149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v19.4s, v9.8h\n"
2150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v9.8h, v1.16b, v4.16b\n"
2151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v20.4s, v10.8h\n"
2152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2  v10.8h, v2.16b, v4.16b\n"
2153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp  v21.4s, v11.8h\n"
2154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2  v11.8h, v3.16b, v4.16b\n"
2155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // 3rd LHS element done - Reuse v4
2156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // for 5th LHS element
2157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v22.4s, v6.8h\n"
2159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v6.8h,  v0.8b,  v5.8b\n"
2160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v23.4s, v7.8h\n"
2161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v7.8h,  v1.8b,  v5.8b\n"
2162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v20.4s, v8.8h\n"
2163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v8.8h,  v2.8b,  v5.8b\n"
2164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v21.4s, v9.8h\n"
2165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v9.8h,  v3.8b,  v5.8b\n"
2166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v22.4s, v10.8h\n"
2167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2  v10.8h, v0.16b, v5.16b\n"
2168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v23.4s, v11.8h\n"
2169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2  v11.8h, v1.16b, v5.16b\n"
2170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v24.4s, v6.8h\n"
2172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v6.8h,  v2.16b, v5.16b\n"
2173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v25.4s, v7.8h\n"
2174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v7.8h,  v3.16b, v5.16b\n"
2175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v26.4s, v8.8h\n"
2176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v8.8h,  v0.8b,  v4.8b\n"
2177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v27.4s, v9.8h\n"
2178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull    v9.8h,  v1.8b,  v4.8b\n"
2179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v24.4s, v10.8h\n"
2180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull   v10.8h,  v2.8b,  v4.8b\n"
2181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v25.4s, v11.8h\n"
2182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull   v11.8h,  v3.8b,  v4.8b\n"
2183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v26.4s, v6.8h\n"
2185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v6.8h, v0.16b, v4.16b\n"
2186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v27.4s, v7.8h\n"
2187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v7.8h, v1.16b, v4.16b\n"
2188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v28.4s, v8.8h\n"
2189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v8.8h, v2.16b, v4.16b\n"
2190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v29.4s, v9.8h\n"
2191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "umull2   v9.8h, v3.16b, v4.16b\n"
2192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v30.4s, v10.8h\n"
2193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v31.4s, v11.8h\n"
2194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v28.4s, v6.8h\n"
2196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v29.4s, v7.8h\n"
2197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 16, since we just handled
2198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 16 levels of depth.  Do this subs a bit before the end of the loop
2199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // for better dispatch on A57.
2200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #16\n"
2201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v30.4s, v8.8h\n"
2202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "uadalp v31.4s, v9.8h\n"
2203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
2205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
2206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Reduce aggregators horizontally
2208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v0.4s, v12.4s, v13.4s\n"
2209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v1.4s, v14.4s, v15.4s\n"
2210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v2.4s, v16.4s, v17.4s\n"
2211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v3.4s, v18.4s, v19.4s\n"
2212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v4.4s, v20.4s, v21.4s\n"
2213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v5.4s, v22.4s, v23.4s\n"
2214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v6.4s, v24.4s, v25.4s\n"
2215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v7.4s, v26.4s, v27.4s\n"
2216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v8.4s, v28.4s, v29.4s\n"
2217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v9.4s, v30.4s, v31.4s\n"
2218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v10.4s, v0.4s, v1.4s\n"
2220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v11.4s, v2.4s, v3.4s\n"
2221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v12.4s, v4.4s, v5.4s\n"
2222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v13.4s, v6.4s, v7.4s\n"
2223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v14.4s, v8.4s, v9.4s\n"
2224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[rowmajor_accumulator_buffer]\n"
2226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
2227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
2228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
2229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
2230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
2231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
2232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
2233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
2234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
2235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rowmajor_accumulator_buffer] "r"(rowmajor_accumulator_buffer)
2236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
2237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
2239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
2241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    // accumulate row-major accumulators into global (column-major) accumulators
2243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int l = 0; l < kLhsWidth; l++) {
2244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int r = 0; r < kRhsWidth; r++) {
2245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        accum_ptr[l + kLhsWidth * r] +=
2246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            rowmajor_accumulator_buffer[r + l * kRhsWidth];
2247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
2248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
2249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
2250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
2251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Fast kernel operating on int8 operands.
2253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// It is assumed that one of the two int8 operands only takes values
2254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// in [-127, 127], while the other may freely range in [-128, 127].
2255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The issue with both operands taking the value -128 is that:
2256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// -128*-128 + -128*-128 == -32768 overflows int16.
2257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
2258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// range. That is the basic idea of this kernel.
2259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits {
2260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int8_t OperandType;
2261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t AccumulatorType;
2262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
2263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
2264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
2265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
2266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
2267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
2268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::size_t start_depth = 123;
2269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::size_t run_depth = depth;
2270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::size_t dst_col_stride = 4;
2271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    AccumulatorType* dst_ptr = accum_ptr;
2272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
2273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Overview of register layout:
2274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
2276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x16 block of Lhs is stored in 8 bit in v4--v7.
2277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit
2279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // components which need to be horizontally-added at the end)
2280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // The Lhs vectors are multiplied by the Rhs vectors with a widening
2282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // multiply over the 8 first levels of depth, producing int16x8
2283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // vectors of products for each position in the accumulator matrix.
2284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Here comes the special trick: since the operands are signed int8,
2285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // their range being [ -2^7 , 2^7 ), their products are in range
2286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
2287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // without any risk of overflowing int16.
2288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // We thus proceed with the 8 next levels of depth, multiplying
2289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // again Lhs by Rhs, accumulating into this existing int16x8 vector.
2290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Only then, having processed 16 levels of depth, do we need to
2292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // horizontally add these int16x8 accumulators into the final
2293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // int32x4 accumulators.
2294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // As we do not have enough registers to store all 16 int16x8
2296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // temporary-16bit-accumulators, we have them cycle through v8--v15.
2297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Register layout (ignoring the v8--v15 temporary 16bit accumulators):
2300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +--------+--------+--------+--------+
2302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
2303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                          Rhs  +--------+--------+--------+--------+
2304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |  ...   |  ...   |  ...   |  ...   |
2305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +--------+--------+--------+--------|
2306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
2307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               +--------+--------+--------+--------+
2308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                               |        |        |        |        |
2310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //    Lhs                        |        |        |        |        |
2312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+-----+--------+ - - +--------+--------+--------+--------+
2314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v4.b[0]| ... |v4.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
2315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v5.b[0]| ... |v5.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
2316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v6.b[0]| ... |v6.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
2317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  |v7.b[0]| ... |v7.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
2318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //  +-------+--------------+ - - +--------+--------+--------+--------+
2319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //                                                Accumulator
2321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
2322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Clear accumulators
2324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
2325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v16.4s, wzr\n"
2326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
2327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v17.4s, wzr\n"
2328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
2329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v18.4s, wzr\n"
2330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
2331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v19.4s, wzr\n"
2332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
2333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v20.4s, wzr\n"
2334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
2335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v21.4s, wzr\n"
2336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
2337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v22.4s, wzr\n"
2338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
2339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v23.4s, wzr\n"
2340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[run_depth], %[run_depth], #16\n"
2341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v24.4s, wzr\n"
2342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[dst_ptr]\n"
2343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v25.4s, wzr\n"
2344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v26.4s, wzr\n"
2345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v27.4s, wzr\n"
2346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v28.4s, wzr\n"
2347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v29.4s, wzr\n"
2348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v30.4s, wzr\n"
2349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v31.4s, wzr\n"
2350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v0.8b,  v4.8b\n"
2352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v1.8b,  v4.8b\n"
2353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v0.8b,  v5.8b\n"
2354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v1.8b,  v5.8b\n"
2355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v0.16b,  v4.16b\n"
2356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v1.16b,  v4.16b\n"
2357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v0.16b,  v5.16b\n"
2358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v1.16b,  v5.16b\n"
2359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
2361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
2363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
2364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %[run_depth], %[run_depth], #16\n"
2366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v16.4s, v12.8h\n"
2368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v0.8b,  v6.8b\n"
2369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v17.4s, v13.8h\n"
2370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v0.8b,  v7.8b\n"
2371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v20.4s, v14.8h\n"
2372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v1.8b,  v6.8b\n"
2373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v21.4s, v15.8h\n"
2374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v1.8b,  v7.8b\n"
2375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v0.16b,  v6.16b\n"
2376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v0.16b,  v7.16b\n"
2377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
2378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v1.16b,  v6.16b\n"
2379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v1.16b,  v7.16b\n"
2380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
2381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v24.4s, v12.8h\n"
2382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v2.8b,  v4.8b\n"
2383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v28.4s, v13.8h\n"
2384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v3.8b,  v4.8b\n"
2385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v25.4s, v14.8h\n"
2386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v2.8b,  v5.8b\n"
2387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v29.4s, v15.8h\n"
2388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v3.8b,  v5.8b\n"
2389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v2.16b,  v4.16b\n"
2390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v3.16b,  v4.16b\n"
2391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
2392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v2.16b,  v5.16b\n"
2393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v3.16b,  v5.16b\n"
2394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
2395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v18.4s, v12.8h\n"
2396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v2.8b,  v6.8b\n"
2397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v19.4s, v13.8h\n"
2398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v2.8b,  v7.8b\n"
2399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v22.4s, v14.8h\n"
2400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v3.8b,  v6.8b\n"
2401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v23.4s, v15.8h\n"
2402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v3.8b,  v7.8b\n"
2403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v2.16b,  v6.16b\n"
2404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v2.16b,  v7.16b\n"
2405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
2406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v3.16b,  v6.16b\n"
2407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
2408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v3.16b,  v7.16b\n"
2409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
2410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v26.4s, v12.8h\n"
2411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
2412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v0.8b,  v4.8b\n"
2413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v30.4s, v13.8h\n"
2414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v1.8b,  v4.8b\n"
2415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v27.4s, v14.8h\n"
2416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v0.8b,  v5.8b\n"
2417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v31.4s, v15.8h\n"
2418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v1.8b,  v5.8b\n"
2419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v0.16b,  v4.16b\n"
2420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v1.16b,  v4.16b\n"
2421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v0.16b,  v5.16b\n"
2422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v1.16b,  v5.16b\n"
2423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP "b\n"
2425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_AFTER_LOOP
2427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
2428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators from memory
2430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
2431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
2432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
2433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
2434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[dst_ptr]\n"
2435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Do the remaining arithmetic for the 16 last levels of depths.
2437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // All the operands are already loaded.
2438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v16.4s, v12.8h\n"
2439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v0.8b,  v6.8b\n"
2440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v17.4s, v13.8h\n"
2441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v0.8b,  v7.8b\n"
2442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v20.4s, v14.8h\n"
2443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v1.8b,  v6.8b\n"
2444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v21.4s, v15.8h\n"
2445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v1.8b,  v7.8b\n"
2446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v0.16b,  v6.16b\n"
2447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v0.16b,  v7.16b\n"
2448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v1.16b,  v6.16b\n"
2449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v1.16b,  v7.16b\n"
2450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v24.4s, v12.8h\n"
2451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v2.8b,  v4.8b\n"
2452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v28.4s, v13.8h\n"
2453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v3.8b,  v4.8b\n"
2454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v25.4s, v14.8h\n"
2455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v2.8b,  v5.8b\n"
2456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v29.4s, v15.8h\n"
2457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v3.8b,  v5.8b\n"
2458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v2.16b,  v4.16b\n"
2459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v3.16b,  v4.16b\n"
2460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v2.16b,  v5.16b\n"
2461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v3.16b,  v5.16b\n"
2462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v18.4s, v12.8h\n"
2463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v12.8h,  v2.8b,  v6.8b\n"
2464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v19.4s, v13.8h\n"
2465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v13.8h,  v2.8b,  v7.8b\n"
2466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v22.4s, v14.8h\n"
2467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v14.8h,  v3.8b,  v6.8b\n"
2468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v23.4s, v15.8h\n"
2469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smull    v15.8h,  v3.8b,  v7.8b\n"
2470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v12.8h,  v2.16b,  v6.16b\n"
2471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v13.8h,  v2.16b,  v7.16b\n"
2472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v14.8h,  v3.16b,  v6.16b\n"
2473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "smlal2   v15.8h,  v3.16b,  v7.16b\n"
2474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v26.4s, v12.8h\n"
2475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v30.4s, v13.8h\n"
2476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v27.4s, v14.8h\n"
2477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "sadalp  v31.4s, v15.8h\n"
2478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Reduce aggregators horizontally
2480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v0.4s, v16.4s, v20.4s\n"
2481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v1.4s, v17.4s, v21.4s\n"
2482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v2.4s, v18.4s, v22.4s\n"
2483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v3.4s, v19.4s, v23.4s\n"
2484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v4.4s, v24.4s, v28.4s\n"
2485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v5.4s, v25.4s, v29.4s\n"
2486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v6.4s, v26.4s, v30.4s\n"
2487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v7.4s, v27.4s, v31.4s\n"
2488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v12.4s, v0.4s, v4.4s\n"
2490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v13.4s, v1.4s, v5.4s\n"
2491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v14.4s, v2.4s, v6.4s\n"
2492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "addp v15.4s, v3.4s, v7.4s\n"
2493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Add to the accumulators loaded from memory
2495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add v8.4s, v8.4s, v12.4s\n"
2496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add v9.4s, v9.4s, v13.4s\n"
2497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add v10.4s, v10.4s, v14.4s\n"
2498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add v11.4s, v11.4s, v15.4s\n"
2499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators back to memory
2501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
2502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
2503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
2504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
2505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
2506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
2507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth),
2508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [dst_col_stride] "+r"(dst_col_stride)
2509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
2510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [start_depth] "r"(start_depth)
2511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
2512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
2514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
2516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
2517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
2518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
25197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __ARM_FEATURE_DOTPROD
25207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Kernels utilizing the Armv8.2 Dot Product extension.
25217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang//
25227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// The dot product instructions work by taking 4 consecutive 8-bit depth
25237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// values from each operand, multiplying the 4 pairs together and
25247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// accumulating all the results into the corresponding 32-bit accumulator
25257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// lane.  As such, the operation is identical to a 32-bit instruction (like
25267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// FMLA used in SGEMM), except that 4 depth values are processed at a time
25277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// instead of 1.
25287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
25297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Thus, this first kernel is a carbon copy of
25307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// "NEON_64bit_GEMM_Float32_WithScalar_A57" (which should provide good
25317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// performance for most processors) below with the opcode (fmla -> udot) and
25327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// types (float32 -> uint8/uint32) changed.
25337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang//
25347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// A signed version of this kernel could be produced by replacing "udot"
25357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// with "sdot" - performance should be identical to this udot kernel.
25367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct {
25377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
25387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint32_t AccumulatorType;
25397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
25407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>,
25417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> >
25427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
25437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
25447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
25457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
25467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
25477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "mov x0, %[accum_ptr]\n"
25487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v8.4s}, [x0], #16\n"
25497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v16.4s}, [x0], #16\n"
25507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v24.4s}, [x0], #16\n"
25517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v9.4s}, [x0], #16\n"
25527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v17.4s}, [x0], #16\n"
25537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v25.4s}, [x0], #16\n"
25547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v10.4s}, [x0], #16\n"
25557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v18.4s}, [x0], #16\n"
25567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v26.4s}, [x0], #16\n"
25577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v11.4s}, [x0], #16\n"
25587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v19.4s}, [x0], #16\n"
25597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v27.4s}, [x0], #16\n"
25607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v12.4s}, [x0], #16\n"
25617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v20.4s}, [x0], #16\n"
25627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v28.4s}, [x0], #16\n"
25637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v13.4s}, [x0], #16\n"
25647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v21.4s}, [x0], #16\n"
25657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v29.4s}, [x0], #16\n"
25667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v14.4s}, [x0], #16\n"
25677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v22.4s}, [x0], #16\n"
25687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v30.4s}, [x0], #16\n"
25697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v15.4s}, [x0], #16\n"
25707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v23.4s}, [x0], #16\n"
25717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v31.4s}, [x0], #16\n"
25727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
25737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // The start of the loop assumes first Rhs cell is already loaded, so
25747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // do it here for first iteration.
25757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
25767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
25777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // And the same for the first Lhs cell.
25787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
25797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
25807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP
25817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        ":\n"
25827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
25837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Start the MACs at the head of the loop - 1st cell from each side
25847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // already loaded.
25857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v8.4s, v2.16b, v0.b[0]\n"
25867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v9.4s, v2.16b, v0.b[1]\n"
25877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
25887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v10.4s, v2.16b, v0.b[2]\n"
25897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v11.4s, v2.16b, v0.b[3]\n"
25907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
25917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v12.4s, v2.16b, v1.b[0]\n"
25927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v13.4s, v2.16b, v1.b[1]\n"
25937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
25947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v14.4s, v2.16b, v1.b[2]\n"
25957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v15.4s, v2.16b, v1.b[3]\n"
25967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load
25977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // for the next iteration early.
25987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v16.4s, v3.16b, v0.b[0]\n"
25997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v17.4s, v3.16b, v0.b[1]\n"
26007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v18.4s, v3.16b, v0.b[2]\n"
26017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v19.4s, v3.16b, v0.b[3]\n"
26027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v20.4s, v3.16b, v1.b[0]\n"
26037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v21.4s, v3.16b, v1.b[1]\n"
26047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v22.4s, v3.16b, v1.b[2]\n"
26057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v23.4s, v3.16b, v1.b[3]\n"
26067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v24.4s, v4.16b, v0.b[0]\n"
26077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v25.4s, v4.16b, v0.b[1]\n"
26087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v26.4s, v4.16b, v0.b[2]\n"
26097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v27.4s, v4.16b, v0.b[3]\n"
26107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -
26117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // load for the next iteration early.
26127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v28.4s, v4.16b, v1.b[0]\n"
26137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v29.4s, v4.16b, v1.b[1]\n"
26147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
26157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Loop.  Decrement loop index (depth) by 4 as udot processes 4
26167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // depth values.
26177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "subs %w[depth], %w[depth], #4\n"
26187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v30.4s, v4.16b, v1.b[2]\n"
26197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v31.4s, v4.16b, v1.b[3]\n"
26207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
26217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
26227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "b\n"
26237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
26247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators
26257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "mov x0, %[accum_ptr]\n"
26267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v8.16b}, [x0], #16\n"
26277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v16.16b}, [x0], #16\n"
26287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v24.16b}, [x0], #16\n"
26297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v9.16b}, [x0], #16\n"
26307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v17.16b}, [x0], #16\n"
26317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v25.16b}, [x0], #16\n"
26327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v10.16b}, [x0], #16\n"
26337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v18.16b}, [x0], #16\n"
26347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v26.16b}, [x0], #16\n"
26357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v11.16b}, [x0], #16\n"
26367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v19.16b}, [x0], #16\n"
26377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v27.16b}, [x0], #16\n"
26387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v12.16b}, [x0], #16\n"
26397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v20.16b}, [x0], #16\n"
26407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v28.16b}, [x0], #16\n"
26417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v13.16b}, [x0], #16\n"
26427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v21.16b}, [x0], #16\n"
26437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v29.16b}, [x0], #16\n"
26447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v14.16b}, [x0], #16\n"
26457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v22.16b}, [x0], #16\n"
26467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v30.16b}, [x0], #16\n"
26477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v15.16b}, [x0], #16\n"
26487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v23.16b}, [x0], #16\n"
26497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v31.16b}, [x0], #16\n"
26507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
26517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
26527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
26537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
26547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
26557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
26567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
26577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
26587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
26597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v28", "v29", "v30", "v31");
26607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
26617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
26627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
26637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// As above, except tuned for Cortex-A55r1.
26647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang//
26657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Similarly, this is a clone of NEON_64bit_GEMM_Float32_WithScalar_A55r1
26667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// with the names changed.
26677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct_A55r1 {
26687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
26697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint32_t AccumulatorType;
26707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
26717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>,
26727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> >
26737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
26747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
26757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
26767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
26777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
26787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "mov x0, %[accum_ptr]\n"
26797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v8.4s}, [x0], #16\n"
26807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v16.4s}, [x0], #16\n"
26817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v24.4s}, [x0], #16\n"
26827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v9.4s}, [x0], #16\n"
26837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v17.4s}, [x0], #16\n"
26847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v25.4s}, [x0], #16\n"
26857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v10.4s}, [x0], #16\n"
26867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v18.4s}, [x0], #16\n"
26877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v26.4s}, [x0], #16\n"
26887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v11.4s}, [x0], #16\n"
26897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v19.4s}, [x0], #16\n"
26907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v27.4s}, [x0], #16\n"
26917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v12.4s}, [x0], #16\n"
26927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v20.4s}, [x0], #16\n"
26937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v28.4s}, [x0], #16\n"
26947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v13.4s}, [x0], #16\n"
26957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v21.4s}, [x0], #16\n"
26967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v29.4s}, [x0], #16\n"
26977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v14.4s}, [x0], #16\n"
26987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v22.4s}, [x0], #16\n"
26997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v30.4s}, [x0], #16\n"
27007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v15.4s}, [x0], #16\n"
27017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v23.4s}, [x0], #16\n"
27027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v31.4s}, [x0], #16\n"
27037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // For details on how this kernel works, see the Float32 kernel below.
27057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d0, [%[rhs_ptr]]\n"
27077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"
27087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr q2, [%[lhs_ptr]]\n"
27107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr q3, [%[lhs_ptr], #16]\n"
27117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP
27137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        ":\n"
27147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v8.4s, v2.16b, v0.b[0]\n"
27167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d1, [%[rhs_ptr], #16]\n"         // Bottom half of v1
27177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v9.4s, v2.16b, v0.b[1]\n"
27187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v0.d[1], x18\n"                  // Finish loading v0
27197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v16.4s, v3.16b, v0.b[0]\n"      // out of sequence - used to reduce load/use pressure.
27207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[rhs_ptr], #24]\n"        // Top half of v1 to X register
27217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v17.4s, v3.16b, v0.b[1]\n"      // out of sequence - used to reduce load/use pressure.
27227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #32\n"   // RHS loads complete - increment pointer.
27237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v10.4s, v2.16b, v0.b[2]\n"
27247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d4, [%[lhs_ptr], #32]\n"         // Bottom half of v4
27257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v11.4s, v2.16b, v0.b[3]\n"
27267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v1.d[1], x18\n"                  // Finish loading v1
27277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v12.4s, v2.16b, v1.b[0]\n"
27287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[lhs_ptr], #40]\n"        // Top half of v4 to X register
27297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v13.4s, v2.16b, v1.b[1]\n"
27307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #48\n"   // LHS loads complete - increment pointer.
27317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v14.4s, v2.16b, v1.b[2]\n"
27327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v15.4s, v2.16b, v1.b[3]\n"
27347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d2, [%[lhs_ptr]]\n"              // Bottom half of v2 (for next time)
27357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v18.4s, v3.16b, v0.b[2]\n"
27367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v4.d[1], x18\n"                  // Finish loading v4
27377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v19.4s, v3.16b, v0.b[3]\n"
27387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[lhs_ptr], #8]\n"         // Top half of next v2 to X register
27397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v20.4s, v3.16b, v1.b[0]\n"
27407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "subs %w[depth], %w[depth], #4\n"
27417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v21.4s, v3.16b, v1.b[1]\n"
27427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v22.4s, v3.16b, v1.b[2]\n"
27447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v23.4s, v3.16b, v1.b[3]\n"
27467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d3, [%[lhs_ptr], #16]\n"         // Bottom half of v3 (for next time)
27477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v24.4s, v4.16b, v0.b[0]\n"
27487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v2.d[1], x18\n"                  // Finish loading next v2
27497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v25.4s, v4.16b, v0.b[1]\n"
27507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[lhs_ptr], #24]\n"        // Top half of next v3 to X register
27517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v26.4s, v4.16b, v0.b[2]\n"
27527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v27.4s, v4.16b, v0.b[3]\n"
27547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d0, [%[rhs_ptr]]\n"              // Bottom half of v0 (for next time)
27557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v28.4s, v4.16b, v1.b[0]\n"
27567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v3.d[1], x18\n"                  // Finish loading next v3
27577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v29.4s, v4.16b, v1.b[1]\n"
27587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"         // Top half of next v0 to X register
27597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v30.4s, v4.16b, v1.b[2]\n"
27607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "udot v31.4s, v4.16b, v1.b[3]\n"
27627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bne " GEMMLOWP_LABEL_LOOP "b\n"
27637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
27647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators
27657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "mov x0, %[accum_ptr]\n"
27667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v8.4s}, [x0], #16\n"
27677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v16.4s}, [x0], #16\n"
27687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v24.4s}, [x0], #16\n"
27697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v9.4s}, [x0], #16\n"
27707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v17.4s}, [x0], #16\n"
27717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v25.4s}, [x0], #16\n"
27727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v10.4s}, [x0], #16\n"
27737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v18.4s}, [x0], #16\n"
27747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v26.4s}, [x0], #16\n"
27757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v11.4s}, [x0], #16\n"
27767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v19.4s}, [x0], #16\n"
27777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v27.4s}, [x0], #16\n"
27787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v12.4s}, [x0], #16\n"
27797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v20.4s}, [x0], #16\n"
27807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v28.4s}, [x0], #16\n"
27817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v13.4s}, [x0], #16\n"
27827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v21.4s}, [x0], #16\n"
27837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v29.4s}, [x0], #16\n"
27847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v14.4s}, [x0], #16\n"
27857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v22.4s}, [x0], #16\n"
27867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v30.4s}, [x0], #16\n"
27877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v15.4s}, [x0], #16\n"
27887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v23.4s}, [x0], #16\n"
27897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v31.4s}, [x0], #16\n"
27907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
27917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
27927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
27937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
27947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
27957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
27967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
27977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
27987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
27997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v27", "v28", "v29", "v30", "v31");
28007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
28017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
28027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif  // __ARM_FEATURE_DOTPROD
28037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
2804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// We don't actually use int32*int32 in production. This is just an
2805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// experiment to help dissociate the effect of integer-vs-float, from the
2806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// effect of operands width.
2807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Int32_WithScalar {
2808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t OperandType;
2809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t AccumulatorType;
2810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
2811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
2812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
2813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
2814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
2815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
2816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
2817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
2818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
2819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
2820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v16.16b}, [x0], #16\n"
2821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v24.16b}, [x0], #16\n"
2822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
2823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v17.16b}, [x0], #16\n"
2824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v25.16b}, [x0], #16\n"
2825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
2826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v18.16b}, [x0], #16\n"
2827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v26.16b}, [x0], #16\n"
2828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
2829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v19.16b}, [x0], #16\n"
2830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v27.16b}, [x0], #16\n"
2831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v12.16b}, [x0], #16\n"
2832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v20.16b}, [x0], #16\n"
2833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v28.16b}, [x0], #16\n"
2834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v13.16b}, [x0], #16\n"
2835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v21.16b}, [x0], #16\n"
2836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v29.16b}, [x0], #16\n"
2837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v14.16b}, [x0], #16\n"
2838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v22.16b}, [x0], #16\n"
2839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v30.16b}, [x0], #16\n"
2840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v15.16b}, [x0], #16\n"
2841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v23.16b}, [x0], #16\n"
2842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v31.16b}, [x0], #16\n"
2843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
2845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
2846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 2 Rhs cell of size 1x4 each
2848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"
2849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"
2850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
2852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
2853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"
2854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"
2855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
2857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v8.4s, v2.4s, v0.s[0]\n"
2858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v9.4s, v2.4s, v0.s[1]\n"
2859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v10.4s, v2.4s, v0.s[2]\n"
2860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v11.4s, v2.4s, v0.s[3]\n"
2861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v12.4s, v2.4s, v1.s[0]\n"
2862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v13.4s, v2.4s, v1.s[1]\n"
2863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v14.4s, v2.4s, v1.s[2]\n"
2864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v15.4s, v2.4s, v1.s[3]\n"
2865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v16.4s, v3.4s, v0.s[0]\n"
2866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v17.4s, v3.4s, v0.s[1]\n"
2867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v18.4s, v3.4s, v0.s[2]\n"
2868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v19.4s, v3.4s, v0.s[3]\n"
2869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v20.4s, v3.4s, v1.s[0]\n"
2870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v21.4s, v3.4s, v1.s[1]\n"
2871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v22.4s, v3.4s, v1.s[2]\n"
2872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v23.4s, v3.4s, v1.s[3]\n"
2873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v24.4s, v4.4s, v0.s[0]\n"
2874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v25.4s, v4.4s, v0.s[1]\n"
2875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v26.4s, v4.4s, v0.s[2]\n"
2876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v27.4s, v4.4s, v0.s[3]\n"
2877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v28.4s, v4.4s, v1.s[0]\n"
2878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v29.4s, v4.4s, v1.s[1]\n"
2879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v30.4s, v4.4s, v1.s[2]\n"
2880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mla v31.4s, v4.4s, v1.s[3]\n"
2881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
2883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
2884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #1\n"
2885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
2886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
2887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
2889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
2890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
2891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v16.16b}, [x0], #16\n"
2892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v24.16b}, [x0], #16\n"
2893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
2894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v17.16b}, [x0], #16\n"
2895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v25.16b}, [x0], #16\n"
2896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
2897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v18.16b}, [x0], #16\n"
2898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v26.16b}, [x0], #16\n"
2899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
2900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v19.16b}, [x0], #16\n"
2901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v27.16b}, [x0], #16\n"
2902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
2903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v20.16b}, [x0], #16\n"
2904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v28.16b}, [x0], #16\n"
2905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
2906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v21.16b}, [x0], #16\n"
2907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v29.16b}, [x0], #16\n"
2908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
2909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v22.16b}, [x0], #16\n"
2910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v30.16b}, [x0], #16\n"
2911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v15.16b}, [x0], #16\n"
2912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v23.16b}, [x0], #16\n"
2913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v31.16b}, [x0], #16\n"
2914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
2915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
2916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
2917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
2918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
2919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
2920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
2922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
2924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
2925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
2926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Not very efficient kernel, just an experiment to see what we can do
2928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// without using NEON multiply-with-scalar instructions.
2929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithVectorDuplicatingScalar {
2930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
2931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
2932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
2933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
2934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
2935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
2936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
2937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
2938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
2939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
2940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
2941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
2942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v16.16b}, [x0], #16\n"
2943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v24.16b}, [x0], #16\n"
2944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
2945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v17.16b}, [x0], #16\n"
2946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v25.16b}, [x0], #16\n"
2947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
2948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v18.16b}, [x0], #16\n"
2949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v26.16b}, [x0], #16\n"
2950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
2951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v19.16b}, [x0], #16\n"
2952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v27.16b}, [x0], #16\n"
2953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v12.16b}, [x0], #16\n"
2954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v20.16b}, [x0], #16\n"
2955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v28.16b}, [x0], #16\n"
2956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v13.16b}, [x0], #16\n"
2957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v21.16b}, [x0], #16\n"
2958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v29.16b}, [x0], #16\n"
2959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v14.16b}, [x0], #16\n"
2960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v22.16b}, [x0], #16\n"
2961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v30.16b}, [x0], #16\n"
2962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v15.16b}, [x0], #16\n"
2963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v23.16b}, [x0], #16\n"
2964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v31.16b}, [x0], #16\n"
2965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
2967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
2968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 2 Rhs cell of size 1x4 each
2970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v5.4s}, [%[rhs_ptr]], #16\n"
2971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v6.4s}, [%[rhs_ptr]], #16\n"
2972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
2974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
2975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"
2976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"
2977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
2979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v0.4s, v5.s[0]\n"
2980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v1.4s, v5.s[1]\n"
2981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v8.4s, v2.4s, v0.4s\n"
2982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v16.4s, v3.4s, v0.4s\n"
2983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v24.4s, v4.4s, v0.4s\n"
2984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v9.4s, v2.4s, v1.4s\n"
2985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v17.4s, v3.4s, v1.4s\n"
2986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v25.4s, v4.4s, v1.4s\n"
2987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v0.4s, v5.s[2]\n"
2988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v1.4s, v5.s[3]\n"
2989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v10.4s, v2.4s, v0.4s\n"
2990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v18.4s, v3.4s, v0.4s\n"
2991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v26.4s, v4.4s, v0.4s\n"
2992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v11.4s, v2.4s, v1.4s\n"
2993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v19.4s, v3.4s, v1.4s\n"
2994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v27.4s, v4.4s, v1.4s\n"
2995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v0.4s, v6.s[0]\n"
2996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v1.4s, v6.s[1]\n"
2997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v12.4s, v2.4s, v0.4s\n"
2998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v20.4s, v3.4s, v0.4s\n"
2999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v28.4s, v4.4s, v0.4s\n"
3000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v13.4s, v2.4s, v1.4s\n"
3001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v21.4s, v3.4s, v1.4s\n"
3002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v29.4s, v4.4s, v1.4s\n"
3003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v0.4s, v6.s[2]\n"
3004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "dup v1.4s, v6.s[3]\n"
3005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v14.4s, v2.4s, v0.4s\n"
3006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v22.4s, v3.4s, v0.4s\n"
3007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v30.4s, v4.4s, v0.4s\n"
3008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v15.4s, v2.4s, v1.4s\n"
3009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v23.4s, v3.4s, v1.4s\n"
3010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v31.4s, v4.4s, v1.4s\n"
3011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
3013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
3014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #1\n"
3015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
3016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
3017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
3019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
3021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v16.16b}, [x0], #16\n"
3022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v24.16b}, [x0], #16\n"
3023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
3024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v17.16b}, [x0], #16\n"
3025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v25.16b}, [x0], #16\n"
3026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
3027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v18.16b}, [x0], #16\n"
3028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v26.16b}, [x0], #16\n"
3029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
3030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v19.16b}, [x0], #16\n"
3031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v27.16b}, [x0], #16\n"
3032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
3033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v20.16b}, [x0], #16\n"
3034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v28.16b}, [x0], #16\n"
3035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
3036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v21.16b}, [x0], #16\n"
3037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v29.16b}, [x0], #16\n"
3038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
3039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v22.16b}, [x0], #16\n"
3040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v30.16b}, [x0], #16\n"
3041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v15.16b}, [x0], #16\n"
3042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v23.16b}, [x0], #16\n"
3043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v31.16b}, [x0], #16\n"
3044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
3045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
3046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
3047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
3048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
3049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
3050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
3051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
3052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
3053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
3054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This is the "most natural" kernel, using NEON multiply-with-scalar
3058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// instructions.
3059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar {
3060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
3061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
3062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
3063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
3064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
3065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
3066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
3067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
3068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
3069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
3070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
3072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v16.16b}, [x0], #16\n"
3073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v24.16b}, [x0], #16\n"
3074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
3075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v17.16b}, [x0], #16\n"
3076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v25.16b}, [x0], #16\n"
3077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
3078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v18.16b}, [x0], #16\n"
3079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v26.16b}, [x0], #16\n"
3080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
3081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v19.16b}, [x0], #16\n"
3082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v27.16b}, [x0], #16\n"
3083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v12.16b}, [x0], #16\n"
3084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v20.16b}, [x0], #16\n"
3085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v28.16b}, [x0], #16\n"
3086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v13.16b}, [x0], #16\n"
3087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v21.16b}, [x0], #16\n"
3088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v29.16b}, [x0], #16\n"
3089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v14.16b}, [x0], #16\n"
3090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v22.16b}, [x0], #16\n"
3091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v30.16b}, [x0], #16\n"
3092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v15.16b}, [x0], #16\n"
3093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v23.16b}, [x0], #16\n"
3094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v31.16b}, [x0], #16\n"
3095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
3097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
3098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 2 Rhs cell of size 1x4 each
3100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"
3101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"
3102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load 3 Lhs cells of size 4x1 each
3104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
3105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"
3106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"
3107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Multiply-accumulate
3109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v8.4s, v2.4s, v0.s[0]\n"
3110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v9.4s, v2.4s, v0.s[1]\n"
3111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v10.4s, v2.4s, v0.s[2]\n"
3112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v11.4s, v2.4s, v0.s[3]\n"
3113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v12.4s, v2.4s, v1.s[0]\n"
3114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v13.4s, v2.4s, v1.s[1]\n"
3115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v14.4s, v2.4s, v1.s[2]\n"
3116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v15.4s, v2.4s, v1.s[3]\n"
3117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v16.4s, v3.4s, v0.s[0]\n"
3118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v17.4s, v3.4s, v0.s[1]\n"
3119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v18.4s, v3.4s, v0.s[2]\n"
3120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v19.4s, v3.4s, v0.s[3]\n"
3121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v20.4s, v3.4s, v1.s[0]\n"
3122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v21.4s, v3.4s, v1.s[1]\n"
3123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v22.4s, v3.4s, v1.s[2]\n"
3124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v23.4s, v3.4s, v1.s[3]\n"
3125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v24.4s, v4.4s, v0.s[0]\n"
3126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v25.4s, v4.4s, v0.s[1]\n"
3127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v26.4s, v4.4s, v0.s[2]\n"
3128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v27.4s, v4.4s, v0.s[3]\n"
3129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v28.4s, v4.4s, v1.s[0]\n"
3130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v29.4s, v4.4s, v1.s[1]\n"
3131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v30.4s, v4.4s, v1.s[2]\n"
3132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v31.4s, v4.4s, v1.s[3]\n"
3133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled 1
3135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // level of depth.
3136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #1\n"
3137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
3138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
3139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
3141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
3143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v16.16b}, [x0], #16\n"
3144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v24.16b}, [x0], #16\n"
3145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
3146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v17.16b}, [x0], #16\n"
3147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v25.16b}, [x0], #16\n"
3148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
3149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v18.16b}, [x0], #16\n"
3150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v26.16b}, [x0], #16\n"
3151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
3152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v19.16b}, [x0], #16\n"
3153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v27.16b}, [x0], #16\n"
3154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
3155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v20.16b}, [x0], #16\n"
3156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v28.16b}, [x0], #16\n"
3157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
3158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v21.16b}, [x0], #16\n"
3159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v29.16b}, [x0], #16\n"
3160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
3161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v22.16b}, [x0], #16\n"
3162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v30.16b}, [x0], #16\n"
3163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v15.16b}, [x0], #16\n"
3164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v23.16b}, [x0], #16\n"
3165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v31.16b}, [x0], #16\n"
3166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
3167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
3168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
3169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
3170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
3171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
3172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
3173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
3174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
3175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
3176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel contributed by ARM. Tuned for A57.
3180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar_A57 {
3181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
3182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
3183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
3184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
3185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
3186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
3187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
3188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
3189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
3190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
3191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
3193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v16.16b}, [x0], #16\n"
3194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v24.16b}, [x0], #16\n"
3195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
3196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v17.16b}, [x0], #16\n"
3197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v25.16b}, [x0], #16\n"
3198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
3199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v18.16b}, [x0], #16\n"
3200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v26.16b}, [x0], #16\n"
3201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
3202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v19.16b}, [x0], #16\n"
3203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v27.16b}, [x0], #16\n"
3204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v12.16b}, [x0], #16\n"
3205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v20.16b}, [x0], #16\n"
3206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v28.16b}, [x0], #16\n"
3207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v13.16b}, [x0], #16\n"
3208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v21.16b}, [x0], #16\n"
3209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v29.16b}, [x0], #16\n"
3210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v14.16b}, [x0], #16\n"
3211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v22.16b}, [x0], #16\n"
3212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v30.16b}, [x0], #16\n"
3213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v15.16b}, [x0], #16\n"
3214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v23.16b}, [x0], #16\n"
3215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v31.16b}, [x0], #16\n"
3216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // The start of the loop assumes first Rhs cell is already loaded, so
3218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // do it here for first iteration.
3219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"
3220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // And the same for the first Lhs cell.
3222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
3223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
3225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
3226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Start the MACs at the head of the loop - 1st cell from each side
3228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // already loaded.
3229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v8.4s, v2.4s, v0.s[0]\n"
3230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v9.4s, v2.4s, v0.s[1]\n"
3231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
3232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v10.4s, v2.4s, v0.s[2]\n"
3233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v11.4s, v2.4s, v0.s[3]\n"
3234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
3235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v12.4s, v2.4s, v1.s[0]\n"
3236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v13.4s, v2.4s, v1.s[1]\n"
3237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
3238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v14.4s, v2.4s, v1.s[2]\n"
3239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v15.4s, v2.4s, v1.s[3]\n"
3240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load
3241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // for the next iteration early.
3242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v16.4s, v3.4s, v0.s[0]\n"
3243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v17.4s, v3.4s, v0.s[1]\n"
3244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v18.4s, v3.4s, v0.s[2]\n"
3245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v19.4s, v3.4s, v0.s[3]\n"
3246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v20.4s, v3.4s, v1.s[0]\n"
3247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v21.4s, v3.4s, v1.s[1]\n"
3248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v22.4s, v3.4s, v1.s[2]\n"
3249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v23.4s, v3.4s, v1.s[3]\n"
3250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v24.4s, v4.4s, v0.s[0]\n"
3251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v25.4s, v4.4s, v0.s[1]\n"
3252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v26.4s, v4.4s, v0.s[2]\n"
3253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v27.4s, v4.4s, v0.s[3]\n"
3254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -
3255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // load for the next iteration
3256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // early.
3257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v28.4s, v4.4s, v1.s[0]\n"
3258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v29.4s, v4.4s, v1.s[1]\n"
3259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop. Decrement loop index (depth) by 1, since we just handled
3260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 1 level of depth.  Do this a bit before the end of the loop for
3261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // better dispatch on A57.
3262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #1\n"
3263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v30.4s, v4.4s, v1.s[2]\n"
3264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v31.4s, v4.4s, v1.s[3]\n"
3265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
3267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
3268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
3270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
3272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v16.16b}, [x0], #16\n"
3273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v24.16b}, [x0], #16\n"
3274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
3275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v17.16b}, [x0], #16\n"
3276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v25.16b}, [x0], #16\n"
3277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
3278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v18.16b}, [x0], #16\n"
3279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v26.16b}, [x0], #16\n"
3280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
3281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v19.16b}, [x0], #16\n"
3282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v27.16b}, [x0], #16\n"
3283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
3284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v20.16b}, [x0], #16\n"
3285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v28.16b}, [x0], #16\n"
3286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
3287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v21.16b}, [x0], #16\n"
3288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v29.16b}, [x0], #16\n"
3289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
3290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v22.16b}, [x0], #16\n"
3291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v30.16b}, [x0], #16\n"
3292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v15.16b}, [x0], #16\n"
3293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v23.16b}, [x0], #16\n"
3294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v31.16b}, [x0], #16\n"
3295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
3296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
3297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
3298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
3299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
3300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
3301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
3302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
3303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
3304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v28", "v29", "v30", "v31");
3305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef __APPLE__
3309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Faster kernel contributed by ARM. Tuned for A53.
3310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar_A53 {
3311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
3312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
3313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
3314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
3315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
3316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
3317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
3318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
3319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    asm volatile(
3320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Load accumulators
3321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v8.16b}, [x0], #16\n"
3323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v16.16b}, [x0], #16\n"
3324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v24.16b}, [x0], #16\n"
3325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v9.16b}, [x0], #16\n"
3326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v17.16b}, [x0], #16\n"
3327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v25.16b}, [x0], #16\n"
3328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v10.16b}, [x0], #16\n"
3329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v18.16b}, [x0], #16\n"
3330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v26.16b}, [x0], #16\n"
3331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v11.16b}, [x0], #16\n"
3332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v19.16b}, [x0], #16\n"
3333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v27.16b}, [x0], #16\n"
3334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v12.16b}, [x0], #16\n"
3335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v20.16b}, [x0], #16\n"
3336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v28.16b}, [x0], #16\n"
3337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v13.16b}, [x0], #16\n"
3338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v21.16b}, [x0], #16\n"
3339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v29.16b}, [x0], #16\n"
3340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v14.16b}, [x0], #16\n"
3341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v22.16b}, [x0], #16\n"
3342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v30.16b}, [x0], #16\n"
3343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v15.16b}, [x0], #16\n"
3344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v23.16b}, [x0], #16\n"
3345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v31.16b}, [x0], #16\n"
3346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // For A53, a very different-looking loop is needed.
3348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
3349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // The main reason for this is that on A53 128-bit loads take two
3350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // cycles during which no dual issue can occur.  Doing two separate
3351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // 64-bit loads avoids this issue - they each take one cycle and are
3352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // able to dual issue.  Since vector register loads don't dual issue
3353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // with FMLA, we load half the register as normal and the other half
3354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // into an integer register.  This second half can then be moved into
3355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // place later with an INS instruction - which will dual issue with a
3356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // later FP load.
3357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
3358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // For this kernel there are approximately 3 times as many multiplies
3359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // as loads, so it makes sense to structure the loop into blocks of 4
3360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // cycles, with 1 dedicated "load cycle" and 3 "multiply cycles" per
3361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // block.  Strictly preserving this structure with NOPs where no load
3362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // is needed seems to result in higher performance.
3363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
3364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Choice of x18 to store the upper halves on their way into the
3365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // vector registers is arbitrary.  Added to the clobber list so that
3366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // the compiler will make it available.
3367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
3368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        //
3369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // At the start of the loop, it is assumed that v0 is "half loaded" -
3370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // bottom half in place in d0 and the upper half in x18 ready to
3371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // insert.  So set that up here for the first iteration:
3372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr d0, [%[rhs_ptr]]\n"             // Bottom half of first Rhs cell
3373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"        // Upper half
3374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #16\n"  // Separate increment (needed as
3375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // there is no operation to load at
3376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // reg + 8 but then increment reg
3377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // by 16).
3378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // v2 should be fully loaded - as it's outside the loop proper it's fine
3380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // to use a 128-bit load here.
3381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"  // first Lhs cell
3382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        GEMMLOWP_LABEL_LOOP
3384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        ":\n"
3385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // First block of four cycles.  Multplies all require v2 and v0; v2 is
3387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // loaded earlier and v0 is half loaded and completed in the load
3388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // cycle at the start.
3389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr d1, [%[rhs_ptr]]\n"  // "load" cycle - loading bottom half of v1
3390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // (second Rhs cell).
3391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ins v0.d[1], x18\n"  // "load" cycle - moving the upper half of v0 into
3392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // place.
3393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v8.4s, v2.4s, v0.s[0]\n"  // "fmla" cycle 1 - first multiply.
3394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"  // "fmla" cycle 1 - load upper half of v1
3395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // into x18.
3396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v9.4s, v2.4s, v0.s[1]\n"       // "fmla" cycle 2 - second multiply
3397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #16\n"  // "fmla" cycle 2 - increment Rhs
3398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // pointer (if needed)
3399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v10.4s, v2.4s, v0.s[2]\n"  // "fmla" cycle 3 - third multiply.  No
3400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // more work to dual issue.
3401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Second block.  Start loading v3 (second Lhs cell), finish loading v1.
3403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr d3, [%[lhs_ptr]]\n"
3404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ins v1.d[1], x18\n"  // v1 ready here.
3405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v11.4s, v2.4s, v0.s[3]\n"
3406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr x18, [%[lhs_ptr], #8]\n"
3407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v12.4s, v2.4s, v1.s[0]\n"  // First use of v1.
3408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #16\n"
3409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v13.4s, v2.4s, v1.s[1]\n"
3410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Third block.  Start loading v4 (third Lhs cell), finish loading v3.
3412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr d4, [%[lhs_ptr]]\n"
3413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ins v3.d[1], x18\n"  // v3 ready here.
3414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v14.4s, v2.4s, v1.s[2]\n"
3415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr x18, [%[lhs_ptr], #8]\n"
3416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v15.4s, v2.4s, v1.s[3]\n"
3417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #16\n"
3418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v16.4s, v3.4s, v0.s[0]\n"  // First use of v3.
3419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Fourth block.  v2 (first Lhs cell) is now finished with, so start
3421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // loading value for next iteration.  Finish loading v4.
3422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr d2, [%[lhs_ptr]]\n"
3423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ins v4.d[1], x18\n"  // v4 ready here.
3424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v17.4s, v3.4s, v0.s[1]\n"
3425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr x18, [%[lhs_ptr], #8]\n"
3426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v18.4s, v3.4s, v0.s[2]\n"
3427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #16\n"
3428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v19.4s, v3.4s, v0.s[3]\n"
3429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Fifth block, finish loading v2.  No new load to start as the other
3431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // registers are all still live.
3432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ins v2.d[1], x18\n"
3433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v20.4s, v3.4s, v1.s[0]\n"
3434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v21.4s, v3.4s, v1.s[1]\n"
3435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v22.4s, v3.4s, v1.s[2]\n"
3436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Sixth block, nothing to load.  2 nops needed as a single nop would
3438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // dual issue with the FMLA and break the timing.
3439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "nop\n"
3440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "nop\n"
3441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v23.4s, v3.4s, v1.s[3]\n"
3442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v24.4s, v4.4s, v0.s[0]\n"  // First use of v4.
3443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v25.4s, v4.4s, v0.s[1]\n"
3444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Seventh block, nothing to load.  Decrement the loop counter in this
3446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // block as the last block is very full.
3447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "nop\n"
3448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "nop\n"
3449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v26.4s, v4.4s, v0.s[2]\n"
3450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "subs %w[depth], %w[depth], #1\n"
3451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v27.4s, v4.4s, v0.s[3]\n"
3452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v28.4s, v4.4s, v1.s[0]\n"
3453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Eighth block - start loading v0 for next iteration.
3455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr d0, [%[rhs_ptr]]\n"
3456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v29.4s, v4.4s, v1.s[1]\n"
3457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"
3458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v30.4s, v4.4s, v1.s[2]\n"
3459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #16\n"
3460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "fmla v31.4s, v4.4s, v1.s[3]\n"
3461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Loop branch.  This will dual issue in fmla cycle 3 of the 8th block.
3463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "bne " GEMMLOWP_LABEL_LOOP
3464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "b\n"
3465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        // Store accumulators
3467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "mov x0, %[accum_ptr]\n"
3468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v8.16b}, [x0], #16\n"
3469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v16.16b}, [x0], #16\n"
3470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v24.16b}, [x0], #16\n"
3471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v9.16b}, [x0], #16\n"
3472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v17.16b}, [x0], #16\n"
3473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v25.16b}, [x0], #16\n"
3474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v10.16b}, [x0], #16\n"
3475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v18.16b}, [x0], #16\n"
3476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v26.16b}, [x0], #16\n"
3477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v11.16b}, [x0], #16\n"
3478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v19.16b}, [x0], #16\n"
3479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v27.16b}, [x0], #16\n"
3480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v12.16b}, [x0], #16\n"
3481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v20.16b}, [x0], #16\n"
3482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v28.16b}, [x0], #16\n"
3483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v13.16b}, [x0], #16\n"
3484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v21.16b}, [x0], #16\n"
3485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v29.16b}, [x0], #16\n"
3486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v14.16b}, [x0], #16\n"
3487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v22.16b}, [x0], #16\n"
3488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v30.16b}, [x0], #16\n"
3489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v15.16b}, [x0], #16\n"
3490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v23.16b}, [x0], #16\n"
3491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "st1 {v31.16b}, [x0], #16\n"
3492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // outputs
3493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
3494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [depth] "+r"(depth)
3495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // inputs
3496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [accum_ptr] "r"(accum_ptr)
3497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        :  // clobbers
3498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
3499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
3500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
3501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "v27", "v28", "v29", "v30", "v31");
3502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
35067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Faster kernel contributed by ARM. Tuned for A55r1.
35077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct NEON_64bit_GEMM_Float32_WithScalar_A55r1 {
35087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef float OperandType;
35097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef float AccumulatorType;
35107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
35117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
35127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
35137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
35147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
35157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
35167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
35177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
35187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "mov x0, %[accum_ptr]\n"
35197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v8.4s}, [x0], #16\n"
35207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v16.4s}, [x0], #16\n"
35217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v24.4s}, [x0], #16\n"
35227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v9.4s}, [x0], #16\n"
35237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v17.4s}, [x0], #16\n"
35247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v25.4s}, [x0], #16\n"
35257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v10.4s}, [x0], #16\n"
35267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v18.4s}, [x0], #16\n"
35277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v26.4s}, [x0], #16\n"
35287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v11.4s}, [x0], #16\n"
35297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v19.4s}, [x0], #16\n"
35307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v27.4s}, [x0], #16\n"
35317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v12.4s}, [x0], #16\n"
35327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v20.4s}, [x0], #16\n"
35337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v28.4s}, [x0], #16\n"
35347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v13.4s}, [x0], #16\n"
35357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v21.4s}, [x0], #16\n"
35367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v29.4s}, [x0], #16\n"
35377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v14.4s}, [x0], #16\n"
35387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v22.4s}, [x0], #16\n"
35397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v30.4s}, [x0], #16\n"
35407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v15.4s}, [x0], #16\n"
35417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v23.4s}, [x0], #16\n"
35427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld1 {v31.4s}, [x0], #16\n"
35437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A55r1 requires a hybrid of the A53 and standard approaches.
35457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
35467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Like A53, this processor prefers 64-bit loads.
35477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
35487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Unlike A53, it is capable of dual-issuing a 64-bit vector load
35497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // (or INS) with a FMLA instruction.
35507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
35517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Therefore we aim to issue an FMLA instruction every cycle.
35527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Alongside three FMLAs we can dual issue a (vector) 64-bit load, a
35537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // scalar 64-bit load and finally an INS to replicate the effect of
35547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // a single 128-bit load.
35557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
35567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // The loop contains 24 FMLA instructions, and 5 vector registers
35577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // need to be loaded, consuming 15 dual issue slots.  This leaves 9
35587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // dual issue slots.  Four of these are used for loop housekeeping
35597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // (2 pointer adds, 1 counter update and 1 branch), leaving 5 left
35607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // over (marked by blank lines).
35617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
35627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Choice of x18 to store the upper halves on their way into the
35637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // vector registers is arbitrary.  Added to the clobber list so that
35647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // the compiler will make it available.
35657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // At the start of the loop, it is assumed that v0 is "half loaded" -
35687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // bottom half in place in d0 and the upper half in x18 ready to
35697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // insert.  So set that up here for the first iteration:
35707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d0, [%[rhs_ptr]]\n"             // Bottom half of first Rhs cell
35717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"        // Upper half
35727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // v2-v3 should be fully loaded - as it's outside the loop proper it's fine
35747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // to use a 128-bit load here.
35757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr q2, [%[lhs_ptr]]\n"      // first Lhs cell
35767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr q3, [%[lhs_ptr], #16]\n" // second Lhs cell
35777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP
35797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        ":\n"
35807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v8.4s, v2.4s, v0.s[0]\n"
35827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d1, [%[rhs_ptr], #16]\n"         // Bottom half of v1
35837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v9.4s, v2.4s, v0.s[1]\n"
35847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v0.d[1], x18\n"                  // Finish loading v0
35857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v16.4s, v3.4s, v0.s[0]\n"       // out of sequence - used to reduce load/use pressure.
35867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[rhs_ptr], #24]\n"        // Top half of v1 to X register
35877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v17.4s, v3.4s, v0.s[1]\n"       // out of sequence - used to reduce load/use pressure.
35887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "add %[rhs_ptr], %[rhs_ptr], #32\n"   // RHS loads complete - increment pointer.
35897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v10.4s, v2.4s, v0.s[2]\n"
35907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d4, [%[lhs_ptr], #32]\n"         // Bottom half of v4
35917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v11.4s, v2.4s, v0.s[3]\n"
35927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v1.d[1], x18\n"                  // Finish loading v1
35937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v12.4s, v2.4s, v1.s[0]\n"
35947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[lhs_ptr], #40]\n"        // Top half of v4 to X register
35957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v13.4s, v2.4s, v1.s[1]\n"
35967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "add %[lhs_ptr], %[lhs_ptr], #48\n"   // LHS loads complete - increment pointer.
35977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v14.4s, v2.4s, v1.s[2]\n"
35987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
35997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v15.4s, v2.4s, v1.s[3]\n"
36007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d2, [%[lhs_ptr]]\n"              // Bottom half of v2 (for next time)
36017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v18.4s, v3.4s, v0.s[2]\n"
36027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v4.d[1], x18\n"                  // Finish loading v4
36037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v19.4s, v3.4s, v0.s[3]\n"
36047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[lhs_ptr], #8]\n"         // Top half of next v2 to X register
36057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v20.4s, v3.4s, v1.s[0]\n"
36067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "subs %w[depth], %w[depth], #1\n"
36077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v21.4s, v3.4s, v1.s[1]\n"
36087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
36097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v22.4s, v3.4s, v1.s[2]\n"
36107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
36117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v23.4s, v3.4s, v1.s[3]\n"
36127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d3, [%[lhs_ptr], #16]\n"         // Bottom half of v3 (for next time)
36137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v24.4s, v4.4s, v0.s[0]\n"
36147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v2.d[1], x18\n"                  // Finish loading next v2
36157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v25.4s, v4.4s, v0.s[1]\n"
36167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[lhs_ptr], #24]\n"        // Top half of next v3 to X register
36177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v26.4s, v4.4s, v0.s[2]\n"
36187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
36197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v27.4s, v4.4s, v0.s[3]\n"
36207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr d0, [%[rhs_ptr]]\n"              // Bottom half of v0 (for next time)
36217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v28.4s, v4.4s, v1.s[0]\n"
36227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins v3.d[1], x18\n"                  // Finish loading next v3
36237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v29.4s, v4.4s, v1.s[1]\n"
36247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldr x18, [%[rhs_ptr], #8]\n"         // Top half of next v0 to X register
36257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v30.4s, v4.4s, v1.s[2]\n"
36267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
36277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fmla v31.4s, v4.4s, v1.s[3]\n"
36287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bne " GEMMLOWP_LABEL_LOOP "b\n"
36297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
36307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators
36317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "mov x0, %[accum_ptr]\n"
36327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v8.4s}, [x0], #16\n"
36337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v16.4s}, [x0], #16\n"
36347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v24.4s}, [x0], #16\n"
36357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v9.4s}, [x0], #16\n"
36367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v17.4s}, [x0], #16\n"
36377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v25.4s}, [x0], #16\n"
36387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v10.4s}, [x0], #16\n"
36397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v18.4s}, [x0], #16\n"
36407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v26.4s}, [x0], #16\n"
36417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v11.4s}, [x0], #16\n"
36427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v19.4s}, [x0], #16\n"
36437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v27.4s}, [x0], #16\n"
36447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v12.4s}, [x0], #16\n"
36457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v20.4s}, [x0], #16\n"
36467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v28.4s}, [x0], #16\n"
36477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v13.4s}, [x0], #16\n"
36487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v21.4s}, [x0], #16\n"
36497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v29.4s}, [x0], #16\n"
36507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v14.4s}, [x0], #16\n"
36517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v22.4s}, [x0], #16\n"
36527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v30.4s}, [x0], #16\n"
36537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v15.4s}, [x0], #16\n"
36547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v23.4s}, [x0], #16\n"
36557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st1 {v31.4s}, [x0], #16\n"
36567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
36577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
36587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
36597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
36607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
36617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
36627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
36637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
36647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
36657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v27", "v28", "v29", "v30", "v31");
36667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
36677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
36687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
3669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif  // __aarch64__
3670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
36717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if defined(__arm__) || defined(__aarch64__)
3672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef __aarch64__
3673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
3674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int32x2_t c = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
3675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int32x2_t d = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
3676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return vcombine_s32(c, d);
3677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// C++ intrinsics-based variant of the deep, int8, fast kernel
3681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int Cols>
3682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics {
3683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int8_t OperandType;
3684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t AccumulatorType;
3685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
3686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
3687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<Cols, 16, CellOrder::WidthMajor>, 1> >
3688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
3689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
3690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
3691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    int32x4_t acc[4][Cols];
3692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int i = 0; i < 4; i++) {
3693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int j = 0; j < Cols; j++) {
3694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        acc[i][j] = vdupq_n_s32(0);
3695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int d = 0; d < depth; d += 16) {
3698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int8x16_t lhs[4];
3699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < 4; i++) {
3700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        lhs[i] = vld1q_s8(lhs_ptr + 16 * i);
3701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int8x16_t rhs[Cols];
3703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < Cols; i++) {
3704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        rhs[i] = vld1q_s8(rhs_ptr + 16 * i);
3705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < 4; i++) {
3707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        for (int j = 0; j < Cols; j++) {
3708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          int16x8_t local_acc =
3709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              vmull_s8(vget_low_s8(lhs[i]), vget_low_s8(rhs[j]));
3710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          local_acc =
3711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              vmlal_s8(local_acc, vget_high_s8(lhs[i]), vget_high_s8(rhs[j]));
3712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][j] = vpadalq_s16(acc[i][j], local_acc);
3713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        }
3714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      lhs_ptr += 64;
3716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      rhs_ptr += 16 * Cols;
3717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int i = 0; i < Cols; i++) {
3719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int32x4_t acc_2x_0 = vpaddq_s32(acc[0][i], acc[1][i]);
3720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int32x4_t acc_2x_1 = vpaddq_s32(acc[2][i], acc[3][i]);
3721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int32x4_t acc_4x = vpaddq_s32(acc_2x_0, acc_2x_1);
3722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int32x4_t dst_val = vld1q_s32(accum_ptr + 4 * i);
3723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      dst_val = vaddq_s32(dst_val, acc_4x);
3724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      vst1q_s32(accum_ptr + 4 * i, dst_val);
3725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics =
3730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics<4>;
3731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics =
3733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics<2>;
3734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// C++ intrinsics-based variant of the wide, uint8, general kernel
3736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int RhsCells>
3737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics {
3738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::uint8_t OperandType;
3739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef std::int32_t AccumulatorType;
3740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
3741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
3742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, RhsCells> >
3743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
3744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
3745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
3746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    int32x4_t acc[3][4 * RhsCells];
3747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int i = 0; i < 3; i++) {
3748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int j = 0; j < 4 * RhsCells; j++) {
3749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        acc[i][j] = vld1q_s32(accum_ptr + 4 * (i + 3 * j));
3750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int d = 0; d < depth; d += 2) {
3753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int16x8_t lhs[3];
3754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < 3; i++) {
3755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        lhs[i] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(lhs_ptr + 8 * i)));
3756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      int16x8_t rhs[RhsCells];
3758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < RhsCells; i++) {
3759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        rhs[i] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(rhs_ptr + 8 * i)));
3760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < 3; i++) {
3762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        for (int j = 0; j < RhsCells; j++) {
3763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 0] = vmlal_lane_s16(
3764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              acc[i][4 * j + 0], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 0);
3765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 1] = vmlal_lane_s16(
3766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              acc[i][4 * j + 1], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 1);
3767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 2] = vmlal_lane_s16(
3768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              acc[i][4 * j + 2], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 2);
3769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 3] = vmlal_lane_s16(
3770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              acc[i][4 * j + 3], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 3);
3771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 0] =
3772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              vmlal_lane_s16(acc[i][4 * j + 0], vget_high_s16(lhs[i]),
3773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                             vget_high_s16(rhs[j]), 0);
3774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 1] =
3775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              vmlal_lane_s16(acc[i][4 * j + 1], vget_high_s16(lhs[i]),
3776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                             vget_high_s16(rhs[j]), 1);
3777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 2] =
3778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              vmlal_lane_s16(acc[i][4 * j + 2], vget_high_s16(lhs[i]),
3779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                             vget_high_s16(rhs[j]), 2);
3780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 3] =
3781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              vmlal_lane_s16(acc[i][4 * j + 3], vget_high_s16(lhs[i]),
3782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                             vget_high_s16(rhs[j]), 3);
3783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        }
3784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      lhs_ptr += 24;
3786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      rhs_ptr += 8 * RhsCells;
3787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int i = 0; i < 3; i++) {
3789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int j = 0; j < 4 * RhsCells; j++) {
3790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        vst1q_s32(accum_ptr + 4 * (i + 3 * j), acc[i][j]);
3791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics =
3797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics<1>;
3798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics =
3800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics<2>;
3801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <int RhsCells>
3803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct NEON_GEMM_Float32_WithScalar_intrinsics {
3804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float OperandType;
3805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef float AccumulatorType;
3806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef KernelFormat<
3807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
3808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, RhsCells> >
3809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Format;
3810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
3811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
3812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    float32x4_t acc[3][4 * RhsCells];
3813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int i = 0; i < 3; i++) {
3814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int j = 0; j < 4 * RhsCells; j++) {
3815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        acc[i][j] = vld1q_f32(accum_ptr + 4 * (i + 3 * j));
3816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int d = 0; d < depth; d++) {
3819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      float32x4_t lhs[3];
3820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < 3; i++) {
3821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        lhs[i] = vld1q_f32(lhs_ptr + 4 * i);
3822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      float32x4_t rhs[RhsCells];
3824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < RhsCells; i++) {
3825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        rhs[i] = vld1q_f32(rhs_ptr + 4 * i);
3826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int i = 0; i < 3; i++) {
3828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        for (int j = 0; j < RhsCells; j++) {
3829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 0] = vmlaq_lane_f32(acc[i][4 * j + 0], lhs[i],
3830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                             vget_low_f32(rhs[j]), 0);
3831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 1] = vmlaq_lane_f32(acc[i][4 * j + 1], lhs[i],
3832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                             vget_low_f32(rhs[j]), 1);
3833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 2] = vmlaq_lane_f32(acc[i][4 * j + 2], lhs[i],
3834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                             vget_high_f32(rhs[j]), 0);
3835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          acc[i][4 * j + 3] = vmlaq_lane_f32(acc[i][4 * j + 3], lhs[i],
3836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                             vget_high_f32(rhs[j]), 1);
3837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        }
3838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      lhs_ptr += 12;
3840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      rhs_ptr += 4 * RhsCells;
3841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int i = 0; i < 3; i++) {
3843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int j = 0; j < 4 * RhsCells; j++) {
3844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        vst1q_f32(accum_ptr + 4 * (i + 3 * j), acc[i][j]);
3845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
3846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
3847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
3848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
3849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_32bit_GEMM_Float32_WithScalar_intrinsics =
3851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    NEON_GEMM_Float32_WithScalar_intrinsics<1>;
3852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangusing NEON_64bit_GEMM_Float32_WithScalar_intrinsics =
3854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    NEON_GEMM_Float32_WithScalar_intrinsics<2>;
38557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif  // __arm__ || __aarch64__
38567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
38577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __mips
38587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstatic inline v4i32 workaround_msa_maddv_w(v4i32 a, v4i32 b, v4i32 c) {
38597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  // Workaround for incorrect encoding of maddv.df in gcc (a exchanged with c).
38607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#if 0
38617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  return __builtin_msa_maddv_w(a, b, c);
38627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#else
38637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  asm volatile("maddv.w %w[a], %w[b], %w[c]\n"
38647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang               // Outputs
38657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang               : [a] "+f"(a)
38667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang               // Inputs
38677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang               : [b] "f"(b), [c] "f"(c));
38687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  return a;
38697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif
38707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang}
38717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
38727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications.
38737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 20 MSA regs used:
38747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 12 accumulators
38757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs
38767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs
38777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes
38787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~55 instructions in the loop.
38797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics {
38807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
38817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::int32_t AccumulatorType;
38827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
38837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
38847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
38857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
38867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
38877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
38887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    const v16i8 zeroes = __builtin_msa_ldi_b(0);
38897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    v4i32 acc[3][4];
38907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    // Load accumulators.
38917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    for (int i = 0; i < 3; i++) {
38927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 4; j++) {
38937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        acc[i][j] = __builtin_msa_ld_w(accum_ptr + 4 * (i + 3 * j), 0);
38947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
38957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    }
38967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
38977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    while (depth > 0) {
38987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
38997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      v8i16 lhs[6];
39007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr), 0));
39017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[1] =
39027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr + 8), 0));
39037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Zero-extend 8-bit elements of lhs[] to 16 bits.
39057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes,
39067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                                                            reinterpret_cast<v16i8>(lhs[0])));
39077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[2] = reinterpret_cast<v8i16>(__builtin_msa_ilvl_b(zeroes,
39087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                                                            reinterpret_cast<v16i8>(lhs[1])));
39097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[1] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes,
39107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                                                            reinterpret_cast<v16i8>(lhs[1])));
39117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Zero-extend 16-bit elements of lhs[] to 32 bits.
39137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[3] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[0]);
39147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[4] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[1]);
39157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[5] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[2]);
39167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[0] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[0]);
39177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[1] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[1]);
39187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[2] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[2]);
39197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Depth 0.
39217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 4; j++) {
39227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 1 byte of rhs, making 4 32-bit replicas of it.
39237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j]));
39247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Multiply-add into accumulators.
39257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        for (int i = 0; i < 3; i++) {
39267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i]), rhs);
39277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        }
39287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
39297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Depth 1.
39317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 4; j++) {
39327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 1 byte of rhs, making 4 32-bit replicas of it.
39337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 4]));
39347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Multiply-add into accumulators.
39357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        for (int i = 0; i < 3; i++) {
39367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i + 3]), rhs);
39377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        }
39387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
39397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs_ptr += 24;
39417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      rhs_ptr += 8;
39427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      depth -= 2;
39437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    }
39447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    // Store accumulators.
39467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    for (int i = 0; i < 3; i++) {
39477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 4; j++) {
39487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        __builtin_msa_st_w(acc[i][j], accum_ptr + 4 * (i + 3 * j), 0);
39497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
39507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    }
39517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
39527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
39537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above
39557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics.
39567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications.
39577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 20 MSA regs used:
39587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 12 accumulators
39597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs
39607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs
39617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes
39627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~55 instructions in the loop.
39637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly {
39647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
39657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::int32_t AccumulatorType;
39667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
39677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
39687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
39697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
39707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr,
39717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
39727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
39737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
39747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w0,   (0*16)(%[accum_ptr])\n"
39757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w4,   (1*16)(%[accum_ptr])\n"
39767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w8,   (2*16)(%[accum_ptr])\n"
39777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w1,   (3*16)(%[accum_ptr])\n"
39787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w5,   (4*16)(%[accum_ptr])\n"
39797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w9,   (5*16)(%[accum_ptr])\n"
39807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w2,   (6*16)(%[accum_ptr])\n"
39817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w6,   (7*16)(%[accum_ptr])\n"
39827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w10,  (8*16)(%[accum_ptr])\n"
39837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w3,   (9*16)(%[accum_ptr])\n"
39847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w7,  (10*16)(%[accum_ptr])\n"
39857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w11, (11*16)(%[accum_ptr])\n"
39867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Set a temp to all zeroes.
39877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldi.b  $w19, 0\n"
39887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
39897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP ":\n"
39907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Overview of register layout:
39917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
39927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A half of the 2x4 cell of Rhs is stored in 32bit in w18.
39937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 32bit in w12-w17.
39947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x4 block of accumulators is stored in 32bit in w0-w11.
39957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
39967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +------+------+------+------+
39977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //               Rhs  |w18[0]|w18[1]|w18[2]|w18[3]|
39987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +------+------+------+------+
39997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
40007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    |      |      |      |      |
40017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
40027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //    Lhs             |      |      |      |      |
40037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
40047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
40057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w12|w15|         | w0   | w1   | w2   | w3   |
40067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w12|w15|         | w0   | w1   | w2   | w3   |
40077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w12|w15|         | w0   | w1   | w2   | w3   |
40087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w12|w15|         | w0   | w1   | w2   | w3   |
40097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
40107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w13|w16|         | w4   | w5   | w6   | w7   |
40117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w13|w16|         | w4   | w5   | w6   | w7   |
40127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w13|w16|         | w4   | w5   | w6   | w7   |
40137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w13|w16|         | w4   | w5   | w6   | w7   |
40147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
40157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w14|w17|         | w8   | w9   | w10  | w11  |
40167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w14|w17|         | w8   | w9   | w10  | w11  |
40177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w14|w17|         | w8   | w9   | w10  | w11  |
40187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w14|w17|         | w8   | w9   | w10  | w11  |
40197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
40207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
40217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                            Accumulator
40227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
40247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w12, 0(%[lhs_ptr])\n"
40257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w13, 8(%[lhs_ptr])\n"
40267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for depth 0.
40287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a0, 0(%[rhs_ptr])\n"
40297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a1, 1(%[rhs_ptr])\n"
40307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a2, 2(%[rhs_ptr])\n"
40317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a3, 3(%[rhs_ptr])\n"
40327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Zero-extend 8-bit elements of lhs[] to 16 bits.
40347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w12, $w19, $w12\n"
40357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.b $w14, $w19, $w13\n"
40367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w13, $w19, $w13\n"
40377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Zero-extend 16-bit elements of lhs[] to 32 bits.
40387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.h $w15, $w19, $w12\n"
40397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.h $w16, $w19, $w13\n"
40407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.h $w17, $w19, $w14\n"
40417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w12, $w19, $w12\n"
40427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w13, $w19, $w13\n"
40437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w14, $w19, $w14\n"
40447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Depth 0.
40467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a0\n"
40477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a0, 4(%[rhs_ptr])\n"
40487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w0, $w12, $w18\n"
40497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w4, $w13, $w18\n"
40507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w8, $w14, $w18\n"
40517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a1\n"
40527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a1, 5(%[rhs_ptr])\n"
40537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w1, $w12, $w18\n"
40547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w5, $w13, $w18\n"
40557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w9, $w14, $w18\n"
40567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a2\n"
40577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a2, 6(%[rhs_ptr])\n"
40587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w2, $w12, $w18\n"
40597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w6, $w13, $w18\n"
40607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w10, $w14, $w18\n"
40617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a3\n"
40627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a3, 7(%[rhs_ptr])\n"
40637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w3, $w12, $w18\n"
40647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w7, $w13, $w18\n"
40657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w11, $w14, $w18\n"
40667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Depth 1.
40687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a0\n"
40697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w0, $w15, $w18\n"
40707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w4, $w16, $w18\n"
40717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w8, $w17, $w18\n"
40727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a1\n"
40737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w1, $w15, $w18\n"
40747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w5, $w16, $w18\n"
40757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w9, $w17, $w18\n"
40767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a2\n"
40777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w2, $w15, $w18\n"
40787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w6, $w16, $w18\n"
40797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w10, $w17, $w18\n"
40807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w18, $a3\n"
40817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w3, $w15, $w18\n"
40827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w7, $w16, $w18\n"
40837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w11, $w17, $w18\n"
40847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "addiu  %[depth], -2\n"
40867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n"
40877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 8\n"
40887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bnez   %[depth]," GEMMLOWP_LABEL_LOOP "b\n"
40897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
40907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators.
40917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w0,   (0*16)(%[accum_ptr])\n"
40927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w4,   (1*16)(%[accum_ptr])\n"
40937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w8,   (2*16)(%[accum_ptr])\n"
40947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w1,   (3*16)(%[accum_ptr])\n"
40957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w5,   (4*16)(%[accum_ptr])\n"
40967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w9,   (5*16)(%[accum_ptr])\n"
40977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w2,   (6*16)(%[accum_ptr])\n"
40987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w6,   (7*16)(%[accum_ptr])\n"
40997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w10,  (8*16)(%[accum_ptr])\n"
41007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w3,   (9*16)(%[accum_ptr])\n"
41017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w7,  (10*16)(%[accum_ptr])\n"
41027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w11, (11*16)(%[accum_ptr])\n"
41037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
41047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
41057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
41067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
41077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
41087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
41097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "memory",
41107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "a0", "a1", "a2", "a3",
41117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7",
41127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",
41137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f16", "$f17", "$f18", "$f19");
41147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
41157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
41167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
41177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above
41187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics2 (TODO).
41197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 16x16=32 multiplications.
41207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 20 MSA regs used:
41217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 12 accumulators
41227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 3 lhs
41237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 4 rhs
41247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes
41257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~45 instructions in the loop.
41267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly2 {
41277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
41287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::int32_t AccumulatorType;
41297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
41307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
41317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
41327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
41337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr,
41347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
41357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
41367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
41377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w0,   (0*16)(%[accum_ptr])\n"
41387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w4,   (1*16)(%[accum_ptr])\n"
41397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w8,   (2*16)(%[accum_ptr])\n"
41407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w1,   (3*16)(%[accum_ptr])\n"
41417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w5,   (4*16)(%[accum_ptr])\n"
41427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w9,   (5*16)(%[accum_ptr])\n"
41437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w2,   (6*16)(%[accum_ptr])\n"
41447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w6,   (7*16)(%[accum_ptr])\n"
41457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w10,  (8*16)(%[accum_ptr])\n"
41467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w3,   (9*16)(%[accum_ptr])\n"
41477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w7,  (10*16)(%[accum_ptr])\n"
41487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w11, (11*16)(%[accum_ptr])\n"
41497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Set a temp to all zeroes.
41507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldi.b  $w19, 0\n"
41517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
41527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP ":\n"
41537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Overview of register layout:
41547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
41557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 2x4 cell of Rhs is stored in 16bit in w15-w18 (each register
41567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // contains 4 replicas of a pair of elements).
41577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in w12-w14.
41587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x4 block of accumulators is stored in 32bit in w0-w11.
41597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
41607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +-----+-----+-----+-----+
41617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //               Rhs  | w15 | w16 | w17 | w18 |
41627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +-----+-----+-----+-----+
41637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
41647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    |     |     |     |     |
41657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
41667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //       Lhs          |     |     |     |     |
41677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
41687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +-----+-----+-----+-----+
41697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w12|         | w0  | w1  | w2  | w3  |
41707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w12|         | w0  | w1  | w2  | w3  |
41717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w12|         | w0  | w1  | w2  | w3  |
41727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w12|         | w0  | w1  | w2  | w3  |
41737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +-----+-----+-----+-----+
41747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w13|         | w4  | w5  | w6  | w7  |
41757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w13|         | w4  | w5  | w6  | w7  |
41767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w13|         | w4  | w5  | w6  | w7  |
41777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w13|         | w4  | w5  | w6  | w7  |
41787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +-----+-----+-----+-----+
41797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w14|         | w8  | w9  | w10 | w11 |
41807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w14|         | w8  | w9  | w10 | w11 |
41817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w14|         | w8  | w9  | w10 | w11 |
41827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w14|         | w8  | w9  | w10 | w11 |
41837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +-----+-----+-----+-----+
41847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
41857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                           Accumulators
41867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
41877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
41887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w12, 0(%[lhs_ptr])\n"
41897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w13, 8(%[lhs_ptr])\n"
41907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
41917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for depth 0.
41927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a0, 0(%[rhs_ptr])\n"
41937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a1, 1(%[rhs_ptr])\n"
41947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a2, 2(%[rhs_ptr])\n"
41957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a3, 3(%[rhs_ptr])\n"
41967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for depth 1.
41977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $v0, 4(%[rhs_ptr])\n"
41987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $v1, 5(%[rhs_ptr])\n"
41997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $t8, 6(%[rhs_ptr])\n"
42007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $t9, 7(%[rhs_ptr])\n"
42017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Zero-extend 8-bit elements of lhs[] to 16 bits.
42037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w12, $w19, $w12\n"
42047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.b $w14, $w19, $w13\n"
42057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w13, $w19, $w13\n"
42067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Interleave depth 0 and depth 1 elements of lhs[] for dpadd_u.w.
42077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.d $w15, $w19, $w12\n"
42087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.d $w16, $w19, $w13\n"
42097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.d $w17, $w19, $w14\n"
42107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w12, $w15, $w12\n"
42117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w13, $w16, $w13\n"
42127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w14, $w17, $w14\n"
42137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Combine and interleave depth 0 and depth 1 elements of rhs[] for dpadd_u.w.
42157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a0, $v0, 16, 8\n"
42167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a1, $v1, 16, 8\n"
42177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a2, $t8, 16, 8\n"
42187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a3, $t9, 16, 8\n"
42197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Make 4 replicas of every pair of rhs[] elements.
42207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w15, $a0\n"
42217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w16, $a1\n"
42227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w17, $a2\n"
42237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w18, $a3\n"
42247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Depths 0 and 1.
42267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Dot-product-(and)-add doubles multiplicand width.
42277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w0, $w12, $w15\n"
42287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w4, $w13, $w15\n"
42297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w8, $w14, $w15\n"
42307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w1, $w12, $w16\n"
42317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w5, $w13, $w16\n"
42327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w9, $w14, $w16\n"
42337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w2, $w12, $w17\n"
42347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w6, $w13, $w17\n"
42357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w10, $w14, $w17\n"
42367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w3, $w12, $w18\n"
42377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w7, $w13, $w18\n"
42387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w11, $w14, $w18\n"
42397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "addiu  %[depth], -2\n"
42417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n"
42427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 8\n"
42437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bnez   %[depth]," GEMMLOWP_LABEL_LOOP "b\n"
42447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators.
42467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w0,   (0*16)(%[accum_ptr])\n"
42477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w4,   (1*16)(%[accum_ptr])\n"
42487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w8,   (2*16)(%[accum_ptr])\n"
42497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w1,   (3*16)(%[accum_ptr])\n"
42507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w5,   (4*16)(%[accum_ptr])\n"
42517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w9,   (5*16)(%[accum_ptr])\n"
42527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w2,   (6*16)(%[accum_ptr])\n"
42537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w6,   (7*16)(%[accum_ptr])\n"
42547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w10,  (8*16)(%[accum_ptr])\n"
42557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w3,   (9*16)(%[accum_ptr])\n"
42567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w7,  (10*16)(%[accum_ptr])\n"
42577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w11, (11*16)(%[accum_ptr])\n"
42587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
42597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
42607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
42617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
42627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
42637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
42647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "memory",
42657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v0", "v1",
42667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "a0", "a1", "a2", "a3",
42677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "t8", "t9",
42687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7",
42697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",
42707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f16", "$f17", "$f18", "$f19");
42717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
42727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
42737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications.
42757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 32 MSA regs used:
42767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 24 accumulators
42777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs
42787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs
42797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes
42807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~95 instructions in the loop.
42817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics {
42827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
42837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint32_t AccumulatorType;
42847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
42857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
42867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> >
42877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
42887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
42897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
42907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    const v16i8 zeroes = __builtin_msa_ldi_b(0);
42917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    v4i32 acc[3][8];
42927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    // Load accumulators.
42937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    for (int i = 0; i < 3; i++) {
42947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 8; j++) {
42957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        acc[i][j] = __builtin_msa_ld_w(accum_ptr + 4 * (i + 3 * j), 0);
42967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
42977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    }
42987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
42997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    while (depth > 0) {
43007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
43017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      v8i16 lhs[6];
43027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr), 0));
43037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[1] =
43047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          reinterpret_cast<v8i16>(__builtin_msa_ld_b(const_cast<OperandType*>(lhs_ptr + 8), 0));
43057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Zero-extend 8-bit elements of lhs[] to 16 bits.
43077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[0] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes,
43087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                                                            reinterpret_cast<v16i8>(lhs[0])));
43097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[2] = reinterpret_cast<v8i16>(__builtin_msa_ilvl_b(zeroes,
43107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                                                            reinterpret_cast<v16i8>(lhs[1])));
43117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[1] = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(zeroes,
43127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                                                            reinterpret_cast<v16i8>(lhs[1])));
43137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Zero-extend 16-bit elements of lhs[] to 32 bits.
43157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[3] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[0]);
43167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[4] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[1]);
43177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[5] = __builtin_msa_ilvl_h(reinterpret_cast<v8i16>(zeroes), lhs[2]);
43187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[0] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[0]);
43197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[1] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[1]);
43207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs[2] = __builtin_msa_ilvr_h(reinterpret_cast<v8i16>(zeroes), lhs[2]);
43217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Depth 0.
43237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 4; j++) {
43247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 1 byte of rhs, making 4 32-bit replicas of it.
43257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j]));
43267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Multiply-add into accumulators.
43277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        for (int i = 0; i < 3; i++) {
43287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i]), rhs);
43297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        }
43307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
43317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 4; j < 8; j++) {
43327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 1 byte of rhs, making 4 32-bit replicas of it.
43337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 4]));
43347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Multiply-add into accumulators.
43357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        for (int i = 0; i < 3; i++) {
43367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i]), rhs);
43377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        }
43387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
43397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      // Depth 1.
43417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 4; j++) {
43427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 1 byte of rhs, making 4 32-bit replicas of it.
43437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 4]));
43447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Multiply-add into accumulators.
43457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        for (int i = 0; i < 3; i++) {
43467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i + 3]), rhs);
43477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        }
43487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
43497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 4; j < 8; j++) {
43507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 1 byte of rhs, making 4 32-bit replicas of it.
43517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        v4i32 rhs = reinterpret_cast<v4i32>(__builtin_msa_fill_w(rhs_ptr[j + 8]));
43527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Multiply-add into accumulators.
43537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        for (int i = 0; i < 3; i++) {
43547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang          acc[i][j] = workaround_msa_maddv_w(acc[i][j], reinterpret_cast<v4i32>(lhs[i + 3]), rhs);
43557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        }
43567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
43577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      lhs_ptr += 24;
43597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      rhs_ptr += 16;
43607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      depth -= 2;
43617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    }
43627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    // Store accumulators.
43647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    for (int i = 0; i < 3; i++) {
43657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      for (int j = 0; j < 8; j++) {
43667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        __builtin_msa_st_w(acc[i][j], accum_ptr + 4 * (i + 3 * j), 0);
43677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      }
43687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    }
43697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
43707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
43717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
43727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above
43737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics.
43747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 32x32=32 multiplications.
43757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 32 MSA regs used:
43767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 24 accumulators
43777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 6 lhs
43787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 rhs
43797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes
43807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~95 instructions in the loop.
43817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly {
43827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
43837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint32_t AccumulatorType;
43847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
43857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
43867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> >
43877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
43887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr,
43897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
43907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
43917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
43927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w0,   (0*16)(%[accum_ptr])\n"
43937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w4,   (1*16)(%[accum_ptr])\n"
43947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w8,   (2*16)(%[accum_ptr])\n"
43957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w1,   (3*16)(%[accum_ptr])\n"
43967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w5,   (4*16)(%[accum_ptr])\n"
43977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w9,   (5*16)(%[accum_ptr])\n"
43987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w2,   (6*16)(%[accum_ptr])\n"
43997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w6,   (7*16)(%[accum_ptr])\n"
44007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w10,  (8*16)(%[accum_ptr])\n"
44017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w3,   (9*16)(%[accum_ptr])\n"
44027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w7,  (10*16)(%[accum_ptr])\n"
44037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w11, (11*16)(%[accum_ptr])\n"
44047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w12, (12*16)(%[accum_ptr])\n"
44057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w16, (13*16)(%[accum_ptr])\n"
44067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w20, (14*16)(%[accum_ptr])\n"
44077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w13, (15*16)(%[accum_ptr])\n"
44087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w17, (16*16)(%[accum_ptr])\n"
44097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w21, (17*16)(%[accum_ptr])\n"
44107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w14, (18*16)(%[accum_ptr])\n"
44117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w18, (19*16)(%[accum_ptr])\n"
44127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w22, (20*16)(%[accum_ptr])\n"
44137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w15, (21*16)(%[accum_ptr])\n"
44147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w19, (22*16)(%[accum_ptr])\n"
44157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w23, (23*16)(%[accum_ptr])\n"
44167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Set a temp to all zeroes.
44177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldi.b  $w31, 0\n"
44187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
44197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP ":\n"
44207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Overview of register layout:
44217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
44227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A quarter of the 2 2x4 cells of Rhs is stored in 32bit in w30.
44237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 32bit in w24-w29.
44247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x8 block of accumulators is stored in 32bit in w0-w23.
44257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
44267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +------+------+------+------+
44277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //               Rhs  |w30[0]|w30[1]|w30[2]|w30[3]|
44287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +------+------+------+------+
44297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
44307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    |      |      |      |      |
44317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
44327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //    Lhs             |      |      |      |      |
44337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
44347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
44357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w24|w27|         |w0/12 |w1/13 |w2/14 |w3/15 |
44367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w24|w27|         |w0/12 |w1/13 |w2/14 |w3/15 |
44377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w24|w27|         |w0/12 |w1/13 |w2/14 |w3/15 |
44387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w24|w27|         |w0/12 |w1/13 |w2/14 |w3/15 |
44397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
44407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w25|w28|         |w4/16 |w5/17 |w6/18 |w7/19 |
44417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w25|w28|         |w4/16 |w5/17 |w6/18 |w7/19 |
44427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w25|w28|         |w4/16 |w5/17 |w6/18 |w7/19 |
44437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w25|w28|         |w4/16 |w5/17 |w6/18 |w7/19 |
44447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
44457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w26|w29|         |w8/20 |w9/21 |w10/22|w11/23|
44467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w26|w29|         |w8/20 |w9/21 |w10/22|w11/23|
44477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w26|w29|         |w8/20 |w9/21 |w10/22|w11/23|
44487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  |w26|w29|         |w8/20 |w9/21 |w10/22|w11/23|
44497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //  +---+---+ - - - - +------+------+------+------+
44507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
44517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                            Accumulator
44527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
44537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
44547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w24, 0(%[lhs_ptr])\n"
44557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w25, 8(%[lhs_ptr])\n"
44567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
44577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for the first half of depth 0.
44587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a0, 0(%[rhs_ptr])\n"
44597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a1, 1(%[rhs_ptr])\n"
44607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a2, 2(%[rhs_ptr])\n"
44617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a3, 3(%[rhs_ptr])\n"
44627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
44637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Zero-extend 8-bit elements of lhs[] to 16 bits.
44647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w24, $w31, $w24\n"
44657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.b $w26, $w31, $w25\n"
44667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w25, $w31, $w25\n"
44677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Zero-extend 16-bit elements of lhs[] to 32 bits.
44687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.h $w27, $w31, $w24\n"
44697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.h $w28, $w31, $w25\n"
44707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.h $w29, $w31, $w26\n"
44717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w24, $w31, $w24\n"
44727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w25, $w31, $w25\n"
44737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w26, $w31, $w26\n"
44747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
44757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Depth 0.
44767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a0\n"
44777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a0, 8(%[rhs_ptr])\n"
44787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w0, $w24, $w30\n"
44797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w4, $w25, $w30\n"
44807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w8, $w26, $w30\n"
44817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a1\n"
44827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a1, 9(%[rhs_ptr])\n"
44837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w1, $w24, $w30\n"
44847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w5, $w25, $w30\n"
44857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w9, $w26, $w30\n"
44867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a2\n"
44877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a2, 10(%[rhs_ptr])\n"
44887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w2, $w24, $w30\n"
44897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w6, $w25, $w30\n"
44907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w10, $w26, $w30\n"
44917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a3\n"
44927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a3, 11(%[rhs_ptr])\n"
44937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w3, $w24, $w30\n"
44947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w7, $w25, $w30\n"
44957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w11, $w26, $w30\n"
44967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
44977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a0\n"
44987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a0, 4(%[rhs_ptr])\n"
44997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w12, $w24, $w30\n"
45007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w16, $w25, $w30\n"
45017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w20, $w26, $w30\n"
45027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a1\n"
45037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a1, 5(%[rhs_ptr])\n"
45047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w13, $w24, $w30\n"
45057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w17, $w25, $w30\n"
45067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w21, $w26, $w30\n"
45077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a2\n"
45087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a2, 6(%[rhs_ptr])\n"
45097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w14, $w24, $w30\n"
45107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w18, $w25, $w30\n"
45117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w22, $w26, $w30\n"
45127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a3\n"
45137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a3, 7(%[rhs_ptr])\n"
45147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w15, $w24, $w30\n"
45157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w19, $w25, $w30\n"
45167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w23, $w26, $w30\n"
45177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
45187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Depth 1.
45197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a0\n"
45207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a0, 12(%[rhs_ptr])\n"
45217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w0, $w27, $w30\n"
45227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w4, $w28, $w30\n"
45237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w8, $w29, $w30\n"
45247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a1\n"
45257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a1, 13(%[rhs_ptr])\n"
45267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w1, $w27, $w30\n"
45277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w5, $w28, $w30\n"
45287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w9, $w29, $w30\n"
45297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a2\n"
45307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a2, 14(%[rhs_ptr])\n"
45317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w2, $w27, $w30\n"
45327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w6, $w28, $w30\n"
45337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w10, $w29, $w30\n"
45347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a3\n"
45357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu     $a3, 15(%[rhs_ptr])\n"
45367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w3, $w27, $w30\n"
45377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w7, $w28, $w30\n"
45387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w11, $w29, $w30\n"
45397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
45407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a0\n"
45417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w12, $w27, $w30\n"
45427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w16, $w28, $w30\n"
45437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w20, $w29, $w30\n"
45447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a1\n"
45457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w13, $w27, $w30\n"
45467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w17, $w28, $w30\n"
45477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w21, $w29, $w30\n"
45487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a2\n"
45497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w14, $w27, $w30\n"
45507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w18, $w28, $w30\n"
45517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w22, $w29, $w30\n"
45527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w  $w30, $a3\n"
45537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w15, $w27, $w30\n"
45547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w19, $w28, $w30\n"
45557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "maddv.w $w23, $w29, $w30\n"
45567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
45577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "addiu  %[depth], -2\n"
45587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n"
45597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 16\n"
45607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bnez   %[depth]," GEMMLOWP_LABEL_LOOP "b\n"
45617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
45627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators.
45637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w0,   (0*16)(%[accum_ptr])\n"
45647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w4,   (1*16)(%[accum_ptr])\n"
45657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w8,   (2*16)(%[accum_ptr])\n"
45667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w1,   (3*16)(%[accum_ptr])\n"
45677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w5,   (4*16)(%[accum_ptr])\n"
45687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w9,   (5*16)(%[accum_ptr])\n"
45697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w2,   (6*16)(%[accum_ptr])\n"
45707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w6,   (7*16)(%[accum_ptr])\n"
45717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w10,  (8*16)(%[accum_ptr])\n"
45727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w3,   (9*16)(%[accum_ptr])\n"
45737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w7,  (10*16)(%[accum_ptr])\n"
45747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w11, (11*16)(%[accum_ptr])\n"
45757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w12, (12*16)(%[accum_ptr])\n"
45767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w16, (13*16)(%[accum_ptr])\n"
45777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w20, (14*16)(%[accum_ptr])\n"
45787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w13, (15*16)(%[accum_ptr])\n"
45797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w17, (16*16)(%[accum_ptr])\n"
45807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w21, (17*16)(%[accum_ptr])\n"
45817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w14, (18*16)(%[accum_ptr])\n"
45827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w18, (19*16)(%[accum_ptr])\n"
45837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w22, (20*16)(%[accum_ptr])\n"
45847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w15, (21*16)(%[accum_ptr])\n"
45857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w19, (22*16)(%[accum_ptr])\n"
45867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w23, (23*16)(%[accum_ptr])\n"
45877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
45887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
45897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
45907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
45917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
45927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
45937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "memory",
45947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "a0", "a1", "a2", "a3",
45957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7",
45967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",
45977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23",
45987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31");
45997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
46007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
46017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
46027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Assembly implementation of the above
46037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics2 (TODO).
46047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// Using 16x16=32 multiplications.
46057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// 32 MSA regs used:
46067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 24 accumulators
46077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 3 lhs
46087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 4 rhs
46097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// - 1 temps/zeroes
46107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang// ~70 instructions in the loop.
46117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wangstruct MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly2 {
46127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint8_t OperandType;
46137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef std::uint32_t AccumulatorType;
46147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  typedef KernelFormat<
46157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
46167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> >
46177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang      Format;
46187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  static void Run(OperandType* lhs_ptr, OperandType* rhs_ptr,
46197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                  AccumulatorType* accum_ptr, int depth) {
46207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    asm volatile(
46217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load accumulators
46227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w0,   (0*16)(%[accum_ptr])\n"
46237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w4,   (1*16)(%[accum_ptr])\n"
46247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w8,   (2*16)(%[accum_ptr])\n"
46257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w1,   (3*16)(%[accum_ptr])\n"
46267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w5,   (4*16)(%[accum_ptr])\n"
46277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w9,   (5*16)(%[accum_ptr])\n"
46287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w2,   (6*16)(%[accum_ptr])\n"
46297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w6,   (7*16)(%[accum_ptr])\n"
46307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w10,  (8*16)(%[accum_ptr])\n"
46317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w3,   (9*16)(%[accum_ptr])\n"
46327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w7,  (10*16)(%[accum_ptr])\n"
46337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w11, (11*16)(%[accum_ptr])\n"
46347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w12, (12*16)(%[accum_ptr])\n"
46357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w16, (13*16)(%[accum_ptr])\n"
46367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w20, (14*16)(%[accum_ptr])\n"
46377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w13, (15*16)(%[accum_ptr])\n"
46387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w17, (16*16)(%[accum_ptr])\n"
46397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w21, (17*16)(%[accum_ptr])\n"
46407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w14, (18*16)(%[accum_ptr])\n"
46417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w18, (19*16)(%[accum_ptr])\n"
46427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w22, (20*16)(%[accum_ptr])\n"
46437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w15, (21*16)(%[accum_ptr])\n"
46447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w19, (22*16)(%[accum_ptr])\n"
46457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.w   $w23, (23*16)(%[accum_ptr])\n"
46467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Set a temp to all zeroes.
46477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ldi.b  $w31, 0\n"
46487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
46497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_LABEL_LOOP ":\n"
46507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Overview of register layout:
46517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
46527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A half of the 2 2x4 cells of Rhs is stored in 16bit in w27-w30
46537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // (each register contains 4 replicas of a pair of elements).
46547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in w24-w26.
46557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // A 12x8 block of accumulators is stored in 32bit in w0-w23.
46567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
46577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +------+------+------+------+
46587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //               Rhs  |w27   |w28   |w29   |w30   |
46597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    +------+------+------+------+
46607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
46617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                    |      |      |      |      |
46627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
46637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //       Lhs          |      |      |      |      |
46647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
46657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +------+------+------+------+
46667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
46677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
46687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
46697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
46707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +------+------+------+------+
46717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
46727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
46737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
46747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
46757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +------+------+------+------+
46767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
46777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
46787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
46797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
46807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //      +---+ - - - - +------+------+------+------+
46817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //
46827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        //                             Accumulators
46837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
46847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
46857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w24, 0(%[lhs_ptr])\n"
46867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ld.b   $w25, 8(%[lhs_ptr])\n"
46877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
46887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for the first half of depth 0.
46897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a0, 0(%[rhs_ptr])\n"
46907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a1, 1(%[rhs_ptr])\n"
46917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a2, 2(%[rhs_ptr])\n"
46927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a3, 3(%[rhs_ptr])\n"
46937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for the first half of depth 1.
46947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $v0, 4(%[rhs_ptr])\n"
46957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $v1, 5(%[rhs_ptr])\n"
46967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $t8, 6(%[rhs_ptr])\n"
46977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $t9, 7(%[rhs_ptr])\n"
46987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
46997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Zero-extend 8-bit elements of lhs[] to 16 bits.
47007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w24, $w31, $w24\n"
47017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.b $w26, $w31, $w25\n"
47027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.b $w25, $w31, $w25\n"
47037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Interleave depth 0 and depth 1 elements of lhs[] for dpadd_u.w.
47047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.d $w27, $w31, $w24\n"
47057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.d $w28, $w31, $w25\n"
47067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvl.d $w29, $w31, $w26\n"
47077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w24, $w27, $w24\n"
47087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w25, $w28, $w25\n"
47097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ilvr.h $w26, $w29, $w26\n"
47107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Combine and interleave depth 0 and depth 1 elements of rhs[] for dpadd_u.w
47127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // (for the first half).
47137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a0, $v0, 16, 8\n"
47147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a1, $v1, 16, 8\n"
47157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a2, $t8, 16, 8\n"
47167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a3, $t9, 16, 8\n"
47177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Make 4 replicas of every pair of rhs[] elements.
47187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w27, $a0\n"
47197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w28, $a1\n"
47207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w29, $a2\n"
47217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w30, $a3\n"
47227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47237d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for the second half of depth 0.
47247d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a0, 8(%[rhs_ptr])\n"
47257d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a1, 9(%[rhs_ptr])\n"
47267d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a2, 10(%[rhs_ptr])\n"
47277d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $a3, 11(%[rhs_ptr])\n"
47287d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Load 4 bytes of rhs[] for the second half of depth 1.
47297d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $v0, 12(%[rhs_ptr])\n"
47307d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $v1, 13(%[rhs_ptr])\n"
47317d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $t8, 14(%[rhs_ptr])\n"
47327d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "lbu    $t9, 15(%[rhs_ptr])\n"
47337d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47347d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // First half of depths 0 and 1.
47357d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Dot-product-(and)-add doubles multiplicand width.
47367d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w0, $w24, $w27\n"
47377d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w4, $w25, $w27\n"
47387d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w8, $w26, $w27\n"
47397d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w1, $w24, $w28\n"
47407d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w5, $w25, $w28\n"
47417d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w9, $w26, $w28\n"
47427d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w2, $w24, $w29\n"
47437d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w6, $w25, $w29\n"
47447d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w10, $w26, $w29\n"
47457d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w3, $w24, $w30\n"
47467d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w7, $w25, $w30\n"
47477d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w11, $w26, $w30\n"
47487d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47497d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Combine and interleave depth 0 and depth 1 elements of rhs[] for dpadd_u.w
47507d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // (for the second half).
47517d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a0, $v0, 16, 8\n"
47527d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a1, $v1, 16, 8\n"
47537d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a2, $t8, 16, 8\n"
47547d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "ins    $a3, $t9, 16, 8\n"
47557d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Make 4 replicas of every pair of rhs[] elements.
47567d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w27, $a0\n"
47577d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w28, $a1\n"
47587d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w29, $a2\n"
47597d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "fill.w $w30, $a3\n"
47607d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47617d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Second half of depths 0 and 1.
47627d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Dot-product-(and)-add doubles multiplicand width.
47637d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w12, $w24, $w27\n"
47647d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w16, $w25, $w27\n"
47657d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w20, $w26, $w27\n"
47667d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w13, $w24, $w28\n"
47677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w17, $w25, $w28\n"
47687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w21, $w26, $w28\n"
47697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w14, $w24, $w29\n"
47707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w18, $w25, $w29\n"
47717d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w22, $w26, $w29\n"
47727d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w15, $w24, $w30\n"
47737d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w19, $w25, $w30\n"
47747d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "dpadd_u.w $w23, $w26, $w30\n"
47757d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47767d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "addiu  %[depth], -2\n"
47777d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 24\n"
47787d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 16\n"
47797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "bnez   %[depth]," GEMMLOWP_LABEL_LOOP "b\n"
47807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
47817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        // Store accumulators.
47827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w0,   (0*16)(%[accum_ptr])\n"
47837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w4,   (1*16)(%[accum_ptr])\n"
47847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w8,   (2*16)(%[accum_ptr])\n"
47857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w1,   (3*16)(%[accum_ptr])\n"
47867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w5,   (4*16)(%[accum_ptr])\n"
47877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w9,   (5*16)(%[accum_ptr])\n"
47887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w2,   (6*16)(%[accum_ptr])\n"
47897d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w6,   (7*16)(%[accum_ptr])\n"
47907d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w10,  (8*16)(%[accum_ptr])\n"
47917d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w3,   (9*16)(%[accum_ptr])\n"
47927d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w7,  (10*16)(%[accum_ptr])\n"
47937d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w11, (11*16)(%[accum_ptr])\n"
47947d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w12, (12*16)(%[accum_ptr])\n"
47957d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w16, (13*16)(%[accum_ptr])\n"
47967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w20, (14*16)(%[accum_ptr])\n"
47977d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w13, (15*16)(%[accum_ptr])\n"
47987d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w17, (16*16)(%[accum_ptr])\n"
47997d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w21, (17*16)(%[accum_ptr])\n"
48007d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w14, (18*16)(%[accum_ptr])\n"
48017d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w18, (19*16)(%[accum_ptr])\n"
48027d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w22, (20*16)(%[accum_ptr])\n"
48037d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w15, (21*16)(%[accum_ptr])\n"
48047d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w19, (22*16)(%[accum_ptr])\n"
48057d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "st.w   $w23, (23*16)(%[accum_ptr])\n"
48067d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // outputs
48077d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
48087d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [depth] "+r"(depth)
48097d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // inputs
48107d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        [accum_ptr] "r"(accum_ptr)
48117d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        :  // clobbers
48127d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "memory",
48137d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "v0", "v1",
48147d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "a0", "a1", "a2", "a3",
48157d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "t8", "t9",
48167d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7",
48177d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",
48187d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23",
48197d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang        "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31");
48207d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  }
48217d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang};
48227d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif  // __mips
4823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// BEGIN code copied from gemmlowp/internal/kernel_reference.h
4825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This kernel is templatized in an arbitrary Format template parameter,
4827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// allowing it to have any arbitrary format.
4828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename tOperandType, typename tAccumulatorType, typename tFormat>
4829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct ReferenceKernel {
4830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef tOperandType OperandType;
4831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef tAccumulatorType AccumulatorType;
4832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef tFormat Format;
4833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
4835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  AccumulatorType* accum_ptr, int depth) {
4836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const int depth_cells = static_cast<int>(depth / Format::kDepth);
4837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    // The outer loop is over the depth dimension.
4839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int dc = 0; dc < depth_cells; dc++) {
4840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // The next two loops are over cells of the Lhs (stacked vertically),
4841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // and over cells of the Rhs (stacked horizontally).
4842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
4843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        const OperandType* lhs_cell_ptr =
4844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            lhs_ptr + (dc * Format::Lhs::kCells + rc) *
4845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                          Format::Lhs::Cell::kWidth * Format::kDepth;
4846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
4847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          const OperandType* rhs_cell_ptr =
4848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              rhs_ptr + (dc * Format::Rhs::kCells + cc) *
4849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                            Format::Rhs::Cell::kWidth * Format::kDepth;
4850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          // Now we are inside one cell of the Lhs and inside one cell
4852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          // of the Rhs, so the remaining inner loops are just
4853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          // traditional three loops of matrix multiplication.
4854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          for (int di = 0; di < Format::kDepth; di++) {
4855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
4856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
4857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                const OperandType* lhs_coeff_ptr =
4858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                    lhs_cell_ptr +
4859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                    OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
4860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                const OperandType* rhs_coeff_ptr =
4861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                    rhs_cell_ptr +
4862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                    OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
4863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                AccumulatorType* accumulator_coeff_ptr =
4864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                    accum_ptr + (ri + rc * Format::Lhs::Cell::kWidth) +
4865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                    (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
4866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                *accumulator_coeff_ptr += AccumulatorType(*lhs_coeff_ptr) *
4867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                          AccumulatorType(*rhs_coeff_ptr);
4868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              }
4869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            }
4870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          }
4871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        }
4872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
4873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
4874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
4875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
4876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// END code copied from gemmlowp/internal/kernel_reference.h
4878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType>
4880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangclass CacheLineAlignedBuffer {
4881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang public:
4882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer(std::size_t size) : size_(size) {
4883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    data_ = nullptr;
4884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    // Adds a few bytes of padding here, because the 64-bit 'A57' kernel
4885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    // reads one iteration past the end the buffer, causing a crash on iOS.
48867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    int res = posix_memalign(reinterpret_cast<void**>(&data_), kCacheLineSize,
48877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang                             size_ * sizeof(DataType) + 16);
48887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang    (void)res;
4889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
4890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  ~CacheLineAlignedBuffer() { free(data_); }
4892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const DataType* data() const { return data_; }
4894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  DataType* data() { return data_; }
4895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
48967d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  std::size_t size() const { return size_; }
4897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang private:
4899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const std::size_t size_;
4900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  DataType* data_;
4901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
4902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType>
4904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid FillRandom(CacheLineAlignedBuffer<DataType>* buffer) {
4905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static std::mt19937 generator(0);
4906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // 100 is smaller than any nonzero bound of the range of any data type.
4907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const DataType kMaxVal = DataType(100);
4908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const DataType kMinVal =
4909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      std::is_signed<DataType>::value ? -kMaxVal : DataType(0);
4910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::uniform_real_distribution<float> dist(kMinVal, kMaxVal);
4911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (std::size_t i = 0; i < buffer->size(); i++) {
4912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    buffer->data()[i] = DataType(dist(generator));
4913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
4914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType>
4917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid FillZero(CacheLineAlignedBuffer<DataType>* buffer) {
4918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (std::size_t i = 0; i < buffer->size(); i++) {
4919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    buffer->data()[i] = DataType(0);
4920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
4921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType>
4924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid Copy(CacheLineAlignedBuffer<DataType>* dst,
4925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          const CacheLineAlignedBuffer<DataType>& src) {
4926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  assert(dst->size() == src.size());
4927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  memcpy(dst->data(), src.data(), src.size() * sizeof(DataType));
4928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType>
4931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid PrintMatrix(int rows, int cols, int rowstride, int colstride,
4932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const DataType* data) {
4933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (int r = 0; r < rows; r++) {
4934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int c = 0; c < cols; c++) {
4935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      std::cerr << double(data[r * rowstride + c * colstride]) << " ";
4936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
4937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::cerr << std::endl;
4938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
4939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cerr << std::endl;
4940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename DataType>
4943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangbool approx_equals(DataType a, DataType b) {
4944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return a == b;
4945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
4948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangbool approx_equals(float a, float b) {
4949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if (!a && !b) {
4950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    return true;
4951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
4952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // 1e-1 is very coarse accuracy, we should switch to an overall L2 metric
4953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // and tighten the tolerance on that metric.
4954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return std::abs(a - b) < 1e-1f * std::min(std::abs(a), std::abs(b));
4955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel>
4958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid test_kernel(int depth, const char* kernel_name) {
4959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::OperandType OperandType;
4960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::AccumulatorType AccumulatorType;
4961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::Format Format;
4962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kLhsWidth = Format::Lhs::kWidth;
4963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int kRhsWidth = Format::Rhs::kWidth;
4964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef ReferenceKernel<OperandType, AccumulatorType, Format> ReferenceKernel;
4966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<OperandType> lhs(kLhsWidth * depth);
4968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<OperandType> rhs(kRhsWidth * depth);
4969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<AccumulatorType> accum_initial(kLhsWidth * kRhsWidth);
4970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<AccumulatorType> accum(kLhsWidth * kRhsWidth);
4971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<AccumulatorType> accum_reference(kLhsWidth *
4972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                                          kRhsWidth);
4973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  FillRandom(&lhs);
4975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  FillRandom(&rhs);
4976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  FillRandom(&accum_initial);
4977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  Copy(&accum, accum_initial);
4978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  Copy(&accum_reference, accum_initial);
4979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  ReferenceKernel::Run(lhs.data(), rhs.data(), accum_reference.data(), depth);
4981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  Kernel::Run(lhs.data(), rhs.data(), accum.data(), depth);
4982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (int l = 0; l < kLhsWidth; l++) {
4984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int r = 0; r < kRhsWidth; r++) {
4985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      const int index = l + kLhsWidth * r;
4986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      if (!approx_equals(accum.data()[index], accum_reference.data()[index])) {
4987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        std::cerr << "Arithmetic error in kernel:" << std::endl
4988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  << "    " << kernel_name << std::endl
4989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  << "Wrong accumulator for depth=" << depth << ", "
4990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  << "at l = " << l << ", r = " << r << std::endl;
4991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        std::cerr << "reference value: " << accum_reference.data()[index]
4992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                  << std::endl;
4993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        std::cerr << "actual value:    " << accum.data()[index] << std::endl;
4994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        if (depth <= 16) {
4995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          std::cerr << "LHS matrix:" << std::endl;
4996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          PrintMatrix(kLhsWidth, depth, 1, kLhsWidth, lhs.data());
4997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          std::cerr << "RHS matrix:" << std::endl;
4998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          PrintMatrix(depth, kRhsWidth, kRhsWidth, 1, rhs.data());
4999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          std::cerr << "Initial Accumulator matrix:" << std::endl;
5000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, accum_initial.data());
5001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          std::cerr << "Reference Accumulator matrix:" << std::endl;
5002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth,
5003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                      accum_reference.data());
5004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          std::cerr << "Actual Accumulator matrix:" << std::endl;
5005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, accum.data());
5006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        }
5007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        abort();
5008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      }
5009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
5010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
5011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel>
5014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint ops(int depth) {
5015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // 2x the number of multiply-accumulate scalar ops.
5016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return 2 * Kernel::Format::Lhs::kWidth * Kernel::Format::Rhs::kWidth * depth;
5017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <unsigned Modulus, typename Integer>
5020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangInteger RoundDown(Integer i) {
5021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return i - (i % Modulus);
5022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint CacheSizeInKB() {
5025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const char* cache_size_k_env = getenv("CACHE_SIZE_KB");
5026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const int cache_size_k =
5027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      cache_size_k_env ? atoi(cache_size_k_env) : kDefaultCacheSizeK;
5028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return cache_size_k;
5029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel>
5032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint BenchmarkDepthToFitInCache() {
5033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int cache_size_bytes = 1024 * CacheSizeInKB();
5034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // Subtract the typical size of a few cache lines, so
5036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // we don't need to worry too hard about e.g. some stack data.
5037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int conservative_cache_size_bytes =
5038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      cache_size_bytes - 2 * kCacheLineSize;
5039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // We will subtract the memory occupied by accumulators.
5041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::AccumulatorType AccumulatorType;
5042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int kAccumulatorBytes = sizeof(AccumulatorType) *
5043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                Kernel::Format::Lhs::kWidth *
5044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                Kernel::Format::Rhs::kWidth;
5045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // Compute the depth.
5047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::OperandType OperandType;
5048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int kBytesPerUnitOfDepth =
5049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      sizeof(OperandType) *
5050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      (Kernel::Format::Lhs::kWidth + Kernel::Format::Rhs::kWidth);
5051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int unrounded_depth =
5052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      (conservative_cache_size_bytes - kAccumulatorBytes) /
5053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      kBytesPerUnitOfDepth;
5054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // Cap depth, to avoid unfairly favoring narrower kernels
5056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int kMaxDepth = 1024;
5057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int clamped_unrounded_depth = std::min(kMaxDepth, unrounded_depth);
5058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // Round depth down to a multiple of cache line size, which helps because
5060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // our kernels may crash if depth is not a multiple of the number of
5061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // depth level that they want to
5062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // handle at each loop iteration, and we don't want to require kernels
5063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // to be more complex. Currently all kernels process 1, 2 or 8 levels of
5064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // depth at a time. The main reason why that might increase in the future
5065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // is if registers get wider, but I don't suppose that register could
5066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // ever get wider than cache lines.
5067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return RoundDown<kCacheLineSize>(clamped_unrounded_depth);
5068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangdouble current_time_in_seconds() {
5071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  timespec t;
5072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  clock_gettime(CLOCK_REALTIME, &t);
5073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return t.tv_sec + 1e-9 * t.tv_nsec;
5074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel>
5077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangdouble benchmark(int depth) {
5078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // Minimum duration for this benchmark to run. If the workload finishes
5079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  // sooner, we retry with double the number of iterations.
5080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  static const double min_benchmark_time_in_seconds = 1.0;
5081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::OperandType OperandType;
5083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef typename Kernel::AccumulatorType AccumulatorType;
5084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<OperandType> lhs(Kernel::Format::Lhs::kWidth * depth);
5086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<OperandType> rhs(Kernel::Format::Rhs::kWidth * depth);
5087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  CacheLineAlignedBuffer<AccumulatorType> accum(Kernel::Format::Lhs::kWidth *
5088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                                Kernel::Format::Rhs::kWidth);
5089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (std::uint64_t iters_at_a_time = 1;; iters_at_a_time *= 2) {
5091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const double t_start = current_time_in_seconds();
5092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (std::uint64_t i = 0; i < iters_at_a_time; i++) {
5093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Kernel::Run(lhs.data(), rhs.data(), accum.data(), depth);
5094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
5095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const double t_end = current_time_in_seconds();
5096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const double elapsed = t_end - t_start;
5097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    if (elapsed > min_benchmark_time_in_seconds) {
5098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return iters_at_a_time * ops<Kernel>(depth) / elapsed;
5099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
5100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
5101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Kernel>
5104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangvoid benchmark_and_print_results(const char* kernel_name) {
5105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if (getenv("BENCHMARK_KERNEL")) {
5106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    if (strcmp(getenv("BENCHMARK_KERNEL"), kernel_name)) {
5107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      return;
5108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
5109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
5110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int kKernelDepth = Kernel::Format::kDepth;
5111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (int depth = kKernelDepth; depth <= 1024; depth += kKernelDepth) {
5112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    test_kernel<Kernel>(depth, kernel_name);
5113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
5114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if (getenv("BENCHMARK_ALL_DEPTHS")) {
5116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    for (int depth = kKernelDepth;
5117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang         depth <= BenchmarkDepthToFitInCache<Kernel>(); depth *= 2) {
5118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      std::cout << kernel_name << "," << depth << ","
5119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                << benchmark<Kernel>(depth) * 1e-9f << std::endl;
5120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    }
5121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  } else {
5122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const int depth = BenchmarkDepthToFitInCache<Kernel>();
5123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::cout << kernel_name << "," << benchmark<Kernel>(depth) * 1e-9f
5124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang              << std::endl;
5125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
5126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define BENCHMARK(Kernel)                         \
5129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  do {                                            \
5130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    benchmark_and_print_results<Kernel>(#Kernel); \
5131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  } while (false)
5132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangint main() {
5134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if (getenv("BENCHMARK_ALL_DEPTHS")) {
5135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::cout << "kernel,depth,Gop/s" << std::endl;
5136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  } else {
5137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::cout << "kernel,Gop/s" << std::endl;
5138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
5139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __arm__
5141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits);
5142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics);
5143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators);
5144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics);
5145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand);
5146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Int32_WithScalar);
5147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar);
5148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __ARM_FEATURE_FMA
5149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar);
5150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
5151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_MLA_WithScalar);
5152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_intrinsics);
5153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_A53);
5154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_A53_depth2);
5155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_MLA_Rotating);
5156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __ARM_FEATURE_FMA
5157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_32bit_GEMM_Float32_FMA_Rotating);
5158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
5159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
5160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef __aarch64__
5162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits);
5163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics);
5164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators);
5165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics);
5166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57);
51677d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __ARM_FEATURE_DOTPROD
51687d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct);
51697d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct_A55r1);
51707d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif
5171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Int32_WithScalar);
5172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Float32_WithVectorDuplicatingScalar);
5173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar);
5174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_intrinsics);
5175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A57);
5176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef __APPLE__
5177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A53);
5178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
51797d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A55r1);
51807d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#endif
51817d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang
51827d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang#ifdef __mips
51837d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_intrinsics);
51847d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly);
51857d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(MSA_GEMM_12x4_Uint8Operands_Uint32Accumulators_assembly2);
51867d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_intrinsics);
51877d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly);
51887d0d5a611e629e7c8946e6720baa6846ade9f015Miao Wang  BENCHMARK(MSA_GEMM_12x8_Uint8Operands_Uint32Accumulators_assembly2);
5189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
5190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
5191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return 0;
5192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
5193