1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
3a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License");
4a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// you may not use this file except in compliance with the License.
5a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// You may obtain a copy of the License at
6a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
7a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//     http://www.apache.org/licenses/LICENSE-2.0
8a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
9a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Unless required by applicable law or agreed to in writing, software
10a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS,
11a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// See the License for the specific language governing permissions and
13a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// limitations under the License.
14a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
15a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
16a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
17a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
18a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef GEMMLOWP_NEON_32
19a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
20a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cassert>
21a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include <cstdint>
22a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
23a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace gemmlowp {
24a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace meta {
25a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
26a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
27a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
28a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1,
29a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
30a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
31a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
32a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
33a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
34a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
35a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
36a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
37a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()"
38a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
39a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
40a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
41a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
42a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
43a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
44a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
45a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
46a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
47a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
48a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
49a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
50a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
51a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
52a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
53a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
54a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
55a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d2}, [%[lhs]:64]!\n"
56a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d3}, [%[rhs]:64]!\n"
57a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
58a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
59a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q2, d3, d2\n"
60a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q2\n"
61a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
62a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
63a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
64a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
65a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
66a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
67a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
69a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
70a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
71a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
74a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
76a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
77a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
78a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
79a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
80a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
81a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
82a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
83a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
84a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
85a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
86a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
87a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[0]}, [%[result]]!\n"
90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12",
97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d13", "d14", "d15", "d16", "d17", "cc", "memory");
98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
99a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2,
103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()"
112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d4}, [%[lhs]:64]!\n"
131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q4, d5, d4\n"
135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d6, d4\n"
136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q4\n"
137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q5\n"
138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[0]}, [%[result]]!\n"
168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3,
181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()"
190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6}, [%[lhs]:64]!\n"
210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d7, d6\n"
214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d8, d6\n"
215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d6\n"
216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q5\n"
217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q6\n"
218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q7\n"
219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[0]}, [%[result]]!\n"
251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[2]}, [%[result]]!\n"
252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4,
265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()"
274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8}, [%[lhs]:64]!\n"
295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d8\n"
299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d8\n"
300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d8\n"
301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d12, d8\n"
302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q7\n"
303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q8\n"
304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q9\n"
305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q10\n"
306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "cc", "memory");
348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5,
353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()"
362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14}, [%[lhs]:64]!\n"
385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d14\n"
387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d14\n"
388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d12, d14\n"
389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d13, d14\n"
390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10}, [%[rhs]:64]!\n"
391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q8\n"
393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q9\n"
394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q10\n"
395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q11\n"
396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d14\n"
397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q8\n"
398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[multiplicative_offset]\n"
406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, %[rounding_offset]\n"
407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[shift]\n"
408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q5, d10[0]\n"
409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d8\n"
421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q8\n"
428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q1, q1, q8\n"
429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q9\n"
430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q1, q1, q9\n"
431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q10\n"
432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q1, q1, q10\n"
433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d1, q1\n"
435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[4]}, [%[result]]!\n"
440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "cc", "memory");
449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6,
454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()"
463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16}, [%[lhs]:64]!\n"
487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d12, d16\n"
489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d13, d16\n"
490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d14, d16\n"
491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d15, d16\n"
492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d12, d16\n"
499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d13, d16\n"
500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q9\n"
501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q10\n"
502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, %[multiplicative_offset]\n"
510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[rounding_offset]\n"
511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[shift]\n"
512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d12[0]\n"
513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q6\n"
530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q8\n"
532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q9\n"
533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q1, q1, q9\n"
534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q10\n"
535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q1, q1, q10\n"
536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q11\n"
537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q1, q1, q11\n"
538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d1, q1\n"
540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[2]}, [%[result]]!\n"
545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "cc", "memory");
554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7,
559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()"
568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18}, [%[lhs]:64]!\n"
593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d14, d18\n"
595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d18\n"
596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d17, d18\n"
598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q10\n"
601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q11\n"
602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q12\n"
603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q13\n"
604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d14, d18\n"
605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d18\n"
606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q10\n"
608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[multiplicative_offset]\n"
618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[rounding_offset]\n"
619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, %[shift]\n"
620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, d14[0]\n"
621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d3, d12, d12\n"
636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q9\n"
642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q10\n"
643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q1, q1, q10\n"
644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q11\n"
645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q1, q1, q11\n"
646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q12\n"
647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q1, q1, q12\n"
648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d1, q1\n"
650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[2]}, [%[result]]!\n"
655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[6]}, [%[result]]!\n"
656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8,
670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()"
679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 1x8 lanes loop.
698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16}, [%[lhs]:64]!\n"
702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d17\n"
703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d19\n"
705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d20\n"
706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q11\n"
708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q12\n"
709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q13\n"
710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q14\n"
711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #256]\n"
712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d16, d17\n"
713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d18\n"
714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d19\n"
715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d20\n"
716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #32]\n"
717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q15\n"
722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q13\n"
725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[multiplicative_offset]\n"
733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, %[rounding_offset]\n"
734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q13, %[shift]\n"
735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d16[0]\n"
736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d3, d12, d14\n"
752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q8\n"
756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q10\n"
758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q11\n"
759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q1, q1, q11\n"
760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q12\n"
761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q1, q1, q12\n"
762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q13\n"
763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q1, q1, q13\n"
764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d1, q1\n"
766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d31", "cc", "memory");
780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1,
785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()"
794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6}, [%[rhs]:64]!\n"
814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q4, d6, d4\n"
817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d6, d5\n"
818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q4\n"
819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q5\n"
820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q2, d8[0]\n"
831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[1]\n"
832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d2\n"
841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q2\n"
844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q4\n"
845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q1, q1, q6\n"
849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q1, q1, q7\n"
851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q1, q1, q8\n"
853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d2, q1\n"
855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d2, q1\n"
857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[0]}, [%[result]]!\n"
860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d2[0]}, [r0]!\n"
861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2,
874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()"
883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d10, d8\n"
908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d11, d8\n"
909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d9\n"
910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d9\n"
911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q6\n"
912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q7\n"
913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q8\n"
914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q9\n"
915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d8[0]\n"
926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[1]\n"
927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d6\n"
938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q4\n"
942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q5\n"
944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q2, q2, q6\n"
946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q2, q2, q7\n"
948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q2, q2, q8\n"
950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d4, q2\n"
952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d4, q2\n"
954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[0]}, [%[result]]!\n"
957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d4[0]}, [r0]!\n"
958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3,
972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()"
981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
1005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d14, d12\n"
1008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d15, d12\n"
1009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d12\n"
1010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d14, d13\n"
1011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d15, d13\n"
1012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d13\n"
1013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
1014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
1015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
1016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
1017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q13\n"
1018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q14\n"
1019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
1024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
1026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[multiplicative_offset]\n"
1027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, %[rounding_offset]\n"
1028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[shift]\n"
1029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, d12[0]\n"
1030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d12[1]\n"
1031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
1034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
1041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
1042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
1043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
1044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d8\n"
1045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d7, d10, d10\n"
1046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
1048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q11\n"
1049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q6\n"
1050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
1051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q7\n"
1052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q8\n"
1053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q3, q3, q8\n"
1054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q9\n"
1055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q3, q3, q9\n"
1056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q10\n"
1057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q3, q3, q10\n"
1058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
1059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d6, q3\n"
1060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
1061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d6, q3\n"
1062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[0]}, [%[result]]!\n"
1065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[2]}, [%[result]]!\n"
1066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d6[0]}, [r0]!\n"
1067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d6[2]}, [r0]!\n"
1068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
1072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
1073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
1074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
1076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
1077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
1078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
1082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4,
1083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
1085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
1086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
1087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()"
1092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
1105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
1106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
1107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
1108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
1109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 2x4 lanes loop.
1111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
1114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d16}, [%[lhs]:64]!\n"
1115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d18\n"
1116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d17}, [%[lhs]:64]!\n"
1117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d19\n"
1118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d20\n"
1120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d21\n"
1122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d17, d18\n"
1123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q11\n"
1124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q12\n"
1125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q13\n"
1126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d17, d19\n"
1127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d17, d20\n"
1128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d17, d21\n"
1129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q14\n"
1134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q15\n"
1135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
1136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
1137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q13\n"
1138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
1143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
1144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
1145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[multiplicative_offset]\n"
1146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[rounding_offset]\n"
1147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, %[shift]\n"
1148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q13, d16[0]\n"
1149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d16[1]\n"
1150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
1153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
1159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
1161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
1162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
1163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
1164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
1165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d10\n"
1166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d9, d12, d14\n"
1167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
1169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q13\n"
1170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q8\n"
1171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
1172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q9\n"
1173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q10\n"
1174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q4, q4, q10\n"
1175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q11\n"
1176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q4, q4, q11\n"
1177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q12\n"
1178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q4, q4, q12\n"
1179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
1180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d8, q4\n"
1181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
1182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d8, q4\n"
1183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
1186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d8[0]}, [r0]!\n"
1187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
1191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
1192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
1193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
1195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
1196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d31", "cc", "memory");
1197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
1201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1,
1202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
1204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
1205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
1206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()"
1211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
1225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
1231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d9}, [%[rhs]:64]!\n"
1232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d9, d6\n"
1235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d9, d7\n"
1236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d8\n"
1237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q5\n"
1238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q6\n"
1239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q7\n"
1240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
1245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[multiplicative_offset]\n"
1248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, %[rounding_offset]\n"
1249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[shift]\n"
1250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q3, d8[0]\n"
1251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d8[1]\n"
1252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d9[0]\n"
1253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
1256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
1257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
1261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d2\n"
1263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d4\n"
1265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
1267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q3\n"
1268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q9\n"
1269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q4\n"
1270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
1271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
1272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q5\n"
1273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q6\n"
1274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q1, q1, q6\n"
1275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q2, q2, q6\n"
1276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q7\n"
1277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q1, q1, q7\n"
1278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q2, q2, q7\n"
1279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q8\n"
1280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q1, q1, q8\n"
1281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q2, q2, q8\n"
1282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
1283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d2, q1\n"
1284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d4, q2\n"
1285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
1286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d2, q1\n"
1287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d4, q2\n"
1288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[0]}, [%[result]]!\n"
1291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d2[0]}, [r0]!\n"
1292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d4[0]}, [r1]!\n"
1293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
1297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
1298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
1299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
1300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
1301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory");
1302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
1306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2,
1307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
1309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
1310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
1311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()"
1316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
1329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
1330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
1331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
1333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
1339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
1340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d15, d12\n"
1343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d16, d12\n"
1344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d13\n"
1345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d13\n"
1346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d15, d14\n"
1347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d14\n"
1348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
1349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
1350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
1351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
1352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q13\n"
1353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q14\n"
1354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
1359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
1361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[multiplicative_offset]\n"
1362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, %[rounding_offset]\n"
1363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[shift]\n"
1364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, d12[0]\n"
1365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, d12[1]\n"
1366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d13[0]\n"
1367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
1370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
1371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
1378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d6\n"
1379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
1380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
1381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d10\n"
1382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
1384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q11\n"
1385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q12\n"
1386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q6\n"
1387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
1388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q7\n"
1389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q7\n"
1390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q8\n"
1391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q2, q2, q8\n"
1392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q4, q4, q8\n"
1393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q9\n"
1394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q2, q2, q9\n"
1395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q4, q4, q9\n"
1396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q10\n"
1397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q2, q2, q10\n"
1398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q4, q4, q10\n"
1399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
1400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d4, q2\n"
1401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d8, q4\n"
1402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
1403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d4, q2\n"
1404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d8, q4\n"
1405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[0]}, [%[result]]!\n"
1408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d4[0]}, [r0]!\n"
1409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d8[0]}, [r1]!\n"
1410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
1414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
1415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
1416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
1417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
1418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory");
1420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void
1424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3,
1425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       const FusedKernelParams<QuantizedStaticPreprocessed,
1427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                               RowMajor>& params,
1428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                       uint8_t* result) {
1429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()"
1434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
1447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
1448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
1449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
1450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
1451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q8, q5\n"
1452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 3x3 lanes loop.
1454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
1457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d18}, [%[lhs]:64]!\n"
1458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d18, d21\n"
1459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d19}, [%[lhs]:64]!\n"
1460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d18, d22\n"
1461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d20}, [%[lhs]:64]!\n"
1462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d18, d23\n"
1463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d19, d21\n"
1465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q12\n"
1467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q13\n"
1468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q14\n"
1469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q15\n"
1470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d19, d22\n"
1471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d19, d23\n"
1472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d20, d21\n"
1473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d20, d22\n"
1474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d20, d23\n"
1479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q12\n"
1480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q13\n"
1481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q14\n"
1482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q15\n"
1483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q8, q9\n"
1484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Prepare
1489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
1490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
1491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[multiplicative_offset]\n"
1492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, %[rounding_offset]\n"
1493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q13, %[shift]\n"
1494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q14, d18[0]\n"
1495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q15, d18[1]\n"
1496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d19[0]\n"
1497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
1500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
1501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
1508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
1509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
1510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
1511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d8\n"
1512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d7, d10, d10\n"
1513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
1514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
1515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d16, d16, d17\n"
1516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d14\n"
1517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d13, d16, d16\n"
1518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantization::Transform
1520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q14\n"
1521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q15\n"
1522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q6, q6, q9\n"
1523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q10\n"
1524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q10\n"
1525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q6, q6, q10\n"
1526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q0, q0, q11\n"
1527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q3, q3, q11\n"
1528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.i32 q6, q6, q11\n"
1529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q0, q0, q12\n"
1530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q3, q3, q12\n"
1531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.i32 q6, q6, q12\n"
1532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q0, q0, q13\n"
1533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q3, q3, q13\n"
1534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vshl.s32 q6, q6, q13\n"
1535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d0, q0\n"
1536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d6, q3\n"
1537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovn.s32 d12, q6\n"
1538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d0, q0\n"
1539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d6, q3\n"
1540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vqmovun.s16 d12, q6\n"
1541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d0[0]}, [%[result]]!\n"
1544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d0[2]}, [%[result]]!\n"
1545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d6[0]}, [r0]!\n"
1546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d6[2]}, [r0]!\n"
1547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.16 {d12[0]}, [r1]!\n"
1548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.8 {d12[2]}, [r1]!\n"
1549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [shift] "r"(params.kernel.shift),
1553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
1554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [rounding_offset] "r"(params.kernel.rounding_offset)
1555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
1556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
1557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d30", "d31", "cc", "memory");
1559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
1563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1,
1564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
1567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
1568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, "
1573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
1574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
1586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d2}, [%[lhs]:64]!\n"
1592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d3}, [%[rhs]:64]!\n"
1593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q2, d3, d2\n"
1596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q2\n"
1597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
1602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
1605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
1611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
1613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
1614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
1615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
1618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
1621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "cc",
1622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
1623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
1627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2,
1628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
1631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
1632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, "
1637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
1638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
1651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d4}, [%[lhs]:64]!\n"
1657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
1658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q4, d5, d4\n"
1661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d6, d4\n"
1662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q4\n"
1663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q5\n"
1664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
1669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
1672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
1681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
1682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
1683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
1686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
1689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
1690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory");
1691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
1695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3,
1696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
1699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
1700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, "
1705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
1706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
1720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6}, [%[lhs]:64]!\n"
1726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
1727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d7, d6\n"
1730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d8, d6\n"
1731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d6\n"
1732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q5\n"
1733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q6\n"
1734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q7\n"
1735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
1740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
1743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
1752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
1754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
1755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
1756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
1759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d1[0]}, [%[result]]!\n"
1760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
1763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "cc", "memory");
1765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
1769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4,
1770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
1773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
1774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, "
1779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
1780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
1793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
1795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8}, [%[lhs]:64]!\n"
1801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
1802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
1804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d8\n"
1805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d8\n"
1806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d8\n"
1807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d12, d8\n"
1808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q7\n"
1809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q8\n"
1810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q9\n"
1811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q10\n"
1812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
1817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
1820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
1828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
1830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
1832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
1833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
1834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1}, [%[result]]!\n"
1837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
1840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21",
1842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory");
1843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
1847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5,
1848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
1851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
1852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, "
1857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
1858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
1871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
1872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
1874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
1880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14}, [%[lhs]:64]!\n"
1881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d14\n"
1883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d14\n"
1884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d12, d14\n"
1885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d13, d14\n"
1886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10}, [%[rhs]:64]!\n"
1887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
1888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q8\n"
1889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q9\n"
1890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q10\n"
1891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q11\n"
1892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d14\n"
1893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q8\n"
1894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
1899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
1900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
1901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q5, d10[0]\n"
1902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
1910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
1911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
1912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
1913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d8\n"
1914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
1916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
1917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
1918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
1919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
1920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
1922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1}, [%[result]]!\n"
1923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d2[0]}, [%[result]]!\n"
1924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
1926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
1927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
1929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "cc", "memory");
1930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
1931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
1933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
1934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6,
1935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
1938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
1939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
1940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
1941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
1942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, "
1944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
1945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
1946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
1947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
1949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
1950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
1951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
1952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
1954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
1955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
1956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
1957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
1958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
1959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
1960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
1962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
1963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
1965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
1966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
1968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16}, [%[lhs]:64]!\n"
1969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
1970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d12, d16\n"
1971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d13, d16\n"
1972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d14, d16\n"
1973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d15, d16\n"
1974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
1975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
1976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
1977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
1978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
1979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
1980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d12, d16\n"
1981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d13, d16\n"
1982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q9\n"
1983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q10\n"
1984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
1986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
1987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
1989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
1991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d12[0]\n"
1992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
1994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
1996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
1997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
1998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
1999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
2004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
2005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
2008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q6\n"
2009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
2010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q8\n"
2011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
2014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "cc", "memory");
2020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7,
2025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, "
2034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
2049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
2050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
2051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
2053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
2059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18}, [%[lhs]:64]!\n"
2060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d14, d18\n"
2062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d18\n"
2063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
2064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d17, d18\n"
2065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
2066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
2067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q10\n"
2068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q11\n"
2069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q12\n"
2070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q13\n"
2071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d14, d18\n"
2072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d18\n"
2073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
2074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q10\n"
2075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
2076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
2077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
2083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
2084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, d14[0]\n"
2085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
2096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
2098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
2099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d3, d12, d12\n"
2100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
2103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
2104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
2105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q9\n"
2106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
2109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d3[0]}, [%[result]]!\n"
2110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
2116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8,
2121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, "
2130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
2145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
2146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
2147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
2148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 1x8 lanes loop.
2150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
2153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16}, [%[lhs]:64]!\n"
2154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d17\n"
2155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
2156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d19\n"
2157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d20\n"
2158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
2159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q11\n"
2160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q12\n"
2161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q13\n"
2162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q14\n"
2163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #256]\n"
2164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d16, d17\n"
2165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d18\n"
2166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d19\n"
2167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d20\n"
2168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #32]\n"
2169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q15\n"
2174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
2175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
2176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q13\n"
2177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
2183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
2184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d16[0]\n"
2185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
2196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
2197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
2199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
2200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d3, d12, d14\n"
2201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
2204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q8\n"
2205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
2206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q10\n"
2207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n"
2210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
2216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d31", "cc", "memory");
2217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1,
2222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, "
2231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
2251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6}, [%[rhs]:64]!\n"
2252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q4, d6, d4\n"
2255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d6, d5\n"
2256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q4\n"
2257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q5\n"
2258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q2, d8[0]\n"
2266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[1]\n"
2267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
2274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d2\n"
2276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q2\n"
2279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q4\n"
2280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
2281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
2282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
2285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d2[0]}, [r0]!\n"
2286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
2290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "cc", "memory");
2291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2,
2296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, "
2305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d10, d8\n"
2331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d11, d8\n"
2332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d9\n"
2333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d9\n"
2334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q6\n"
2335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q7\n"
2336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q8\n"
2337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q9\n"
2338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d8[0]\n"
2346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[1]\n"
2347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d6\n"
2358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
2361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q4\n"
2362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
2363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q5\n"
2364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
2367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d4}, [r0]!\n"
2368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
2373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
2374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3,
2379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, "
2388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
2403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
2404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
2412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
2413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d14, d12\n"
2416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d15, d12\n"
2417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d12\n"
2418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d14, d13\n"
2419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d15, d13\n"
2420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d13\n"
2421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
2422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
2423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
2424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
2425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q13\n"
2426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q14\n"
2427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
2433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
2434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d12[0]\n"
2435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d12[1]\n"
2436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
2446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d8\n"
2450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d7, d10, d10\n"
2451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
2454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q6\n"
2455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
2456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q7\n"
2457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
2460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d1[0]}, [%[result]]!\n"
2461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d6}, [r0]!\n"
2462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d7[0]}, [r0]!\n"
2463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
2469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
2470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4,
2475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, "
2484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
2499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
2500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
2501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
2502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 2x4 lanes loop.
2504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
2507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d16}, [%[lhs]:64]!\n"
2508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d18\n"
2509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d17}, [%[lhs]:64]!\n"
2510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d19\n"
2511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d20\n"
2513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d21\n"
2515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d17, d18\n"
2516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q11\n"
2517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q12\n"
2518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q13\n"
2519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d17, d19\n"
2520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d17, d20\n"
2521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d17, d21\n"
2522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q14\n"
2527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q15\n"
2528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
2529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
2530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q13\n"
2531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
2537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
2538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, d16[0]\n"
2539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d16[1]\n"
2540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
2551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
2554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
2555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d10\n"
2556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d9, d12, d14\n"
2557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q10\n"
2560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q8\n"
2561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
2562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q9\n"
2563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1}, [%[result]]!\n"
2566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d8, d9}, [r0]!\n"
2567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
2573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d31", "cc", "memory");
2574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1,
2579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, "
2588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
2609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d9}, [%[rhs]:64]!\n"
2610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d9, d6\n"
2613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d9, d7\n"
2614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d8\n"
2615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q5\n"
2616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q6\n"
2617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q7\n"
2618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q3, d8[0]\n"
2626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d8[1]\n"
2627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d9[0]\n"
2628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
2632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
2636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d2\n"
2638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d4\n"
2640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q3\n"
2643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q6\n"
2644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q4\n"
2645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
2646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
2647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q5\n"
2648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
2651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d2[0]}, [r0]!\n"
2652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d4[0]}, [r1]!\n"
2653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
2657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory");
2658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2,
2663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, "
2672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
2687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
2688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
2696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
2697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d15, d12\n"
2700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d16, d12\n"
2701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d13\n"
2702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d13\n"
2703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d15, d14\n"
2704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d14\n"
2705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
2706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
2707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
2708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
2709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q13\n"
2710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q14\n"
2711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
2717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
2718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d12[0]\n"
2719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d12[1]\n"
2720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d13[0]\n"
2721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
2725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d6\n"
2733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d10\n"
2736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
2739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q9\n"
2740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q6\n"
2741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
2742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q7\n"
2743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q7\n"
2744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
2747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d4}, [r0]!\n"
2748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d8}, [r1]!\n"
2749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
2753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
2754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory");
2756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3,
2761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 int32_t* result) {
2765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, "
2770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
2783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
2784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
2785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
2786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
2787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
2788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q8, q5\n"
2789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 3x3 lanes loop.
2791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
2794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d18}, [%[lhs]:64]!\n"
2795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d18, d21\n"
2796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d19}, [%[lhs]:64]!\n"
2797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d18, d22\n"
2798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d20}, [%[lhs]:64]!\n"
2799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d18, d23\n"
2800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d19, d21\n"
2802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q12\n"
2804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q13\n"
2805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q14\n"
2806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q15\n"
2807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d19, d22\n"
2808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d19, d23\n"
2809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d20, d21\n"
2810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d20, d22\n"
2811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d20, d23\n"
2816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q12\n"
2817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q13\n"
2818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q14\n"
2819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q15\n"
2820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q8, q9\n"
2821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Prepare
2826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
2827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
2828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, d18[0]\n"
2829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, d18[1]\n"
2830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d19[0]\n"
2831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
2834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
2835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
2840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
2841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
2842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
2843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
2844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
2845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d8\n"
2846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d7, d10, d10\n"
2847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
2848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
2849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d16, d16, d17\n"
2850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d14\n"
2851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d13, d16, d16\n"
2852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationInt32::Transform
2854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q11\n"
2855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q12\n"
2856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q6, q6, q9\n"
2857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q10\n"
2858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q10\n"
2859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q6, q6, q10\n"
2860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
2863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d1[0]}, [%[result]]!\n"
2864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d6}, [r0]!\n"
2865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d7[0]}, [r0]!\n"
2866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d12}, [r1]!\n"
2867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d13[0]}, [r1]!\n"
2868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride)
2871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
2872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
2873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d30", "d31", "cc", "memory");
2875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1,
2880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
2884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, "
2889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d2}, [%[lhs]:64]!\n"
2908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d3}, [%[rhs]:64]!\n"
2909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q2, d3, d2\n"
2912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q2\n"
2913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
2918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
2921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
2922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
2928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
2930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
2931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
2932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
2933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
2934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
2936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
2937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
2939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
2940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
2941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12",
2942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d13", "cc", "memory");
2943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
2944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
2946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
2947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2,
2948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
2951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
2952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
2953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
2954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
2955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, "
2957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
2958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
2959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
2960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
2962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
2963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
2964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
2965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
2967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
2968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
2969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
2971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
2972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
2974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
2975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d4}, [%[lhs]:64]!\n"
2977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
2978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
2979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
2980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q4, d5, d4\n"
2981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d6, d4\n"
2982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q4\n"
2983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q5\n"
2984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
2986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
2987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
2989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
2992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
2993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
2995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
2996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
2997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
2998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
2999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
3003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
3004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
3006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
3009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
3014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d12", "d13", "cc", "memory");
3015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3,
3020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, "
3029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
3044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6}, [%[lhs]:64]!\n"
3050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
3051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d7, d6\n"
3054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d8, d6\n"
3055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d6\n"
3056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q5\n"
3057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q6\n"
3058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q7\n"
3059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
3067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
3068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
3077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
3080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
3081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
3083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
3086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d1[0]}, [%[result]]!\n"
3087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "cc", "memory");
3093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4,
3098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, "
3107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
3123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8}, [%[lhs]:64]!\n"
3129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
3130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d8\n"
3133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d8\n"
3134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d8\n"
3135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d12, d8\n"
3136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q7\n"
3137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q8\n"
3138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q9\n"
3139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q10\n"
3140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
3148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[0]\n"
3149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
3159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q4\n"
3162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
3163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
3165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1}, [%[result]]!\n"
3168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "cc", "memory");
3175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5,
3180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, "
3189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
3204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
3206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
3212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14}, [%[lhs]:64]!\n"
3213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d14\n"
3215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d14\n"
3216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d12, d14\n"
3217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d13, d14\n"
3218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10}, [%[rhs]:64]!\n"
3219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
3220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q8\n"
3221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q9\n"
3222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q10\n"
3223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q11\n"
3224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d14\n"
3225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q8\n"
3226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
3232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
3233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[scale]\n"
3234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q5, d10[0]\n"
3235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
3244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
3246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d8\n"
3247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
3250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
3251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
3252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
3253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q1, q1\n"
3255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q8\n"
3256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q1, q1, q8\n"
3257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1}, [%[result]]!\n"
3260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d2[0]}, [%[result]]!\n"
3261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "cc", "memory");
3268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6,
3273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, "
3282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3289a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3290a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3291a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3292a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3293a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3294a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3295a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3296a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
3297a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
3298a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3299a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
3300a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3301a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3302a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3303a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3304a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3305a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
3306a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16}, [%[lhs]:64]!\n"
3307a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3308a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d12, d16\n"
3309a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d13, d16\n"
3310a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d14, d16\n"
3311a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d15, d16\n"
3312a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
3313a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
3314a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
3315a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
3316a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
3317a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
3318a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d12, d16\n"
3319a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d13, d16\n"
3320a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q9\n"
3321a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q10\n"
3322a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3323a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3324a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3325a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3326a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3327a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
3328a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
3329a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, %[scale]\n"
3330a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d12[0]\n"
3331a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3332a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3333a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3334a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3335a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3336a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3337a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3338a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3339a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
3340a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
3341a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3342a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
3343a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
3344a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3345a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3346a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q6\n"
3347a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q6\n"
3348a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
3349a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q8\n"
3350a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3351a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q1, q1\n"
3352a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q9\n"
3353a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q1, q1, q9\n"
3354a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3355a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3356a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
3357a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3358a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3359a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3360a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3361a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3362a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3363a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "cc", "memory");
3364a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3365a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3366a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3367a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3368a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7,
3369a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3370a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3371a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3372a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3373a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3374a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3375a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3376a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3377a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, "
3378a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3379a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3380a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3381a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3382a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3383a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3384a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3385a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3386a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3387a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3388a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3389a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3390a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3391a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3392a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
3393a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
3394a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
3395a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3396a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General 1xM lanes loop.
3397a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3398a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3399a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3400a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3401a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3402a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
3403a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18}, [%[lhs]:64]!\n"
3404a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3405a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d14, d18\n"
3406a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d18\n"
3407a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
3408a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d17, d18\n"
3409a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
3410a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #128]\n"
3411a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q10\n"
3412a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q11\n"
3413a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q12\n"
3414a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q13\n"
3415a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d14, d18\n"
3416a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d18\n"
3417a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
3418a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q10\n"
3419a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
3420a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
3421a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3422a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3423a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3424a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3425a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3426a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
3427a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
3428a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[scale]\n"
3429a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, d14[0]\n"
3430a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3431a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3432a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3433a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3434a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3435a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3436a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3437a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3438a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
3439a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
3440a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
3441a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3442a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
3443a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
3444a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d3, d12, d12\n"
3445a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3446a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3447a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
3448a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
3449a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
3450a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q9\n"
3451a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3452a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q1, q1\n"
3453a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q10\n"
3454a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q1, q1, q10\n"
3455a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3456a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3457a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
3458a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d3[0]}, [%[result]]!\n"
3459a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3460a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3461a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3462a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3463a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3464a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3465a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
3466a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3467a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3468a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3469a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3470a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8,
3471a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3472a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3473a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3474a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3475a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3476a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3477a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3478a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3479a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, "
3480a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3481a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3482a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3483a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3484a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3485a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3486a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3487a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3488a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3489a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3490a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3491a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3492a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3493a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3494a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
3495a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
3496a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
3497a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
3498a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3499a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 1x8 lanes loop.
3500a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3501a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3502a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
3503a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16}, [%[lhs]:64]!\n"
3504a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d17\n"
3505a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d18\n"
3506a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d19\n"
3507a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d20\n"
3508a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
3509a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q11\n"
3510a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q12\n"
3511a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q13\n"
3512a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q14\n"
3513a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #256]\n"
3514a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d16, d17\n"
3515a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d18\n"
3516a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d19\n"
3517a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d20\n"
3518a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #32]\n"
3519a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3520a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3521a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3522a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3523a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q15\n"
3524a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
3525a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
3526a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q13\n"
3527a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3528a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3529a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3530a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3531a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3532a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
3533a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
3534a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[scale]\n"
3535a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d16[0]\n"
3536a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3537a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3538a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3539a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3540a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3541a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3542a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3543a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3544a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
3545a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
3546a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
3547a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
3548a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3549a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
3550a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d8, d10\n"
3551a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d3, d12, d14\n"
3552a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3553a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3554a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q8\n"
3555a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q8\n"
3556a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
3557a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q10\n"
3558a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3559a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q1, q1\n"
3560a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q11\n"
3561a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q1, q1, q11\n"
3562a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3563a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3564a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n"
3565a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3566a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3567a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3568a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3569a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3570a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3571a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
3572a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d31", "cc", "memory");
3573a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3574a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3575a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3576a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3577a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1,
3578a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3579a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3580a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3581a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3582a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3583a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3584a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3585a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3586a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, "
3587a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3588a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3589a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3590a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3591a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3592a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3593a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3594a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3595a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3596a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3597a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3598a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3599a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3600a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
3601a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3602a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3603a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3604a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3605a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3606a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
3607a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6}, [%[rhs]:64]!\n"
3608a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3609a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3610a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q4, d6, d4\n"
3611a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d6, d5\n"
3612a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q4\n"
3613a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q5\n"
3614a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3615a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3616a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3617a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3618a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3619a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3620a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3621a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
3622a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q2, d8[0]\n"
3623a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[1]\n"
3624a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3625a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3626a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
3627a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3628a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3629a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3630a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
3631a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3632a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d2\n"
3633a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3634a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3635a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q2\n"
3636a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q4\n"
3637a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
3638a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
3639a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3640a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q1, q1\n"
3641a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
3642a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q1, q1, q6\n"
3643a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3644a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3645a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
3646a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d2[0]}, [r0]!\n"
3647a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3648a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3649a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3650a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3651a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
3652a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "cc", "memory");
3653a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3654a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3655a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3656a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3657a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2,
3658a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3659a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3660a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3661a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3662a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3663a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3664a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3665a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3666a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, "
3667a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3668a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3669a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3670a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3671a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3672a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3673a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3674a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3675a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3676a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3677a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3678a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3679a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3680a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3681a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3682a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
3683a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3684a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3685a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3686a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3687a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3688a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3689a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3690a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3691a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3692a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d10, d8\n"
3693a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d11, d8\n"
3694a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q8, d10, d9\n"
3695a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d11, d9\n"
3696a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q6\n"
3697a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q7\n"
3698a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q8\n"
3699a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q9\n"
3700a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3701a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3702a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3703a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3704a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3705a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3706a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3707a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
3708a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, d8[0]\n"
3709a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d8[1]\n"
3710a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3711a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3712a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
3713a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3714a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3715a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3716a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3717a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3718a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3719a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3720a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d6\n"
3721a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3722a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3723a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
3724a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q4\n"
3725a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
3726a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q5\n"
3727a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3728a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q2, q2\n"
3729a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
3730a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q2, q2, q6\n"
3731a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3732a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3733a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
3734a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d4}, [r0]!\n"
3735a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3736a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3737a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3738a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3739a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3740a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
3741a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
3742a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3743a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3744a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3745a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3746a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3,
3747a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3748a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3749a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3750a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3751a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3752a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3753a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3754a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3755a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, "
3756a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3757a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3758a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3759a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3760a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3761a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3762a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3763a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3764a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3765a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3766a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3767a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3768a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3769a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3770a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
3771a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
3772a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3773a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
3774a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3775a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3776a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3777a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3778a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3779a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
3780a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
3781a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3782a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3783a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d14, d12\n"
3784a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d15, d12\n"
3785a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d12\n"
3786a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d14, d13\n"
3787a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d15, d13\n"
3788a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d13\n"
3789a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
3790a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
3791a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
3792a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
3793a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q13\n"
3794a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q14\n"
3795a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3796a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3797a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3798a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3799a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3800a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
3801a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
3802a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[scale]\n"
3803a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d12[0]\n"
3804a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d12[1]\n"
3805a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3806a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3807a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
3808a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3809a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3810a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3811a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3812a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3813a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3814a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
3815a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3816a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
3817a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
3818a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d8\n"
3819a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d7, d10, d10\n"
3820a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3821a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3822a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
3823a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q6\n"
3824a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
3825a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q7\n"
3826a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3827a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q3, q3\n"
3828a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q8\n"
3829a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q3, q3, q8\n"
3830a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3831a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3832a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
3833a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d1[0]}, [%[result]]!\n"
3834a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d6}, [r0]!\n"
3835a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d7[0]}, [r0]!\n"
3836a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3837a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3838a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3839a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3840a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3841a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3842a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
3843a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "memory");
3844a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3845a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3846a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3847a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3848a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4,
3849a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3850a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3851a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3852a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3853a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3854a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3855a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3856a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3857a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, "
3858a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3859a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3860a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3861a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3862a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3863a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3864a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3865a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3866a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3867a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3868a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3869a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3870a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3871a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
3872a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
3873a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
3874a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
3875a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
3876a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3877a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 2x4 lanes loop.
3878a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3879a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3880a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
3881a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d16}, [%[lhs]:64]!\n"
3882a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d16, d18\n"
3883a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d17}, [%[lhs]:64]!\n"
3884a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d19\n"
3885a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3886a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d16, d20\n"
3887a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3888a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d21\n"
3889a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d17, d18\n"
3890a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q11\n"
3891a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q12\n"
3892a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q13\n"
3893a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d17, d19\n"
3894a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d17, d20\n"
3895a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d17, d21\n"
3896a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3897a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3898a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3899a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3900a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q14\n"
3901a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q15\n"
3902a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q11\n"
3903a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q12\n"
3904a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q13\n"
3905a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3906a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
3907a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
3908a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3909a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
3910a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
3911a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
3912a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, %[scale]\n"
3913a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, d16[0]\n"
3914a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, d16[1]\n"
3915a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3916a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
3917a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
3918a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3919a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
3920a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
3921a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
3922a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
3923a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
3924a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
3925a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d6\n"
3926a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
3927a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
3928a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
3929a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
3930a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d10\n"
3931a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d9, d12, d14\n"
3932a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3933a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
3934a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q11\n"
3935a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q8\n"
3936a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
3937a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q9\n"
3938a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
3939a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q4, q4\n"
3940a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q10\n"
3941a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q4, q4, q10\n"
3942a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3943a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
3944a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0, d1}, [%[result]]!\n"
3945a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d8, d9}, [r0]!\n"
3946a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3947a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
3948a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
3949a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
3950a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3951a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3952a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
3953a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d31", "cc", "memory");
3954a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
3955a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3956a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
3957a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
3958a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1,
3959a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3960a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3961a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
3962a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
3963a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
3964a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
3965a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
3966a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3967a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, "
3968a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
3969a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
3970a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
3971a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3972a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
3973a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
3974a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
3975a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
3976a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3977a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
3978a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
3979a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
3980a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
3981a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3982a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
3983a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
3984a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3985a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
3986a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
3987a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3988a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
3989a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d9}, [%[rhs]:64]!\n"
3990a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
3991a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
3992a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q5, d9, d6\n"
3993a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q6, d9, d7\n"
3994a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q7, d9, d8\n"
3995a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q5\n"
3996a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q6\n"
3997a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q7\n"
3998a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
3999a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
4000a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
4001a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4002a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
4003a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
4004a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
4005a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, %[scale]\n"
4006a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q3, d8[0]\n"
4007a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q7, d8[1]\n"
4008a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q4, d9[0]\n"
4009a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4010a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
4011a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
4012a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
4013a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4014a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
4015a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
4016a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d0\n"
4017a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
4018a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d2\n"
4019a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
4020a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d4\n"
4021a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4022a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
4023a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q3\n"
4024a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q7\n"
4025a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q4\n"
4026a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q5\n"
4027a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q1, q1, q5\n"
4028a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q5\n"
4029a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
4030a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q1, q1\n"
4031a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q2, q2\n"
4032a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q6\n"
4033a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q1, q1, q6\n"
4034a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q2, q2, q6\n"
4035a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4036a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
4037a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0[0]}, [%[result]]!\n"
4038a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d2[0]}, [r0]!\n"
4039a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d4[0]}, [r1]!\n"
4040a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4041a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
4042a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
4043a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
4044a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
4045a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory");
4046a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4047a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4048a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
4049a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
4050a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2,
4051a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
4052a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
4053a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
4054a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
4055a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
4056a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
4057a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
4058a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
4059a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, "
4060a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
4061a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
4062a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
4063a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
4064a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
4065a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
4066a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
4067a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
4068a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4069a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
4070a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
4071a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
4072a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
4073a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
4074a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
4075a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
4076a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4077a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // General NxM lanes loop.
4078a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
4079a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4080a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
4081a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
4082a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4083a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
4084a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
4085a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
4086a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
4087a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d15, d12\n"
4088a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q10, d16, d12\n"
4089a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q11, d15, d13\n"
4090a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d16, d13\n"
4091a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d15, d14\n"
4092a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d16, d14\n"
4093a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q9\n"
4094a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q10\n"
4095a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q11\n"
4096a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q12\n"
4097a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q13\n"
4098a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q14\n"
4099a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
4101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
4102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
4104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
4105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
4106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q8, %[scale]\n"
4107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d12[0]\n"
4108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q10, d12[1]\n"
4109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q6, d13[0]\n"
4110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
4112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
4113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
4114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
4116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
4117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
4118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
4119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
4120a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
4121a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d6\n"
4122a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
4123a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
4124a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d10\n"
4125a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4126a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
4127a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q9\n"
4128a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q10\n"
4129a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q6\n"
4130a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q7\n"
4131a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q2, q2, q7\n"
4132a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q4, q4, q7\n"
4133a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
4134a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q2, q2\n"
4135a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q4, q4\n"
4136a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q8\n"
4137a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q2, q2, q8\n"
4138a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q4, q4, q8\n"
4139a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4140a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
4141a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
4142a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d4}, [r0]!\n"
4143a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d8}, [r1]!\n"
4144a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4145a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
4146a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
4147a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
4148a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
4149a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
4150a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
4151a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "cc", "memory");
4152a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4153a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4154a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <>
4155a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MulKernel<
4156a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3,
4157a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
4158a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
4159a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                         RowMajor>& params,
4160a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                 float* result) {
4161a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG
4162a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifdef DEBUG_METAGEMM_VERBOSE
4163a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::cout << __FILE__ << "(" << __LINE__
4164a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
4165a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, "
4166a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang               "8>::Multiply()"
4167a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::endl
4168a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang            << std::flush;
4169a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
4170a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
4171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  asm volatile(
4172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs]]\n"
4173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs]]\n"
4174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4175a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Clear aggregators.
4176a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q0, #0\n"
4177a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q1, #0\n"
4178a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q2, #0\n"
4179a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q3, q0\n"
4180a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q4, q1\n"
4181a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q5, q2\n"
4182a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q6, q3\n"
4183a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q7, q4\n"
4184a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmov.i32 q8, q5\n"
4185a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4186a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // 3x3 lanes loop.
4187a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "1:"
4188a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4189a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
4190a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d18}, [%[lhs]:64]!\n"
4191a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d18, d21\n"
4192a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d19}, [%[lhs]:64]!\n"
4193a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d18, d22\n"
4194a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.8 {d20}, [%[lhs]:64]!\n"
4195a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d18, d23\n"
4196a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[lhs], #64]\n"
4197a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d19, d21\n"
4198a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "pld [%[rhs], #64]\n"
4199a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q0, q12\n"
4200a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q1, q13\n"
4201a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q2, q14\n"
4202a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q3, q15\n"
4203a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q12, d19, d22\n"
4204a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q13, d19, d23\n"
4205a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q14, d20, d21\n"
4206a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q15, d20, d22\n"
4207a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4208a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Subtract counter.
4209a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "subs %[count], %[count], #8\n"
4210a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4211a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmull.u8 q9, d20, d23\n"
4212a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q4, q12\n"
4213a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q5, q13\n"
4214a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q6, q14\n"
4215a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q7, q15\n"
4216a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadal.u16 q8, q9\n"
4217a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4218a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Loop break.
4219a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "bgt 1b\n"
4220a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4221a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Prepare
4222a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
4223a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
4224a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q11, %[scale]\n"
4225a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q12, d18[0]\n"
4226a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q13, d18[1]\n"
4227a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vdup.32 q9, d19[0]\n"
4228a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4229a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Prepare
4230a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r0, %[result], %[stride]\n"
4231a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "add r1, r0, %[stride]\n"
4232a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4233a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // Reduce aggregators.
4234a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d1\n"
4235a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d2, d2, d3\n"
4236a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d4, d4, d5\n"
4237a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d0, d0, d2\n"
4238a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d1, d4, d4\n"
4239a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d7\n"
4240a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d8, d8, d9\n"
4241a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d10, d10, d11\n"
4242a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d6, d6, d8\n"
4243a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d7, d10, d10\n"
4244a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d13\n"
4245a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d14, d14, d15\n"
4246a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d16, d16, d17\n"
4247a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d12, d12, d14\n"
4248a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vpadd.u32 d13, d16, d16\n"
4249a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4250a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // StaticQuantizationFloat::Transform
4251a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q12\n"
4252a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q13\n"
4253a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q6, q6, q9\n"
4254a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q0, q0, q10\n"
4255a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q3, q3, q10\n"
4256a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vadd.s32 q6, q6, q10\n"
4257a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q0, q0\n"
4258a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q3, q3\n"
4259a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vcvt.f32.s32 q6, q6\n"
4260a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q0, q0, q11\n"
4261a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q3, q3, q11\n"
4262a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vmul.f32 q6, q6, q11\n"
4263a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4264a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      // RowMajorOutput::Output
4265a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d0}, [%[result]]!\n"
4266a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d1[0]}, [%[result]]!\n"
4267a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d6}, [r0]!\n"
4268a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d7[0]}, [r0]!\n"
4269a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d12}, [r1]!\n"
4270a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      "vst1.32 {d13[0]}, [r1]!\n"
4271a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4272a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : [count] "r"(params.kernel.count),
4273a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [stride] "r"(params.output_stream.stride),
4274a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        [scale] "r"(params.kernel.scale)
4275a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
4276a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
4277a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
4278a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang        "d30", "d31", "cc", "memory");
4279a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
4280a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4281a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}  // namespace meta
4282a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}  // namespace gemmlowp
4283a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4284a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#else
4285a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#warning "Meta gemm for arm32 requires: GEMMLOWP_NEON_32!"
4286a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif
4287a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
4288a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif  // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
4289