1// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
16#define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
17
18#ifdef GEMMLOWP_NEON_32
19
20#include <cassert>
21#include <cstdint>
22
23namespace gemmlowp {
24namespace meta {
25
26template <>
27inline void
28MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1,
29          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
30                       const FusedKernelParams<QuantizedStaticPreprocessed,
31                                               RowMajor>& params,
32                       uint8_t* result) {
33#ifdef DEBUG
34#ifdef DEBUG_METAGEMM_VERBOSE
35  std::cout << __FILE__ << "(" << __LINE__
36            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
37               "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()"
38            << std::endl
39            << std::flush;
40#endif
41#endif
42  asm volatile(
43      "pld [%[lhs]]\n"
44      "pld [%[rhs]]\n"
45
46      // Clear aggregators.
47      "vmov.i32 q0, #0\n"
48
49      // General NxM lanes loop.
50      "1:"
51
52      // Subtract counter.
53      "subs %[count], %[count], #8\n"
54
55      "vld1.32 {d2}, [%[lhs]:64]!\n"
56      "vld1.32 {d3}, [%[rhs]:64]!\n"
57      "pld [%[lhs], #64]\n"
58      "pld [%[rhs], #64]\n"
59      "vmull.u8 q2, d3, d2\n"
60      "vpadal.u16 q0, q2\n"
61
62      // Loop break.
63      "bgt 1b\n"
64
65      // StaticQuantization::Prepare
66      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
67      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
68      "vdup.32 q6, %[multiplicative_offset]\n"
69      "vdup.32 q7, %[rounding_offset]\n"
70      "vdup.32 q8, %[shift]\n"
71      "vdup.32 q4, d8[0]\n"
72
73      // RowMajorOutput::Prepare
74
75      // Reduce aggregators.
76      "vpadd.u32 d0, d0, d1\n"
77      "vpadd.u32 d0, d0, d0\n"
78
79      // StaticQuantization::Transform
80      "vadd.s32 q0, q0, q4\n"
81      "vadd.s32 q0, q0, q5\n"
82      "vmul.i32 q0, q0, q6\n"
83      "vadd.i32 q0, q0, q7\n"
84      "vshl.s32 q0, q0, q8\n"
85      "vqmovn.s32 d0, q0\n"
86      "vqmovun.s16 d0, q0\n"
87
88      // RowMajorOutput::Output
89      "vst1.8 {d0[0]}, [%[result]]!\n"
90      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
91      : [count] "r"(params.kernel.count),
92        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
93        [shift] "r"(params.kernel.shift),
94        [stride] "r"(params.output_stream.stride),
95        [rounding_offset] "r"(params.kernel.rounding_offset)
96      : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12",
97        "d13", "d14", "d15", "d16", "d17", "cc", "memory");
98}
99
100template <>
101inline void
102MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2,
103          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
104                       const FusedKernelParams<QuantizedStaticPreprocessed,
105                                               RowMajor>& params,
106                       uint8_t* result) {
107#ifdef DEBUG
108#ifdef DEBUG_METAGEMM_VERBOSE
109  std::cout << __FILE__ << "(" << __LINE__
110            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
111               "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()"
112            << std::endl
113            << std::flush;
114#endif
115#endif
116  asm volatile(
117      "pld [%[lhs]]\n"
118      "pld [%[rhs]]\n"
119
120      // Clear aggregators.
121      "vmov.i32 q0, #0\n"
122      "vmov.i32 q1, #0\n"
123
124      // General NxM lanes loop.
125      "1:"
126
127      // Subtract counter.
128      "subs %[count], %[count], #8\n"
129
130      "vld1.32 {d4}, [%[lhs]:64]!\n"
131      "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
132      "pld [%[lhs], #64]\n"
133      "pld [%[rhs], #64]\n"
134      "vmull.u8 q4, d5, d4\n"
135      "vmull.u8 q5, d6, d4\n"
136      "vpadal.u16 q0, q4\n"
137      "vpadal.u16 q1, q5\n"
138
139      // Loop break.
140      "bgt 1b\n"
141
142      // StaticQuantization::Prepare
143      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
144      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
145      "vdup.32 q6, %[multiplicative_offset]\n"
146      "vdup.32 q7, %[rounding_offset]\n"
147      "vdup.32 q8, %[shift]\n"
148      "vdup.32 q4, d8[0]\n"
149
150      // RowMajorOutput::Prepare
151
152      // Reduce aggregators.
153      "vpadd.u32 d0, d0, d1\n"
154      "vpadd.u32 d2, d2, d3\n"
155      "vpadd.u32 d0, d0, d2\n"
156
157      // StaticQuantization::Transform
158      "vadd.s32 q0, q0, q4\n"
159      "vadd.s32 q0, q0, q5\n"
160      "vmul.i32 q0, q0, q6\n"
161      "vadd.i32 q0, q0, q7\n"
162      "vshl.s32 q0, q0, q8\n"
163      "vqmovn.s32 d0, q0\n"
164      "vqmovun.s16 d0, q0\n"
165
166      // RowMajorOutput::Output
167      "vst1.16 {d0[0]}, [%[result]]!\n"
168      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
169      : [count] "r"(params.kernel.count),
170        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
171        [shift] "r"(params.kernel.shift),
172        [stride] "r"(params.output_stream.stride),
173        [rounding_offset] "r"(params.kernel.rounding_offset)
174      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
175        "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
176}
177
178template <>
179inline void
180MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3,
181          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
182                       const FusedKernelParams<QuantizedStaticPreprocessed,
183                                               RowMajor>& params,
184                       uint8_t* result) {
185#ifdef DEBUG
186#ifdef DEBUG_METAGEMM_VERBOSE
187  std::cout << __FILE__ << "(" << __LINE__
188            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
189               "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()"
190            << std::endl
191            << std::flush;
192#endif
193#endif
194  asm volatile(
195      "pld [%[lhs]]\n"
196      "pld [%[rhs]]\n"
197
198      // Clear aggregators.
199      "vmov.i32 q0, #0\n"
200      "vmov.i32 q1, #0\n"
201      "vmov.i32 q2, #0\n"
202
203      // General NxM lanes loop.
204      "1:"
205
206      // Subtract counter.
207      "subs %[count], %[count], #8\n"
208
209      "vld1.32 {d6}, [%[lhs]:64]!\n"
210      "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
211      "pld [%[lhs], #64]\n"
212      "pld [%[rhs], #64]\n"
213      "vmull.u8 q5, d7, d6\n"
214      "vmull.u8 q6, d8, d6\n"
215      "vmull.u8 q7, d9, d6\n"
216      "vpadal.u16 q0, q5\n"
217      "vpadal.u16 q1, q6\n"
218      "vpadal.u16 q2, q7\n"
219
220      // Loop break.
221      "bgt 1b\n"
222
223      // StaticQuantization::Prepare
224      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
225      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
226      "vdup.32 q6, %[multiplicative_offset]\n"
227      "vdup.32 q7, %[rounding_offset]\n"
228      "vdup.32 q8, %[shift]\n"
229      "vdup.32 q4, d8[0]\n"
230
231      // RowMajorOutput::Prepare
232
233      // Reduce aggregators.
234      "vpadd.u32 d0, d0, d1\n"
235      "vpadd.u32 d2, d2, d3\n"
236      "vpadd.u32 d4, d4, d5\n"
237      "vpadd.u32 d0, d0, d2\n"
238      "vpadd.u32 d1, d4, d4\n"
239
240      // StaticQuantization::Transform
241      "vadd.s32 q0, q0, q4\n"
242      "vadd.s32 q0, q0, q5\n"
243      "vmul.i32 q0, q0, q6\n"
244      "vadd.i32 q0, q0, q7\n"
245      "vshl.s32 q0, q0, q8\n"
246      "vqmovn.s32 d0, q0\n"
247      "vqmovun.s16 d0, q0\n"
248
249      // RowMajorOutput::Output
250      "vst1.16 {d0[0]}, [%[result]]!\n"
251      "vst1.8 {d0[2]}, [%[result]]!\n"
252      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
253      : [count] "r"(params.kernel.count),
254        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
255        [shift] "r"(params.kernel.shift),
256        [stride] "r"(params.output_stream.stride),
257        [rounding_offset] "r"(params.kernel.rounding_offset)
258      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
259        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
260}
261
262template <>
263inline void
264MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4,
265          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
266                       const FusedKernelParams<QuantizedStaticPreprocessed,
267                                               RowMajor>& params,
268                       uint8_t* result) {
269#ifdef DEBUG
270#ifdef DEBUG_METAGEMM_VERBOSE
271  std::cout << __FILE__ << "(" << __LINE__
272            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
273               "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()"
274            << std::endl
275            << std::flush;
276#endif
277#endif
278  asm volatile(
279      "pld [%[lhs]]\n"
280      "pld [%[rhs]]\n"
281
282      // Clear aggregators.
283      "vmov.i32 q0, #0\n"
284      "vmov.i32 q1, #0\n"
285      "vmov.i32 q2, #0\n"
286      "vmov.i32 q3, q0\n"
287
288      // General NxM lanes loop.
289      "1:"
290
291      // Subtract counter.
292      "subs %[count], %[count], #8\n"
293
294      "vld1.32 {d8}, [%[lhs]:64]!\n"
295      "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
296      "pld [%[lhs], #64]\n"
297      "pld [%[rhs], #64]\n"
298      "vmull.u8 q7, d9, d8\n"
299      "vmull.u8 q8, d10, d8\n"
300      "vmull.u8 q9, d11, d8\n"
301      "vmull.u8 q10, d12, d8\n"
302      "vpadal.u16 q0, q7\n"
303      "vpadal.u16 q1, q8\n"
304      "vpadal.u16 q2, q9\n"
305      "vpadal.u16 q3, q10\n"
306
307      // Loop break.
308      "bgt 1b\n"
309
310      // StaticQuantization::Prepare
311      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
312      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
313      "vdup.32 q6, %[multiplicative_offset]\n"
314      "vdup.32 q7, %[rounding_offset]\n"
315      "vdup.32 q8, %[shift]\n"
316      "vdup.32 q4, d8[0]\n"
317
318      // RowMajorOutput::Prepare
319
320      // Reduce aggregators.
321      "vpadd.u32 d0, d0, d1\n"
322      "vpadd.u32 d2, d2, d3\n"
323      "vpadd.u32 d4, d4, d5\n"
324      "vpadd.u32 d6, d6, d7\n"
325      "vpadd.u32 d0, d0, d2\n"
326      "vpadd.u32 d1, d4, d6\n"
327
328      // StaticQuantization::Transform
329      "vadd.s32 q0, q0, q4\n"
330      "vadd.s32 q0, q0, q5\n"
331      "vmul.i32 q0, q0, q6\n"
332      "vadd.i32 q0, q0, q7\n"
333      "vshl.s32 q0, q0, q8\n"
334      "vqmovn.s32 d0, q0\n"
335      "vqmovun.s16 d0, q0\n"
336
337      // RowMajorOutput::Output
338      "vst1.32 {d0[0]}, [%[result]]!\n"
339      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
340      : [count] "r"(params.kernel.count),
341        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
342        [shift] "r"(params.kernel.shift),
343        [stride] "r"(params.output_stream.stride),
344        [rounding_offset] "r"(params.kernel.rounding_offset)
345      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
346        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
347        "d21", "cc", "memory");
348}
349
350template <>
351inline void
352MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5,
353          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
354                       const FusedKernelParams<QuantizedStaticPreprocessed,
355                                               RowMajor>& params,
356                       uint8_t* result) {
357#ifdef DEBUG
358#ifdef DEBUG_METAGEMM_VERBOSE
359  std::cout << __FILE__ << "(" << __LINE__
360            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
361               "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()"
362            << std::endl
363            << std::flush;
364#endif
365#endif
366  asm volatile(
367      "pld [%[lhs]]\n"
368      "pld [%[rhs]]\n"
369
370      // Clear aggregators.
371      "vmov.i32 q0, #0\n"
372      "vmov.i32 q1, #0\n"
373      "vmov.i32 q2, #0\n"
374      "vmov.i32 q3, q0\n"
375      "vmov.i32 q4, q1\n"
376
377      // General 1xM lanes loop.
378      "1:"
379
380      // Subtract counter.
381      "subs %[count], %[count], #8\n"
382
383      "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
384      "vld1.32 {d14}, [%[lhs]:64]!\n"
385      "pld [%[lhs], #64]\n"
386      "vmull.u8 q8, d10, d14\n"
387      "vmull.u8 q9, d11, d14\n"
388      "vmull.u8 q10, d12, d14\n"
389      "vmull.u8 q11, d13, d14\n"
390      "vld1.32 {d10}, [%[rhs]:64]!\n"
391      "pld [%[rhs], #128]\n"
392      "vpadal.u16 q0, q8\n"
393      "vpadal.u16 q1, q9\n"
394      "vpadal.u16 q2, q10\n"
395      "vpadal.u16 q3, q11\n"
396      "vmull.u8 q8, d10, d14\n"
397      "vpadal.u16 q4, q8\n"
398
399      // Loop break.
400      "bgt 1b\n"
401
402      // StaticQuantization::Prepare
403      "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
404      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
405      "vdup.32 q8, %[multiplicative_offset]\n"
406      "vdup.32 q9, %[rounding_offset]\n"
407      "vdup.32 q10, %[shift]\n"
408      "vdup.32 q5, d10[0]\n"
409
410      // RowMajorOutput::Prepare
411
412      // Reduce aggregators.
413      "vpadd.u32 d0, d0, d1\n"
414      "vpadd.u32 d2, d2, d3\n"
415      "vpadd.u32 d4, d4, d5\n"
416      "vpadd.u32 d6, d6, d7\n"
417      "vpadd.u32 d8, d8, d9\n"
418      "vpadd.u32 d0, d0, d2\n"
419      "vpadd.u32 d1, d4, d6\n"
420      "vpadd.u32 d2, d8, d8\n"
421
422      // StaticQuantization::Transform
423      "vadd.s32 q0, q0, q5\n"
424      "vadd.s32 q1, q1, q5\n"
425      "vadd.s32 q0, q0, q6\n"
426      "vadd.s32 q1, q1, q7\n"
427      "vmul.i32 q0, q0, q8\n"
428      "vmul.i32 q1, q1, q8\n"
429      "vadd.i32 q0, q0, q9\n"
430      "vadd.i32 q1, q1, q9\n"
431      "vshl.s32 q0, q0, q10\n"
432      "vshl.s32 q1, q1, q10\n"
433      "vqmovn.s32 d0, q0\n"
434      "vqmovn.s32 d1, q1\n"
435      "vqmovun.s16 d0, q0\n"
436
437      // RowMajorOutput::Output
438      "vst1.32 {d0[0]}, [%[result]]!\n"
439      "vst1.8 {d0[4]}, [%[result]]!\n"
440      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
441      : [count] "r"(params.kernel.count),
442        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
443        [shift] "r"(params.kernel.shift),
444        [stride] "r"(params.output_stream.stride),
445        [rounding_offset] "r"(params.kernel.rounding_offset)
446      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
447        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
448        "d21", "d22", "d23", "cc", "memory");
449}
450
451template <>
452inline void
453MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6,
454          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
455                       const FusedKernelParams<QuantizedStaticPreprocessed,
456                                               RowMajor>& params,
457                       uint8_t* result) {
458#ifdef DEBUG
459#ifdef DEBUG_METAGEMM_VERBOSE
460  std::cout << __FILE__ << "(" << __LINE__
461            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
462               "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()"
463            << std::endl
464            << std::flush;
465#endif
466#endif
467  asm volatile(
468      "pld [%[lhs]]\n"
469      "pld [%[rhs]]\n"
470
471      // Clear aggregators.
472      "vmov.i32 q0, #0\n"
473      "vmov.i32 q1, #0\n"
474      "vmov.i32 q2, #0\n"
475      "vmov.i32 q3, q0\n"
476      "vmov.i32 q4, q1\n"
477      "vmov.i32 q5, q2\n"
478
479      // General 1xM lanes loop.
480      "1:"
481
482      // Subtract counter.
483      "subs %[count], %[count], #8\n"
484
485      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
486      "vld1.32 {d16}, [%[lhs]:64]!\n"
487      "pld [%[lhs], #64]\n"
488      "vmull.u8 q9, d12, d16\n"
489      "vmull.u8 q10, d13, d16\n"
490      "vmull.u8 q11, d14, d16\n"
491      "vmull.u8 q12, d15, d16\n"
492      "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
493      "pld [%[rhs], #128]\n"
494      "vpadal.u16 q0, q9\n"
495      "vpadal.u16 q1, q10\n"
496      "vpadal.u16 q2, q11\n"
497      "vpadal.u16 q3, q12\n"
498      "vmull.u8 q9, d12, d16\n"
499      "vmull.u8 q10, d13, d16\n"
500      "vpadal.u16 q4, q9\n"
501      "vpadal.u16 q5, q10\n"
502
503      // Loop break.
504      "bgt 1b\n"
505
506      // StaticQuantization::Prepare
507      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
508      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
509      "vdup.32 q9, %[multiplicative_offset]\n"
510      "vdup.32 q10, %[rounding_offset]\n"
511      "vdup.32 q11, %[shift]\n"
512      "vdup.32 q6, d12[0]\n"
513
514      // RowMajorOutput::Prepare
515
516      // Reduce aggregators.
517      "vpadd.u32 d0, d0, d1\n"
518      "vpadd.u32 d2, d2, d3\n"
519      "vpadd.u32 d4, d4, d5\n"
520      "vpadd.u32 d6, d6, d7\n"
521      "vpadd.u32 d8, d8, d9\n"
522      "vpadd.u32 d10, d10, d11\n"
523      "vpadd.u32 d0, d0, d2\n"
524      "vpadd.u32 d1, d4, d6\n"
525      "vpadd.u32 d2, d8, d10\n"
526
527      // StaticQuantization::Transform
528      "vadd.s32 q0, q0, q6\n"
529      "vadd.s32 q1, q1, q6\n"
530      "vadd.s32 q0, q0, q7\n"
531      "vadd.s32 q1, q1, q8\n"
532      "vmul.i32 q0, q0, q9\n"
533      "vmul.i32 q1, q1, q9\n"
534      "vadd.i32 q0, q0, q10\n"
535      "vadd.i32 q1, q1, q10\n"
536      "vshl.s32 q0, q0, q11\n"
537      "vshl.s32 q1, q1, q11\n"
538      "vqmovn.s32 d0, q0\n"
539      "vqmovn.s32 d1, q1\n"
540      "vqmovun.s16 d0, q0\n"
541
542      // RowMajorOutput::Output
543      "vst1.32 {d0[0]}, [%[result]]!\n"
544      "vst1.16 {d0[2]}, [%[result]]!\n"
545      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
546      : [count] "r"(params.kernel.count),
547        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
548        [shift] "r"(params.kernel.shift),
549        [stride] "r"(params.output_stream.stride),
550        [rounding_offset] "r"(params.kernel.rounding_offset)
551      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
552        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
553        "d21", "d22", "d23", "d24", "d25", "cc", "memory");
554}
555
556template <>
557inline void
558MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7,
559          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
560                       const FusedKernelParams<QuantizedStaticPreprocessed,
561                                               RowMajor>& params,
562                       uint8_t* result) {
563#ifdef DEBUG
564#ifdef DEBUG_METAGEMM_VERBOSE
565  std::cout << __FILE__ << "(" << __LINE__
566            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
567               "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()"
568            << std::endl
569            << std::flush;
570#endif
571#endif
572  asm volatile(
573      "pld [%[lhs]]\n"
574      "pld [%[rhs]]\n"
575
576      // Clear aggregators.
577      "vmov.i32 q0, #0\n"
578      "vmov.i32 q1, #0\n"
579      "vmov.i32 q2, #0\n"
580      "vmov.i32 q3, q0\n"
581      "vmov.i32 q4, q1\n"
582      "vmov.i32 q5, q2\n"
583      "vmov.i32 q6, q3\n"
584
585      // General 1xM lanes loop.
586      "1:"
587
588      // Subtract counter.
589      "subs %[count], %[count], #8\n"
590
591      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
592      "vld1.32 {d18}, [%[lhs]:64]!\n"
593      "pld [%[lhs], #64]\n"
594      "vmull.u8 q10, d14, d18\n"
595      "vmull.u8 q11, d15, d18\n"
596      "vmull.u8 q12, d16, d18\n"
597      "vmull.u8 q13, d17, d18\n"
598      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
599      "pld [%[rhs], #128]\n"
600      "vpadal.u16 q0, q10\n"
601      "vpadal.u16 q1, q11\n"
602      "vpadal.u16 q2, q12\n"
603      "vpadal.u16 q3, q13\n"
604      "vmull.u8 q10, d14, d18\n"
605      "vmull.u8 q11, d15, d18\n"
606      "vmull.u8 q12, d16, d18\n"
607      "vpadal.u16 q4, q10\n"
608      "vpadal.u16 q5, q11\n"
609      "vpadal.u16 q6, q12\n"
610
611      // Loop break.
612      "bgt 1b\n"
613
614      // StaticQuantization::Prepare
615      "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
616      "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
617      "vdup.32 q10, %[multiplicative_offset]\n"
618      "vdup.32 q11, %[rounding_offset]\n"
619      "vdup.32 q12, %[shift]\n"
620      "vdup.32 q7, d14[0]\n"
621
622      // RowMajorOutput::Prepare
623
624      // Reduce aggregators.
625      "vpadd.u32 d0, d0, d1\n"
626      "vpadd.u32 d2, d2, d3\n"
627      "vpadd.u32 d4, d4, d5\n"
628      "vpadd.u32 d6, d6, d7\n"
629      "vpadd.u32 d8, d8, d9\n"
630      "vpadd.u32 d10, d10, d11\n"
631      "vpadd.u32 d12, d12, d13\n"
632      "vpadd.u32 d0, d0, d2\n"
633      "vpadd.u32 d1, d4, d6\n"
634      "vpadd.u32 d2, d8, d10\n"
635      "vpadd.u32 d3, d12, d12\n"
636
637      // StaticQuantization::Transform
638      "vadd.s32 q0, q0, q7\n"
639      "vadd.s32 q1, q1, q7\n"
640      "vadd.s32 q0, q0, q8\n"
641      "vadd.s32 q1, q1, q9\n"
642      "vmul.i32 q0, q0, q10\n"
643      "vmul.i32 q1, q1, q10\n"
644      "vadd.i32 q0, q0, q11\n"
645      "vadd.i32 q1, q1, q11\n"
646      "vshl.s32 q0, q0, q12\n"
647      "vshl.s32 q1, q1, q12\n"
648      "vqmovn.s32 d0, q0\n"
649      "vqmovn.s32 d1, q1\n"
650      "vqmovun.s16 d0, q0\n"
651
652      // RowMajorOutput::Output
653      "vst1.32 {d0[0]}, [%[result]]!\n"
654      "vst1.16 {d0[2]}, [%[result]]!\n"
655      "vst1.8 {d0[6]}, [%[result]]!\n"
656      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
657      : [count] "r"(params.kernel.count),
658        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
659        [shift] "r"(params.kernel.shift),
660        [stride] "r"(params.output_stream.stride),
661        [rounding_offset] "r"(params.kernel.rounding_offset)
662      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
663        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
664        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
665}
666
667template <>
668inline void
669MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8,
670          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
671                       const FusedKernelParams<QuantizedStaticPreprocessed,
672                                               RowMajor>& params,
673                       uint8_t* result) {
674#ifdef DEBUG
675#ifdef DEBUG_METAGEMM_VERBOSE
676  std::cout << __FILE__ << "(" << __LINE__
677            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
678               "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()"
679            << std::endl
680            << std::flush;
681#endif
682#endif
683  asm volatile(
684      "pld [%[lhs]]\n"
685      "pld [%[rhs]]\n"
686
687      // Clear aggregators.
688      "vmov.i32 q0, #0\n"
689      "vmov.i32 q1, #0\n"
690      "vmov.i32 q2, #0\n"
691      "vmov.i32 q3, q0\n"
692      "vmov.i32 q4, q1\n"
693      "vmov.i32 q5, q2\n"
694      "vmov.i32 q6, q3\n"
695      "vmov.i32 q7, q4\n"
696
697      // 1x8 lanes loop.
698      "1:"
699
700      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
701      "vld1.32 {d16}, [%[lhs]:64]!\n"
702      "vmull.u8 q11, d16, d17\n"
703      "vmull.u8 q12, d16, d18\n"
704      "vmull.u8 q13, d16, d19\n"
705      "vmull.u8 q14, d16, d20\n"
706      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
707      "vpadal.u16 q0, q11\n"
708      "vpadal.u16 q1, q12\n"
709      "vpadal.u16 q2, q13\n"
710      "vpadal.u16 q3, q14\n"
711      "pld [%[rhs], #256]\n"
712      "vmull.u8 q15, d16, d17\n"
713      "vmull.u8 q11, d16, d18\n"
714      "vmull.u8 q12, d16, d19\n"
715      "vmull.u8 q13, d16, d20\n"
716      "pld [%[lhs], #32]\n"
717
718      // Subtract counter.
719      "subs %[count], %[count], #8\n"
720
721      "vpadal.u16 q4, q15\n"
722      "vpadal.u16 q5, q11\n"
723      "vpadal.u16 q6, q12\n"
724      "vpadal.u16 q7, q13\n"
725
726      // Loop break.
727      "bgt 1b\n"
728
729      // StaticQuantization::Prepare
730      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
731      "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
732      "vdup.32 q11, %[multiplicative_offset]\n"
733      "vdup.32 q12, %[rounding_offset]\n"
734      "vdup.32 q13, %[shift]\n"
735      "vdup.32 q8, d16[0]\n"
736
737      // RowMajorOutput::Prepare
738
739      // Reduce aggregators.
740      "vpadd.u32 d0, d0, d1\n"
741      "vpadd.u32 d2, d2, d3\n"
742      "vpadd.u32 d4, d4, d5\n"
743      "vpadd.u32 d6, d6, d7\n"
744      "vpadd.u32 d8, d8, d9\n"
745      "vpadd.u32 d10, d10, d11\n"
746      "vpadd.u32 d12, d12, d13\n"
747      "vpadd.u32 d14, d14, d15\n"
748      "vpadd.u32 d0, d0, d2\n"
749      "vpadd.u32 d1, d4, d6\n"
750      "vpadd.u32 d2, d8, d10\n"
751      "vpadd.u32 d3, d12, d14\n"
752
753      // StaticQuantization::Transform
754      "vadd.s32 q0, q0, q8\n"
755      "vadd.s32 q1, q1, q8\n"
756      "vadd.s32 q0, q0, q9\n"
757      "vadd.s32 q1, q1, q10\n"
758      "vmul.i32 q0, q0, q11\n"
759      "vmul.i32 q1, q1, q11\n"
760      "vadd.i32 q0, q0, q12\n"
761      "vadd.i32 q1, q1, q12\n"
762      "vshl.s32 q0, q0, q13\n"
763      "vshl.s32 q1, q1, q13\n"
764      "vqmovn.s32 d0, q0\n"
765      "vqmovn.s32 d1, q1\n"
766      "vqmovun.s16 d0, q0\n"
767
768      // RowMajorOutput::Output
769      "vst1.32 {d0}, [%[result]]!\n"
770      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
771      : [count] "r"(params.kernel.count),
772        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
773        [shift] "r"(params.kernel.shift),
774        [stride] "r"(params.output_stream.stride),
775        [rounding_offset] "r"(params.kernel.rounding_offset)
776      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
777        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
778        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
779        "d31", "cc", "memory");
780}
781
782template <>
783inline void
784MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1,
785          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
786                       const FusedKernelParams<QuantizedStaticPreprocessed,
787                                               RowMajor>& params,
788                       uint8_t* result) {
789#ifdef DEBUG
790#ifdef DEBUG_METAGEMM_VERBOSE
791  std::cout << __FILE__ << "(" << __LINE__
792            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
793               "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()"
794            << std::endl
795            << std::flush;
796#endif
797#endif
798  asm volatile(
799      "pld [%[lhs]]\n"
800      "pld [%[rhs]]\n"
801
802      // Clear aggregators.
803      "vmov.i32 q0, #0\n"
804      "vmov.i32 q1, #0\n"
805
806      // General NxM lanes loop.
807      "1:"
808
809      // Subtract counter.
810      "subs %[count], %[count], #8\n"
811
812      "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
813      "vld1.32 {d6}, [%[rhs]:64]!\n"
814      "pld [%[lhs], #64]\n"
815      "pld [%[rhs], #64]\n"
816      "vmull.u8 q4, d6, d4\n"
817      "vmull.u8 q5, d6, d5\n"
818      "vpadal.u16 q0, q4\n"
819      "vpadal.u16 q1, q5\n"
820
821      // Loop break.
822      "bgt 1b\n"
823
824      // StaticQuantization::Prepare
825      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
826      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
827      "vdup.32 q6, %[multiplicative_offset]\n"
828      "vdup.32 q7, %[rounding_offset]\n"
829      "vdup.32 q8, %[shift]\n"
830      "vdup.32 q2, d8[0]\n"
831      "vdup.32 q4, d8[1]\n"
832
833      // RowMajorOutput::Prepare
834      "add r0, %[result], %[stride]\n"
835
836      // Reduce aggregators.
837      "vpadd.u32 d0, d0, d1\n"
838      "vpadd.u32 d0, d0, d0\n"
839      "vpadd.u32 d2, d2, d3\n"
840      "vpadd.u32 d2, d2, d2\n"
841
842      // StaticQuantization::Transform
843      "vadd.s32 q0, q0, q2\n"
844      "vadd.s32 q1, q1, q4\n"
845      "vadd.s32 q0, q0, q5\n"
846      "vadd.s32 q1, q1, q5\n"
847      "vmul.i32 q0, q0, q6\n"
848      "vmul.i32 q1, q1, q6\n"
849      "vadd.i32 q0, q0, q7\n"
850      "vadd.i32 q1, q1, q7\n"
851      "vshl.s32 q0, q0, q8\n"
852      "vshl.s32 q1, q1, q8\n"
853      "vqmovn.s32 d0, q0\n"
854      "vqmovn.s32 d2, q1\n"
855      "vqmovun.s16 d0, q0\n"
856      "vqmovun.s16 d2, q1\n"
857
858      // RowMajorOutput::Output
859      "vst1.8 {d0[0]}, [%[result]]!\n"
860      "vst1.8 {d2[0]}, [r0]!\n"
861      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
862      : [count] "r"(params.kernel.count),
863        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
864        [shift] "r"(params.kernel.shift),
865        [stride] "r"(params.output_stream.stride),
866        [rounding_offset] "r"(params.kernel.rounding_offset)
867      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
868        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
869}
870
871template <>
872inline void
873MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2,
874          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
875                       const FusedKernelParams<QuantizedStaticPreprocessed,
876                                               RowMajor>& params,
877                       uint8_t* result) {
878#ifdef DEBUG
879#ifdef DEBUG_METAGEMM_VERBOSE
880  std::cout << __FILE__ << "(" << __LINE__
881            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
882               "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()"
883            << std::endl
884            << std::flush;
885#endif
886#endif
887  asm volatile(
888      "pld [%[lhs]]\n"
889      "pld [%[rhs]]\n"
890
891      // Clear aggregators.
892      "vmov.i32 q0, #0\n"
893      "vmov.i32 q1, #0\n"
894      "vmov.i32 q2, #0\n"
895      "vmov.i32 q3, q0\n"
896
897      // General NxM lanes loop.
898      "1:"
899
900      // Subtract counter.
901      "subs %[count], %[count], #8\n"
902
903      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
904      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
905      "pld [%[lhs], #64]\n"
906      "pld [%[rhs], #64]\n"
907      "vmull.u8 q6, d10, d8\n"
908      "vmull.u8 q7, d11, d8\n"
909      "vmull.u8 q8, d10, d9\n"
910      "vmull.u8 q9, d11, d9\n"
911      "vpadal.u16 q0, q6\n"
912      "vpadal.u16 q1, q7\n"
913      "vpadal.u16 q2, q8\n"
914      "vpadal.u16 q3, q9\n"
915
916      // Loop break.
917      "bgt 1b\n"
918
919      // StaticQuantization::Prepare
920      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
921      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
922      "vdup.32 q6, %[multiplicative_offset]\n"
923      "vdup.32 q7, %[rounding_offset]\n"
924      "vdup.32 q8, %[shift]\n"
925      "vdup.32 q9, d8[0]\n"
926      "vdup.32 q4, d8[1]\n"
927
928      // RowMajorOutput::Prepare
929      "add r0, %[result], %[stride]\n"
930
931      // Reduce aggregators.
932      "vpadd.u32 d0, d0, d1\n"
933      "vpadd.u32 d2, d2, d3\n"
934      "vpadd.u32 d0, d0, d2\n"
935      "vpadd.u32 d4, d4, d5\n"
936      "vpadd.u32 d6, d6, d7\n"
937      "vpadd.u32 d4, d4, d6\n"
938
939      // StaticQuantization::Transform
940      "vadd.s32 q0, q0, q9\n"
941      "vadd.s32 q2, q2, q4\n"
942      "vadd.s32 q0, q0, q5\n"
943      "vadd.s32 q2, q2, q5\n"
944      "vmul.i32 q0, q0, q6\n"
945      "vmul.i32 q2, q2, q6\n"
946      "vadd.i32 q0, q0, q7\n"
947      "vadd.i32 q2, q2, q7\n"
948      "vshl.s32 q0, q0, q8\n"
949      "vshl.s32 q2, q2, q8\n"
950      "vqmovn.s32 d0, q0\n"
951      "vqmovn.s32 d4, q2\n"
952      "vqmovun.s16 d0, q0\n"
953      "vqmovun.s16 d4, q2\n"
954
955      // RowMajorOutput::Output
956      "vst1.16 {d0[0]}, [%[result]]!\n"
957      "vst1.16 {d4[0]}, [r0]!\n"
958      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
959      : [count] "r"(params.kernel.count),
960        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
961        [shift] "r"(params.kernel.shift),
962        [stride] "r"(params.output_stream.stride),
963        [rounding_offset] "r"(params.kernel.rounding_offset)
964      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
965        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
966        "memory");
967}
968
969template <>
970inline void
971MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3,
972          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
973                       const FusedKernelParams<QuantizedStaticPreprocessed,
974                                               RowMajor>& params,
975                       uint8_t* result) {
976#ifdef DEBUG
977#ifdef DEBUG_METAGEMM_VERBOSE
978  std::cout << __FILE__ << "(" << __LINE__
979            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
980               "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()"
981            << std::endl
982            << std::flush;
983#endif
984#endif
985  asm volatile(
986      "pld [%[lhs]]\n"
987      "pld [%[rhs]]\n"
988
989      // Clear aggregators.
990      "vmov.i32 q0, #0\n"
991      "vmov.i32 q1, #0\n"
992      "vmov.i32 q2, #0\n"
993      "vmov.i32 q3, q0\n"
994      "vmov.i32 q4, q1\n"
995      "vmov.i32 q5, q2\n"
996
997      // General NxM lanes loop.
998      "1:"
999
1000      // Subtract counter.
1001      "subs %[count], %[count], #8\n"
1002
1003      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1004      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
1005      "pld [%[lhs], #64]\n"
1006      "pld [%[rhs], #64]\n"
1007      "vmull.u8 q9, d14, d12\n"
1008      "vmull.u8 q10, d15, d12\n"
1009      "vmull.u8 q11, d16, d12\n"
1010      "vmull.u8 q12, d14, d13\n"
1011      "vmull.u8 q13, d15, d13\n"
1012      "vmull.u8 q14, d16, d13\n"
1013      "vpadal.u16 q0, q9\n"
1014      "vpadal.u16 q1, q10\n"
1015      "vpadal.u16 q2, q11\n"
1016      "vpadal.u16 q3, q12\n"
1017      "vpadal.u16 q4, q13\n"
1018      "vpadal.u16 q5, q14\n"
1019
1020      // Loop break.
1021      "bgt 1b\n"
1022
1023      // StaticQuantization::Prepare
1024      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1025      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
1026      "vdup.32 q8, %[multiplicative_offset]\n"
1027      "vdup.32 q9, %[rounding_offset]\n"
1028      "vdup.32 q10, %[shift]\n"
1029      "vdup.32 q11, d12[0]\n"
1030      "vdup.32 q6, d12[1]\n"
1031
1032      // RowMajorOutput::Prepare
1033      "add r0, %[result], %[stride]\n"
1034
1035      // Reduce aggregators.
1036      "vpadd.u32 d0, d0, d1\n"
1037      "vpadd.u32 d2, d2, d3\n"
1038      "vpadd.u32 d4, d4, d5\n"
1039      "vpadd.u32 d0, d0, d2\n"
1040      "vpadd.u32 d1, d4, d4\n"
1041      "vpadd.u32 d6, d6, d7\n"
1042      "vpadd.u32 d8, d8, d9\n"
1043      "vpadd.u32 d10, d10, d11\n"
1044      "vpadd.u32 d6, d6, d8\n"
1045      "vpadd.u32 d7, d10, d10\n"
1046
1047      // StaticQuantization::Transform
1048      "vadd.s32 q0, q0, q11\n"
1049      "vadd.s32 q3, q3, q6\n"
1050      "vadd.s32 q0, q0, q7\n"
1051      "vadd.s32 q3, q3, q7\n"
1052      "vmul.i32 q0, q0, q8\n"
1053      "vmul.i32 q3, q3, q8\n"
1054      "vadd.i32 q0, q0, q9\n"
1055      "vadd.i32 q3, q3, q9\n"
1056      "vshl.s32 q0, q0, q10\n"
1057      "vshl.s32 q3, q3, q10\n"
1058      "vqmovn.s32 d0, q0\n"
1059      "vqmovn.s32 d6, q3\n"
1060      "vqmovun.s16 d0, q0\n"
1061      "vqmovun.s16 d6, q3\n"
1062
1063      // RowMajorOutput::Output
1064      "vst1.16 {d0[0]}, [%[result]]!\n"
1065      "vst1.8 {d0[2]}, [%[result]]!\n"
1066      "vst1.16 {d6[0]}, [r0]!\n"
1067      "vst1.8 {d6[2]}, [r0]!\n"
1068      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1069      : [count] "r"(params.kernel.count),
1070        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1071        [shift] "r"(params.kernel.shift),
1072        [stride] "r"(params.output_stream.stride),
1073        [rounding_offset] "r"(params.kernel.rounding_offset)
1074      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1075        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
1076        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
1077        "memory");
1078}
1079
1080template <>
1081inline void
1082MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4,
1083          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1084                       const FusedKernelParams<QuantizedStaticPreprocessed,
1085                                               RowMajor>& params,
1086                       uint8_t* result) {
1087#ifdef DEBUG
1088#ifdef DEBUG_METAGEMM_VERBOSE
1089  std::cout << __FILE__ << "(" << __LINE__
1090            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1091               "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()"
1092            << std::endl
1093            << std::flush;
1094#endif
1095#endif
1096  asm volatile(
1097      "pld [%[lhs]]\n"
1098      "pld [%[rhs]]\n"
1099
1100      // Clear aggregators.
1101      "vmov.i32 q0, #0\n"
1102      "vmov.i32 q1, #0\n"
1103      "vmov.i32 q2, #0\n"
1104      "vmov.i32 q3, q0\n"
1105      "vmov.i32 q4, q1\n"
1106      "vmov.i32 q5, q2\n"
1107      "vmov.i32 q6, q3\n"
1108      "vmov.i32 q7, q4\n"
1109
1110      // 2x4 lanes loop.
1111      "1:"
1112
1113      "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
1114      "vld1.8 {d16}, [%[lhs]:64]!\n"
1115      "vmull.u8 q11, d16, d18\n"
1116      "vld1.8 {d17}, [%[lhs]:64]!\n"
1117      "vmull.u8 q12, d16, d19\n"
1118      "pld [%[rhs], #64]\n"
1119      "vmull.u8 q13, d16, d20\n"
1120      "pld [%[lhs], #64]\n"
1121      "vmull.u8 q14, d16, d21\n"
1122      "vmull.u8 q15, d17, d18\n"
1123      "vpadal.u16 q0, q11\n"
1124      "vpadal.u16 q1, q12\n"
1125      "vpadal.u16 q2, q13\n"
1126      "vmull.u8 q11, d17, d19\n"
1127      "vmull.u8 q12, d17, d20\n"
1128      "vmull.u8 q13, d17, d21\n"
1129
1130      // Subtract counter.
1131      "subs %[count], %[count], #8\n"
1132
1133      "vpadal.u16 q3, q14\n"
1134      "vpadal.u16 q4, q15\n"
1135      "vpadal.u16 q5, q11\n"
1136      "vpadal.u16 q6, q12\n"
1137      "vpadal.u16 q7, q13\n"
1138
1139      // Loop break.
1140      "bgt 1b\n"
1141
1142      // StaticQuantization::Prepare
1143      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
1144      "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
1145      "vdup.32 q10, %[multiplicative_offset]\n"
1146      "vdup.32 q11, %[rounding_offset]\n"
1147      "vdup.32 q12, %[shift]\n"
1148      "vdup.32 q13, d16[0]\n"
1149      "vdup.32 q8, d16[1]\n"
1150
1151      // RowMajorOutput::Prepare
1152      "add r0, %[result], %[stride]\n"
1153
1154      // Reduce aggregators.
1155      "vpadd.u32 d0, d0, d1\n"
1156      "vpadd.u32 d2, d2, d3\n"
1157      "vpadd.u32 d4, d4, d5\n"
1158      "vpadd.u32 d6, d6, d7\n"
1159      "vpadd.u32 d0, d0, d2\n"
1160      "vpadd.u32 d1, d4, d6\n"
1161      "vpadd.u32 d8, d8, d9\n"
1162      "vpadd.u32 d10, d10, d11\n"
1163      "vpadd.u32 d12, d12, d13\n"
1164      "vpadd.u32 d14, d14, d15\n"
1165      "vpadd.u32 d8, d8, d10\n"
1166      "vpadd.u32 d9, d12, d14\n"
1167
1168      // StaticQuantization::Transform
1169      "vadd.s32 q0, q0, q13\n"
1170      "vadd.s32 q4, q4, q8\n"
1171      "vadd.s32 q0, q0, q9\n"
1172      "vadd.s32 q4, q4, q9\n"
1173      "vmul.i32 q0, q0, q10\n"
1174      "vmul.i32 q4, q4, q10\n"
1175      "vadd.i32 q0, q0, q11\n"
1176      "vadd.i32 q4, q4, q11\n"
1177      "vshl.s32 q0, q0, q12\n"
1178      "vshl.s32 q4, q4, q12\n"
1179      "vqmovn.s32 d0, q0\n"
1180      "vqmovn.s32 d8, q4\n"
1181      "vqmovun.s16 d0, q0\n"
1182      "vqmovun.s16 d8, q4\n"
1183
1184      // RowMajorOutput::Output
1185      "vst1.32 {d0[0]}, [%[result]]!\n"
1186      "vst1.32 {d8[0]}, [r0]!\n"
1187      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1188      : [count] "r"(params.kernel.count),
1189        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1190        [shift] "r"(params.kernel.shift),
1191        [stride] "r"(params.output_stream.stride),
1192        [rounding_offset] "r"(params.kernel.rounding_offset)
1193      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1194        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
1195        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
1196        "d31", "cc", "memory");
1197}
1198
1199template <>
1200inline void
1201MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1,
1202          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1203                       const FusedKernelParams<QuantizedStaticPreprocessed,
1204                                               RowMajor>& params,
1205                       uint8_t* result) {
1206#ifdef DEBUG
1207#ifdef DEBUG_METAGEMM_VERBOSE
1208  std::cout << __FILE__ << "(" << __LINE__
1209            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1210               "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()"
1211            << std::endl
1212            << std::flush;
1213#endif
1214#endif
1215  asm volatile(
1216      "pld [%[lhs]]\n"
1217      "pld [%[rhs]]\n"
1218
1219      // Clear aggregators.
1220      "vmov.i32 q0, #0\n"
1221      "vmov.i32 q1, #0\n"
1222      "vmov.i32 q2, #0\n"
1223
1224      // General NxM lanes loop.
1225      "1:"
1226
1227      // Subtract counter.
1228      "subs %[count], %[count], #8\n"
1229
1230      "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
1231      "vld1.32 {d9}, [%[rhs]:64]!\n"
1232      "pld [%[lhs], #64]\n"
1233      "pld [%[rhs], #64]\n"
1234      "vmull.u8 q5, d9, d6\n"
1235      "vmull.u8 q6, d9, d7\n"
1236      "vmull.u8 q7, d9, d8\n"
1237      "vpadal.u16 q0, q5\n"
1238      "vpadal.u16 q1, q6\n"
1239      "vpadal.u16 q2, q7\n"
1240
1241      // Loop break.
1242      "bgt 1b\n"
1243
1244      // StaticQuantization::Prepare
1245      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1246      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1247      "vdup.32 q6, %[multiplicative_offset]\n"
1248      "vdup.32 q7, %[rounding_offset]\n"
1249      "vdup.32 q8, %[shift]\n"
1250      "vdup.32 q3, d8[0]\n"
1251      "vdup.32 q9, d8[1]\n"
1252      "vdup.32 q4, d9[0]\n"
1253
1254      // RowMajorOutput::Prepare
1255      "add r0, %[result], %[stride]\n"
1256      "add r1, r0, %[stride]\n"
1257
1258      // Reduce aggregators.
1259      "vpadd.u32 d0, d0, d1\n"
1260      "vpadd.u32 d0, d0, d0\n"
1261      "vpadd.u32 d2, d2, d3\n"
1262      "vpadd.u32 d2, d2, d2\n"
1263      "vpadd.u32 d4, d4, d5\n"
1264      "vpadd.u32 d4, d4, d4\n"
1265
1266      // StaticQuantization::Transform
1267      "vadd.s32 q0, q0, q3\n"
1268      "vadd.s32 q1, q1, q9\n"
1269      "vadd.s32 q2, q2, q4\n"
1270      "vadd.s32 q0, q0, q5\n"
1271      "vadd.s32 q1, q1, q5\n"
1272      "vadd.s32 q2, q2, q5\n"
1273      "vmul.i32 q0, q0, q6\n"
1274      "vmul.i32 q1, q1, q6\n"
1275      "vmul.i32 q2, q2, q6\n"
1276      "vadd.i32 q0, q0, q7\n"
1277      "vadd.i32 q1, q1, q7\n"
1278      "vadd.i32 q2, q2, q7\n"
1279      "vshl.s32 q0, q0, q8\n"
1280      "vshl.s32 q1, q1, q8\n"
1281      "vshl.s32 q2, q2, q8\n"
1282      "vqmovn.s32 d0, q0\n"
1283      "vqmovn.s32 d2, q1\n"
1284      "vqmovn.s32 d4, q2\n"
1285      "vqmovun.s16 d0, q0\n"
1286      "vqmovun.s16 d2, q1\n"
1287      "vqmovun.s16 d4, q2\n"
1288
1289      // RowMajorOutput::Output
1290      "vst1.8 {d0[0]}, [%[result]]!\n"
1291      "vst1.8 {d2[0]}, [r0]!\n"
1292      "vst1.8 {d4[0]}, [r1]!\n"
1293      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1294      : [count] "r"(params.kernel.count),
1295        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1296        [shift] "r"(params.kernel.shift),
1297        [stride] "r"(params.output_stream.stride),
1298        [rounding_offset] "r"(params.kernel.rounding_offset)
1299      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
1300        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
1301        "cc", "memory");
1302}
1303
1304template <>
1305inline void
1306MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2,
1307          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1308                       const FusedKernelParams<QuantizedStaticPreprocessed,
1309                                               RowMajor>& params,
1310                       uint8_t* result) {
1311#ifdef DEBUG
1312#ifdef DEBUG_METAGEMM_VERBOSE
1313  std::cout << __FILE__ << "(" << __LINE__
1314            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1315               "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()"
1316            << std::endl
1317            << std::flush;
1318#endif
1319#endif
1320  asm volatile(
1321      "pld [%[lhs]]\n"
1322      "pld [%[rhs]]\n"
1323
1324      // Clear aggregators.
1325      "vmov.i32 q0, #0\n"
1326      "vmov.i32 q1, #0\n"
1327      "vmov.i32 q2, #0\n"
1328      "vmov.i32 q3, q0\n"
1329      "vmov.i32 q4, q1\n"
1330      "vmov.i32 q5, q2\n"
1331
1332      // General NxM lanes loop.
1333      "1:"
1334
1335      // Subtract counter.
1336      "subs %[count], %[count], #8\n"
1337
1338      "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
1339      "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
1340      "pld [%[lhs], #64]\n"
1341      "pld [%[rhs], #64]\n"
1342      "vmull.u8 q9, d15, d12\n"
1343      "vmull.u8 q10, d16, d12\n"
1344      "vmull.u8 q11, d15, d13\n"
1345      "vmull.u8 q12, d16, d13\n"
1346      "vmull.u8 q13, d15, d14\n"
1347      "vmull.u8 q14, d16, d14\n"
1348      "vpadal.u16 q0, q9\n"
1349      "vpadal.u16 q1, q10\n"
1350      "vpadal.u16 q2, q11\n"
1351      "vpadal.u16 q3, q12\n"
1352      "vpadal.u16 q4, q13\n"
1353      "vpadal.u16 q5, q14\n"
1354
1355      // Loop break.
1356      "bgt 1b\n"
1357
1358      // StaticQuantization::Prepare
1359      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1360      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
1361      "vdup.32 q8, %[multiplicative_offset]\n"
1362      "vdup.32 q9, %[rounding_offset]\n"
1363      "vdup.32 q10, %[shift]\n"
1364      "vdup.32 q11, d12[0]\n"
1365      "vdup.32 q12, d12[1]\n"
1366      "vdup.32 q6, d13[0]\n"
1367
1368      // RowMajorOutput::Prepare
1369      "add r0, %[result], %[stride]\n"
1370      "add r1, r0, %[stride]\n"
1371
1372      // Reduce aggregators.
1373      "vpadd.u32 d0, d0, d1\n"
1374      "vpadd.u32 d2, d2, d3\n"
1375      "vpadd.u32 d0, d0, d2\n"
1376      "vpadd.u32 d4, d4, d5\n"
1377      "vpadd.u32 d6, d6, d7\n"
1378      "vpadd.u32 d4, d4, d6\n"
1379      "vpadd.u32 d8, d8, d9\n"
1380      "vpadd.u32 d10, d10, d11\n"
1381      "vpadd.u32 d8, d8, d10\n"
1382
1383      // StaticQuantization::Transform
1384      "vadd.s32 q0, q0, q11\n"
1385      "vadd.s32 q2, q2, q12\n"
1386      "vadd.s32 q4, q4, q6\n"
1387      "vadd.s32 q0, q0, q7\n"
1388      "vadd.s32 q2, q2, q7\n"
1389      "vadd.s32 q4, q4, q7\n"
1390      "vmul.i32 q0, q0, q8\n"
1391      "vmul.i32 q2, q2, q8\n"
1392      "vmul.i32 q4, q4, q8\n"
1393      "vadd.i32 q0, q0, q9\n"
1394      "vadd.i32 q2, q2, q9\n"
1395      "vadd.i32 q4, q4, q9\n"
1396      "vshl.s32 q0, q0, q10\n"
1397      "vshl.s32 q2, q2, q10\n"
1398      "vshl.s32 q4, q4, q10\n"
1399      "vqmovn.s32 d0, q0\n"
1400      "vqmovn.s32 d4, q2\n"
1401      "vqmovn.s32 d8, q4\n"
1402      "vqmovun.s16 d0, q0\n"
1403      "vqmovun.s16 d4, q2\n"
1404      "vqmovun.s16 d8, q4\n"
1405
1406      // RowMajorOutput::Output
1407      "vst1.16 {d0[0]}, [%[result]]!\n"
1408      "vst1.16 {d4[0]}, [r0]!\n"
1409      "vst1.16 {d8[0]}, [r1]!\n"
1410      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1411      : [count] "r"(params.kernel.count),
1412        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1413        [shift] "r"(params.kernel.shift),
1414        [stride] "r"(params.output_stream.stride),
1415        [rounding_offset] "r"(params.kernel.rounding_offset)
1416      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
1417        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
1418        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1419        "cc", "memory");
1420}
1421
1422template <>
1423inline void
1424MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3,
1425          8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1426                       const FusedKernelParams<QuantizedStaticPreprocessed,
1427                                               RowMajor>& params,
1428                       uint8_t* result) {
1429#ifdef DEBUG
1430#ifdef DEBUG_METAGEMM_VERBOSE
1431  std::cout << __FILE__ << "(" << __LINE__
1432            << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1433               "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()"
1434            << std::endl
1435            << std::flush;
1436#endif
1437#endif
1438  asm volatile(
1439      "pld [%[lhs]]\n"
1440      "pld [%[rhs]]\n"
1441
1442      // Clear aggregators.
1443      "vmov.i32 q0, #0\n"
1444      "vmov.i32 q1, #0\n"
1445      "vmov.i32 q2, #0\n"
1446      "vmov.i32 q3, q0\n"
1447      "vmov.i32 q4, q1\n"
1448      "vmov.i32 q5, q2\n"
1449      "vmov.i32 q6, q3\n"
1450      "vmov.i32 q7, q4\n"
1451      "vmov.i32 q8, q5\n"
1452
1453      // 3x3 lanes loop.
1454      "1:"
1455
1456      "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
1457      "vld1.8 {d18}, [%[lhs]:64]!\n"
1458      "vmull.u8 q12, d18, d21\n"
1459      "vld1.8 {d19}, [%[lhs]:64]!\n"
1460      "vmull.u8 q13, d18, d22\n"
1461      "vld1.8 {d20}, [%[lhs]:64]!\n"
1462      "vmull.u8 q14, d18, d23\n"
1463      "pld [%[lhs], #64]\n"
1464      "vmull.u8 q15, d19, d21\n"
1465      "pld [%[rhs], #64]\n"
1466      "vpadal.u16 q0, q12\n"
1467      "vpadal.u16 q1, q13\n"
1468      "vpadal.u16 q2, q14\n"
1469      "vpadal.u16 q3, q15\n"
1470      "vmull.u8 q12, d19, d22\n"
1471      "vmull.u8 q13, d19, d23\n"
1472      "vmull.u8 q14, d20, d21\n"
1473      "vmull.u8 q15, d20, d22\n"
1474
1475      // Subtract counter.
1476      "subs %[count], %[count], #8\n"
1477
1478      "vmull.u8 q9, d20, d23\n"
1479      "vpadal.u16 q4, q12\n"
1480      "vpadal.u16 q5, q13\n"
1481      "vpadal.u16 q6, q14\n"
1482      "vpadal.u16 q7, q15\n"
1483      "vpadal.u16 q8, q9\n"
1484
1485      // Loop break.
1486      "bgt 1b\n"
1487
1488      // StaticQuantization::Prepare
1489      "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
1490      "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
1491      "vdup.32 q11, %[multiplicative_offset]\n"
1492      "vdup.32 q12, %[rounding_offset]\n"
1493      "vdup.32 q13, %[shift]\n"
1494      "vdup.32 q14, d18[0]\n"
1495      "vdup.32 q15, d18[1]\n"
1496      "vdup.32 q9, d19[0]\n"
1497
1498      // RowMajorOutput::Prepare
1499      "add r0, %[result], %[stride]\n"
1500      "add r1, r0, %[stride]\n"
1501
1502      // Reduce aggregators.
1503      "vpadd.u32 d0, d0, d1\n"
1504      "vpadd.u32 d2, d2, d3\n"
1505      "vpadd.u32 d4, d4, d5\n"
1506      "vpadd.u32 d0, d0, d2\n"
1507      "vpadd.u32 d1, d4, d4\n"
1508      "vpadd.u32 d6, d6, d7\n"
1509      "vpadd.u32 d8, d8, d9\n"
1510      "vpadd.u32 d10, d10, d11\n"
1511      "vpadd.u32 d6, d6, d8\n"
1512      "vpadd.u32 d7, d10, d10\n"
1513      "vpadd.u32 d12, d12, d13\n"
1514      "vpadd.u32 d14, d14, d15\n"
1515      "vpadd.u32 d16, d16, d17\n"
1516      "vpadd.u32 d12, d12, d14\n"
1517      "vpadd.u32 d13, d16, d16\n"
1518
1519      // StaticQuantization::Transform
1520      "vadd.s32 q0, q0, q14\n"
1521      "vadd.s32 q3, q3, q15\n"
1522      "vadd.s32 q6, q6, q9\n"
1523      "vadd.s32 q0, q0, q10\n"
1524      "vadd.s32 q3, q3, q10\n"
1525      "vadd.s32 q6, q6, q10\n"
1526      "vmul.i32 q0, q0, q11\n"
1527      "vmul.i32 q3, q3, q11\n"
1528      "vmul.i32 q6, q6, q11\n"
1529      "vadd.i32 q0, q0, q12\n"
1530      "vadd.i32 q3, q3, q12\n"
1531      "vadd.i32 q6, q6, q12\n"
1532      "vshl.s32 q0, q0, q13\n"
1533      "vshl.s32 q3, q3, q13\n"
1534      "vshl.s32 q6, q6, q13\n"
1535      "vqmovn.s32 d0, q0\n"
1536      "vqmovn.s32 d6, q3\n"
1537      "vqmovn.s32 d12, q6\n"
1538      "vqmovun.s16 d0, q0\n"
1539      "vqmovun.s16 d6, q3\n"
1540      "vqmovun.s16 d12, q6\n"
1541
1542      // RowMajorOutput::Output
1543      "vst1.16 {d0[0]}, [%[result]]!\n"
1544      "vst1.8 {d0[2]}, [%[result]]!\n"
1545      "vst1.16 {d6[0]}, [r0]!\n"
1546      "vst1.8 {d6[2]}, [r0]!\n"
1547      "vst1.16 {d12[0]}, [r1]!\n"
1548      "vst1.8 {d12[2]}, [r1]!\n"
1549      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1550      : [count] "r"(params.kernel.count),
1551        [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1552        [shift] "r"(params.kernel.shift),
1553        [stride] "r"(params.output_stream.stride),
1554        [rounding_offset] "r"(params.kernel.rounding_offset)
1555      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
1556        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
1557        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1558        "d30", "d31", "cc", "memory");
1559}
1560
1561template <>
1562inline void MulKernel<
1563    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1,
1564    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1565                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1566                                         RowMajor>& params,
1567                 int32_t* result) {
1568#ifdef DEBUG
1569#ifdef DEBUG_METAGEMM_VERBOSE
1570  std::cout << __FILE__ << "(" << __LINE__
1571            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1572               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, "
1573               "8>::Multiply()"
1574            << std::endl
1575            << std::flush;
1576#endif
1577#endif
1578  asm volatile(
1579      "pld [%[lhs]]\n"
1580      "pld [%[rhs]]\n"
1581
1582      // Clear aggregators.
1583      "vmov.i32 q0, #0\n"
1584
1585      // General NxM lanes loop.
1586      "1:"
1587
1588      // Subtract counter.
1589      "subs %[count], %[count], #8\n"
1590
1591      "vld1.32 {d2}, [%[lhs]:64]!\n"
1592      "vld1.32 {d3}, [%[rhs]:64]!\n"
1593      "pld [%[lhs], #64]\n"
1594      "pld [%[rhs], #64]\n"
1595      "vmull.u8 q2, d3, d2\n"
1596      "vpadal.u16 q0, q2\n"
1597
1598      // Loop break.
1599      "bgt 1b\n"
1600
1601      // StaticQuantizationInt32::Prepare
1602      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1603      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1604      "vdup.32 q4, d8[0]\n"
1605
1606      // RowMajorOutput::Prepare
1607
1608      // Reduce aggregators.
1609      "vpadd.u32 d0, d0, d1\n"
1610      "vpadd.u32 d0, d0, d0\n"
1611
1612      // StaticQuantizationInt32::Transform
1613      "vadd.s32 q0, q0, q4\n"
1614      "vadd.s32 q0, q0, q5\n"
1615
1616      // RowMajorOutput::Output
1617      "vst1.32 {d0[0]}, [%[result]]!\n"
1618      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1619      : [count] "r"(params.kernel.count),
1620        [stride] "r"(params.output_stream.stride)
1621      : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "cc",
1622        "memory");
1623}
1624
1625template <>
1626inline void MulKernel<
1627    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2,
1628    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1629                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1630                                         RowMajor>& params,
1631                 int32_t* result) {
1632#ifdef DEBUG
1633#ifdef DEBUG_METAGEMM_VERBOSE
1634  std::cout << __FILE__ << "(" << __LINE__
1635            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1636               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, "
1637               "8>::Multiply()"
1638            << std::endl
1639            << std::flush;
1640#endif
1641#endif
1642  asm volatile(
1643      "pld [%[lhs]]\n"
1644      "pld [%[rhs]]\n"
1645
1646      // Clear aggregators.
1647      "vmov.i32 q0, #0\n"
1648      "vmov.i32 q1, #0\n"
1649
1650      // General NxM lanes loop.
1651      "1:"
1652
1653      // Subtract counter.
1654      "subs %[count], %[count], #8\n"
1655
1656      "vld1.32 {d4}, [%[lhs]:64]!\n"
1657      "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
1658      "pld [%[lhs], #64]\n"
1659      "pld [%[rhs], #64]\n"
1660      "vmull.u8 q4, d5, d4\n"
1661      "vmull.u8 q5, d6, d4\n"
1662      "vpadal.u16 q0, q4\n"
1663      "vpadal.u16 q1, q5\n"
1664
1665      // Loop break.
1666      "bgt 1b\n"
1667
1668      // StaticQuantizationInt32::Prepare
1669      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1670      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1671      "vdup.32 q4, d8[0]\n"
1672
1673      // RowMajorOutput::Prepare
1674
1675      // Reduce aggregators.
1676      "vpadd.u32 d0, d0, d1\n"
1677      "vpadd.u32 d2, d2, d3\n"
1678      "vpadd.u32 d0, d0, d2\n"
1679
1680      // StaticQuantizationInt32::Transform
1681      "vadd.s32 q0, q0, q4\n"
1682      "vadd.s32 q0, q0, q5\n"
1683
1684      // RowMajorOutput::Output
1685      "vst1.32 {d0}, [%[result]]!\n"
1686      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1687      : [count] "r"(params.kernel.count),
1688        [stride] "r"(params.output_stream.stride)
1689      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
1690        "cc", "memory");
1691}
1692
1693template <>
1694inline void MulKernel<
1695    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3,
1696    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1697                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1698                                         RowMajor>& params,
1699                 int32_t* result) {
1700#ifdef DEBUG
1701#ifdef DEBUG_METAGEMM_VERBOSE
1702  std::cout << __FILE__ << "(" << __LINE__
1703            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1704               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, "
1705               "8>::Multiply()"
1706            << std::endl
1707            << std::flush;
1708#endif
1709#endif
1710  asm volatile(
1711      "pld [%[lhs]]\n"
1712      "pld [%[rhs]]\n"
1713
1714      // Clear aggregators.
1715      "vmov.i32 q0, #0\n"
1716      "vmov.i32 q1, #0\n"
1717      "vmov.i32 q2, #0\n"
1718
1719      // General NxM lanes loop.
1720      "1:"
1721
1722      // Subtract counter.
1723      "subs %[count], %[count], #8\n"
1724
1725      "vld1.32 {d6}, [%[lhs]:64]!\n"
1726      "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
1727      "pld [%[lhs], #64]\n"
1728      "pld [%[rhs], #64]\n"
1729      "vmull.u8 q5, d7, d6\n"
1730      "vmull.u8 q6, d8, d6\n"
1731      "vmull.u8 q7, d9, d6\n"
1732      "vpadal.u16 q0, q5\n"
1733      "vpadal.u16 q1, q6\n"
1734      "vpadal.u16 q2, q7\n"
1735
1736      // Loop break.
1737      "bgt 1b\n"
1738
1739      // StaticQuantizationInt32::Prepare
1740      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1741      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1742      "vdup.32 q4, d8[0]\n"
1743
1744      // RowMajorOutput::Prepare
1745
1746      // Reduce aggregators.
1747      "vpadd.u32 d0, d0, d1\n"
1748      "vpadd.u32 d2, d2, d3\n"
1749      "vpadd.u32 d4, d4, d5\n"
1750      "vpadd.u32 d0, d0, d2\n"
1751      "vpadd.u32 d1, d4, d4\n"
1752
1753      // StaticQuantizationInt32::Transform
1754      "vadd.s32 q0, q0, q4\n"
1755      "vadd.s32 q0, q0, q5\n"
1756
1757      // RowMajorOutput::Output
1758      "vst1.32 {d0}, [%[result]]!\n"
1759      "vst1.32 {d1[0]}, [%[result]]!\n"
1760      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1761      : [count] "r"(params.kernel.count),
1762        [stride] "r"(params.output_stream.stride)
1763      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1764        "d11", "d12", "d13", "d14", "d15", "cc", "memory");
1765}
1766
1767template <>
1768inline void MulKernel<
1769    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4,
1770    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1771                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1772                                         RowMajor>& params,
1773                 int32_t* result) {
1774#ifdef DEBUG
1775#ifdef DEBUG_METAGEMM_VERBOSE
1776  std::cout << __FILE__ << "(" << __LINE__
1777            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1778               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, "
1779               "8>::Multiply()"
1780            << std::endl
1781            << std::flush;
1782#endif
1783#endif
1784  asm volatile(
1785      "pld [%[lhs]]\n"
1786      "pld [%[rhs]]\n"
1787
1788      // Clear aggregators.
1789      "vmov.i32 q0, #0\n"
1790      "vmov.i32 q1, #0\n"
1791      "vmov.i32 q2, #0\n"
1792      "vmov.i32 q3, q0\n"
1793
1794      // General NxM lanes loop.
1795      "1:"
1796
1797      // Subtract counter.
1798      "subs %[count], %[count], #8\n"
1799
1800      "vld1.32 {d8}, [%[lhs]:64]!\n"
1801      "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
1802      "pld [%[lhs], #64]\n"
1803      "pld [%[rhs], #64]\n"
1804      "vmull.u8 q7, d9, d8\n"
1805      "vmull.u8 q8, d10, d8\n"
1806      "vmull.u8 q9, d11, d8\n"
1807      "vmull.u8 q10, d12, d8\n"
1808      "vpadal.u16 q0, q7\n"
1809      "vpadal.u16 q1, q8\n"
1810      "vpadal.u16 q2, q9\n"
1811      "vpadal.u16 q3, q10\n"
1812
1813      // Loop break.
1814      "bgt 1b\n"
1815
1816      // StaticQuantizationInt32::Prepare
1817      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
1818      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
1819      "vdup.32 q4, d8[0]\n"
1820
1821      // RowMajorOutput::Prepare
1822
1823      // Reduce aggregators.
1824      "vpadd.u32 d0, d0, d1\n"
1825      "vpadd.u32 d2, d2, d3\n"
1826      "vpadd.u32 d4, d4, d5\n"
1827      "vpadd.u32 d6, d6, d7\n"
1828      "vpadd.u32 d0, d0, d2\n"
1829      "vpadd.u32 d1, d4, d6\n"
1830
1831      // StaticQuantizationInt32::Transform
1832      "vadd.s32 q0, q0, q4\n"
1833      "vadd.s32 q0, q0, q5\n"
1834
1835      // RowMajorOutput::Output
1836      "vst1.32 {d0, d1}, [%[result]]!\n"
1837      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1838      : [count] "r"(params.kernel.count),
1839        [stride] "r"(params.output_stream.stride)
1840      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1841        "d11", "d12", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21",
1842        "cc", "memory");
1843}
1844
1845template <>
1846inline void MulKernel<
1847    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5,
1848    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1849                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1850                                         RowMajor>& params,
1851                 int32_t* result) {
1852#ifdef DEBUG
1853#ifdef DEBUG_METAGEMM_VERBOSE
1854  std::cout << __FILE__ << "(" << __LINE__
1855            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1856               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, "
1857               "8>::Multiply()"
1858            << std::endl
1859            << std::flush;
1860#endif
1861#endif
1862  asm volatile(
1863      "pld [%[lhs]]\n"
1864      "pld [%[rhs]]\n"
1865
1866      // Clear aggregators.
1867      "vmov.i32 q0, #0\n"
1868      "vmov.i32 q1, #0\n"
1869      "vmov.i32 q2, #0\n"
1870      "vmov.i32 q3, q0\n"
1871      "vmov.i32 q4, q1\n"
1872
1873      // General 1xM lanes loop.
1874      "1:"
1875
1876      // Subtract counter.
1877      "subs %[count], %[count], #8\n"
1878
1879      "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
1880      "vld1.32 {d14}, [%[lhs]:64]!\n"
1881      "pld [%[lhs], #64]\n"
1882      "vmull.u8 q8, d10, d14\n"
1883      "vmull.u8 q9, d11, d14\n"
1884      "vmull.u8 q10, d12, d14\n"
1885      "vmull.u8 q11, d13, d14\n"
1886      "vld1.32 {d10}, [%[rhs]:64]!\n"
1887      "pld [%[rhs], #128]\n"
1888      "vpadal.u16 q0, q8\n"
1889      "vpadal.u16 q1, q9\n"
1890      "vpadal.u16 q2, q10\n"
1891      "vpadal.u16 q3, q11\n"
1892      "vmull.u8 q8, d10, d14\n"
1893      "vpadal.u16 q4, q8\n"
1894
1895      // Loop break.
1896      "bgt 1b\n"
1897
1898      // StaticQuantizationInt32::Prepare
1899      "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
1900      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
1901      "vdup.32 q5, d10[0]\n"
1902
1903      // RowMajorOutput::Prepare
1904
1905      // Reduce aggregators.
1906      "vpadd.u32 d0, d0, d1\n"
1907      "vpadd.u32 d2, d2, d3\n"
1908      "vpadd.u32 d4, d4, d5\n"
1909      "vpadd.u32 d6, d6, d7\n"
1910      "vpadd.u32 d8, d8, d9\n"
1911      "vpadd.u32 d0, d0, d2\n"
1912      "vpadd.u32 d1, d4, d6\n"
1913      "vpadd.u32 d2, d8, d8\n"
1914
1915      // StaticQuantizationInt32::Transform
1916      "vadd.s32 q0, q0, q5\n"
1917      "vadd.s32 q1, q1, q5\n"
1918      "vadd.s32 q0, q0, q6\n"
1919      "vadd.s32 q1, q1, q7\n"
1920
1921      // RowMajorOutput::Output
1922      "vst1.32 {d0, d1}, [%[result]]!\n"
1923      "vst1.32 {d2[0]}, [%[result]]!\n"
1924      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1925      : [count] "r"(params.kernel.count),
1926        [stride] "r"(params.output_stream.stride)
1927      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
1928        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
1929        "d21", "d22", "d23", "cc", "memory");
1930}
1931
1932template <>
1933inline void MulKernel<
1934    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6,
1935    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1936                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1937                                         RowMajor>& params,
1938                 int32_t* result) {
1939#ifdef DEBUG
1940#ifdef DEBUG_METAGEMM_VERBOSE
1941  std::cout << __FILE__ << "(" << __LINE__
1942            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1943               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, "
1944               "8>::Multiply()"
1945            << std::endl
1946            << std::flush;
1947#endif
1948#endif
1949  asm volatile(
1950      "pld [%[lhs]]\n"
1951      "pld [%[rhs]]\n"
1952
1953      // Clear aggregators.
1954      "vmov.i32 q0, #0\n"
1955      "vmov.i32 q1, #0\n"
1956      "vmov.i32 q2, #0\n"
1957      "vmov.i32 q3, q0\n"
1958      "vmov.i32 q4, q1\n"
1959      "vmov.i32 q5, q2\n"
1960
1961      // General 1xM lanes loop.
1962      "1:"
1963
1964      // Subtract counter.
1965      "subs %[count], %[count], #8\n"
1966
1967      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
1968      "vld1.32 {d16}, [%[lhs]:64]!\n"
1969      "pld [%[lhs], #64]\n"
1970      "vmull.u8 q9, d12, d16\n"
1971      "vmull.u8 q10, d13, d16\n"
1972      "vmull.u8 q11, d14, d16\n"
1973      "vmull.u8 q12, d15, d16\n"
1974      "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
1975      "pld [%[rhs], #128]\n"
1976      "vpadal.u16 q0, q9\n"
1977      "vpadal.u16 q1, q10\n"
1978      "vpadal.u16 q2, q11\n"
1979      "vpadal.u16 q3, q12\n"
1980      "vmull.u8 q9, d12, d16\n"
1981      "vmull.u8 q10, d13, d16\n"
1982      "vpadal.u16 q4, q9\n"
1983      "vpadal.u16 q5, q10\n"
1984
1985      // Loop break.
1986      "bgt 1b\n"
1987
1988      // StaticQuantizationInt32::Prepare
1989      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
1990      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
1991      "vdup.32 q6, d12[0]\n"
1992
1993      // RowMajorOutput::Prepare
1994
1995      // Reduce aggregators.
1996      "vpadd.u32 d0, d0, d1\n"
1997      "vpadd.u32 d2, d2, d3\n"
1998      "vpadd.u32 d4, d4, d5\n"
1999      "vpadd.u32 d6, d6, d7\n"
2000      "vpadd.u32 d8, d8, d9\n"
2001      "vpadd.u32 d10, d10, d11\n"
2002      "vpadd.u32 d0, d0, d2\n"
2003      "vpadd.u32 d1, d4, d6\n"
2004      "vpadd.u32 d2, d8, d10\n"
2005
2006      // StaticQuantizationInt32::Transform
2007      "vadd.s32 q0, q0, q6\n"
2008      "vadd.s32 q1, q1, q6\n"
2009      "vadd.s32 q0, q0, q7\n"
2010      "vadd.s32 q1, q1, q8\n"
2011
2012      // RowMajorOutput::Output
2013      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
2014      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2015      : [count] "r"(params.kernel.count),
2016        [stride] "r"(params.output_stream.stride)
2017      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2018        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2019        "d21", "d22", "d23", "d24", "d25", "cc", "memory");
2020}
2021
2022template <>
2023inline void MulKernel<
2024    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7,
2025    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2026                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2027                                         RowMajor>& params,
2028                 int32_t* result) {
2029#ifdef DEBUG
2030#ifdef DEBUG_METAGEMM_VERBOSE
2031  std::cout << __FILE__ << "(" << __LINE__
2032            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2033               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, "
2034               "8>::Multiply()"
2035            << std::endl
2036            << std::flush;
2037#endif
2038#endif
2039  asm volatile(
2040      "pld [%[lhs]]\n"
2041      "pld [%[rhs]]\n"
2042
2043      // Clear aggregators.
2044      "vmov.i32 q0, #0\n"
2045      "vmov.i32 q1, #0\n"
2046      "vmov.i32 q2, #0\n"
2047      "vmov.i32 q3, q0\n"
2048      "vmov.i32 q4, q1\n"
2049      "vmov.i32 q5, q2\n"
2050      "vmov.i32 q6, q3\n"
2051
2052      // General 1xM lanes loop.
2053      "1:"
2054
2055      // Subtract counter.
2056      "subs %[count], %[count], #8\n"
2057
2058      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
2059      "vld1.32 {d18}, [%[lhs]:64]!\n"
2060      "pld [%[lhs], #64]\n"
2061      "vmull.u8 q10, d14, d18\n"
2062      "vmull.u8 q11, d15, d18\n"
2063      "vmull.u8 q12, d16, d18\n"
2064      "vmull.u8 q13, d17, d18\n"
2065      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
2066      "pld [%[rhs], #128]\n"
2067      "vpadal.u16 q0, q10\n"
2068      "vpadal.u16 q1, q11\n"
2069      "vpadal.u16 q2, q12\n"
2070      "vpadal.u16 q3, q13\n"
2071      "vmull.u8 q10, d14, d18\n"
2072      "vmull.u8 q11, d15, d18\n"
2073      "vmull.u8 q12, d16, d18\n"
2074      "vpadal.u16 q4, q10\n"
2075      "vpadal.u16 q5, q11\n"
2076      "vpadal.u16 q6, q12\n"
2077
2078      // Loop break.
2079      "bgt 1b\n"
2080
2081      // StaticQuantizationInt32::Prepare
2082      "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
2083      "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
2084      "vdup.32 q7, d14[0]\n"
2085
2086      // RowMajorOutput::Prepare
2087
2088      // Reduce aggregators.
2089      "vpadd.u32 d0, d0, d1\n"
2090      "vpadd.u32 d2, d2, d3\n"
2091      "vpadd.u32 d4, d4, d5\n"
2092      "vpadd.u32 d6, d6, d7\n"
2093      "vpadd.u32 d8, d8, d9\n"
2094      "vpadd.u32 d10, d10, d11\n"
2095      "vpadd.u32 d12, d12, d13\n"
2096      "vpadd.u32 d0, d0, d2\n"
2097      "vpadd.u32 d1, d4, d6\n"
2098      "vpadd.u32 d2, d8, d10\n"
2099      "vpadd.u32 d3, d12, d12\n"
2100
2101      // StaticQuantizationInt32::Transform
2102      "vadd.s32 q0, q0, q7\n"
2103      "vadd.s32 q1, q1, q7\n"
2104      "vadd.s32 q0, q0, q8\n"
2105      "vadd.s32 q1, q1, q9\n"
2106
2107      // RowMajorOutput::Output
2108      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
2109      "vst1.32 {d3[0]}, [%[result]]!\n"
2110      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2111      : [count] "r"(params.kernel.count),
2112        [stride] "r"(params.output_stream.stride)
2113      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2114        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2115        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
2116}
2117
2118template <>
2119inline void MulKernel<
2120    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8,
2121    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2122                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2123                                         RowMajor>& params,
2124                 int32_t* result) {
2125#ifdef DEBUG
2126#ifdef DEBUG_METAGEMM_VERBOSE
2127  std::cout << __FILE__ << "(" << __LINE__
2128            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2129               "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, "
2130               "8>::Multiply()"
2131            << std::endl
2132            << std::flush;
2133#endif
2134#endif
2135  asm volatile(
2136      "pld [%[lhs]]\n"
2137      "pld [%[rhs]]\n"
2138
2139      // Clear aggregators.
2140      "vmov.i32 q0, #0\n"
2141      "vmov.i32 q1, #0\n"
2142      "vmov.i32 q2, #0\n"
2143      "vmov.i32 q3, q0\n"
2144      "vmov.i32 q4, q1\n"
2145      "vmov.i32 q5, q2\n"
2146      "vmov.i32 q6, q3\n"
2147      "vmov.i32 q7, q4\n"
2148
2149      // 1x8 lanes loop.
2150      "1:"
2151
2152      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
2153      "vld1.32 {d16}, [%[lhs]:64]!\n"
2154      "vmull.u8 q11, d16, d17\n"
2155      "vmull.u8 q12, d16, d18\n"
2156      "vmull.u8 q13, d16, d19\n"
2157      "vmull.u8 q14, d16, d20\n"
2158      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
2159      "vpadal.u16 q0, q11\n"
2160      "vpadal.u16 q1, q12\n"
2161      "vpadal.u16 q2, q13\n"
2162      "vpadal.u16 q3, q14\n"
2163      "pld [%[rhs], #256]\n"
2164      "vmull.u8 q15, d16, d17\n"
2165      "vmull.u8 q11, d16, d18\n"
2166      "vmull.u8 q12, d16, d19\n"
2167      "vmull.u8 q13, d16, d20\n"
2168      "pld [%[lhs], #32]\n"
2169
2170      // Subtract counter.
2171      "subs %[count], %[count], #8\n"
2172
2173      "vpadal.u16 q4, q15\n"
2174      "vpadal.u16 q5, q11\n"
2175      "vpadal.u16 q6, q12\n"
2176      "vpadal.u16 q7, q13\n"
2177
2178      // Loop break.
2179      "bgt 1b\n"
2180
2181      // StaticQuantizationInt32::Prepare
2182      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
2183      "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
2184      "vdup.32 q8, d16[0]\n"
2185
2186      // RowMajorOutput::Prepare
2187
2188      // Reduce aggregators.
2189      "vpadd.u32 d0, d0, d1\n"
2190      "vpadd.u32 d2, d2, d3\n"
2191      "vpadd.u32 d4, d4, d5\n"
2192      "vpadd.u32 d6, d6, d7\n"
2193      "vpadd.u32 d8, d8, d9\n"
2194      "vpadd.u32 d10, d10, d11\n"
2195      "vpadd.u32 d12, d12, d13\n"
2196      "vpadd.u32 d14, d14, d15\n"
2197      "vpadd.u32 d0, d0, d2\n"
2198      "vpadd.u32 d1, d4, d6\n"
2199      "vpadd.u32 d2, d8, d10\n"
2200      "vpadd.u32 d3, d12, d14\n"
2201
2202      // StaticQuantizationInt32::Transform
2203      "vadd.s32 q0, q0, q8\n"
2204      "vadd.s32 q1, q1, q8\n"
2205      "vadd.s32 q0, q0, q9\n"
2206      "vadd.s32 q1, q1, q10\n"
2207
2208      // RowMajorOutput::Output
2209      "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n"
2210      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2211      : [count] "r"(params.kernel.count),
2212        [stride] "r"(params.output_stream.stride)
2213      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2214        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2215        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
2216        "d31", "cc", "memory");
2217}
2218
2219template <>
2220inline void MulKernel<
2221    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1,
2222    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2223                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2224                                         RowMajor>& params,
2225                 int32_t* result) {
2226#ifdef DEBUG
2227#ifdef DEBUG_METAGEMM_VERBOSE
2228  std::cout << __FILE__ << "(" << __LINE__
2229            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2230               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, "
2231               "8>::Multiply()"
2232            << std::endl
2233            << std::flush;
2234#endif
2235#endif
2236  asm volatile(
2237      "pld [%[lhs]]\n"
2238      "pld [%[rhs]]\n"
2239
2240      // Clear aggregators.
2241      "vmov.i32 q0, #0\n"
2242      "vmov.i32 q1, #0\n"
2243
2244      // General NxM lanes loop.
2245      "1:"
2246
2247      // Subtract counter.
2248      "subs %[count], %[count], #8\n"
2249
2250      "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
2251      "vld1.32 {d6}, [%[rhs]:64]!\n"
2252      "pld [%[lhs], #64]\n"
2253      "pld [%[rhs], #64]\n"
2254      "vmull.u8 q4, d6, d4\n"
2255      "vmull.u8 q5, d6, d5\n"
2256      "vpadal.u16 q0, q4\n"
2257      "vpadal.u16 q1, q5\n"
2258
2259      // Loop break.
2260      "bgt 1b\n"
2261
2262      // StaticQuantizationInt32::Prepare
2263      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2264      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2265      "vdup.32 q2, d8[0]\n"
2266      "vdup.32 q4, d8[1]\n"
2267
2268      // RowMajorOutput::Prepare
2269      "add r0, %[result], %[stride]\n"
2270
2271      // Reduce aggregators.
2272      "vpadd.u32 d0, d0, d1\n"
2273      "vpadd.u32 d0, d0, d0\n"
2274      "vpadd.u32 d2, d2, d3\n"
2275      "vpadd.u32 d2, d2, d2\n"
2276
2277      // StaticQuantizationInt32::Transform
2278      "vadd.s32 q0, q0, q2\n"
2279      "vadd.s32 q1, q1, q4\n"
2280      "vadd.s32 q0, q0, q5\n"
2281      "vadd.s32 q1, q1, q5\n"
2282
2283      // RowMajorOutput::Output
2284      "vst1.32 {d0[0]}, [%[result]]!\n"
2285      "vst1.32 {d2[0]}, [r0]!\n"
2286      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2287      : [count] "r"(params.kernel.count),
2288        [stride] "r"(params.output_stream.stride)
2289      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
2290        "d11", "cc", "memory");
2291}
2292
2293template <>
2294inline void MulKernel<
2295    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2,
2296    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2297                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2298                                         RowMajor>& params,
2299                 int32_t* result) {
2300#ifdef DEBUG
2301#ifdef DEBUG_METAGEMM_VERBOSE
2302  std::cout << __FILE__ << "(" << __LINE__
2303            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2304               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, "
2305               "8>::Multiply()"
2306            << std::endl
2307            << std::flush;
2308#endif
2309#endif
2310  asm volatile(
2311      "pld [%[lhs]]\n"
2312      "pld [%[rhs]]\n"
2313
2314      // Clear aggregators.
2315      "vmov.i32 q0, #0\n"
2316      "vmov.i32 q1, #0\n"
2317      "vmov.i32 q2, #0\n"
2318      "vmov.i32 q3, q0\n"
2319
2320      // General NxM lanes loop.
2321      "1:"
2322
2323      // Subtract counter.
2324      "subs %[count], %[count], #8\n"
2325
2326      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2327      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2328      "pld [%[lhs], #64]\n"
2329      "pld [%[rhs], #64]\n"
2330      "vmull.u8 q6, d10, d8\n"
2331      "vmull.u8 q7, d11, d8\n"
2332      "vmull.u8 q8, d10, d9\n"
2333      "vmull.u8 q9, d11, d9\n"
2334      "vpadal.u16 q0, q6\n"
2335      "vpadal.u16 q1, q7\n"
2336      "vpadal.u16 q2, q8\n"
2337      "vpadal.u16 q3, q9\n"
2338
2339      // Loop break.
2340      "bgt 1b\n"
2341
2342      // StaticQuantizationInt32::Prepare
2343      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2344      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2345      "vdup.32 q6, d8[0]\n"
2346      "vdup.32 q4, d8[1]\n"
2347
2348      // RowMajorOutput::Prepare
2349      "add r0, %[result], %[stride]\n"
2350
2351      // Reduce aggregators.
2352      "vpadd.u32 d0, d0, d1\n"
2353      "vpadd.u32 d2, d2, d3\n"
2354      "vpadd.u32 d0, d0, d2\n"
2355      "vpadd.u32 d4, d4, d5\n"
2356      "vpadd.u32 d6, d6, d7\n"
2357      "vpadd.u32 d4, d4, d6\n"
2358
2359      // StaticQuantizationInt32::Transform
2360      "vadd.s32 q0, q0, q6\n"
2361      "vadd.s32 q2, q2, q4\n"
2362      "vadd.s32 q0, q0, q5\n"
2363      "vadd.s32 q2, q2, q5\n"
2364
2365      // RowMajorOutput::Output
2366      "vst1.32 {d0}, [%[result]]!\n"
2367      "vst1.32 {d4}, [r0]!\n"
2368      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2369      : [count] "r"(params.kernel.count),
2370        [stride] "r"(params.output_stream.stride)
2371      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2372        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
2373        "memory");
2374}
2375
2376template <>
2377inline void MulKernel<
2378    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3,
2379    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2380                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2381                                         RowMajor>& params,
2382                 int32_t* result) {
2383#ifdef DEBUG
2384#ifdef DEBUG_METAGEMM_VERBOSE
2385  std::cout << __FILE__ << "(" << __LINE__
2386            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2387               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, "
2388               "8>::Multiply()"
2389            << std::endl
2390            << std::flush;
2391#endif
2392#endif
2393  asm volatile(
2394      "pld [%[lhs]]\n"
2395      "pld [%[rhs]]\n"
2396
2397      // Clear aggregators.
2398      "vmov.i32 q0, #0\n"
2399      "vmov.i32 q1, #0\n"
2400      "vmov.i32 q2, #0\n"
2401      "vmov.i32 q3, q0\n"
2402      "vmov.i32 q4, q1\n"
2403      "vmov.i32 q5, q2\n"
2404
2405      // General NxM lanes loop.
2406      "1:"
2407
2408      // Subtract counter.
2409      "subs %[count], %[count], #8\n"
2410
2411      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
2412      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
2413      "pld [%[lhs], #64]\n"
2414      "pld [%[rhs], #64]\n"
2415      "vmull.u8 q9, d14, d12\n"
2416      "vmull.u8 q10, d15, d12\n"
2417      "vmull.u8 q11, d16, d12\n"
2418      "vmull.u8 q12, d14, d13\n"
2419      "vmull.u8 q13, d15, d13\n"
2420      "vmull.u8 q14, d16, d13\n"
2421      "vpadal.u16 q0, q9\n"
2422      "vpadal.u16 q1, q10\n"
2423      "vpadal.u16 q2, q11\n"
2424      "vpadal.u16 q3, q12\n"
2425      "vpadal.u16 q4, q13\n"
2426      "vpadal.u16 q5, q14\n"
2427
2428      // Loop break.
2429      "bgt 1b\n"
2430
2431      // StaticQuantizationInt32::Prepare
2432      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
2433      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
2434      "vdup.32 q8, d12[0]\n"
2435      "vdup.32 q6, d12[1]\n"
2436
2437      // RowMajorOutput::Prepare
2438      "add r0, %[result], %[stride]\n"
2439
2440      // Reduce aggregators.
2441      "vpadd.u32 d0, d0, d1\n"
2442      "vpadd.u32 d2, d2, d3\n"
2443      "vpadd.u32 d4, d4, d5\n"
2444      "vpadd.u32 d0, d0, d2\n"
2445      "vpadd.u32 d1, d4, d4\n"
2446      "vpadd.u32 d6, d6, d7\n"
2447      "vpadd.u32 d8, d8, d9\n"
2448      "vpadd.u32 d10, d10, d11\n"
2449      "vpadd.u32 d6, d6, d8\n"
2450      "vpadd.u32 d7, d10, d10\n"
2451
2452      // StaticQuantizationInt32::Transform
2453      "vadd.s32 q0, q0, q8\n"
2454      "vadd.s32 q3, q3, q6\n"
2455      "vadd.s32 q0, q0, q7\n"
2456      "vadd.s32 q3, q3, q7\n"
2457
2458      // RowMajorOutput::Output
2459      "vst1.32 {d0}, [%[result]]!\n"
2460      "vst1.32 {d1[0]}, [%[result]]!\n"
2461      "vst1.32 {d6}, [r0]!\n"
2462      "vst1.32 {d7[0]}, [r0]!\n"
2463      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2464      : [count] "r"(params.kernel.count),
2465        [stride] "r"(params.output_stream.stride)
2466      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2467        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2468        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
2469        "memory");
2470}
2471
2472template <>
2473inline void MulKernel<
2474    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4,
2475    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2476                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2477                                         RowMajor>& params,
2478                 int32_t* result) {
2479#ifdef DEBUG
2480#ifdef DEBUG_METAGEMM_VERBOSE
2481  std::cout << __FILE__ << "(" << __LINE__
2482            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2483               "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, "
2484               "8>::Multiply()"
2485            << std::endl
2486            << std::flush;
2487#endif
2488#endif
2489  asm volatile(
2490      "pld [%[lhs]]\n"
2491      "pld [%[rhs]]\n"
2492
2493      // Clear aggregators.
2494      "vmov.i32 q0, #0\n"
2495      "vmov.i32 q1, #0\n"
2496      "vmov.i32 q2, #0\n"
2497      "vmov.i32 q3, q0\n"
2498      "vmov.i32 q4, q1\n"
2499      "vmov.i32 q5, q2\n"
2500      "vmov.i32 q6, q3\n"
2501      "vmov.i32 q7, q4\n"
2502
2503      // 2x4 lanes loop.
2504      "1:"
2505
2506      "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
2507      "vld1.8 {d16}, [%[lhs]:64]!\n"
2508      "vmull.u8 q11, d16, d18\n"
2509      "vld1.8 {d17}, [%[lhs]:64]!\n"
2510      "vmull.u8 q12, d16, d19\n"
2511      "pld [%[rhs], #64]\n"
2512      "vmull.u8 q13, d16, d20\n"
2513      "pld [%[lhs], #64]\n"
2514      "vmull.u8 q14, d16, d21\n"
2515      "vmull.u8 q15, d17, d18\n"
2516      "vpadal.u16 q0, q11\n"
2517      "vpadal.u16 q1, q12\n"
2518      "vpadal.u16 q2, q13\n"
2519      "vmull.u8 q11, d17, d19\n"
2520      "vmull.u8 q12, d17, d20\n"
2521      "vmull.u8 q13, d17, d21\n"
2522
2523      // Subtract counter.
2524      "subs %[count], %[count], #8\n"
2525
2526      "vpadal.u16 q3, q14\n"
2527      "vpadal.u16 q4, q15\n"
2528      "vpadal.u16 q5, q11\n"
2529      "vpadal.u16 q6, q12\n"
2530      "vpadal.u16 q7, q13\n"
2531
2532      // Loop break.
2533      "bgt 1b\n"
2534
2535      // StaticQuantizationInt32::Prepare
2536      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
2537      "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
2538      "vdup.32 q10, d16[0]\n"
2539      "vdup.32 q8, d16[1]\n"
2540
2541      // RowMajorOutput::Prepare
2542      "add r0, %[result], %[stride]\n"
2543
2544      // Reduce aggregators.
2545      "vpadd.u32 d0, d0, d1\n"
2546      "vpadd.u32 d2, d2, d3\n"
2547      "vpadd.u32 d4, d4, d5\n"
2548      "vpadd.u32 d6, d6, d7\n"
2549      "vpadd.u32 d0, d0, d2\n"
2550      "vpadd.u32 d1, d4, d6\n"
2551      "vpadd.u32 d8, d8, d9\n"
2552      "vpadd.u32 d10, d10, d11\n"
2553      "vpadd.u32 d12, d12, d13\n"
2554      "vpadd.u32 d14, d14, d15\n"
2555      "vpadd.u32 d8, d8, d10\n"
2556      "vpadd.u32 d9, d12, d14\n"
2557
2558      // StaticQuantizationInt32::Transform
2559      "vadd.s32 q0, q0, q10\n"
2560      "vadd.s32 q4, q4, q8\n"
2561      "vadd.s32 q0, q0, q9\n"
2562      "vadd.s32 q4, q4, q9\n"
2563
2564      // RowMajorOutput::Output
2565      "vst1.32 {d0, d1}, [%[result]]!\n"
2566      "vst1.32 {d8, d9}, [r0]!\n"
2567      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2568      : [count] "r"(params.kernel.count),
2569        [stride] "r"(params.output_stream.stride)
2570      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
2571        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
2572        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
2573        "d31", "cc", "memory");
2574}
2575
2576template <>
2577inline void MulKernel<
2578    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1,
2579    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2580                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2581                                         RowMajor>& params,
2582                 int32_t* result) {
2583#ifdef DEBUG
2584#ifdef DEBUG_METAGEMM_VERBOSE
2585  std::cout << __FILE__ << "(" << __LINE__
2586            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2587               "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, "
2588               "8>::Multiply()"
2589            << std::endl
2590            << std::flush;
2591#endif
2592#endif
2593  asm volatile(
2594      "pld [%[lhs]]\n"
2595      "pld [%[rhs]]\n"
2596
2597      // Clear aggregators.
2598      "vmov.i32 q0, #0\n"
2599      "vmov.i32 q1, #0\n"
2600      "vmov.i32 q2, #0\n"
2601
2602      // General NxM lanes loop.
2603      "1:"
2604
2605      // Subtract counter.
2606      "subs %[count], %[count], #8\n"
2607
2608      "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
2609      "vld1.32 {d9}, [%[rhs]:64]!\n"
2610      "pld [%[lhs], #64]\n"
2611      "pld [%[rhs], #64]\n"
2612      "vmull.u8 q5, d9, d6\n"
2613      "vmull.u8 q6, d9, d7\n"
2614      "vmull.u8 q7, d9, d8\n"
2615      "vpadal.u16 q0, q5\n"
2616      "vpadal.u16 q1, q6\n"
2617      "vpadal.u16 q2, q7\n"
2618
2619      // Loop break.
2620      "bgt 1b\n"
2621
2622      // StaticQuantizationInt32::Prepare
2623      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2624      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2625      "vdup.32 q3, d8[0]\n"
2626      "vdup.32 q6, d8[1]\n"
2627      "vdup.32 q4, d9[0]\n"
2628
2629      // RowMajorOutput::Prepare
2630      "add r0, %[result], %[stride]\n"
2631      "add r1, r0, %[stride]\n"
2632
2633      // Reduce aggregators.
2634      "vpadd.u32 d0, d0, d1\n"
2635      "vpadd.u32 d0, d0, d0\n"
2636      "vpadd.u32 d2, d2, d3\n"
2637      "vpadd.u32 d2, d2, d2\n"
2638      "vpadd.u32 d4, d4, d5\n"
2639      "vpadd.u32 d4, d4, d4\n"
2640
2641      // StaticQuantizationInt32::Transform
2642      "vadd.s32 q0, q0, q3\n"
2643      "vadd.s32 q1, q1, q6\n"
2644      "vadd.s32 q2, q2, q4\n"
2645      "vadd.s32 q0, q0, q5\n"
2646      "vadd.s32 q1, q1, q5\n"
2647      "vadd.s32 q2, q2, q5\n"
2648
2649      // RowMajorOutput::Output
2650      "vst1.32 {d0[0]}, [%[result]]!\n"
2651      "vst1.32 {d2[0]}, [r0]!\n"
2652      "vst1.32 {d4[0]}, [r1]!\n"
2653      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2654      : [count] "r"(params.kernel.count),
2655        [stride] "r"(params.output_stream.stride)
2656      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
2657        "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory");
2658}
2659
2660template <>
2661inline void MulKernel<
2662    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2,
2663    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2664                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2665                                         RowMajor>& params,
2666                 int32_t* result) {
2667#ifdef DEBUG
2668#ifdef DEBUG_METAGEMM_VERBOSE
2669  std::cout << __FILE__ << "(" << __LINE__
2670            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2671               "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, "
2672               "8>::Multiply()"
2673            << std::endl
2674            << std::flush;
2675#endif
2676#endif
2677  asm volatile(
2678      "pld [%[lhs]]\n"
2679      "pld [%[rhs]]\n"
2680
2681      // Clear aggregators.
2682      "vmov.i32 q0, #0\n"
2683      "vmov.i32 q1, #0\n"
2684      "vmov.i32 q2, #0\n"
2685      "vmov.i32 q3, q0\n"
2686      "vmov.i32 q4, q1\n"
2687      "vmov.i32 q5, q2\n"
2688
2689      // General NxM lanes loop.
2690      "1:"
2691
2692      // Subtract counter.
2693      "subs %[count], %[count], #8\n"
2694
2695      "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
2696      "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
2697      "pld [%[lhs], #64]\n"
2698      "pld [%[rhs], #64]\n"
2699      "vmull.u8 q9, d15, d12\n"
2700      "vmull.u8 q10, d16, d12\n"
2701      "vmull.u8 q11, d15, d13\n"
2702      "vmull.u8 q12, d16, d13\n"
2703      "vmull.u8 q13, d15, d14\n"
2704      "vmull.u8 q14, d16, d14\n"
2705      "vpadal.u16 q0, q9\n"
2706      "vpadal.u16 q1, q10\n"
2707      "vpadal.u16 q2, q11\n"
2708      "vpadal.u16 q3, q12\n"
2709      "vpadal.u16 q4, q13\n"
2710      "vpadal.u16 q5, q14\n"
2711
2712      // Loop break.
2713      "bgt 1b\n"
2714
2715      // StaticQuantizationInt32::Prepare
2716      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
2717      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
2718      "vdup.32 q8, d12[0]\n"
2719      "vdup.32 q9, d12[1]\n"
2720      "vdup.32 q6, d13[0]\n"
2721
2722      // RowMajorOutput::Prepare
2723      "add r0, %[result], %[stride]\n"
2724      "add r1, r0, %[stride]\n"
2725
2726      // Reduce aggregators.
2727      "vpadd.u32 d0, d0, d1\n"
2728      "vpadd.u32 d2, d2, d3\n"
2729      "vpadd.u32 d0, d0, d2\n"
2730      "vpadd.u32 d4, d4, d5\n"
2731      "vpadd.u32 d6, d6, d7\n"
2732      "vpadd.u32 d4, d4, d6\n"
2733      "vpadd.u32 d8, d8, d9\n"
2734      "vpadd.u32 d10, d10, d11\n"
2735      "vpadd.u32 d8, d8, d10\n"
2736
2737      // StaticQuantizationInt32::Transform
2738      "vadd.s32 q0, q0, q8\n"
2739      "vadd.s32 q2, q2, q9\n"
2740      "vadd.s32 q4, q4, q6\n"
2741      "vadd.s32 q0, q0, q7\n"
2742      "vadd.s32 q2, q2, q7\n"
2743      "vadd.s32 q4, q4, q7\n"
2744
2745      // RowMajorOutput::Output
2746      "vst1.32 {d0}, [%[result]]!\n"
2747      "vst1.32 {d4}, [r0]!\n"
2748      "vst1.32 {d8}, [r1]!\n"
2749      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2750      : [count] "r"(params.kernel.count),
2751        [stride] "r"(params.output_stream.stride)
2752      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
2753        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
2754        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2755        "cc", "memory");
2756}
2757
2758template <>
2759inline void MulKernel<
2760    uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3,
2761    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2762                 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2763                                         RowMajor>& params,
2764                 int32_t* result) {
2765#ifdef DEBUG
2766#ifdef DEBUG_METAGEMM_VERBOSE
2767  std::cout << __FILE__ << "(" << __LINE__
2768            << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2769               "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, "
2770               "8>::Multiply()"
2771            << std::endl
2772            << std::flush;
2773#endif
2774#endif
2775  asm volatile(
2776      "pld [%[lhs]]\n"
2777      "pld [%[rhs]]\n"
2778
2779      // Clear aggregators.
2780      "vmov.i32 q0, #0\n"
2781      "vmov.i32 q1, #0\n"
2782      "vmov.i32 q2, #0\n"
2783      "vmov.i32 q3, q0\n"
2784      "vmov.i32 q4, q1\n"
2785      "vmov.i32 q5, q2\n"
2786      "vmov.i32 q6, q3\n"
2787      "vmov.i32 q7, q4\n"
2788      "vmov.i32 q8, q5\n"
2789
2790      // 3x3 lanes loop.
2791      "1:"
2792
2793      "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
2794      "vld1.8 {d18}, [%[lhs]:64]!\n"
2795      "vmull.u8 q12, d18, d21\n"
2796      "vld1.8 {d19}, [%[lhs]:64]!\n"
2797      "vmull.u8 q13, d18, d22\n"
2798      "vld1.8 {d20}, [%[lhs]:64]!\n"
2799      "vmull.u8 q14, d18, d23\n"
2800      "pld [%[lhs], #64]\n"
2801      "vmull.u8 q15, d19, d21\n"
2802      "pld [%[rhs], #64]\n"
2803      "vpadal.u16 q0, q12\n"
2804      "vpadal.u16 q1, q13\n"
2805      "vpadal.u16 q2, q14\n"
2806      "vpadal.u16 q3, q15\n"
2807      "vmull.u8 q12, d19, d22\n"
2808      "vmull.u8 q13, d19, d23\n"
2809      "vmull.u8 q14, d20, d21\n"
2810      "vmull.u8 q15, d20, d22\n"
2811
2812      // Subtract counter.
2813      "subs %[count], %[count], #8\n"
2814
2815      "vmull.u8 q9, d20, d23\n"
2816      "vpadal.u16 q4, q12\n"
2817      "vpadal.u16 q5, q13\n"
2818      "vpadal.u16 q6, q14\n"
2819      "vpadal.u16 q7, q15\n"
2820      "vpadal.u16 q8, q9\n"
2821
2822      // Loop break.
2823      "bgt 1b\n"
2824
2825      // StaticQuantizationInt32::Prepare
2826      "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
2827      "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
2828      "vdup.32 q11, d18[0]\n"
2829      "vdup.32 q12, d18[1]\n"
2830      "vdup.32 q9, d19[0]\n"
2831
2832      // RowMajorOutput::Prepare
2833      "add r0, %[result], %[stride]\n"
2834      "add r1, r0, %[stride]\n"
2835
2836      // Reduce aggregators.
2837      "vpadd.u32 d0, d0, d1\n"
2838      "vpadd.u32 d2, d2, d3\n"
2839      "vpadd.u32 d4, d4, d5\n"
2840      "vpadd.u32 d0, d0, d2\n"
2841      "vpadd.u32 d1, d4, d4\n"
2842      "vpadd.u32 d6, d6, d7\n"
2843      "vpadd.u32 d8, d8, d9\n"
2844      "vpadd.u32 d10, d10, d11\n"
2845      "vpadd.u32 d6, d6, d8\n"
2846      "vpadd.u32 d7, d10, d10\n"
2847      "vpadd.u32 d12, d12, d13\n"
2848      "vpadd.u32 d14, d14, d15\n"
2849      "vpadd.u32 d16, d16, d17\n"
2850      "vpadd.u32 d12, d12, d14\n"
2851      "vpadd.u32 d13, d16, d16\n"
2852
2853      // StaticQuantizationInt32::Transform
2854      "vadd.s32 q0, q0, q11\n"
2855      "vadd.s32 q3, q3, q12\n"
2856      "vadd.s32 q6, q6, q9\n"
2857      "vadd.s32 q0, q0, q10\n"
2858      "vadd.s32 q3, q3, q10\n"
2859      "vadd.s32 q6, q6, q10\n"
2860
2861      // RowMajorOutput::Output
2862      "vst1.32 {d0}, [%[result]]!\n"
2863      "vst1.32 {d1[0]}, [%[result]]!\n"
2864      "vst1.32 {d6}, [r0]!\n"
2865      "vst1.32 {d7[0]}, [r0]!\n"
2866      "vst1.32 {d12}, [r1]!\n"
2867      "vst1.32 {d13[0]}, [r1]!\n"
2868      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2869      : [count] "r"(params.kernel.count),
2870        [stride] "r"(params.output_stream.stride)
2871      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
2872        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
2873        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2874        "d30", "d31", "cc", "memory");
2875}
2876
2877template <>
2878inline void MulKernel<
2879    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1,
2880    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2881                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2882                                         RowMajor>& params,
2883                 float* result) {
2884#ifdef DEBUG
2885#ifdef DEBUG_METAGEMM_VERBOSE
2886  std::cout << __FILE__ << "(" << __LINE__
2887            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2888               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, "
2889               "8>::Multiply()"
2890            << std::endl
2891            << std::flush;
2892#endif
2893#endif
2894  asm volatile(
2895      "pld [%[lhs]]\n"
2896      "pld [%[rhs]]\n"
2897
2898      // Clear aggregators.
2899      "vmov.i32 q0, #0\n"
2900
2901      // General NxM lanes loop.
2902      "1:"
2903
2904      // Subtract counter.
2905      "subs %[count], %[count], #8\n"
2906
2907      "vld1.32 {d2}, [%[lhs]:64]!\n"
2908      "vld1.32 {d3}, [%[rhs]:64]!\n"
2909      "pld [%[lhs], #64]\n"
2910      "pld [%[rhs], #64]\n"
2911      "vmull.u8 q2, d3, d2\n"
2912      "vpadal.u16 q0, q2\n"
2913
2914      // Loop break.
2915      "bgt 1b\n"
2916
2917      // StaticQuantizationFloat::Prepare
2918      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2919      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2920      "vdup.32 q6, %[scale]\n"
2921      "vdup.32 q4, d8[0]\n"
2922
2923      // RowMajorOutput::Prepare
2924
2925      // Reduce aggregators.
2926      "vpadd.u32 d0, d0, d1\n"
2927      "vpadd.u32 d0, d0, d0\n"
2928
2929      // StaticQuantizationFloat::Transform
2930      "vadd.s32 q0, q0, q4\n"
2931      "vadd.s32 q0, q0, q5\n"
2932      "vcvt.f32.s32 q0, q0\n"
2933      "vmul.f32 q0, q0, q6\n"
2934
2935      // RowMajorOutput::Output
2936      "vst1.32 {d0[0]}, [%[result]]!\n"
2937      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2938      : [count] "r"(params.kernel.count),
2939        [stride] "r"(params.output_stream.stride),
2940        [scale] "r"(params.kernel.scale)
2941      : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12",
2942        "d13", "cc", "memory");
2943}
2944
2945template <>
2946inline void MulKernel<
2947    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2,
2948    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2949                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2950                                         RowMajor>& params,
2951                 float* result) {
2952#ifdef DEBUG
2953#ifdef DEBUG_METAGEMM_VERBOSE
2954  std::cout << __FILE__ << "(" << __LINE__
2955            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2956               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, "
2957               "8>::Multiply()"
2958            << std::endl
2959            << std::flush;
2960#endif
2961#endif
2962  asm volatile(
2963      "pld [%[lhs]]\n"
2964      "pld [%[rhs]]\n"
2965
2966      // Clear aggregators.
2967      "vmov.i32 q0, #0\n"
2968      "vmov.i32 q1, #0\n"
2969
2970      // General NxM lanes loop.
2971      "1:"
2972
2973      // Subtract counter.
2974      "subs %[count], %[count], #8\n"
2975
2976      "vld1.32 {d4}, [%[lhs]:64]!\n"
2977      "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
2978      "pld [%[lhs], #64]\n"
2979      "pld [%[rhs], #64]\n"
2980      "vmull.u8 q4, d5, d4\n"
2981      "vmull.u8 q5, d6, d4\n"
2982      "vpadal.u16 q0, q4\n"
2983      "vpadal.u16 q1, q5\n"
2984
2985      // Loop break.
2986      "bgt 1b\n"
2987
2988      // StaticQuantizationFloat::Prepare
2989      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
2990      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
2991      "vdup.32 q6, %[scale]\n"
2992      "vdup.32 q4, d8[0]\n"
2993
2994      // RowMajorOutput::Prepare
2995
2996      // Reduce aggregators.
2997      "vpadd.u32 d0, d0, d1\n"
2998      "vpadd.u32 d2, d2, d3\n"
2999      "vpadd.u32 d0, d0, d2\n"
3000
3001      // StaticQuantizationFloat::Transform
3002      "vadd.s32 q0, q0, q4\n"
3003      "vadd.s32 q0, q0, q5\n"
3004      "vcvt.f32.s32 q0, q0\n"
3005      "vmul.f32 q0, q0, q6\n"
3006
3007      // RowMajorOutput::Output
3008      "vst1.32 {d0}, [%[result]]!\n"
3009      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3010      : [count] "r"(params.kernel.count),
3011        [stride] "r"(params.output_stream.stride),
3012        [scale] "r"(params.kernel.scale)
3013      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
3014        "d12", "d13", "cc", "memory");
3015}
3016
3017template <>
3018inline void MulKernel<
3019    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3,
3020    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3021                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3022                                         RowMajor>& params,
3023                 float* result) {
3024#ifdef DEBUG
3025#ifdef DEBUG_METAGEMM_VERBOSE
3026  std::cout << __FILE__ << "(" << __LINE__
3027            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3028               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, "
3029               "8>::Multiply()"
3030            << std::endl
3031            << std::flush;
3032#endif
3033#endif
3034  asm volatile(
3035      "pld [%[lhs]]\n"
3036      "pld [%[rhs]]\n"
3037
3038      // Clear aggregators.
3039      "vmov.i32 q0, #0\n"
3040      "vmov.i32 q1, #0\n"
3041      "vmov.i32 q2, #0\n"
3042
3043      // General NxM lanes loop.
3044      "1:"
3045
3046      // Subtract counter.
3047      "subs %[count], %[count], #8\n"
3048
3049      "vld1.32 {d6}, [%[lhs]:64]!\n"
3050      "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
3051      "pld [%[lhs], #64]\n"
3052      "pld [%[rhs], #64]\n"
3053      "vmull.u8 q5, d7, d6\n"
3054      "vmull.u8 q6, d8, d6\n"
3055      "vmull.u8 q7, d9, d6\n"
3056      "vpadal.u16 q0, q5\n"
3057      "vpadal.u16 q1, q6\n"
3058      "vpadal.u16 q2, q7\n"
3059
3060      // Loop break.
3061      "bgt 1b\n"
3062
3063      // StaticQuantizationFloat::Prepare
3064      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3065      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3066      "vdup.32 q6, %[scale]\n"
3067      "vdup.32 q4, d8[0]\n"
3068
3069      // RowMajorOutput::Prepare
3070
3071      // Reduce aggregators.
3072      "vpadd.u32 d0, d0, d1\n"
3073      "vpadd.u32 d2, d2, d3\n"
3074      "vpadd.u32 d4, d4, d5\n"
3075      "vpadd.u32 d0, d0, d2\n"
3076      "vpadd.u32 d1, d4, d4\n"
3077
3078      // StaticQuantizationFloat::Transform
3079      "vadd.s32 q0, q0, q4\n"
3080      "vadd.s32 q0, q0, q5\n"
3081      "vcvt.f32.s32 q0, q0\n"
3082      "vmul.f32 q0, q0, q6\n"
3083
3084      // RowMajorOutput::Output
3085      "vst1.32 {d0}, [%[result]]!\n"
3086      "vst1.32 {d1[0]}, [%[result]]!\n"
3087      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3088      : [count] "r"(params.kernel.count),
3089        [stride] "r"(params.output_stream.stride),
3090        [scale] "r"(params.kernel.scale)
3091      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3092        "d11", "d12", "d13", "d14", "d15", "cc", "memory");
3093}
3094
3095template <>
3096inline void MulKernel<
3097    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4,
3098    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3099                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3100                                         RowMajor>& params,
3101                 float* result) {
3102#ifdef DEBUG
3103#ifdef DEBUG_METAGEMM_VERBOSE
3104  std::cout << __FILE__ << "(" << __LINE__
3105            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3106               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, "
3107               "8>::Multiply()"
3108            << std::endl
3109            << std::flush;
3110#endif
3111#endif
3112  asm volatile(
3113      "pld [%[lhs]]\n"
3114      "pld [%[rhs]]\n"
3115
3116      // Clear aggregators.
3117      "vmov.i32 q0, #0\n"
3118      "vmov.i32 q1, #0\n"
3119      "vmov.i32 q2, #0\n"
3120      "vmov.i32 q3, q0\n"
3121
3122      // General NxM lanes loop.
3123      "1:"
3124
3125      // Subtract counter.
3126      "subs %[count], %[count], #8\n"
3127
3128      "vld1.32 {d8}, [%[lhs]:64]!\n"
3129      "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
3130      "pld [%[lhs], #64]\n"
3131      "pld [%[rhs], #64]\n"
3132      "vmull.u8 q7, d9, d8\n"
3133      "vmull.u8 q8, d10, d8\n"
3134      "vmull.u8 q9, d11, d8\n"
3135      "vmull.u8 q10, d12, d8\n"
3136      "vpadal.u16 q0, q7\n"
3137      "vpadal.u16 q1, q8\n"
3138      "vpadal.u16 q2, q9\n"
3139      "vpadal.u16 q3, q10\n"
3140
3141      // Loop break.
3142      "bgt 1b\n"
3143
3144      // StaticQuantizationFloat::Prepare
3145      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3146      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3147      "vdup.32 q6, %[scale]\n"
3148      "vdup.32 q4, d8[0]\n"
3149
3150      // RowMajorOutput::Prepare
3151
3152      // Reduce aggregators.
3153      "vpadd.u32 d0, d0, d1\n"
3154      "vpadd.u32 d2, d2, d3\n"
3155      "vpadd.u32 d4, d4, d5\n"
3156      "vpadd.u32 d6, d6, d7\n"
3157      "vpadd.u32 d0, d0, d2\n"
3158      "vpadd.u32 d1, d4, d6\n"
3159
3160      // StaticQuantizationFloat::Transform
3161      "vadd.s32 q0, q0, q4\n"
3162      "vadd.s32 q0, q0, q5\n"
3163      "vcvt.f32.s32 q0, q0\n"
3164      "vmul.f32 q0, q0, q6\n"
3165
3166      // RowMajorOutput::Output
3167      "vst1.32 {d0, d1}, [%[result]]!\n"
3168      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3169      : [count] "r"(params.kernel.count),
3170        [stride] "r"(params.output_stream.stride),
3171        [scale] "r"(params.kernel.scale)
3172      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3173        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3174        "d21", "cc", "memory");
3175}
3176
3177template <>
3178inline void MulKernel<
3179    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5,
3180    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3181                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3182                                         RowMajor>& params,
3183                 float* result) {
3184#ifdef DEBUG
3185#ifdef DEBUG_METAGEMM_VERBOSE
3186  std::cout << __FILE__ << "(" << __LINE__
3187            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3188               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, "
3189               "8>::Multiply()"
3190            << std::endl
3191            << std::flush;
3192#endif
3193#endif
3194  asm volatile(
3195      "pld [%[lhs]]\n"
3196      "pld [%[rhs]]\n"
3197
3198      // Clear aggregators.
3199      "vmov.i32 q0, #0\n"
3200      "vmov.i32 q1, #0\n"
3201      "vmov.i32 q2, #0\n"
3202      "vmov.i32 q3, q0\n"
3203      "vmov.i32 q4, q1\n"
3204
3205      // General 1xM lanes loop.
3206      "1:"
3207
3208      // Subtract counter.
3209      "subs %[count], %[count], #8\n"
3210
3211      "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
3212      "vld1.32 {d14}, [%[lhs]:64]!\n"
3213      "pld [%[lhs], #64]\n"
3214      "vmull.u8 q8, d10, d14\n"
3215      "vmull.u8 q9, d11, d14\n"
3216      "vmull.u8 q10, d12, d14\n"
3217      "vmull.u8 q11, d13, d14\n"
3218      "vld1.32 {d10}, [%[rhs]:64]!\n"
3219      "pld [%[rhs], #128]\n"
3220      "vpadal.u16 q0, q8\n"
3221      "vpadal.u16 q1, q9\n"
3222      "vpadal.u16 q2, q10\n"
3223      "vpadal.u16 q3, q11\n"
3224      "vmull.u8 q8, d10, d14\n"
3225      "vpadal.u16 q4, q8\n"
3226
3227      // Loop break.
3228      "bgt 1b\n"
3229
3230      // StaticQuantizationFloat::Prepare
3231      "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
3232      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
3233      "vdup.32 q8, %[scale]\n"
3234      "vdup.32 q5, d10[0]\n"
3235
3236      // RowMajorOutput::Prepare
3237
3238      // Reduce aggregators.
3239      "vpadd.u32 d0, d0, d1\n"
3240      "vpadd.u32 d2, d2, d3\n"
3241      "vpadd.u32 d4, d4, d5\n"
3242      "vpadd.u32 d6, d6, d7\n"
3243      "vpadd.u32 d8, d8, d9\n"
3244      "vpadd.u32 d0, d0, d2\n"
3245      "vpadd.u32 d1, d4, d6\n"
3246      "vpadd.u32 d2, d8, d8\n"
3247
3248      // StaticQuantizationFloat::Transform
3249      "vadd.s32 q0, q0, q5\n"
3250      "vadd.s32 q1, q1, q5\n"
3251      "vadd.s32 q0, q0, q6\n"
3252      "vadd.s32 q1, q1, q7\n"
3253      "vcvt.f32.s32 q0, q0\n"
3254      "vcvt.f32.s32 q1, q1\n"
3255      "vmul.f32 q0, q0, q8\n"
3256      "vmul.f32 q1, q1, q8\n"
3257
3258      // RowMajorOutput::Output
3259      "vst1.32 {d0, d1}, [%[result]]!\n"
3260      "vst1.32 {d2[0]}, [%[result]]!\n"
3261      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3262      : [count] "r"(params.kernel.count),
3263        [stride] "r"(params.output_stream.stride),
3264        [scale] "r"(params.kernel.scale)
3265      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3266        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3267        "d21", "d22", "d23", "cc", "memory");
3268}
3269
3270template <>
3271inline void MulKernel<
3272    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6,
3273    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3274                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3275                                         RowMajor>& params,
3276                 float* result) {
3277#ifdef DEBUG
3278#ifdef DEBUG_METAGEMM_VERBOSE
3279  std::cout << __FILE__ << "(" << __LINE__
3280            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3281               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, "
3282               "8>::Multiply()"
3283            << std::endl
3284            << std::flush;
3285#endif
3286#endif
3287  asm volatile(
3288      "pld [%[lhs]]\n"
3289      "pld [%[rhs]]\n"
3290
3291      // Clear aggregators.
3292      "vmov.i32 q0, #0\n"
3293      "vmov.i32 q1, #0\n"
3294      "vmov.i32 q2, #0\n"
3295      "vmov.i32 q3, q0\n"
3296      "vmov.i32 q4, q1\n"
3297      "vmov.i32 q5, q2\n"
3298
3299      // General 1xM lanes loop.
3300      "1:"
3301
3302      // Subtract counter.
3303      "subs %[count], %[count], #8\n"
3304
3305      "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
3306      "vld1.32 {d16}, [%[lhs]:64]!\n"
3307      "pld [%[lhs], #64]\n"
3308      "vmull.u8 q9, d12, d16\n"
3309      "vmull.u8 q10, d13, d16\n"
3310      "vmull.u8 q11, d14, d16\n"
3311      "vmull.u8 q12, d15, d16\n"
3312      "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
3313      "pld [%[rhs], #128]\n"
3314      "vpadal.u16 q0, q9\n"
3315      "vpadal.u16 q1, q10\n"
3316      "vpadal.u16 q2, q11\n"
3317      "vpadal.u16 q3, q12\n"
3318      "vmull.u8 q9, d12, d16\n"
3319      "vmull.u8 q10, d13, d16\n"
3320      "vpadal.u16 q4, q9\n"
3321      "vpadal.u16 q5, q10\n"
3322
3323      // Loop break.
3324      "bgt 1b\n"
3325
3326      // StaticQuantizationFloat::Prepare
3327      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
3328      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
3329      "vdup.32 q9, %[scale]\n"
3330      "vdup.32 q6, d12[0]\n"
3331
3332      // RowMajorOutput::Prepare
3333
3334      // Reduce aggregators.
3335      "vpadd.u32 d0, d0, d1\n"
3336      "vpadd.u32 d2, d2, d3\n"
3337      "vpadd.u32 d4, d4, d5\n"
3338      "vpadd.u32 d6, d6, d7\n"
3339      "vpadd.u32 d8, d8, d9\n"
3340      "vpadd.u32 d10, d10, d11\n"
3341      "vpadd.u32 d0, d0, d2\n"
3342      "vpadd.u32 d1, d4, d6\n"
3343      "vpadd.u32 d2, d8, d10\n"
3344
3345      // StaticQuantizationFloat::Transform
3346      "vadd.s32 q0, q0, q6\n"
3347      "vadd.s32 q1, q1, q6\n"
3348      "vadd.s32 q0, q0, q7\n"
3349      "vadd.s32 q1, q1, q8\n"
3350      "vcvt.f32.s32 q0, q0\n"
3351      "vcvt.f32.s32 q1, q1\n"
3352      "vmul.f32 q0, q0, q9\n"
3353      "vmul.f32 q1, q1, q9\n"
3354
3355      // RowMajorOutput::Output
3356      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
3357      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3358      : [count] "r"(params.kernel.count),
3359        [stride] "r"(params.output_stream.stride),
3360        [scale] "r"(params.kernel.scale)
3361      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3362        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3363        "d21", "d22", "d23", "d24", "d25", "cc", "memory");
3364}
3365
3366template <>
3367inline void MulKernel<
3368    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7,
3369    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3370                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3371                                         RowMajor>& params,
3372                 float* result) {
3373#ifdef DEBUG
3374#ifdef DEBUG_METAGEMM_VERBOSE
3375  std::cout << __FILE__ << "(" << __LINE__
3376            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3377               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, "
3378               "8>::Multiply()"
3379            << std::endl
3380            << std::flush;
3381#endif
3382#endif
3383  asm volatile(
3384      "pld [%[lhs]]\n"
3385      "pld [%[rhs]]\n"
3386
3387      // Clear aggregators.
3388      "vmov.i32 q0, #0\n"
3389      "vmov.i32 q1, #0\n"
3390      "vmov.i32 q2, #0\n"
3391      "vmov.i32 q3, q0\n"
3392      "vmov.i32 q4, q1\n"
3393      "vmov.i32 q5, q2\n"
3394      "vmov.i32 q6, q3\n"
3395
3396      // General 1xM lanes loop.
3397      "1:"
3398
3399      // Subtract counter.
3400      "subs %[count], %[count], #8\n"
3401
3402      "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
3403      "vld1.32 {d18}, [%[lhs]:64]!\n"
3404      "pld [%[lhs], #64]\n"
3405      "vmull.u8 q10, d14, d18\n"
3406      "vmull.u8 q11, d15, d18\n"
3407      "vmull.u8 q12, d16, d18\n"
3408      "vmull.u8 q13, d17, d18\n"
3409      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
3410      "pld [%[rhs], #128]\n"
3411      "vpadal.u16 q0, q10\n"
3412      "vpadal.u16 q1, q11\n"
3413      "vpadal.u16 q2, q12\n"
3414      "vpadal.u16 q3, q13\n"
3415      "vmull.u8 q10, d14, d18\n"
3416      "vmull.u8 q11, d15, d18\n"
3417      "vmull.u8 q12, d16, d18\n"
3418      "vpadal.u16 q4, q10\n"
3419      "vpadal.u16 q5, q11\n"
3420      "vpadal.u16 q6, q12\n"
3421
3422      // Loop break.
3423      "bgt 1b\n"
3424
3425      // StaticQuantizationFloat::Prepare
3426      "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
3427      "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
3428      "vdup.32 q10, %[scale]\n"
3429      "vdup.32 q7, d14[0]\n"
3430
3431      // RowMajorOutput::Prepare
3432
3433      // Reduce aggregators.
3434      "vpadd.u32 d0, d0, d1\n"
3435      "vpadd.u32 d2, d2, d3\n"
3436      "vpadd.u32 d4, d4, d5\n"
3437      "vpadd.u32 d6, d6, d7\n"
3438      "vpadd.u32 d8, d8, d9\n"
3439      "vpadd.u32 d10, d10, d11\n"
3440      "vpadd.u32 d12, d12, d13\n"
3441      "vpadd.u32 d0, d0, d2\n"
3442      "vpadd.u32 d1, d4, d6\n"
3443      "vpadd.u32 d2, d8, d10\n"
3444      "vpadd.u32 d3, d12, d12\n"
3445
3446      // StaticQuantizationFloat::Transform
3447      "vadd.s32 q0, q0, q7\n"
3448      "vadd.s32 q1, q1, q7\n"
3449      "vadd.s32 q0, q0, q8\n"
3450      "vadd.s32 q1, q1, q9\n"
3451      "vcvt.f32.s32 q0, q0\n"
3452      "vcvt.f32.s32 q1, q1\n"
3453      "vmul.f32 q0, q0, q10\n"
3454      "vmul.f32 q1, q1, q10\n"
3455
3456      // RowMajorOutput::Output
3457      "vst1.32 {d0, d1, d2}, [%[result]]!\n"
3458      "vst1.32 {d3[0]}, [%[result]]!\n"
3459      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3460      : [count] "r"(params.kernel.count),
3461        [stride] "r"(params.output_stream.stride),
3462        [scale] "r"(params.kernel.scale)
3463      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3464        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3465        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
3466}
3467
3468template <>
3469inline void MulKernel<
3470    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8,
3471    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3472                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3473                                         RowMajor>& params,
3474                 float* result) {
3475#ifdef DEBUG
3476#ifdef DEBUG_METAGEMM_VERBOSE
3477  std::cout << __FILE__ << "(" << __LINE__
3478            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3479               "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, "
3480               "8>::Multiply()"
3481            << std::endl
3482            << std::flush;
3483#endif
3484#endif
3485  asm volatile(
3486      "pld [%[lhs]]\n"
3487      "pld [%[rhs]]\n"
3488
3489      // Clear aggregators.
3490      "vmov.i32 q0, #0\n"
3491      "vmov.i32 q1, #0\n"
3492      "vmov.i32 q2, #0\n"
3493      "vmov.i32 q3, q0\n"
3494      "vmov.i32 q4, q1\n"
3495      "vmov.i32 q5, q2\n"
3496      "vmov.i32 q6, q3\n"
3497      "vmov.i32 q7, q4\n"
3498
3499      // 1x8 lanes loop.
3500      "1:"
3501
3502      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
3503      "vld1.32 {d16}, [%[lhs]:64]!\n"
3504      "vmull.u8 q11, d16, d17\n"
3505      "vmull.u8 q12, d16, d18\n"
3506      "vmull.u8 q13, d16, d19\n"
3507      "vmull.u8 q14, d16, d20\n"
3508      "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
3509      "vpadal.u16 q0, q11\n"
3510      "vpadal.u16 q1, q12\n"
3511      "vpadal.u16 q2, q13\n"
3512      "vpadal.u16 q3, q14\n"
3513      "pld [%[rhs], #256]\n"
3514      "vmull.u8 q15, d16, d17\n"
3515      "vmull.u8 q11, d16, d18\n"
3516      "vmull.u8 q12, d16, d19\n"
3517      "vmull.u8 q13, d16, d20\n"
3518      "pld [%[lhs], #32]\n"
3519
3520      // Subtract counter.
3521      "subs %[count], %[count], #8\n"
3522
3523      "vpadal.u16 q4, q15\n"
3524      "vpadal.u16 q5, q11\n"
3525      "vpadal.u16 q6, q12\n"
3526      "vpadal.u16 q7, q13\n"
3527
3528      // Loop break.
3529      "bgt 1b\n"
3530
3531      // StaticQuantizationFloat::Prepare
3532      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
3533      "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
3534      "vdup.32 q11, %[scale]\n"
3535      "vdup.32 q8, d16[0]\n"
3536
3537      // RowMajorOutput::Prepare
3538
3539      // Reduce aggregators.
3540      "vpadd.u32 d0, d0, d1\n"
3541      "vpadd.u32 d2, d2, d3\n"
3542      "vpadd.u32 d4, d4, d5\n"
3543      "vpadd.u32 d6, d6, d7\n"
3544      "vpadd.u32 d8, d8, d9\n"
3545      "vpadd.u32 d10, d10, d11\n"
3546      "vpadd.u32 d12, d12, d13\n"
3547      "vpadd.u32 d14, d14, d15\n"
3548      "vpadd.u32 d0, d0, d2\n"
3549      "vpadd.u32 d1, d4, d6\n"
3550      "vpadd.u32 d2, d8, d10\n"
3551      "vpadd.u32 d3, d12, d14\n"
3552
3553      // StaticQuantizationFloat::Transform
3554      "vadd.s32 q0, q0, q8\n"
3555      "vadd.s32 q1, q1, q8\n"
3556      "vadd.s32 q0, q0, q9\n"
3557      "vadd.s32 q1, q1, q10\n"
3558      "vcvt.f32.s32 q0, q0\n"
3559      "vcvt.f32.s32 q1, q1\n"
3560      "vmul.f32 q0, q0, q11\n"
3561      "vmul.f32 q1, q1, q11\n"
3562
3563      // RowMajorOutput::Output
3564      "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n"
3565      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3566      : [count] "r"(params.kernel.count),
3567        [stride] "r"(params.output_stream.stride),
3568        [scale] "r"(params.kernel.scale)
3569      : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3570        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3571        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
3572        "d31", "cc", "memory");
3573}
3574
3575template <>
3576inline void MulKernel<
3577    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1,
3578    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3579                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3580                                         RowMajor>& params,
3581                 float* result) {
3582#ifdef DEBUG
3583#ifdef DEBUG_METAGEMM_VERBOSE
3584  std::cout << __FILE__ << "(" << __LINE__
3585            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3586               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, "
3587               "8>::Multiply()"
3588            << std::endl
3589            << std::flush;
3590#endif
3591#endif
3592  asm volatile(
3593      "pld [%[lhs]]\n"
3594      "pld [%[rhs]]\n"
3595
3596      // Clear aggregators.
3597      "vmov.i32 q0, #0\n"
3598      "vmov.i32 q1, #0\n"
3599
3600      // General NxM lanes loop.
3601      "1:"
3602
3603      // Subtract counter.
3604      "subs %[count], %[count], #8\n"
3605
3606      "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
3607      "vld1.32 {d6}, [%[rhs]:64]!\n"
3608      "pld [%[lhs], #64]\n"
3609      "pld [%[rhs], #64]\n"
3610      "vmull.u8 q4, d6, d4\n"
3611      "vmull.u8 q5, d6, d5\n"
3612      "vpadal.u16 q0, q4\n"
3613      "vpadal.u16 q1, q5\n"
3614
3615      // Loop break.
3616      "bgt 1b\n"
3617
3618      // StaticQuantizationFloat::Prepare
3619      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3620      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3621      "vdup.32 q6, %[scale]\n"
3622      "vdup.32 q2, d8[0]\n"
3623      "vdup.32 q4, d8[1]\n"
3624
3625      // RowMajorOutput::Prepare
3626      "add r0, %[result], %[stride]\n"
3627
3628      // Reduce aggregators.
3629      "vpadd.u32 d0, d0, d1\n"
3630      "vpadd.u32 d0, d0, d0\n"
3631      "vpadd.u32 d2, d2, d3\n"
3632      "vpadd.u32 d2, d2, d2\n"
3633
3634      // StaticQuantizationFloat::Transform
3635      "vadd.s32 q0, q0, q2\n"
3636      "vadd.s32 q1, q1, q4\n"
3637      "vadd.s32 q0, q0, q5\n"
3638      "vadd.s32 q1, q1, q5\n"
3639      "vcvt.f32.s32 q0, q0\n"
3640      "vcvt.f32.s32 q1, q1\n"
3641      "vmul.f32 q0, q0, q6\n"
3642      "vmul.f32 q1, q1, q6\n"
3643
3644      // RowMajorOutput::Output
3645      "vst1.32 {d0[0]}, [%[result]]!\n"
3646      "vst1.32 {d2[0]}, [r0]!\n"
3647      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3648      : [count] "r"(params.kernel.count),
3649        [stride] "r"(params.output_stream.stride),
3650        [scale] "r"(params.kernel.scale)
3651      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
3652        "d11", "d12", "d13", "cc", "memory");
3653}
3654
3655template <>
3656inline void MulKernel<
3657    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2,
3658    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3659                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3660                                         RowMajor>& params,
3661                 float* result) {
3662#ifdef DEBUG
3663#ifdef DEBUG_METAGEMM_VERBOSE
3664  std::cout << __FILE__ << "(" << __LINE__
3665            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3666               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, "
3667               "8>::Multiply()"
3668            << std::endl
3669            << std::flush;
3670#endif
3671#endif
3672  asm volatile(
3673      "pld [%[lhs]]\n"
3674      "pld [%[rhs]]\n"
3675
3676      // Clear aggregators.
3677      "vmov.i32 q0, #0\n"
3678      "vmov.i32 q1, #0\n"
3679      "vmov.i32 q2, #0\n"
3680      "vmov.i32 q3, q0\n"
3681
3682      // General NxM lanes loop.
3683      "1:"
3684
3685      // Subtract counter.
3686      "subs %[count], %[count], #8\n"
3687
3688      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3689      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3690      "pld [%[lhs], #64]\n"
3691      "pld [%[rhs], #64]\n"
3692      "vmull.u8 q6, d10, d8\n"
3693      "vmull.u8 q7, d11, d8\n"
3694      "vmull.u8 q8, d10, d9\n"
3695      "vmull.u8 q9, d11, d9\n"
3696      "vpadal.u16 q0, q6\n"
3697      "vpadal.u16 q1, q7\n"
3698      "vpadal.u16 q2, q8\n"
3699      "vpadal.u16 q3, q9\n"
3700
3701      // Loop break.
3702      "bgt 1b\n"
3703
3704      // StaticQuantizationFloat::Prepare
3705      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
3706      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
3707      "vdup.32 q6, %[scale]\n"
3708      "vdup.32 q7, d8[0]\n"
3709      "vdup.32 q4, d8[1]\n"
3710
3711      // RowMajorOutput::Prepare
3712      "add r0, %[result], %[stride]\n"
3713
3714      // Reduce aggregators.
3715      "vpadd.u32 d0, d0, d1\n"
3716      "vpadd.u32 d2, d2, d3\n"
3717      "vpadd.u32 d0, d0, d2\n"
3718      "vpadd.u32 d4, d4, d5\n"
3719      "vpadd.u32 d6, d6, d7\n"
3720      "vpadd.u32 d4, d4, d6\n"
3721
3722      // StaticQuantizationFloat::Transform
3723      "vadd.s32 q0, q0, q7\n"
3724      "vadd.s32 q2, q2, q4\n"
3725      "vadd.s32 q0, q0, q5\n"
3726      "vadd.s32 q2, q2, q5\n"
3727      "vcvt.f32.s32 q0, q0\n"
3728      "vcvt.f32.s32 q2, q2\n"
3729      "vmul.f32 q0, q0, q6\n"
3730      "vmul.f32 q2, q2, q6\n"
3731
3732      // RowMajorOutput::Output
3733      "vst1.32 {d0}, [%[result]]!\n"
3734      "vst1.32 {d4}, [r0]!\n"
3735      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3736      : [count] "r"(params.kernel.count),
3737        [stride] "r"(params.output_stream.stride),
3738        [scale] "r"(params.kernel.scale)
3739      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3740        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
3741        "memory");
3742}
3743
3744template <>
3745inline void MulKernel<
3746    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3,
3747    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3748                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3749                                         RowMajor>& params,
3750                 float* result) {
3751#ifdef DEBUG
3752#ifdef DEBUG_METAGEMM_VERBOSE
3753  std::cout << __FILE__ << "(" << __LINE__
3754            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3755               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, "
3756               "8>::Multiply()"
3757            << std::endl
3758            << std::flush;
3759#endif
3760#endif
3761  asm volatile(
3762      "pld [%[lhs]]\n"
3763      "pld [%[rhs]]\n"
3764
3765      // Clear aggregators.
3766      "vmov.i32 q0, #0\n"
3767      "vmov.i32 q1, #0\n"
3768      "vmov.i32 q2, #0\n"
3769      "vmov.i32 q3, q0\n"
3770      "vmov.i32 q4, q1\n"
3771      "vmov.i32 q5, q2\n"
3772
3773      // General NxM lanes loop.
3774      "1:"
3775
3776      // Subtract counter.
3777      "subs %[count], %[count], #8\n"
3778
3779      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
3780      "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
3781      "pld [%[lhs], #64]\n"
3782      "pld [%[rhs], #64]\n"
3783      "vmull.u8 q9, d14, d12\n"
3784      "vmull.u8 q10, d15, d12\n"
3785      "vmull.u8 q11, d16, d12\n"
3786      "vmull.u8 q12, d14, d13\n"
3787      "vmull.u8 q13, d15, d13\n"
3788      "vmull.u8 q14, d16, d13\n"
3789      "vpadal.u16 q0, q9\n"
3790      "vpadal.u16 q1, q10\n"
3791      "vpadal.u16 q2, q11\n"
3792      "vpadal.u16 q3, q12\n"
3793      "vpadal.u16 q4, q13\n"
3794      "vpadal.u16 q5, q14\n"
3795
3796      // Loop break.
3797      "bgt 1b\n"
3798
3799      // StaticQuantizationFloat::Prepare
3800      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
3801      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
3802      "vdup.32 q8, %[scale]\n"
3803      "vdup.32 q9, d12[0]\n"
3804      "vdup.32 q6, d12[1]\n"
3805
3806      // RowMajorOutput::Prepare
3807      "add r0, %[result], %[stride]\n"
3808
3809      // Reduce aggregators.
3810      "vpadd.u32 d0, d0, d1\n"
3811      "vpadd.u32 d2, d2, d3\n"
3812      "vpadd.u32 d4, d4, d5\n"
3813      "vpadd.u32 d0, d0, d2\n"
3814      "vpadd.u32 d1, d4, d4\n"
3815      "vpadd.u32 d6, d6, d7\n"
3816      "vpadd.u32 d8, d8, d9\n"
3817      "vpadd.u32 d10, d10, d11\n"
3818      "vpadd.u32 d6, d6, d8\n"
3819      "vpadd.u32 d7, d10, d10\n"
3820
3821      // StaticQuantizationFloat::Transform
3822      "vadd.s32 q0, q0, q9\n"
3823      "vadd.s32 q3, q3, q6\n"
3824      "vadd.s32 q0, q0, q7\n"
3825      "vadd.s32 q3, q3, q7\n"
3826      "vcvt.f32.s32 q0, q0\n"
3827      "vcvt.f32.s32 q3, q3\n"
3828      "vmul.f32 q0, q0, q8\n"
3829      "vmul.f32 q3, q3, q8\n"
3830
3831      // RowMajorOutput::Output
3832      "vst1.32 {d0}, [%[result]]!\n"
3833      "vst1.32 {d1[0]}, [%[result]]!\n"
3834      "vst1.32 {d6}, [r0]!\n"
3835      "vst1.32 {d7[0]}, [r0]!\n"
3836      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3837      : [count] "r"(params.kernel.count),
3838        [stride] "r"(params.output_stream.stride),
3839        [scale] "r"(params.kernel.scale)
3840      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3841        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3842        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
3843        "memory");
3844}
3845
3846template <>
3847inline void MulKernel<
3848    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4,
3849    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3850                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3851                                         RowMajor>& params,
3852                 float* result) {
3853#ifdef DEBUG
3854#ifdef DEBUG_METAGEMM_VERBOSE
3855  std::cout << __FILE__ << "(" << __LINE__
3856            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3857               "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, "
3858               "8>::Multiply()"
3859            << std::endl
3860            << std::flush;
3861#endif
3862#endif
3863  asm volatile(
3864      "pld [%[lhs]]\n"
3865      "pld [%[rhs]]\n"
3866
3867      // Clear aggregators.
3868      "vmov.i32 q0, #0\n"
3869      "vmov.i32 q1, #0\n"
3870      "vmov.i32 q2, #0\n"
3871      "vmov.i32 q3, q0\n"
3872      "vmov.i32 q4, q1\n"
3873      "vmov.i32 q5, q2\n"
3874      "vmov.i32 q6, q3\n"
3875      "vmov.i32 q7, q4\n"
3876
3877      // 2x4 lanes loop.
3878      "1:"
3879
3880      "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
3881      "vld1.8 {d16}, [%[lhs]:64]!\n"
3882      "vmull.u8 q11, d16, d18\n"
3883      "vld1.8 {d17}, [%[lhs]:64]!\n"
3884      "vmull.u8 q12, d16, d19\n"
3885      "pld [%[rhs], #64]\n"
3886      "vmull.u8 q13, d16, d20\n"
3887      "pld [%[lhs], #64]\n"
3888      "vmull.u8 q14, d16, d21\n"
3889      "vmull.u8 q15, d17, d18\n"
3890      "vpadal.u16 q0, q11\n"
3891      "vpadal.u16 q1, q12\n"
3892      "vpadal.u16 q2, q13\n"
3893      "vmull.u8 q11, d17, d19\n"
3894      "vmull.u8 q12, d17, d20\n"
3895      "vmull.u8 q13, d17, d21\n"
3896
3897      // Subtract counter.
3898      "subs %[count], %[count], #8\n"
3899
3900      "vpadal.u16 q3, q14\n"
3901      "vpadal.u16 q4, q15\n"
3902      "vpadal.u16 q5, q11\n"
3903      "vpadal.u16 q6, q12\n"
3904      "vpadal.u16 q7, q13\n"
3905
3906      // Loop break.
3907      "bgt 1b\n"
3908
3909      // StaticQuantizationFloat::Prepare
3910      "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
3911      "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
3912      "vdup.32 q10, %[scale]\n"
3913      "vdup.32 q11, d16[0]\n"
3914      "vdup.32 q8, d16[1]\n"
3915
3916      // RowMajorOutput::Prepare
3917      "add r0, %[result], %[stride]\n"
3918
3919      // Reduce aggregators.
3920      "vpadd.u32 d0, d0, d1\n"
3921      "vpadd.u32 d2, d2, d3\n"
3922      "vpadd.u32 d4, d4, d5\n"
3923      "vpadd.u32 d6, d6, d7\n"
3924      "vpadd.u32 d0, d0, d2\n"
3925      "vpadd.u32 d1, d4, d6\n"
3926      "vpadd.u32 d8, d8, d9\n"
3927      "vpadd.u32 d10, d10, d11\n"
3928      "vpadd.u32 d12, d12, d13\n"
3929      "vpadd.u32 d14, d14, d15\n"
3930      "vpadd.u32 d8, d8, d10\n"
3931      "vpadd.u32 d9, d12, d14\n"
3932
3933      // StaticQuantizationFloat::Transform
3934      "vadd.s32 q0, q0, q11\n"
3935      "vadd.s32 q4, q4, q8\n"
3936      "vadd.s32 q0, q0, q9\n"
3937      "vadd.s32 q4, q4, q9\n"
3938      "vcvt.f32.s32 q0, q0\n"
3939      "vcvt.f32.s32 q4, q4\n"
3940      "vmul.f32 q0, q0, q10\n"
3941      "vmul.f32 q4, q4, q10\n"
3942
3943      // RowMajorOutput::Output
3944      "vst1.32 {d0, d1}, [%[result]]!\n"
3945      "vst1.32 {d8, d9}, [r0]!\n"
3946      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3947      : [count] "r"(params.kernel.count),
3948        [stride] "r"(params.output_stream.stride),
3949        [scale] "r"(params.kernel.scale)
3950      : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
3951        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
3952        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
3953        "d31", "cc", "memory");
3954}
3955
3956template <>
3957inline void MulKernel<
3958    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1,
3959    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3960                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3961                                         RowMajor>& params,
3962                 float* result) {
3963#ifdef DEBUG
3964#ifdef DEBUG_METAGEMM_VERBOSE
3965  std::cout << __FILE__ << "(" << __LINE__
3966            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3967               "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, "
3968               "8>::Multiply()"
3969            << std::endl
3970            << std::flush;
3971#endif
3972#endif
3973  asm volatile(
3974      "pld [%[lhs]]\n"
3975      "pld [%[rhs]]\n"
3976
3977      // Clear aggregators.
3978      "vmov.i32 q0, #0\n"
3979      "vmov.i32 q1, #0\n"
3980      "vmov.i32 q2, #0\n"
3981
3982      // General NxM lanes loop.
3983      "1:"
3984
3985      // Subtract counter.
3986      "subs %[count], %[count], #8\n"
3987
3988      "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
3989      "vld1.32 {d9}, [%[rhs]:64]!\n"
3990      "pld [%[lhs], #64]\n"
3991      "pld [%[rhs], #64]\n"
3992      "vmull.u8 q5, d9, d6\n"
3993      "vmull.u8 q6, d9, d7\n"
3994      "vmull.u8 q7, d9, d8\n"
3995      "vpadal.u16 q0, q5\n"
3996      "vpadal.u16 q1, q6\n"
3997      "vpadal.u16 q2, q7\n"
3998
3999      // Loop break.
4000      "bgt 1b\n"
4001
4002      // StaticQuantizationFloat::Prepare
4003      "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
4004      "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
4005      "vdup.32 q6, %[scale]\n"
4006      "vdup.32 q3, d8[0]\n"
4007      "vdup.32 q7, d8[1]\n"
4008      "vdup.32 q4, d9[0]\n"
4009
4010      // RowMajorOutput::Prepare
4011      "add r0, %[result], %[stride]\n"
4012      "add r1, r0, %[stride]\n"
4013
4014      // Reduce aggregators.
4015      "vpadd.u32 d0, d0, d1\n"
4016      "vpadd.u32 d0, d0, d0\n"
4017      "vpadd.u32 d2, d2, d3\n"
4018      "vpadd.u32 d2, d2, d2\n"
4019      "vpadd.u32 d4, d4, d5\n"
4020      "vpadd.u32 d4, d4, d4\n"
4021
4022      // StaticQuantizationFloat::Transform
4023      "vadd.s32 q0, q0, q3\n"
4024      "vadd.s32 q1, q1, q7\n"
4025      "vadd.s32 q2, q2, q4\n"
4026      "vadd.s32 q0, q0, q5\n"
4027      "vadd.s32 q1, q1, q5\n"
4028      "vadd.s32 q2, q2, q5\n"
4029      "vcvt.f32.s32 q0, q0\n"
4030      "vcvt.f32.s32 q1, q1\n"
4031      "vcvt.f32.s32 q2, q2\n"
4032      "vmul.f32 q0, q0, q6\n"
4033      "vmul.f32 q1, q1, q6\n"
4034      "vmul.f32 q2, q2, q6\n"
4035
4036      // RowMajorOutput::Output
4037      "vst1.32 {d0[0]}, [%[result]]!\n"
4038      "vst1.32 {d2[0]}, [r0]!\n"
4039      "vst1.32 {d4[0]}, [r1]!\n"
4040      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4041      : [count] "r"(params.kernel.count),
4042        [stride] "r"(params.output_stream.stride),
4043        [scale] "r"(params.kernel.scale)
4044      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
4045        "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory");
4046}
4047
4048template <>
4049inline void MulKernel<
4050    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2,
4051    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
4052                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
4053                                         RowMajor>& params,
4054                 float* result) {
4055#ifdef DEBUG
4056#ifdef DEBUG_METAGEMM_VERBOSE
4057  std::cout << __FILE__ << "(" << __LINE__
4058            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
4059               "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, "
4060               "8>::Multiply()"
4061            << std::endl
4062            << std::flush;
4063#endif
4064#endif
4065  asm volatile(
4066      "pld [%[lhs]]\n"
4067      "pld [%[rhs]]\n"
4068
4069      // Clear aggregators.
4070      "vmov.i32 q0, #0\n"
4071      "vmov.i32 q1, #0\n"
4072      "vmov.i32 q2, #0\n"
4073      "vmov.i32 q3, q0\n"
4074      "vmov.i32 q4, q1\n"
4075      "vmov.i32 q5, q2\n"
4076
4077      // General NxM lanes loop.
4078      "1:"
4079
4080      // Subtract counter.
4081      "subs %[count], %[count], #8\n"
4082
4083      "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
4084      "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
4085      "pld [%[lhs], #64]\n"
4086      "pld [%[rhs], #64]\n"
4087      "vmull.u8 q9, d15, d12\n"
4088      "vmull.u8 q10, d16, d12\n"
4089      "vmull.u8 q11, d15, d13\n"
4090      "vmull.u8 q12, d16, d13\n"
4091      "vmull.u8 q13, d15, d14\n"
4092      "vmull.u8 q14, d16, d14\n"
4093      "vpadal.u16 q0, q9\n"
4094      "vpadal.u16 q1, q10\n"
4095      "vpadal.u16 q2, q11\n"
4096      "vpadal.u16 q3, q12\n"
4097      "vpadal.u16 q4, q13\n"
4098      "vpadal.u16 q5, q14\n"
4099
4100      // Loop break.
4101      "bgt 1b\n"
4102
4103      // StaticQuantizationFloat::Prepare
4104      "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
4105      "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
4106      "vdup.32 q8, %[scale]\n"
4107      "vdup.32 q9, d12[0]\n"
4108      "vdup.32 q10, d12[1]\n"
4109      "vdup.32 q6, d13[0]\n"
4110
4111      // RowMajorOutput::Prepare
4112      "add r0, %[result], %[stride]\n"
4113      "add r1, r0, %[stride]\n"
4114
4115      // Reduce aggregators.
4116      "vpadd.u32 d0, d0, d1\n"
4117      "vpadd.u32 d2, d2, d3\n"
4118      "vpadd.u32 d0, d0, d2\n"
4119      "vpadd.u32 d4, d4, d5\n"
4120      "vpadd.u32 d6, d6, d7\n"
4121      "vpadd.u32 d4, d4, d6\n"
4122      "vpadd.u32 d8, d8, d9\n"
4123      "vpadd.u32 d10, d10, d11\n"
4124      "vpadd.u32 d8, d8, d10\n"
4125
4126      // StaticQuantizationFloat::Transform
4127      "vadd.s32 q0, q0, q9\n"
4128      "vadd.s32 q2, q2, q10\n"
4129      "vadd.s32 q4, q4, q6\n"
4130      "vadd.s32 q0, q0, q7\n"
4131      "vadd.s32 q2, q2, q7\n"
4132      "vadd.s32 q4, q4, q7\n"
4133      "vcvt.f32.s32 q0, q0\n"
4134      "vcvt.f32.s32 q2, q2\n"
4135      "vcvt.f32.s32 q4, q4\n"
4136      "vmul.f32 q0, q0, q8\n"
4137      "vmul.f32 q2, q2, q8\n"
4138      "vmul.f32 q4, q4, q8\n"
4139
4140      // RowMajorOutput::Output
4141      "vst1.32 {d0}, [%[result]]!\n"
4142      "vst1.32 {d4}, [r0]!\n"
4143      "vst1.32 {d8}, [r1]!\n"
4144      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4145      : [count] "r"(params.kernel.count),
4146        [stride] "r"(params.output_stream.stride),
4147        [scale] "r"(params.kernel.scale)
4148      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
4149        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
4150        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
4151        "cc", "memory");
4152}
4153
4154template <>
4155inline void MulKernel<
4156    uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3,
4157    8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
4158                 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
4159                                         RowMajor>& params,
4160                 float* result) {
4161#ifdef DEBUG
4162#ifdef DEBUG_METAGEMM_VERBOSE
4163  std::cout << __FILE__ << "(" << __LINE__
4164            << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
4165               "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, "
4166               "8>::Multiply()"
4167            << std::endl
4168            << std::flush;
4169#endif
4170#endif
4171  asm volatile(
4172      "pld [%[lhs]]\n"
4173      "pld [%[rhs]]\n"
4174
4175      // Clear aggregators.
4176      "vmov.i32 q0, #0\n"
4177      "vmov.i32 q1, #0\n"
4178      "vmov.i32 q2, #0\n"
4179      "vmov.i32 q3, q0\n"
4180      "vmov.i32 q4, q1\n"
4181      "vmov.i32 q5, q2\n"
4182      "vmov.i32 q6, q3\n"
4183      "vmov.i32 q7, q4\n"
4184      "vmov.i32 q8, q5\n"
4185
4186      // 3x3 lanes loop.
4187      "1:"
4188
4189      "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
4190      "vld1.8 {d18}, [%[lhs]:64]!\n"
4191      "vmull.u8 q12, d18, d21\n"
4192      "vld1.8 {d19}, [%[lhs]:64]!\n"
4193      "vmull.u8 q13, d18, d22\n"
4194      "vld1.8 {d20}, [%[lhs]:64]!\n"
4195      "vmull.u8 q14, d18, d23\n"
4196      "pld [%[lhs], #64]\n"
4197      "vmull.u8 q15, d19, d21\n"
4198      "pld [%[rhs], #64]\n"
4199      "vpadal.u16 q0, q12\n"
4200      "vpadal.u16 q1, q13\n"
4201      "vpadal.u16 q2, q14\n"
4202      "vpadal.u16 q3, q15\n"
4203      "vmull.u8 q12, d19, d22\n"
4204      "vmull.u8 q13, d19, d23\n"
4205      "vmull.u8 q14, d20, d21\n"
4206      "vmull.u8 q15, d20, d22\n"
4207
4208      // Subtract counter.
4209      "subs %[count], %[count], #8\n"
4210
4211      "vmull.u8 q9, d20, d23\n"
4212      "vpadal.u16 q4, q12\n"
4213      "vpadal.u16 q5, q13\n"
4214      "vpadal.u16 q6, q14\n"
4215      "vpadal.u16 q7, q15\n"
4216      "vpadal.u16 q8, q9\n"
4217
4218      // Loop break.
4219      "bgt 1b\n"
4220
4221      // StaticQuantizationFloat::Prepare
4222      "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
4223      "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
4224      "vdup.32 q11, %[scale]\n"
4225      "vdup.32 q12, d18[0]\n"
4226      "vdup.32 q13, d18[1]\n"
4227      "vdup.32 q9, d19[0]\n"
4228
4229      // RowMajorOutput::Prepare
4230      "add r0, %[result], %[stride]\n"
4231      "add r1, r0, %[stride]\n"
4232
4233      // Reduce aggregators.
4234      "vpadd.u32 d0, d0, d1\n"
4235      "vpadd.u32 d2, d2, d3\n"
4236      "vpadd.u32 d4, d4, d5\n"
4237      "vpadd.u32 d0, d0, d2\n"
4238      "vpadd.u32 d1, d4, d4\n"
4239      "vpadd.u32 d6, d6, d7\n"
4240      "vpadd.u32 d8, d8, d9\n"
4241      "vpadd.u32 d10, d10, d11\n"
4242      "vpadd.u32 d6, d6, d8\n"
4243      "vpadd.u32 d7, d10, d10\n"
4244      "vpadd.u32 d12, d12, d13\n"
4245      "vpadd.u32 d14, d14, d15\n"
4246      "vpadd.u32 d16, d16, d17\n"
4247      "vpadd.u32 d12, d12, d14\n"
4248      "vpadd.u32 d13, d16, d16\n"
4249
4250      // StaticQuantizationFloat::Transform
4251      "vadd.s32 q0, q0, q12\n"
4252      "vadd.s32 q3, q3, q13\n"
4253      "vadd.s32 q6, q6, q9\n"
4254      "vadd.s32 q0, q0, q10\n"
4255      "vadd.s32 q3, q3, q10\n"
4256      "vadd.s32 q6, q6, q10\n"
4257      "vcvt.f32.s32 q0, q0\n"
4258      "vcvt.f32.s32 q3, q3\n"
4259      "vcvt.f32.s32 q6, q6\n"
4260      "vmul.f32 q0, q0, q11\n"
4261      "vmul.f32 q3, q3, q11\n"
4262      "vmul.f32 q6, q6, q11\n"
4263
4264      // RowMajorOutput::Output
4265      "vst1.32 {d0}, [%[result]]!\n"
4266      "vst1.32 {d1[0]}, [%[result]]!\n"
4267      "vst1.32 {d6}, [r0]!\n"
4268      "vst1.32 {d7[0]}, [r0]!\n"
4269      "vst1.32 {d12}, [r1]!\n"
4270      "vst1.32 {d13[0]}, [r1]!\n"
4271      : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4272      : [count] "r"(params.kernel.count),
4273        [stride] "r"(params.output_stream.stride),
4274        [scale] "r"(params.kernel.scale)
4275      : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
4276        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
4277        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
4278        "d30", "d31", "cc", "memory");
4279}
4280
4281}  // namespace meta
4282}  // namespace gemmlowp
4283
4284#else
4285#warning "Meta gemm for arm32 requires: GEMMLOWP_NEON_32!"
4286#endif
4287
4288#endif  // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
4289