1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved.
275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License");
475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License.
575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at
675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//     http://www.apache.org/licenses/LICENSE-2.0
875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software
1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS,
1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and
1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License.
1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// kernel_neon.h: a collection of NEON optimized kernels.
1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Check in kernel_default.h which one(s) are actually used by default.
1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Others are mere experiments; they are still covered by tests
1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// in case they might be useful some day.
1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_
2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_KERNEL_NEON_H_
2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
23544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "kernel.h"
2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#include <arm_neon.h>
267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <cassert>
2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp {
2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
30544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang// The kernels here are specifically arm 32bit assembly, not arm 64bit.
317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifdef GEMMLOWP_NEON_32
32544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Our main GEMM kernel.
347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct NEON_32_Kernel12x4Depth2 : KernelBase {
3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                       KernelSideFormat<CellFormat<4, 2>, 1> >
377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      Format;
3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
39544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang  const char* Name() const override { return "NEON, 12x4, depth 2"; }
4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // TODO(benoitjacob): reorder function arguments so dst comes last
427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    ScopedProfilingLabel label("optimized kernel (NEON 12x4)");
4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// For iOS assembler, the %= style of local labels cause compilation errors,
497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//  so use numerical ones instead. See
507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly
517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// If you add any labels, remember to undef them at the end.
527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LOOP_NEON_KERNEL_12X4_DEPTH2 "1"
537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_STORE_RESULT_NEON_KERNEL_12X4_DEPTH2 "2"
5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    assert(dst_row_stride == 1);
5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    asm volatile(
5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Clear accumulator registers (see layout below)
5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q4, #0\n"
5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q8, q4\n"
6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q12, q4\n"
6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q5, q4\n"
6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q9, q4\n"
6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q13, q4\n"
6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q6, q4\n"
6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q10, q4\n"
6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q14, q4\n"
6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q7, q4\n"
6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q11, q4\n"
6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmov.s32 q15, q4\n"
7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        /* Main loop */
7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_LOOP_NEON_KERNEL_12X4_DEPTH2
747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n"
7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Overview of register layout:
7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //
7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // (q1--q3).
8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // A 12x4 block of accumulators is stored in 32bit in q4--q15.
8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //
8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //                   +-----+-----+-----+-----+
8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //                   |d0[0]|d0[1]|d0[2]|d0[3]|
8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //              Rhs  +-----+-----+-----+-----+
8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //                   |d1[0]|d1[1]|d1[2]|d1[3]|
8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //                   +-----+-----+-----+-----+
8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //
8975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //                   |     |     |     |     |
9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //
9175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //    Lhs            |     |     |     |     |
9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //
9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  +--+--+ - - - -  +-----+-----+-----+-----+
9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d2|d3|          | q4  | q5  | q6  | q7  |
9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d2|d3|          | q4  | q5  | q6  | q7  |
9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d2|d3|          | q4  | q5  | q6  | q7  |
9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d2|d3|          | q4  | q5  | q6  | q7  |
9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  +--+--+ - - - -  +-----+-----+-----+-----+
9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d4|d5|          | q8  | q9  | q10 | q11 |
10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d4|d5|          | q8  | q9  | q10 | q11 |
10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d4|d5|          | q8  | q9  | q10 | q11 |
10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d4|d5|          | q8  | q9  | q10 | q11 |
10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  +--+--+ - - - -  +-----+-----+-----+-----+
10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d6|d7|          | q12 | q13 | q14 | q15 |
10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d6|d7|          | q12 | q13 | q14 | q15 |
10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d6|d7|          | q12 | q13 | q14 | q15 |
10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  |d6|d7|          | q12 | q13 | q14 | q15 |
10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //  +--+--+ - - - -  +-----+-----+-----+-----+
10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //
11075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        //                            Accumulator
11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
11275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Load 1 Rhs cell of size 2x4
11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"
11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Load 3 Lhs cells of size 4x2 each
11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Expand Lhs/Rhs cells to 16 bit.
12175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmovl.u8 q0, d0\n"
12275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmovl.u8 q1, d2\n"
12375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmovl.u8 q2, d4\n"
12475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmovl.u8 q3, d6\n"
12575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
12675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Multiply-accumulate, level of depth 0
12775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q4, d2, d0[0]\n"
12875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q5, d2, d0[1]\n"
12975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q6, d2, d0[2]\n"
13075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q7, d2, d0[3]\n"
13175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q8, d4, d0[0]\n"
13275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q9, d4, d0[1]\n"
13375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q10, d4, d0[2]\n"
13475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q11, d4, d0[3]\n"
13575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q12, d6, d0[0]\n"
13675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q13, d6, d0[1]\n"
13775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q14, d6, d0[2]\n"
13875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q15, d6, d0[3]\n"
13975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
14075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Multiply-accumulate, level of depth 1
14175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q4, d3, d1[0]\n"
14275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q5, d3, d1[1]\n"
14375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q6, d3, d1[2]\n"
14475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q7, d3, d1[3]\n"
14575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q8, d5, d1[0]\n"
14675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q9, d5, d1[1]\n"
14775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q10, d5, d1[2]\n"
14875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q11, d5, d1[3]\n"
14975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q12, d7, d1[0]\n"
15075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q13, d7, d1[1]\n"
15175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q14, d7, d1[2]\n"
15275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vmlal.u16 q15, d7, d1[3]\n"
15375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
15475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Loop. Decrement loop index (depth) by 2, since we just handled 2
15575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // levels of depth (Kernel::kDepth=2).
15675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "subs %[run_depth], #2\n"
1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bne " GEMMLOWP_LOOP_NEON_KERNEL_12X4_DEPTH2
1587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "b\n"
15975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
16075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        /* end of main loop */
16175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
16275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        /* Accumulate our local accumulator registers into the destination block
16375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob           */
16475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
16575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Compute stride between consecutive columns, in bytes
16675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r0, #4\n"  // multiply by 4 = sizeof(int32)
16775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mul %[dst_col_stride], r0\n"
16875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
16975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // If start_depth == 0, then there is no preexisting accumulator
17075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // to accumulate, so we can simply store our result.
17175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "cmp %[start_depth], #0\n"
1727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "beq " GEMMLOWP_STORE_RESULT_NEON_KERNEL_12X4_DEPTH2
1737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
17475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
17575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r0, %[dst_ptr]\n"
17675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
17775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Load a column
17875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
17975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d0, d1}, [r1]!\n"
18075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d2, d3}, [r1]!\n"
18175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d4, d5}, [r1]!\n"
18275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Accumulate a column
18375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q4, q4, q0\n"
18475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q8, q8, q1\n"
18575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q12, q12, q2\n"
18675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
18775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "add r0, %[dst_col_stride]\n"
18875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Load a column
18975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
19075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d0, d1}, [r1]!\n"
19175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d2, d3}, [r1]!\n"
19275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d4, d5}, [r1]!\n"
19375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Accumulate a column
19475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q5, q5, q0\n"
19575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q9, q9, q1\n"
19675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q13, q13, q2\n"
19775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
19875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "add r0, %[dst_col_stride]\n"
19975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Load a column
20075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
20175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d0, d1}, [r1]!\n"
20275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d2, d3}, [r1]!\n"
20375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d4, d5}, [r1]!\n"
20475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Accumulate a column
20575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q6, q6, q0\n"
20675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q10, q10, q1\n"
20775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q14, q14, q2\n"
20875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
20975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "add r0, %[dst_col_stride]\n"
21075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Load a column
21175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
21275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d0, d1}, [r1]!\n"
21375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d2, d3}, [r1]!\n"
21475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vld1.32 {d4, d5}, [r1]!\n"
21575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Accumulate a column
21675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q7, q7, q0\n"
21775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q11, q11, q1\n"
21875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vadd.s32 q15, q15, q2\n"
21975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_STORE_RESULT_NEON_KERNEL_12X4_DEPTH2
2217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n"
22275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
22375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r0, %[dst_ptr]\n"
22475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Store a column
22575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
22675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d8, d9}, [r1]!\n"
22775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d16, d17}, [r1]!\n"
22875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d24, d25}, [r1]!\n"
22975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Store a column
23075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "add r0, %[dst_col_stride]\n"
23175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
23275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d10, d11}, [r1]!\n"
23375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d18, d19}, [r1]!\n"
23475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d26, d27}, [r1]!\n"
23575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Store a column
23675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "add r0, %[dst_col_stride]\n"
23775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
23875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d12, d13}, [r1]!\n"
23975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d20, d21}, [r1]!\n"
24075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d28, d29}, [r1]!\n"
24175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // Store a column
24275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "add r0, %[dst_col_stride]\n"
24375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "mov r1, r0\n"
24475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d14, d15}, [r1]!\n"
24575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d22, d23}, [r1]!\n"
24675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "vst1.32 {d30, d31}, [r1]!\n"
24775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        :  // outputs
24875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
24975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [dst_ptr] "+r"(dst_ptr),
25075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [run_depth] "+r"(run_depth)
25175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        :  // inputs
25275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [start_depth] "r"(start_depth),
25375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [dst_col_stride] "r"(dst_col_stride)
25475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        :  // clobbers
25575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "cc", "memory", "r0", "r1",
25675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // note: someone on internet says that quad registers are
25775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // unsupported in the clobber list!
25875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
25975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
26075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
26175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d31");
2627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LOOP_NEON_KERNEL_12X4_DEPTH2
2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_STORE_RESULT_NEON_KERNEL_12X4_DEPTH2
26475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
26575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
26675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct NEON_32_Kernel12x4Depth2Assuming12BitProducts : KernelBase {
2680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef KernelFormat<
2690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
2707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
2717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      Format;
27275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const char* Name() const override {
2740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    return "NEON, 12x4, depth 2, assuming 12-bit products";
2750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
27675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // TODO(benoitjacob): reorder function arguments so dst comes last
2787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
2797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
2807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
2817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
2820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    ScopedProfilingLabel label(
2830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "optimized kernel (NEON 12x4, assuming 12-bit products)");
28475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    assert(dst_row_stride == 1);
28575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// See comments above for why we need local numerical labels in our asm.
2877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS "1"
2887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT "2"
2897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LABEL_32 "3"
2907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LABEL_24 "4"
2917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LABEL_16 "5"
2927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LABEL_8 "6"
2937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LABEL_2 "7"
2947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // This kernel is special in that it uses local 16-bit accumulators.
2960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // Because it assumes that each product fits in 12 bits, it can accumulate
2970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // 16 products into a local 16-bit accumulator without risking overflow.
2980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // At that point, it must accumulate these local 16-bit accumulators back
2990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // into global 32-bit accumulators, which have to be stored in memory for
3000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // lack of register space.
3010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // This 12x4 block of global accumulators is laid out as 3 cells of size 4x4
3020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // stored in diagonal-major order like this for the first 4x4 cell:
3030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
3040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //   0   4   8  12
3050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  13   1   5   9
3060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  10  14   2   6
3070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //   7  11  15   3
3080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
3090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // and likewise for the 2nd  cell (16--31) and 3rd cell (32--47)
3100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    std::int32_t global_accumulators[3 * 4 * 4];
3110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    asm volatile(
3120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Compute stride between consecutive columns, in bytes
3130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, #4\n"  // multiply by 4 = sizeof(int32)
3140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mul %[dst_col_stride], r0\n"
31575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
31675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "cmp %[start_depth], #0\n"
3177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bne"
3187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        " " GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
3197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
3200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // If start_depth==0, we need to clear our global accumulators
3220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, %[global_accumulators]\n"
3230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q8, #0\n"
3240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q9, q8\n"
3250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
3260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
3270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
3280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
3290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
3300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
3317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "b " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
3327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
3330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // If start_depth!=0, we need to load our existing global accumulators
3357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
3367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n"
3370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load global accumulators from destination matrix, column-major
3380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r1, %[dst_ptr]\n"
3390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, %[dst_col_stride]\n"
3400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "sub r0, #32\n"
3410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d0,d1}, [r1]!\n"
3420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d8,d9}, [r1]!\n"
3430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d16,d17}, [r1], r0\n"
3440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d2,d3}, [r1]!\n"
3450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d10,d11}, [r1]!\n"
3460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d18,d19}, [r1], r0\n"
3470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d4,d5}, [r1]!\n"
3480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d12,d13}, [r1]!\n"
3490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d20,d21}, [r1], r0\n"
3500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d6,d7}, [r1]!\n"
3510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d14,d15}, [r1]!\n"
3520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d22,d23}, [r1], r0\n"
3530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Now we need to convert the global accumulator registers to
3540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // 4x4-block-wise diagonal-major order. What we effectively want to do
3550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // is to rotate the rows, however the accumulators are stored in
3560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // column-major order in registers. So we achieve this by
3570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // transposing, rotating the registers, and transposing again each
3580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // 4x4 block.
3590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //
3600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Transpose 3 4x4 blocks separately
3610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q0, q1\n"
3620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q2, q3\n"
3630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d1, d4\n"
3640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d3, d6\n"
3650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q4, q5\n"
3660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q6, q7\n"
3670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d9, d12\n"
3680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d11, d14\n"
3690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q8, q9\n"
3700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q10, q11\n"
3710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d17, d20\n"
3720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d19, d22\n"
3730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Rotate the registers
3740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q1, q1, q1, #1\n"
3750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q2, q2, q2, #2\n"
3760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q3, q3, q3, #3\n"
3770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q5, q5, q5, #1\n"
3780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q6, q6, q6, #2\n"
3790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q7, q7, q7, #3\n"
3800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q9, q9, q9, #1\n"
3810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q10, q10, q10, #2\n"
3820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q11, q11, q11, #3\n"
3830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Transpose again and store into our global accumulators
3840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // buffer. These two operations are done at once using vst4.
3850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, %[global_accumulators]\n"
3860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst4.32 {d0,d2,d4,d6}, [r0]!\n"
3870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst4.32 {d1,d3,d5,d7}, [r0]!\n"
3880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst4.32 {d8,d10,d12,d14}, [r0]!\n"
3890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst4.32 {d9,d11,d13,d15}, [r0]!\n"
3900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst4.32 {d16,d18,d20,d22}, [r0]!\n"
3910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst4.32 {d17,d19,d21,d23}, [r0]!\n"
39275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
39375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        /* Main loop */
39475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n"
39775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Overview of register layout:
3990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Registers q4--q16 are the local 16-bit accumulators.
4010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// However, each entry in the result matrix is represented
4020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// by *two* local 16-bit accumulators: one for even levels
4030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// of depth and one for odd levels of depth. These correspond
4040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// to the scalars at even and odd indices within each q-register.
4050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Thus we effectively use 32 bits of register space for each
4060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// entry in the result matrix. The accumulators register layout
4070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// is the same as was described above for the global 32-bit
4080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// accumulators (3 cells of size 4x4 in diagonal-major order)
4090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// with the only difference that instead of 32bit values we have
4100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// pairs of 16bit values.
4110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// A 2x4 cell of Rhs is stored in 8bit in d0.
4130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3.
4140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                      +--------+--------+--------+--------+
4160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                      |d0[0]   |d0[2]   |d0[4]   |d0[6]   |
4170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                 Rhs  +--------+--------+--------+--------+
4180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                      |d0[1]   |d0[3]   |d0[5]   |d0[7]   |
4190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                      +--------+--------+--------+--------+
4200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                      |        |        |        |        |
4220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//    Lhs               |        |        |        |        |
4240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  +-----+-----+ - - - +--------+--------+--------+--------+
4260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d1[0]|d1[1]|       |q4[0,1] |q5[0,1] |q6[0,1] |q7[0,1] |
4270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d1[2]|d1[3]|       |q7[2,3] |q4[2,3] |q5[2,3] |q6[2,3] |
4280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d1[4]|d1[5]|       |q6[4,5] |q7[4,5] |q4[4,5] |q5[4,5] |
4290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d1[6]|d1[7]|       |q5[6,7] |q6[6,7] |q7[6,7] |q4[6,7] |
4300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  +-----+-----+ - - - +--------+--------+--------+--------+
4310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d2[0]|d2[1]|       |q8[0,1] |q8[0,1] |q8[0,1] |q8[0,1] |
4320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d2[2]|d2[3]|       |q9[2,3] |q9[2,3] |q9[2,3] |q9[2,3] |
4330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d2[4]|d2[5]|       |q10[4,5]|q10[4,5]|q10[4,5]|q10[4,5]|
4340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d2[6]|d2[7]|       |q11[6,7]|q11[6,7]|q11[6,7]|q11[6,7]|
4350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  +-----+-----+ - - - +--------+--------+--------+--------+
4360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d3[0]|d3[1]|       |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]|
4370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d3[2]|d3[3]|       |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]|
4380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d3[4]|d3[5]|       |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]|
4390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  |d3[6]|d3[7]|       |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]|
4400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//  +-----+-----+ - - - +--------+--------+--------+--------+
4410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
4420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                            Local 16-bit accumulators
4430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//                         Note: 2 scalars per matrix entry
4440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#define GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \
4460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  /* Load 3 Lhs cells of size 4x2 */          \
4470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n"     \
4480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                                              \
4490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  /* Load 1 Rhs cell of size 2x4 */           \
4500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"           \
4510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                                              \
4520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  /* Multiply-accumulate */                   \
4530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q4, d1, d0\n"                     \
4540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q8, d2, d0\n"                     \
4550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q12, d3, d0\n"                    \
4560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vext.8 d0, d0, d0, #2\n"                   \
4570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q5, d1, d0\n"                     \
4580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q9, d2, d0\n"                     \
4590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q13, d3, d0\n"                    \
4600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vext.8 d0, d0, d0, #2\n"                   \
4610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q6, d1, d0\n"                     \
4620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q10, d2, d0\n"                    \
4630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q14, d3, d0\n"                    \
4640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vext.8 d0, d0, d0, #2\n"                   \
4650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q7, d1, d0\n"                     \
4660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q11, d2, d0\n"                    \
4670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "vmlal.u8 q15, d3, d0\n"                    \
4680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                                              \
4690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  "sub %[run_depth], #2\n"
4700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#define GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH \
4720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
4730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
4740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
4750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
4760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Clear local 16-bit accumulators
4780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q4, #0\n"
4790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q5, q4\n"
4800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q6, q4\n"
4810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q7, q4\n"
4820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q8, q4\n"
4830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q9, q4\n"
4840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q10, q4\n"
4850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q11, q4\n"
4860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q12, q4\n"
4870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q13, q4\n"
4880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q14, q4\n"
4890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vmov.s32 q15, q4\n"
49075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Select a suitable number of depth levels
4920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // to process at this iteration. TODO (benoitjacob) I guess that
4930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // someone who really knows asm should make this a jump table.
4940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cmp %[run_depth], #32\n"
4957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bge " GEMMLOWP_LABEL_32
4967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
4970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cmp %[run_depth], #24\n"
4987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bge " GEMMLOWP_LABEL_24
4997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
5000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cmp %[run_depth], #16\n"
5017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bge " GEMMLOWP_LABEL_16
5027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
5030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cmp %[run_depth], #8\n"
5047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bge " GEMMLOWP_LABEL_8
5057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
5067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "b " GEMMLOWP_LABEL_2 "f\n"
5077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
5087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_LABEL_32
5097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_24
5107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_16
5117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_8
5127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
5130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
5147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH GEMMLOWP_LABEL_2
5157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
5160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
5170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Accumulate the local accumulators into the global accumulators.
5180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // This is about summing adjacent pairs of 16-bit scalars into
5190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // single 32-bit scalars, so we use pairwise long addition (vpadal).
5200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, %[global_accumulators]\n"
5210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r1, %[global_accumulators]\n"
5220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
5230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
5240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q0, q4\n"
5250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q1, q5\n"
5260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q2, q6\n"
5270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q3, q7\n"
5280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
5290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
5300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
5310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
5320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q0, q8\n"
5330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q1, q9\n"
5340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q2, q10\n"
5350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q3, q11\n"
5360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
5370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
5380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
5390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
5400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q0, q12\n"
5410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q1, q13\n"
5420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q2, q14\n"
5430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vpadal.u16 q3, q15\n"
5440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
5450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
5460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
5477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Loop.
5480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cmp %[run_depth], #0\n"
5497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bne " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
5507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "b\n"
5510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
5520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#undef GEMMLOWP_CLEAR_LOCAL_ACCUMULATORS
5530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#undef GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH
5540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#undef GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
5550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#undef GEMMLOWP_ADD_TO_GLOBAL_ACCUMULATORS
55675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
55775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        /* end of main loop */
55875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
5590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Store the global accumulators to the destination matrix
5600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // (column-major)
5610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // This is the reverse of the steps that we followed at the beginning
5620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // when we load the global accumulators from the destination matrix.
5630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // The problem is the same: how to convert 4x4 blocks
5640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // between column-major and diagonal-major orders.
5650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Like above, we do this by rotating rows, and we achieve that by
5660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // tranposing, rotating columns, and transposing again.
5670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //
5680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load and transpose 4x4 blocks of global accumulators
5690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // These two steps are done at once by the vld4 instruction.
5700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, %[global_accumulators]\n"
5710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld4.32 {d0,d2,d4,d6}, [r0]!\n"
5720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld4.32 {d1,d3,d5,d7}, [r0]!\n"
5730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld4.32 {d8,d10,d12,d14}, [r0]!\n"
5740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld4.32 {d9,d11,d13,d15}, [r0]!\n"
5750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld4.32 {d16,d18,d20,d22}, [r0]!\n"
5760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vld4.32 {d17,d19,d21,d23}, [r0]!\n"
5770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Rotate the rows of each 4x4 block
5780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q1, q1, q1, #3\n"
5790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q2, q2, q2, #2\n"
5800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q3, q3, q3, #1\n"
5810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q5, q5, q5, #3\n"
5820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q6, q6, q6, #2\n"
5830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q7, q7, q7, #1\n"
5840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q9, q9, q9, #3\n"
5850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q10, q10, q10, #2\n"
5860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vext.32 q11, q11, q11, #1\n"
5870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Transpose again each 4x4 block
5880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q0, q1\n"
5890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q2, q3\n"
5900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d1, d4\n"
5910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d3, d6\n"
5920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q4, q5\n"
5930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q6, q7\n"
5940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d9, d12\n"
5950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d11, d14\n"
5960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q8, q9\n"
5970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vtrn.32 q10, q11\n"
5980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d17, d20\n"
5990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vswp d19, d22\n"
6000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Store into the column-major destination matrix
6010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r1, %[dst_ptr]\n"
6020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov r0, %[dst_col_stride]\n"
6030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "sub r0, #32\n"
6040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d0,d1}, [r1]!\n"
6050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d8,d9}, [r1]!\n"
6060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d16,d17}, [r1], r0\n"
6070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d2,d3}, [r1]!\n"
6080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d10,d11}, [r1]!\n"
6090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d18,d19}, [r1], r0\n"
6100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d4,d5}, [r1]!\n"
6110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d12,d13}, [r1]!\n"
6120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d20,d21}, [r1], r0\n"
6130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d6,d7}, [r1]!\n"
6140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d14,d15}, [r1]!\n"
6150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "vst1.32 {d22,d23}, [r1], r0\n"
61675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        :  // outputs
61775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
6180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [dst_ptr] "+r"(dst_ptr),
61975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        [run_depth] "+r"(run_depth)
62075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        :  // inputs
6210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [start_depth] "r"(start_depth), [dst_col_stride] "r"(dst_col_stride),
6220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [global_accumulators] "r"(&global_accumulators[0])
62375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        :  // clobbers
6240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cc", "memory", "r0", "r1",
62575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // note: someone on internet says that quad registers are
62675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        // unsupported in the clobber list!
62775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
62875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
62975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
63075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        "d31");
6317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
6327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
6337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LABEL_32
6347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LABEL_24
6357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LABEL_16
6367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LABEL_8
6377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LABEL_2
63875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
63975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
64075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
6417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#endif  // GEMMLOWP_NEON_32
642544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
643544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang// The kernels here are specifically arm 64bit assembly, not arm 32bit.
6447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifdef GEMMLOWP_NEON_64
645544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
646544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang// Our main GEMM kernel.
6477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct NEON_64_Kernel12x8Depth2 : KernelBase {
648544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
6497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                       KernelSideFormat<CellFormat<4, 2>, 2> >
6507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      Format;
651544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
6520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const char* Name() const override { return "NEON, 12x8, depth 2"; }
653544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
654544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang  // TODO(benoitjacob): reorder function arguments so dst comes last
6557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
6567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
6577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
6587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
6590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    ScopedProfilingLabel label("optimized kernel (NEON 12x8)");
6607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// See comments above for why we need local numerical labels in our asm.
6617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_LOOP_NEON_64_KERNEL_12X8_DEPTH2 "1"
6627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_STORE_RESULT_NEON_64_KERNEL_12x8_DEPTH2 "2"
663544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
664544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang    assert(dst_row_stride == 1);
665544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang    asm volatile(
666544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Clear accumulator registers (see layout below)
667544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v8.4s, wzr\n"
668544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v9.4s, wzr\n"
669544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v10.4s, wzr\n"
670544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v11.4s, wzr\n"
671544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v12.4s, wzr\n"
672544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v13.4s, wzr\n"
673544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v14.4s, wzr\n"
674544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "dup v15.4s, wzr\n"
6750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v16.4s, wzr\n"
6760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v17.4s, wzr\n"
6770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v18.4s, wzr\n"
6780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v19.4s, wzr\n"
6790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v20.4s, wzr\n"
6800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v21.4s, wzr\n"
6810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v22.4s, wzr\n"
6820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v23.4s, wzr\n"
6830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v24.4s, wzr\n"
6840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v25.4s, wzr\n"
6850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v26.4s, wzr\n"
6860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v27.4s, wzr\n"
6870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v28.4s, wzr\n"
6880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v29.4s, wzr\n"
6890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v30.4s, wzr\n"
6900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "dup v31.4s, wzr\n"
691544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
692544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        /* Main loop */
693544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
6947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_LOOP_NEON_64_KERNEL_12X8_DEPTH2
6957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n"
696544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
697544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Overview of register layout:
698544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //
6990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
7000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
7010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // A 12x8 block of accumulators is stored in 32bit in v8--v31.
702544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //
7030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //                         +--------+--------+-----+--------+--------+
7040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //                         |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
7050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //                    Rhs  +--------+--------+-----+--------+--------+
7060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //                         |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
7070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //                         +--------+--------+-----+--------+--------+
708544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //
7090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //                         |        |        |     |        |        |
710544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //
7110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //    Lhs                  |        |        |     |        |        |
712544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //
7130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
7140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v2.h[0]|v2.h[4]|      |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
7150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v2.h[1]|v2.h[5]|      |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
7160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v2.h[2]|v2.h[6]|      |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
7170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v2.h[3]|v2.h[7]|      |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
7180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
7190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v3.h[0]|v3.h[4]|      |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
7200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v3.h[1]|v3.h[5]|      |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
7210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v3.h[2]|v3.h[6]|      |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
7220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v3.h[3]|v3.h[7]|      |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
7230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
7240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v4.h[0]|v4.h[4]|      |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
7250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v4.h[1]|v4.h[5]|      |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
7260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v4.h[2]|v4.h[6]|      |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
7270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  |v4.h[3]|v4.h[7]|      |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
7280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
729544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //
730544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        //                            Accumulator
731544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
7320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load 1 Rhs cell of size 2x8
733544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v0.8b}, [%[rhs_ptr]], #8\n"
7340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v1.8b}, [%[rhs_ptr]], #8\n"
735544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
736544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Load 3 Lhs cells of size 4x2 each
737544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
738544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
7390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
740544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
741544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Expand Lhs/Rhs cells to 16 bit.
742544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "uxtl v0.8h, v0.8b\n"
743544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "uxtl v1.8h, v1.8b\n"
744544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "uxtl v2.8h, v2.8b\n"
745544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "uxtl v3.8h, v3.8b\n"
7460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "uxtl v4.8h, v4.8b\n"
747544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
748544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Multiply-accumulate, level of depth 0
749544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal v8.4s, v2.4h, v0.h[0]\n"
750544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal v9.4s, v2.4h, v0.h[1]\n"
751544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal v10.4s, v2.4h, v0.h[2]\n"
752544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal v11.4s, v2.4h, v0.h[3]\n"
7530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v12.4s, v2.4h, v1.h[0]\n"
7540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v13.4s, v2.4h, v1.h[1]\n"
7550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v14.4s, v2.4h, v1.h[2]\n"
7560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v15.4s, v2.4h, v1.h[3]\n"
7570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v16.4s, v3.4h, v0.h[0]\n"
7580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v17.4s, v3.4h, v0.h[1]\n"
7590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v18.4s, v3.4h, v0.h[2]\n"
7600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v19.4s, v3.4h, v0.h[3]\n"
7610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v20.4s, v3.4h, v1.h[0]\n"
7620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v21.4s, v3.4h, v1.h[1]\n"
7630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v22.4s, v3.4h, v1.h[2]\n"
7640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v23.4s, v3.4h, v1.h[3]\n"
7650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v24.4s, v4.4h, v0.h[0]\n"
7660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v25.4s, v4.4h, v0.h[1]\n"
7670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v26.4s, v4.4h, v0.h[2]\n"
7680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v27.4s, v4.4h, v0.h[3]\n"
7690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v28.4s, v4.4h, v1.h[0]\n"
7700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v29.4s, v4.4h, v1.h[1]\n"
7710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v30.4s, v4.4h, v1.h[2]\n"
7720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal v31.4s, v4.4h, v1.h[3]\n"
773544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
774544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Multiply-accumulate, level of depth 1
775544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
776544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
777544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
778544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
7790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
7800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
7810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
7820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
7830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
7840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
7850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
7860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
7870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
7880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
7890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
7900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
7910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
7920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
7930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
7940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
7950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
7960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
7970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
7980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "umlal2 v31.4s, v4.8h, v1.h[7]\n"
799544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
800544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Loop. Decrement loop index (depth) by 2, since we just handled 2
801544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // levels of depth (Kernel::kDepth=2).
802544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "subs %[run_depth], %[run_depth], #2\n"
8037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "bne " GEMMLOWP_LOOP_NEON_64_KERNEL_12X8_DEPTH2
8047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "b\n"
805544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
806544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        /* end of main loop */
807544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
808544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        /* Accumulate our local accumulator registers into the destination block
809544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang           */
810544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
811544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Compute stride between consecutive columns, in bytes
812544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x0, #4\n"  // multiply by 4 = sizeof(int32)
813544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mul %[dst_col_stride], %[dst_col_stride], x0\n"
814544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
815544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // If start_depth == 0, then there is no preexisting accumulator
816544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // to accumulate, so we can simply store our result.
817544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "cmp %[start_depth], #0\n"
8187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "beq " GEMMLOWP_STORE_RESULT_NEON_64_KERNEL_12x8_DEPTH2
8197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "f\n"
820544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
821544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x0, %[dst_ptr]\n"
822544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
823544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Load a column
824544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
825544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
826544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
827544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
828544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Accumulate a column
8290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v8.4s, v8.4s, v0.4s\n"
8300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v16.4s, v16.4s, v1.4s\n"
8310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v24.4s, v24.4s, v2.4s\n"
832544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
833544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "add x0, x0, %[dst_col_stride]\n"
834544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Load a column
835544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
836544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
837544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
838544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
839544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Accumulate a column
8400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v9.4s, v9.4s, v0.4s\n"
8410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v17.4s, v17.4s, v1.4s\n"
8420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v25.4s, v25.4s, v2.4s\n"
843544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
844544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "add x0, x0, %[dst_col_stride]\n"
845544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Load a column
846544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
847544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
848544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
849544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
850544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Accumulate a column
8510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v10.4s, v10.4s, v0.4s\n"
8520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v18.4s, v18.4s, v1.4s\n"
8530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v26.4s, v26.4s, v2.4s\n"
854544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
855544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "add x0, x0, %[dst_col_stride]\n"
856544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Load a column
857544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
858544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
859544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
860544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
861544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Accumulate a column
8620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v11.4s, v11.4s, v0.4s\n"
8630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v19.4s, v19.4s, v1.4s\n"
8640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v27.4s, v27.4s, v2.4s\n"
865544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
8660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
8670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load a column
8680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
8690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
8700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
8710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
8720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Accumulate a column
8730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v12.4s, v12.4s, v0.4s\n"
8740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v20.4s, v20.4s, v1.4s\n"
8750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v28.4s, v28.4s, v2.4s\n"
8760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
8770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
8780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load a column
8790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
8800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
8810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
8820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
8830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Accumulate a column
8840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v13.4s, v13.4s, v0.4s\n"
8850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v21.4s, v21.4s, v1.4s\n"
8860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v29.4s, v29.4s, v2.4s\n"
8870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
8880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
8890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load a column
8900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
8910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
8920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
8930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
8940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Accumulate a column
8950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v14.4s, v14.4s, v0.4s\n"
8960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v22.4s, v22.4s, v1.4s\n"
8970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v30.4s, v30.4s, v2.4s\n"
8980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
8990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
9000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Load a column
9010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
9020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v0.4s}, [x1], #16\n"
9030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v1.4s}, [x1], #16\n"
9040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "ld1 {v2.4s}, [x1], #16\n"
9050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Accumulate a column
9060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v15.4s, v15.4s, v0.4s\n"
9070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v23.4s, v23.4s, v1.4s\n"
9080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add v31.4s, v31.4s, v2.4s\n"
9090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
9107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        GEMMLOWP_STORE_RESULT_NEON_64_KERNEL_12x8_DEPTH2
9117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        ":\n"
912544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
913544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x0, %[dst_ptr]\n"
914544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Store a column
915544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
916544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "st1 {v8.4s}, [x1], #16\n"
9170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v16.4s}, [x1], #16\n"
9180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v24.4s}, [x1], #16\n"
919544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Store a column
920544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "add x0, x0, %[dst_col_stride]\n"
921544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
922544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "st1 {v9.4s}, [x1], #16\n"
9230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v17.4s}, [x1], #16\n"
9240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v25.4s}, [x1], #16\n"
925544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Store a column
926544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "add x0, x0, %[dst_col_stride]\n"
927544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
928544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "st1 {v10.4s}, [x1], #16\n"
9290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v18.4s}, [x1], #16\n"
9300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v26.4s}, [x1], #16\n"
931544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        // Store a column
932544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "add x0, x0, %[dst_col_stride]\n"
933544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "mov x1, x0\n"
934544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "st1 {v11.4s}, [x1], #16\n"
9350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v19.4s}, [x1], #16\n"
9360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v27.4s}, [x1], #16\n"
9370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Store a column
9380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
9390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
9400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v12.4s}, [x1], #16\n"
9410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v20.4s}, [x1], #16\n"
9420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v28.4s}, [x1], #16\n"
9430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Store a column
9440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
9450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
9460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v13.4s}, [x1], #16\n"
9470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v21.4s}, [x1], #16\n"
9480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v29.4s}, [x1], #16\n"
9490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Store a column
9500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
9510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
9520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v14.4s}, [x1], #16\n"
9530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v22.4s}, [x1], #16\n"
9540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v30.4s}, [x1], #16\n"
9550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // Store a column
9560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "add x0, x0, %[dst_col_stride]\n"
9570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "mov x1, x0\n"
958544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        "st1 {v15.4s}, [x1], #16\n"
9590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v23.4s}, [x1], #16\n"
9600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "st1 {v31.4s}, [x1], #16\n"
961544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        :  // outputs
962544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
963544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        [dst_ptr] "+r"(dst_ptr),
964544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        [run_depth] "+r"(run_depth)
965544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        :  // inputs
966544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        [start_depth] "r"(start_depth),
967544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        [dst_col_stride] "r"(dst_col_stride)
968544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang        :  // clobbers
9690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
9700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
9710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
9720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "v27", "v28", "v29", "v30", "v31");
9737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_LOOP_NEON_64_KERNEL_12X8_DEPTH2
9747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#undef GEMMLOWP_STORE_RESULT_NEON_64_KERNEL_12x8_DEPTH2
975544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang  }
976544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang};
977544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
9787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#endif  // GEMMLOWP_NEON_64
979544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang
9800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Our main GEMV kernel.
9810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Because our GEMV performance is low and not dominated by the kernel
9820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// at the moment, it's not worth optimizing too hard yet.
9830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Using intrinsics allows us to write one implementation for both 32bit and
9840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 64bit ARM, and should also perform OK here because the register pressure
9850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// is not so high in this GEMV kernel.
9860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// When/if we get serious about GEMV performance, we will want to
9870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// implement it to bypass packing altogether, and use source data in-place
9880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// with different GEMV kernels for row-major and column-major LHS.
9890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <int Cells>
9900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangstruct NEONKernel4Nx1Depth2 : KernelBase {
9910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, Cells>,
9927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                       KernelSideFormat<CellFormat<1, 2>, 1> >
9937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      Format;
9940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
9950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const char* Name() const override { return "NEON intrinsics, 4Nx1, depth 2"; }
9960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
9977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
9987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
9997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
10007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
10010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    ScopedProfilingLabel label("optimized kernel (NEON 4Nx1)");
10020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
10030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    assert(dst_row_stride == 1);
10040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
10050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // Clear accumulators
10060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    uint32x4_t acc[Cells];
10070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    for (int cell = 0; cell < Cells; cell++) {
10080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      acc[cell] = vdupq_n_u32(0);
10090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
10100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // Main loop
10117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    for (std::size_t d = 0; d < run_depth; d += 2) {
10120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      // Load LHS cells
10130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      uint16x8_t lhs[Cells];
10140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int cell = 0; cell < Cells; cell++) {
10150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        lhs[cell] = vmovl_u8(vld1_u8(lhs_ptr));
10160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        lhs_ptr += 8;
10170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
10180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      // Load RHS cell
10190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      uint16_t rhs0 = rhs_ptr[0];
10200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      uint16_t rhs1 = rhs_ptr[1];
10210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      rhs_ptr += 2;
10220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      // Multiply-accumulate, level of depth 0
10230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int cell = 0; cell < Cells; cell++) {
10240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        acc[cell] = vmlal_n_u16(acc[cell], vget_low_u16(lhs[cell]), rhs0);
10250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
10260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      // Multiply-accumulate, level of depth 1
10270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int cell = 0; cell < Cells; cell++) {
10280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        acc[cell] = vmlal_n_u16(acc[cell], vget_high_u16(lhs[cell]), rhs1);
10290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
10300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
10310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // If start_depth is nonzero, accumulate with the existing accumulator
10320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    if (start_depth) {
10330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int cell = 0; cell < Cells; cell++) {
10340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        acc[cell] = vaddq_u32(
10350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            acc[cell], vreinterpretq_u32_s32(vld1q_s32(dst_ptr + 4 * cell)));
10360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
10370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
10380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // Store the accumulators
10390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    for (int cell = 0; cell < Cells; cell++) {
10400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      vst1q_s32(dst_ptr + 4 * cell, vreinterpretq_s32_u32(acc[cell]));
10410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
10420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
10430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang};
10440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
104575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}  // namespace gemmlowp
104675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
104775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif  // GEMMLOWP_INTERNAL_KERNEL_NEON_H_
1048