10a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Copyright 2015 Google Inc. All Rights Reserved.
20a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
30a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Licensed under the Apache License, Version 2.0 (the "License");
40a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// you may not use this file except in compliance with the License.
50a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// You may obtain a copy of the License at
60a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
70a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//     http://www.apache.org/licenses/LICENSE-2.0
80a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
90a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Unless required by applicable law or agreed to in writing, software
100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// distributed under the License is distributed on an "AS IS" BASIS,
110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// See the License for the specific language governing permissions and
130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// limitations under the License.
140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// kernel_SSE.h: a collection of Intel SSE optimized kernels.
160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Check in kernel_default.h which one(s) are actually used by default.
170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Others are mere experiments; they are still covered by tests
180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// in case they might be useful some day.
197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_
220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#define GEMMLOWP_INTERNAL_KERNEL_SSE_H_
230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#include "kernel.h"
250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#include <string.h>
277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <cassert>
280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangnamespace gemmlowp {
300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifdef GEMMLOWP_SSE4_32
327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct SSE4_32_Kernel4x4Depth2 : KernelBase {
330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef KernelFormat<
340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>,
357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      Format;
370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const char* Name() const override { return "SSE, 4x4, depth 2"; }
390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    ScopedProfilingLabel label("optimized kernel");
450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    assert(dst_row_stride == 1);
460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    std::int32_t run_depth_cells = run_depth / Format::kDepth;
470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    /* Main loop */
480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // A 4x2 block Lhs is stored in 16bit in xmm0.
510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7.
520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   +-------+-------+-------+-------+
540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //              Rhs  +-------+---------------+-------+
560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   +-------+-------+-------+-------+
580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   |       |       |       |       |
600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //    Lhs            |       |       |       |       |
620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  +--+--+ - - - -  +-------+-------+-------+-------+
640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  +--+--+ - - - -  +-------+-------+-------+-------+
690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                              Accumulator
710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    asm volatile(
730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // set accumulators to zero.
750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm4  , %%xmm4 \n\t"
760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm5  , %%xmm5 \n\t"
770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm6  , %%xmm6 \n\t"
780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm7  , %%xmm7 \n\t"
797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movl  %[run_depth_cells], %%eax\n\t"
817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "subl $2, %%eax\n\t"
827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "js outerLoop1%=\n\t"
837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Loop for K unrolled by 4
857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "outerLoop2%=:\n\t"
860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // K = 1,2
880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // RHS cell to xmm1
890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // LHS cell
920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm4           \n\t"
960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm5           \n\t"
997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "prefetcht0 0x80(%[lhs_ptr]) \n\t"
1017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
1030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
1047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm6           \n\t"
1057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
1060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
1077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm7           \n\t"
1087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "prefetcht0 0x80(%[rhs_ptr]) \n\t"
1107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // K = 3,4
1127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // RHS cell to xmm1
1137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
1147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // LHS cell
1167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
1177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
1187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
1190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm4           \n\t"
1207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
1217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
1220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm5           \n\t"
1237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
1257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
1267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm6           \n\t"
1270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
1287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
1297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm7           \n\t"
1307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "addl $0x10, %[lhs_ptr]\n\t"
1327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "addl $0x10, %[rhs_ptr]\n\t"
1337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "subl $2, %[run_depth_cells]\n\t"
1357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "jnz outerLoop2%=\n\t"
1367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "movl %[run_depth_cells], %%eax\n\t"
1387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "decl %%eax\n\t"
1397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "js finish%=\n\t"
1407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Loop for K unrolled by 2
1427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "outerLoop1%=:\n\t"
1437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // RHS cell to xmm1
1457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
1467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // LHS cell
1487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
1497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
1500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
1517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm4           \n\t"
1527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
1530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
1547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm5           \n\t"
1557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
1580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm6           \n\t"
1597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
1607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
1610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm7           \n\t"
1620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "addl $0x08, %[lhs_ptr]\n\t"
1640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "addl $0x08, %[rhs_ptr]\n\t"
1657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "decl %[run_depth_cells]\n\t"
1677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "jnz outerLoop1%=\n\t"
1687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "finish%=:\n\t"
1700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movl  %[dst_col_stride], %%eax\n\t"
1720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "shll $2, %%eax\n\t"
1730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movl  %[start_depth], %%ecx\n\t"
1750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "test %%ecx, %%ecx\n\t"
1760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "jz storeDst%=\n\t"
1770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "leal (%%eax,%%eax,0x2), %%ecx\n\t"
1790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
1800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t"
1810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t"
1820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t"
1830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "storeDst%=:\n\t"
1850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "leal (%%eax,%%eax,0x2), %%ecx\n\t"
1870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
1880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm5  , 0x00(%[dst_ptr], %%eax, 1)\n\t"
1890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm6  , 0x00(%[dst_ptr], %%eax, 2)\n\t"
1900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm7  , 0x00(%[dst_ptr], %%ecx, 1)\n\t"
1910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
1920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        :  // outputs
1930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [dst_ptr] "+r"(dst_ptr)
1950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        :  // inputs
1960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride),
1970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [run_depth_cells] "g"(run_depth_cells)
1980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        :  // clobbers
1990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
2000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "%xmm6", "%xmm7", "%eax", "%ecx");
2010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
2020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang};
2030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#endif
2047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifdef GEMMLOWP_SSE4_64
2057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct SSE4_64_Kernel12x4Depth2 : KernelBase {
2060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef KernelFormat<
2070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
2087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
2097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      Format;
2100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const char* Name() const override { return "SSE, 12x4, depth 2"; }
2120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
2147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
2157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           const std::uint8_t* rhs_ptr, std::size_t start_depth,
2167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang           std::size_t run_depth) const override {
2170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    ScopedProfilingLabel label("optimized kernel");
2180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    assert(dst_row_stride == 1);
2190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    const std::int64_t run_depth_cells = run_depth / Format::kDepth;
2200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    const std::int64_t dst_col_stride_q = dst_col_stride;
2210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    /* Main loop */
2230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
2250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced
2260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // every Iteration.
2270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15.
2280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
2290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   +-------+-------+-------+-------+
2300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
2310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //              Rhs  +-------+---------------+-------+
2320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
2330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   +-------+-------+-------+-------+
2340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
2350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                   |       |       |       |       |
2360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
2370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //    Lhs            |       |       |       |       |
2380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
2390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  +--+--+ - - - -  +-------+-------+-------+-------+
2400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
2410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
2420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
2430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
2440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  +--+--+ - - - -  +-------+-------+-------+-------+
2450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
2460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 | (Iter2)  | xmm8  | xmm9  | xmm10 | xmm11 |
2470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
2480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
2490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  +--+--+ - - - -  +-------+-------+-------+-------+
2500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
2510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 | (Iter3)  | xmm12 | xmm13 | xmm14 | xmm15 |
2520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
2530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
2540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //  +--+--+ - - - -  +-------+-------+-------+-------+
2550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //
2560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    //                              Accumulator
2570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    asm volatile(
2590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Set registers for destination
2617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "movq  %[dst_col_stride_q], %%r12\n\t"
2627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "shlq $2, %%r12\n\t"
2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "leaq (%%r12,%%r12,0x2), %%r13\n\t"
2647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Set accumulators to zero.
2660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm4  , %%xmm4 \n\t"
2670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm5  , %%xmm5 \n\t"
2680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm6  , %%xmm6 \n\t"
2690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm7  , %%xmm7 \n\t"
2700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm8  , %%xmm8 \n\t"
2710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm9  , %%xmm9 \n\t"
2720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm10 , %%xmm10\n\t"
2730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm11 , %%xmm11\n\t"
2740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm12 , %%xmm12\n\t"
2750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm13 , %%xmm13\n\t"
2760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm14 , %%xmm14\n\t"
2770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pxor %%xmm15 , %%xmm15\n\t"
2780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "movq  %[run_depth_cells], %%r14\n\t"
2807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "subq $2, %%r14\n\t"
2817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "js outerLoop1%=\n\t"
2820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Loop for K unrolled by 4
2847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "outerLoop2%=:\n\t"
2857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // K = 1,2
2870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // RHS cell to xmm1
2887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
2900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // LHS cell
2920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
2930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
2940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
2950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm4           \n\t"
2967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
2977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
2980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm5           \n\t"
2997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "prefetcht0 0x80(%[lhs_ptr]) \n\t"
3017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
3030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm6           \n\t"
3057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
3067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm7           \n\t"
3080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // next LHS cell
3100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
3110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
3120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm8           \n\t"
3147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
3157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm9           \n\t"
3177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "prefetcht0 0x80(%[rhs_ptr]) \n\t"
3197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
3210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm10          \n\t"
3237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
3247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm11          \n\t"
3260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        // next LHS cell
3280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
3290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
3307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm12          \n\t"
3327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
3337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm13          \n\t"
3357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
3377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm14          \n\t"
3397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
3407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm15          \n\t"
3427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // K = 3,4
3447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // RHS cell to xmm1
3457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
3467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // LHS cell
3487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t"
3497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
3507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm4           \n\t"
3520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
3537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm5           \n\t"
3557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
3577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm6           \n\t"
3597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
3607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm7           \n\t"
3627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // next LHS cell
3647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t"
3657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
3660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm8           \n\t"
3687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
3690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm9           \n\t"
3717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
3737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm10          \n\t"
3757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
3767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm11          \n\t"
3787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // next LHS cell
3807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t"
3817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
3827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm12          \n\t"
3847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
3857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm13          \n\t"
3877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
3897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
3907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm14          \n\t"
3910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
3927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
3937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm15          \n\t"
3947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "addq $0x30, %[lhs_ptr]\n\t"
3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "addq $0x10, %[rhs_ptr]\n\t"
3977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
3987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "subq $2, %[run_depth_cells]\n\t"
3997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "jnz outerLoop2%=\n\t"
4007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "movq %[run_depth_cells], %%r14\n\t"
4027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "decq %%r14\n\t"
4037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "js finish%=\n\t"
4047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // Loop for K unrolled by 2
4067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "outerLoop1%=:\n\t"
4077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // RHS cell to xmm1
4097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
4107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // LHS cell
4127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
4137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
4147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
4157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm4           \n\t"
4167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
4177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
4187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm5           \n\t"
4197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
4207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
4217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm6           \n\t"
4227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
4237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
4247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm7           \n\t"
4257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // next LHS cell
4277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
4287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
4297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
4307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm8           \n\t"
4317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
4327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
4337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm9           \n\t"
4347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
4350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
4367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm10          \n\t"
4377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
4380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
4397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm11          \n\t"
4407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        // next LHS cell
4427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
4437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
4447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
4457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm2, %%xmm12          \n\t"
4467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
4477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
4487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "paddd %%xmm3, %%xmm13          \n\t"
4497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
4507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm2         \n\t"
4510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm2, %%xmm14          \n\t"
4527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
4537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "pmaddwd %%xmm0, %%xmm3         \n\t"
4540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd %%xmm3, %%xmm15          \n\t"
4550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "addq $0x18, %[lhs_ptr]\n\t"
4570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "addq $0x08, %[rhs_ptr]\n\t"
4587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "decq %[run_depth_cells]\n\t"
4607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "jnz outerLoop1%=\n\t"
4617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "finish%=:\n\t"
4630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "test %[start_depth], %[start_depth]\n\t"
4650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "jz storeDst%=\n\t"
4660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
4680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x10(%[dst_ptr])           , %%xmm8 \n\t"
4690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x20(%[dst_ptr])           , %%xmm12\n\t"
4700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t"
4710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t"
4720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t"
4730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t"
4740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t"
4750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t"
4760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t"
4770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t"
4780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t"
4790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "storeDst%=:\n\t"
4810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
4830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm8  , 0x10(%[dst_ptr])          \n\t"
4840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm12 , 0x20(%[dst_ptr])          \n\t"
4850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm5  , 0x00(%[dst_ptr], %%r12, 1)\n\t"
4860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm9  , 0x10(%[dst_ptr], %%r12, 1)\n\t"
4870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t"
4880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm6  , 0x00(%[dst_ptr], %%r12, 2)\n\t"
4890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t"
4900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t"
4910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm7  , 0x00(%[dst_ptr], %%r13, 1)\n\t"
4920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t"
4930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t"
4940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        :  // outputs
4960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
4970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [dst_ptr] "+r"(dst_ptr)
4980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        :  // inputs
4990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [start_depth] "r"(start_depth),
5000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [dst_col_stride_q] "r"(dst_col_stride_q),
5010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        [run_depth_cells] "r"(run_depth_cells)
5020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        :  // clobbers
5030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
5047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14",
5057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
5060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
5070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang};
5080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#endif
5090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
5100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang}  // namespace gemmlowp
5110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
5120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#endif  // GEMMLOWP_INTERNAL_KERNEL_SSE_H_
513