10a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Copyright 2015 Google Inc. All Rights Reserved. 20a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 30a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Licensed under the Apache License, Version 2.0 (the "License"); 40a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// you may not use this file except in compliance with the License. 50a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// You may obtain a copy of the License at 60a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 70a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// http://www.apache.org/licenses/LICENSE-2.0 80a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 90a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Unless required by applicable law or agreed to in writing, software 100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// distributed under the License is distributed on an "AS IS" BASIS, 110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// See the License for the specific language governing permissions and 130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// limitations under the License. 140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// kernel_SSE.h: a collection of Intel SSE optimized kernels. 160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Check in kernel_default.h which one(s) are actually used by default. 170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Others are mere experiments; they are still covered by tests 180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// in case they might be useful some day. 197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#define GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#include "kernel.h" 250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#include <string.h> 277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <cassert> 280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangnamespace gemmlowp { 300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifdef GEMMLOWP_SSE4_32 327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct SSE4_32_Kernel4x4Depth2 : KernelBase { 330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef KernelFormat< 340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>, 357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang Format; 370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const char* Name() const override { return "SSE, 4x4, depth 2"; } 390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const std::uint8_t* rhs_ptr, std::size_t start_depth, 437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::size_t run_depth) const override { 440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang ScopedProfilingLabel label("optimized kernel"); 450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(dst_row_stride == 1); 460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::int32_t run_depth_cells = run_depth / Format::kDepth; 470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang /* Main loop */ 480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // A 2x4 cell of Rhs is stored in 16bit in xmm1 . 500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // A 4x2 block Lhs is stored in 16bit in xmm0. 510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7. 520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +-------+-------+-------+-------+ 540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| 550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Rhs +-------+---------------+-------+ 560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| 570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +-------+-------+-------+-------+ 580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // | | | | | 600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Lhs | | | | | 620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +--+--+ - - - - +-------+-------+-------+-------+ 640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | 660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +--+--+ - - - - +-------+-------+-------+-------+ 690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Accumulator 710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang asm volatile( 730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // set accumulators to zero. 750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm4 , %%xmm4 \n\t" 760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm5 , %%xmm5 \n\t" 770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm6 , %%xmm6 \n\t" 780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm7 , %%xmm7 \n\t" 797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movl %[run_depth_cells], %%eax\n\t" 817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "subl $2, %%eax\n\t" 827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "js outerLoop1%=\n\t" 837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Loop for K unrolled by 4 857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "outerLoop2%=:\n\t" 860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // K = 1,2 880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // RHS cell to xmm1 890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // LHS cell 920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm4 \n\t" 960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm5 \n\t" 997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "prefetcht0 0x80(%[lhs_ptr]) \n\t" 1017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 1030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 1047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm6 \n\t" 1057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 1060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 1077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm7 \n\t" 1087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "prefetcht0 0x80(%[rhs_ptr]) \n\t" 1107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // K = 3,4 1127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // RHS cell to xmm1 1137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" 1147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // LHS cell 1167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 1177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 1187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 1190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm4 \n\t" 1207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 1217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 1220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm5 \n\t" 1237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 1257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 1267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm6 \n\t" 1270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 1287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 1297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm7 \n\t" 1307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "addl $0x10, %[lhs_ptr]\n\t" 1327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "addl $0x10, %[rhs_ptr]\n\t" 1337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "subl $2, %[run_depth_cells]\n\t" 1357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "jnz outerLoop2%=\n\t" 1367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "movl %[run_depth_cells], %%eax\n\t" 1387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "decl %%eax\n\t" 1397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "js finish%=\n\t" 1407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Loop for K unrolled by 2 1427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "outerLoop1%=:\n\t" 1437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // RHS cell to xmm1 1457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 1467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // LHS cell 1487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 1497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 1500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 1517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm4 \n\t" 1527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 1530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 1547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm5 \n\t" 1557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 1580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm6 \n\t" 1597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 1607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 1610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm7 \n\t" 1620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "addl $0x08, %[lhs_ptr]\n\t" 1640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "addl $0x08, %[rhs_ptr]\n\t" 1657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "decl %[run_depth_cells]\n\t" 1677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "jnz outerLoop1%=\n\t" 1687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "finish%=:\n\t" 1700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movl %[dst_col_stride], %%eax\n\t" 1720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "shll $2, %%eax\n\t" 1730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movl %[start_depth], %%ecx\n\t" 1750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "test %%ecx, %%ecx\n\t" 1760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "jz storeDst%=\n\t" 1770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "leal (%%eax,%%eax,0x2), %%ecx\n\t" 1790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" 1800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t" 1810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t" 1820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t" 1830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "storeDst%=:\n\t" 1850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "leal (%%eax,%%eax,0x2), %%ecx\n\t" 1870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" 1880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm5 , 0x00(%[dst_ptr], %%eax, 1)\n\t" 1890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm6 , 0x00(%[dst_ptr], %%eax, 2)\n\t" 1900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm7 , 0x00(%[dst_ptr], %%ecx, 1)\n\t" 1910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 1920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : // outputs 1930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 1940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [dst_ptr] "+r"(dst_ptr) 1950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : // inputs 1960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride), 1970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [run_depth_cells] "g"(run_depth_cells) 1980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : // clobbers 1990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5", 2000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "%xmm6", "%xmm7", "%eax", "%ecx"); 2010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 2020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang}; 2030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#endif 2047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifdef GEMMLOWP_SSE4_64 2057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct SSE4_64_Kernel12x4Depth2 : KernelBase { 2060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef KernelFormat< 2070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>, 2087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 2097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang Format; 2100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const char* Name() const override { return "SSE, 12x4, depth 2"; } 2120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 2147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 2157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const std::uint8_t* rhs_ptr, std::size_t start_depth, 2167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::size_t run_depth) const override { 2170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang ScopedProfilingLabel label("optimized kernel"); 2180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(dst_row_stride == 1); 2190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const std::int64_t run_depth_cells = run_depth / Format::kDepth; 2200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const std::int64_t dst_col_stride_q = dst_col_stride; 2210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang /* Main loop */ 2230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // A 2x4 cell of Rhs is stored in 16bit in xmm1 . 2250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced 2260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // every Iteration. 2270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15. 2280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 2290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +-------+-------+-------+-------+ 2300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| 2310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Rhs +-------+---------------+-------+ 2320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| 2330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +-------+-------+-------+-------+ 2340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 2350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // | | | | | 2360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 2370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Lhs | | | | | 2380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 2390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +--+--+ - - - - +-------+-------+-------+-------+ 2400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 2410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | 2420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 2430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 2440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +--+--+ - - - - +-------+-------+-------+-------+ 2450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 2460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | (Iter2) | xmm8 | xmm9 | xmm10 | xmm11 | 2470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 2480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 2490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +--+--+ - - - - +-------+-------+-------+-------+ 2500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 2510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | (Iter3) | xmm12 | xmm13 | xmm14 | xmm15 | 2520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 2530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 2540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // +--+--+ - - - - +-------+-------+-------+-------+ 2550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // 2560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Accumulator 2570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang asm volatile( 2590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Set registers for destination 2617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "movq %[dst_col_stride_q], %%r12\n\t" 2627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "shlq $2, %%r12\n\t" 2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "leaq (%%r12,%%r12,0x2), %%r13\n\t" 2647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Set accumulators to zero. 2660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm4 , %%xmm4 \n\t" 2670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm5 , %%xmm5 \n\t" 2680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm6 , %%xmm6 \n\t" 2690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm7 , %%xmm7 \n\t" 2700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm8 , %%xmm8 \n\t" 2710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm9 , %%xmm9 \n\t" 2720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm10 , %%xmm10\n\t" 2730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm11 , %%xmm11\n\t" 2740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm12 , %%xmm12\n\t" 2750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm13 , %%xmm13\n\t" 2760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm14 , %%xmm14\n\t" 2770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pxor %%xmm15 , %%xmm15\n\t" 2780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "movq %[run_depth_cells], %%r14\n\t" 2807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "subq $2, %%r14\n\t" 2817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "js outerLoop1%=\n\t" 2820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Loop for K unrolled by 4 2847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "outerLoop2%=:\n\t" 2857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // K = 1,2 2870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // RHS cell to xmm1 2887b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 2900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // LHS cell 2920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 2930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 2940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 2950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm4 \n\t" 2967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 2977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 2980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm5 \n\t" 2997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "prefetcht0 0x80(%[lhs_ptr]) \n\t" 3017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 3030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm6 \n\t" 3057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 3067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm7 \n\t" 3080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // next LHS cell 3100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 3110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 3120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm8 \n\t" 3147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 3157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm9 \n\t" 3177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "prefetcht0 0x80(%[rhs_ptr]) \n\t" 3197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 3210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm10 \n\t" 3237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 3247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm11 \n\t" 3260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // next LHS cell 3280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" 3290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 3307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm12 \n\t" 3327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 3337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm13 \n\t" 3357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 3377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm14 \n\t" 3397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 3407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm15 \n\t" 3427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // K = 3,4 3447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // RHS cell to xmm1 3457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" 3467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // LHS cell 3487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t" 3497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 3507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm4 \n\t" 3520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 3537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm5 \n\t" 3557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 3577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm6 \n\t" 3597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 3607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm7 \n\t" 3627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // next LHS cell 3647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t" 3657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 3660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm8 \n\t" 3687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 3690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm9 \n\t" 3717b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 3737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3747b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm10 \n\t" 3757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 3767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm11 \n\t" 3787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // next LHS cell 3807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t" 3817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 3827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm12 \n\t" 3847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 3857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm13 \n\t" 3877b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 3897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 3907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm14 \n\t" 3910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 3927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 3937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm15 \n\t" 3947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "addq $0x30, %[lhs_ptr]\n\t" 3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "addq $0x10, %[rhs_ptr]\n\t" 3977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 3987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "subq $2, %[run_depth_cells]\n\t" 3997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "jnz outerLoop2%=\n\t" 4007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "movq %[run_depth_cells], %%r14\n\t" 4027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "decq %%r14\n\t" 4037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "js finish%=\n\t" 4047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Loop for K unrolled by 2 4067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "outerLoop1%=:\n\t" 4077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // RHS cell to xmm1 4097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 4107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // LHS cell 4127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 4137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 4147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 4157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm4 \n\t" 4167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 4177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 4187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm5 \n\t" 4197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 4207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 4217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm6 \n\t" 4227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 4237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 4247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm7 \n\t" 4257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // next LHS cell 4277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 4287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 4297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 4307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm8 \n\t" 4317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 4327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 4337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm9 \n\t" 4347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 4350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 4367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm10 \n\t" 4377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 4380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 4397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm11 \n\t" 4407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // next LHS cell 4427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" 4437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 4447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 4457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm2, %%xmm12 \n\t" 4467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 4477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 4487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "paddd %%xmm3, %%xmm13 \n\t" 4497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 4507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm2 \n\t" 4510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm2, %%xmm14 \n\t" 4527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 4537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "pmaddwd %%xmm0, %%xmm3 \n\t" 4540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd %%xmm3, %%xmm15 \n\t" 4550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "addq $0x18, %[lhs_ptr]\n\t" 4570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "addq $0x08, %[rhs_ptr]\n\t" 4587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "decq %[run_depth_cells]\n\t" 4607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "jnz outerLoop1%=\n\t" 4617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "finish%=:\n\t" 4630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "test %[start_depth], %[start_depth]\n\t" 4650a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "jz storeDst%=\n\t" 4660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" 4680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x10(%[dst_ptr]) , %%xmm8 \n\t" 4690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x20(%[dst_ptr]) , %%xmm12\n\t" 4700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t" 4710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t" 4720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t" 4730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t" 4740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t" 4750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t" 4760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t" 4770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t" 4780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t" 4790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "storeDst%=:\n\t" 4810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" 4830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm8 , 0x10(%[dst_ptr]) \n\t" 4840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm12 , 0x20(%[dst_ptr]) \n\t" 4850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm5 , 0x00(%[dst_ptr], %%r12, 1)\n\t" 4860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm9 , 0x10(%[dst_ptr], %%r12, 1)\n\t" 4870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t" 4880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm6 , 0x00(%[dst_ptr], %%r12, 2)\n\t" 4890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t" 4900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t" 4910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm7 , 0x00(%[dst_ptr], %%r13, 1)\n\t" 4920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t" 4930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t" 4940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : // outputs 4960a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 4970a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [dst_ptr] "+r"(dst_ptr) 4980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : // inputs 4990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [start_depth] "r"(start_depth), 5000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [dst_col_stride_q] "r"(dst_col_stride_q), 5010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang [run_depth_cells] "r"(run_depth_cells) 5020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : // clobbers 5030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5", 5047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14", 5057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"); 5060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 5070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang}; 5080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#endif 5090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 5100a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang} // namespace gemmlowp 5110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 5120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang#endif // GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 513