1// Copyright 2015 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// kernel_SSE.h: a collection of Intel SSE optimized kernels.
16// Check in kernel_default.h which one(s) are actually used by default.
17// Others are mere experiments; they are still covered by tests
18// in case they might be useful some day.
19//
20
21#ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_
22#define GEMMLOWP_INTERNAL_KERNEL_SSE_H_
23
24#include "kernel.h"
25
26#include <string.h>
27#include <cassert>
28
29namespace gemmlowp {
30
31#ifdef GEMMLOWP_SSE4_32
32struct SSE4_32_Kernel4x4Depth2 : KernelBase {
33  typedef KernelFormat<
34      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>,
35      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
36      Format;
37
38  const char* Name() const override { return "SSE, 4x4, depth 2"; }
39
40  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
41           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
42           const std::uint8_t* rhs_ptr, std::size_t start_depth,
43           std::size_t run_depth) const override {
44    ScopedProfilingLabel label("optimized kernel");
45    assert(dst_row_stride == 1);
46    std::int32_t run_depth_cells = run_depth / Format::kDepth;
47    /* Main loop */
48
49    // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
50    // A 4x2 block Lhs is stored in 16bit in xmm0.
51    // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7.
52    //
53    //                   +-------+-------+-------+-------+
54    //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
55    //              Rhs  +-------+---------------+-------+
56    //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
57    //                   +-------+-------+-------+-------+
58    //
59    //                   |       |       |       |       |
60    //
61    //    Lhs            |       |       |       |       |
62    //
63    //  +--+--+ - - - -  +-------+-------+-------+-------+
64    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
65    //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
66    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
67    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
68    //  +--+--+ - - - -  +-------+-------+-------+-------+
69    //
70    //                              Accumulator
71
72    asm volatile(
73
74        // set accumulators to zero.
75        "pxor %%xmm4  , %%xmm4 \n\t"
76        "pxor %%xmm5  , %%xmm5 \n\t"
77        "pxor %%xmm6  , %%xmm6 \n\t"
78        "pxor %%xmm7  , %%xmm7 \n\t"
79
80        "movl  %[run_depth_cells], %%eax\n\t"
81        "subl $2, %%eax\n\t"
82        "js outerLoop1%=\n\t"
83
84        // Loop for K unrolled by 4
85        "outerLoop2%=:\n\t"
86
87        // K = 1,2
88        // RHS cell to xmm1
89        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
90
91        // LHS cell
92        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
93        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
94        "pmaddwd %%xmm0, %%xmm2         \n\t"
95        "paddd %%xmm2, %%xmm4           \n\t"
96        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
97        "pmaddwd %%xmm0, %%xmm3         \n\t"
98        "paddd %%xmm3, %%xmm5           \n\t"
99
100        "prefetcht0 0x80(%[lhs_ptr]) \n\t"
101
102        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
103        "pmaddwd %%xmm0, %%xmm2         \n\t"
104        "paddd %%xmm2, %%xmm6           \n\t"
105        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
106        "pmaddwd %%xmm0, %%xmm3         \n\t"
107        "paddd %%xmm3, %%xmm7           \n\t"
108
109        "prefetcht0 0x80(%[rhs_ptr]) \n\t"
110
111        // K = 3,4
112        // RHS cell to xmm1
113        "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
114
115        // LHS cell
116        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
117        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
118        "pmaddwd %%xmm0, %%xmm2         \n\t"
119        "paddd %%xmm2, %%xmm4           \n\t"
120        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
121        "pmaddwd %%xmm0, %%xmm3         \n\t"
122        "paddd %%xmm3, %%xmm5           \n\t"
123
124        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
125        "pmaddwd %%xmm0, %%xmm2         \n\t"
126        "paddd %%xmm2, %%xmm6           \n\t"
127        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
128        "pmaddwd %%xmm0, %%xmm3         \n\t"
129        "paddd %%xmm3, %%xmm7           \n\t"
130
131        "addl $0x10, %[lhs_ptr]\n\t"
132        "addl $0x10, %[rhs_ptr]\n\t"
133
134        "subl $2, %[run_depth_cells]\n\t"
135        "jnz outerLoop2%=\n\t"
136
137        "movl %[run_depth_cells], %%eax\n\t"
138        "decl %%eax\n\t"
139        "js finish%=\n\t"
140
141        // Loop for K unrolled by 2
142        "outerLoop1%=:\n\t"
143
144        // RHS cell to xmm1
145        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
146
147        // LHS cell
148        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
149        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
150        "pmaddwd %%xmm0, %%xmm2         \n\t"
151        "paddd %%xmm2, %%xmm4           \n\t"
152        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
153        "pmaddwd %%xmm0, %%xmm3         \n\t"
154        "paddd %%xmm3, %%xmm5           \n\t"
155
156        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
157        "pmaddwd %%xmm0, %%xmm2         \n\t"
158        "paddd %%xmm2, %%xmm6           \n\t"
159        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
160        "pmaddwd %%xmm0, %%xmm3         \n\t"
161        "paddd %%xmm3, %%xmm7           \n\t"
162
163        "addl $0x08, %[lhs_ptr]\n\t"
164        "addl $0x08, %[rhs_ptr]\n\t"
165
166        "decl %[run_depth_cells]\n\t"
167        "jnz outerLoop1%=\n\t"
168
169        "finish%=:\n\t"
170
171        "movl  %[dst_col_stride], %%eax\n\t"
172        "shll $2, %%eax\n\t"
173
174        "movl  %[start_depth], %%ecx\n\t"
175        "test %%ecx, %%ecx\n\t"
176        "jz storeDst%=\n\t"
177
178        "leal (%%eax,%%eax,0x2), %%ecx\n\t"
179        "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
180        "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t"
181        "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t"
182        "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t"
183
184        "storeDst%=:\n\t"
185
186        "leal (%%eax,%%eax,0x2), %%ecx\n\t"
187        "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
188        "movdqu %%xmm5  , 0x00(%[dst_ptr], %%eax, 1)\n\t"
189        "movdqu %%xmm6  , 0x00(%[dst_ptr], %%eax, 2)\n\t"
190        "movdqu %%xmm7  , 0x00(%[dst_ptr], %%ecx, 1)\n\t"
191
192        :  // outputs
193        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
194        [dst_ptr] "+r"(dst_ptr)
195        :  // inputs
196        [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride),
197        [run_depth_cells] "g"(run_depth_cells)
198        :  // clobbers
199        "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
200        "%xmm6", "%xmm7", "%eax", "%ecx");
201  }
202};
203#endif
204#ifdef GEMMLOWP_SSE4_64
205struct SSE4_64_Kernel12x4Depth2 : KernelBase {
206  typedef KernelFormat<
207      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
208      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
209      Format;
210
211  const char* Name() const override { return "SSE, 12x4, depth 2"; }
212
213  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
214           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
215           const std::uint8_t* rhs_ptr, std::size_t start_depth,
216           std::size_t run_depth) const override {
217    ScopedProfilingLabel label("optimized kernel");
218    assert(dst_row_stride == 1);
219    const std::int64_t run_depth_cells = run_depth / Format::kDepth;
220    const std::int64_t dst_col_stride_q = dst_col_stride;
221
222    /* Main loop */
223
224    // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
225    // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced
226    // every Iteration.
227    // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15.
228    //
229    //                   +-------+-------+-------+-------+
230    //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
231    //              Rhs  +-------+---------------+-------+
232    //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
233    //                   +-------+-------+-------+-------+
234    //
235    //                   |       |       |       |       |
236    //
237    //    Lhs            |       |       |       |       |
238    //
239    //  +--+--+ - - - -  +-------+-------+-------+-------+
240    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
241    //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
242    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
243    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
244    //  +--+--+ - - - -  +-------+-------+-------+-------+
245    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
246    //  |xmm0 | (Iter2)  | xmm8  | xmm9  | xmm10 | xmm11 |
247    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
248    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
249    //  +--+--+ - - - -  +-------+-------+-------+-------+
250    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
251    //  |xmm0 | (Iter3)  | xmm12 | xmm13 | xmm14 | xmm15 |
252    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
253    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
254    //  +--+--+ - - - -  +-------+-------+-------+-------+
255    //
256    //                              Accumulator
257
258    asm volatile(
259
260        // Set registers for destination
261        "movq  %[dst_col_stride_q], %%r12\n\t"
262        "shlq $2, %%r12\n\t"
263        "leaq (%%r12,%%r12,0x2), %%r13\n\t"
264
265        // Set accumulators to zero.
266        "pxor %%xmm4  , %%xmm4 \n\t"
267        "pxor %%xmm5  , %%xmm5 \n\t"
268        "pxor %%xmm6  , %%xmm6 \n\t"
269        "pxor %%xmm7  , %%xmm7 \n\t"
270        "pxor %%xmm8  , %%xmm8 \n\t"
271        "pxor %%xmm9  , %%xmm9 \n\t"
272        "pxor %%xmm10 , %%xmm10\n\t"
273        "pxor %%xmm11 , %%xmm11\n\t"
274        "pxor %%xmm12 , %%xmm12\n\t"
275        "pxor %%xmm13 , %%xmm13\n\t"
276        "pxor %%xmm14 , %%xmm14\n\t"
277        "pxor %%xmm15 , %%xmm15\n\t"
278
279        "movq  %[run_depth_cells], %%r14\n\t"
280        "subq $2, %%r14\n\t"
281        "js outerLoop1%=\n\t"
282
283        // Loop for K unrolled by 4
284        "outerLoop2%=:\n\t"
285
286        // K = 1,2
287        // RHS cell to xmm1
288
289        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
290
291        // LHS cell
292        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
293        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
294        "pmaddwd %%xmm0, %%xmm2         \n\t"
295        "paddd %%xmm2, %%xmm4           \n\t"
296        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
297        "pmaddwd %%xmm0, %%xmm3         \n\t"
298        "paddd %%xmm3, %%xmm5           \n\t"
299
300        "prefetcht0 0x80(%[lhs_ptr]) \n\t"
301
302        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
303        "pmaddwd %%xmm0, %%xmm2         \n\t"
304        "paddd %%xmm2, %%xmm6           \n\t"
305        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
306        "pmaddwd %%xmm0, %%xmm3         \n\t"
307        "paddd %%xmm3, %%xmm7           \n\t"
308
309        // next LHS cell
310        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
311        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
312        "pmaddwd %%xmm0, %%xmm2         \n\t"
313        "paddd %%xmm2, %%xmm8           \n\t"
314        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
315        "pmaddwd %%xmm0, %%xmm3         \n\t"
316        "paddd %%xmm3, %%xmm9           \n\t"
317
318        "prefetcht0 0x80(%[rhs_ptr]) \n\t"
319
320        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
321        "pmaddwd %%xmm0, %%xmm2         \n\t"
322        "paddd %%xmm2, %%xmm10          \n\t"
323        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
324        "pmaddwd %%xmm0, %%xmm3         \n\t"
325        "paddd %%xmm3, %%xmm11          \n\t"
326
327        // next LHS cell
328        "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
329        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
330        "pmaddwd %%xmm0, %%xmm2         \n\t"
331        "paddd %%xmm2, %%xmm12          \n\t"
332        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
333        "pmaddwd %%xmm0, %%xmm3         \n\t"
334        "paddd %%xmm3, %%xmm13          \n\t"
335
336        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
337        "pmaddwd %%xmm0, %%xmm2         \n\t"
338        "paddd %%xmm2, %%xmm14          \n\t"
339        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
340        "pmaddwd %%xmm0, %%xmm3         \n\t"
341        "paddd %%xmm3, %%xmm15          \n\t"
342
343        // K = 3,4
344        // RHS cell to xmm1
345        "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
346
347        // LHS cell
348        "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t"
349        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
350        "pmaddwd %%xmm0, %%xmm2         \n\t"
351        "paddd %%xmm2, %%xmm4           \n\t"
352        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
353        "pmaddwd %%xmm0, %%xmm3         \n\t"
354        "paddd %%xmm3, %%xmm5           \n\t"
355
356        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
357        "pmaddwd %%xmm0, %%xmm2         \n\t"
358        "paddd %%xmm2, %%xmm6           \n\t"
359        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
360        "pmaddwd %%xmm0, %%xmm3         \n\t"
361        "paddd %%xmm3, %%xmm7           \n\t"
362
363        // next LHS cell
364        "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t"
365        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
366        "pmaddwd %%xmm0, %%xmm2         \n\t"
367        "paddd %%xmm2, %%xmm8           \n\t"
368        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
369        "pmaddwd %%xmm0, %%xmm3         \n\t"
370        "paddd %%xmm3, %%xmm9           \n\t"
371
372        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
373        "pmaddwd %%xmm0, %%xmm2         \n\t"
374        "paddd %%xmm2, %%xmm10          \n\t"
375        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
376        "pmaddwd %%xmm0, %%xmm3         \n\t"
377        "paddd %%xmm3, %%xmm11          \n\t"
378
379        // next LHS cell
380        "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t"
381        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
382        "pmaddwd %%xmm0, %%xmm2         \n\t"
383        "paddd %%xmm2, %%xmm12          \n\t"
384        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
385        "pmaddwd %%xmm0, %%xmm3         \n\t"
386        "paddd %%xmm3, %%xmm13          \n\t"
387
388        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
389        "pmaddwd %%xmm0, %%xmm2         \n\t"
390        "paddd %%xmm2, %%xmm14          \n\t"
391        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
392        "pmaddwd %%xmm0, %%xmm3         \n\t"
393        "paddd %%xmm3, %%xmm15          \n\t"
394
395        "addq $0x30, %[lhs_ptr]\n\t"
396        "addq $0x10, %[rhs_ptr]\n\t"
397
398        "subq $2, %[run_depth_cells]\n\t"
399        "jnz outerLoop2%=\n\t"
400
401        "movq %[run_depth_cells], %%r14\n\t"
402        "decq %%r14\n\t"
403        "js finish%=\n\t"
404
405        // Loop for K unrolled by 2
406        "outerLoop1%=:\n\t"
407
408        // RHS cell to xmm1
409        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
410
411        // LHS cell
412        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
413        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
414        "pmaddwd %%xmm0, %%xmm2         \n\t"
415        "paddd %%xmm2, %%xmm4           \n\t"
416        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
417        "pmaddwd %%xmm0, %%xmm3         \n\t"
418        "paddd %%xmm3, %%xmm5           \n\t"
419        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
420        "pmaddwd %%xmm0, %%xmm2         \n\t"
421        "paddd %%xmm2, %%xmm6           \n\t"
422        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
423        "pmaddwd %%xmm0, %%xmm3         \n\t"
424        "paddd %%xmm3, %%xmm7           \n\t"
425
426        // next LHS cell
427        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
428        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
429        "pmaddwd %%xmm0, %%xmm2         \n\t"
430        "paddd %%xmm2, %%xmm8           \n\t"
431        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
432        "pmaddwd %%xmm0, %%xmm3         \n\t"
433        "paddd %%xmm3, %%xmm9           \n\t"
434        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
435        "pmaddwd %%xmm0, %%xmm2         \n\t"
436        "paddd %%xmm2, %%xmm10          \n\t"
437        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
438        "pmaddwd %%xmm0, %%xmm3         \n\t"
439        "paddd %%xmm3, %%xmm11          \n\t"
440
441        // next LHS cell
442        "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
443        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
444        "pmaddwd %%xmm0, %%xmm2         \n\t"
445        "paddd %%xmm2, %%xmm12          \n\t"
446        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
447        "pmaddwd %%xmm0, %%xmm3         \n\t"
448        "paddd %%xmm3, %%xmm13          \n\t"
449        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
450        "pmaddwd %%xmm0, %%xmm2         \n\t"
451        "paddd %%xmm2, %%xmm14          \n\t"
452        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
453        "pmaddwd %%xmm0, %%xmm3         \n\t"
454        "paddd %%xmm3, %%xmm15          \n\t"
455
456        "addq $0x18, %[lhs_ptr]\n\t"
457        "addq $0x08, %[rhs_ptr]\n\t"
458
459        "decq %[run_depth_cells]\n\t"
460        "jnz outerLoop1%=\n\t"
461
462        "finish%=:\n\t"
463
464        "test %[start_depth], %[start_depth]\n\t"
465        "jz storeDst%=\n\t"
466
467        "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
468        "paddd 0x10(%[dst_ptr])           , %%xmm8 \n\t"
469        "paddd 0x20(%[dst_ptr])           , %%xmm12\n\t"
470        "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t"
471        "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t"
472        "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t"
473        "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t"
474        "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t"
475        "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t"
476        "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t"
477        "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t"
478        "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t"
479
480        "storeDst%=:\n\t"
481
482        "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
483        "movdqu %%xmm8  , 0x10(%[dst_ptr])          \n\t"
484        "movdqu %%xmm12 , 0x20(%[dst_ptr])          \n\t"
485        "movdqu %%xmm5  , 0x00(%[dst_ptr], %%r12, 1)\n\t"
486        "movdqu %%xmm9  , 0x10(%[dst_ptr], %%r12, 1)\n\t"
487        "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t"
488        "movdqu %%xmm6  , 0x00(%[dst_ptr], %%r12, 2)\n\t"
489        "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t"
490        "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t"
491        "movdqu %%xmm7  , 0x00(%[dst_ptr], %%r13, 1)\n\t"
492        "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t"
493        "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t"
494
495        :  // outputs
496        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
497        [dst_ptr] "+r"(dst_ptr)
498        :  // inputs
499        [start_depth] "r"(start_depth),
500        [dst_col_stride_q] "r"(dst_col_stride_q),
501        [run_depth_cells] "r"(run_depth_cells)
502        :  // clobbers
503        "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
504        "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14",
505        "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
506  }
507};
508#endif
509
510}  // namespace gemmlowp
511
512#endif  // GEMMLOWP_INTERNAL_KERNEL_SSE_H_
513