1// Copyright 2016, VIXL authors
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7//   * Redistributions of source code must retain the above copyright notice,
8//     this list of conditions and the following disclaimer.
9//   * Redistributions in binary form must reproduce the above copyright notice,
10//     this list of conditions and the following disclaimer in the documentation
11//     and/or other materials provided with the distribution.
12//   * Neither the name of ARM Limited nor the names of its contributors may be
13//     used to endorse or promote products derived from this software without
14//     specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#include <cfloat>
28#include <cmath>
29#include <cstdio>
30#include <cstdlib>
31#include <cstring>
32
33#include "test-runner.h"
34#include "test-utils-aarch64.h"
35
36#include "aarch64/cpu-aarch64.h"
37#include "aarch64/debugger-aarch64.h"
38#include "aarch64/disasm-aarch64.h"
39#include "aarch64/macro-assembler-aarch64.h"
40#include "aarch64/simulator-aarch64.h"
41
42namespace vixl {
43namespace aarch64 {
44// Trace tests can only work with the simulator.
45#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
46
47#define __ masm->
48#define TEST(name) TEST_(TRACE_##name)
49
50static void GenerateTestSequenceBase(MacroAssembler* masm) {
51  ExactAssemblyScope guard(masm,
52                           masm->GetBuffer()->GetRemainingBytes(),
53                           ExactAssemblyScope::kMaximumSize);
54
55  __ adc(w3, w4, w5);
56  __ adc(x6, x7, x8);
57  __ adcs(w9, w10, w11);
58  __ adcs(x12, x13, x14);
59  __ add(w15, w16, w17);
60  __ add(x18, x19, x20);
61  __ adds(w21, w22, w23);
62  __ adds(x24, x25, x26);
63  __ and_(w27, w28, w29);
64  __ and_(x2, x3, x4);
65  __ ands(w5, w6, w7);
66  __ ands(x8, x9, x10);
67  __ asr(w11, w12, 0);
68  __ asr(x13, x14, 1);
69  __ asrv(w15, w16, w17);
70  __ asrv(x18, x19, x20);
71  __ bfm(w21, w22, 5, 6);
72  __ bfm(x23, x24, 7, 8);
73  __ bic(w25, w26, w27);
74  __ bic(x28, x29, x2);
75  __ bics(w3, w4, w5);
76  __ bics(x6, x7, x8);
77  __ ccmn(w9, w10, NoFlag, al);
78  __ ccmn(w9, w10, NoFlag, eq);
79  __ ccmn(w9, w10, NoFlag, ne);
80  __ ccmn(x11, x12, CFlag, al);
81  __ ccmn(x11, x12, CFlag, cc);
82  __ ccmn(x11, x12, CFlag, cs);
83  __ ccmp(w13, w14, VFlag, al);
84  __ ccmp(w13, w14, VFlag, hi);
85  __ ccmp(w13, w14, VFlag, ls);
86  __ ccmp(x15, x16, CVFlag, al);
87  __ ccmp(x15, x16, CVFlag, eq);
88  __ ccmp(x15, x16, CVFlag, ne);
89  __ cinc(w17, w18, cc);
90  __ cinc(w17, w18, cs);
91  __ cinc(x19, x20, hi);
92  __ cinc(x19, x20, ls);
93  __ cinv(w21, w22, eq);
94  __ cinv(w21, w22, ne);
95  __ cinv(x23, x24, cc);
96  __ cinv(x23, x24, cs);
97  __ clrex();
98  __ cls(w25, w26);
99  __ cls(x27, x28);
100  __ clz(w29, w2);
101  __ clz(x3, x4);
102  __ cmn(w5, w6);
103  __ cmn(x7, x8);
104  __ cmp(w9, w10);
105  __ cmp(x11, x12);
106  __ cneg(w13, w14, hi);
107  __ cneg(w13, w14, ls);
108  __ cneg(x15, x16, eq);
109  __ cneg(x15, x16, ne);
110  __ crc32b(w17, w18, w19);
111  __ crc32cb(w20, w21, w22);
112  __ crc32ch(w23, w24, w25);
113  __ crc32cw(w26, w27, w28);
114  __ crc32h(w4, w5, w6);
115  __ crc32w(w7, w8, w9);
116  __ csel(w13, w14, w15, cc);
117  __ csel(w13, w14, w15, cs);
118  __ csel(x16, x17, x18, hi);
119  __ csel(x16, x17, x18, ls);
120  __ cset(w19, eq);
121  __ cset(w19, ne);
122  __ cset(x20, cc);
123  __ cset(x20, cs);
124  __ csetm(w21, hi);
125  __ csetm(w21, ls);
126  __ csetm(x22, eq);
127  __ csetm(x22, ne);
128  __ csinc(w23, w24, w25, cc);
129  __ csinc(w23, w24, w25, cs);
130  __ csinc(x26, x27, x28, hi);
131  __ csinc(x26, x27, x28, ls);
132  __ csinv(w29, w2, w3, eq);
133  __ csinv(w29, w2, w3, ne);
134  __ csinv(x4, x5, x6, cc);
135  __ csinv(x4, x5, x6, cs);
136  __ csneg(w7, w8, w9, hi);
137  __ csneg(w7, w8, w9, ls);
138  __ csneg(x10, x11, x12, eq);
139  __ csneg(x10, x11, x12, ne);
140  __ dc(CVAC, x0);
141  __ dmb(InnerShareable, BarrierAll);
142  __ dsb(InnerShareable, BarrierAll);
143  __ eon(w13, w14, w15);
144  __ eon(x16, x17, x18);
145  __ eor(w19, w20, w21);
146  __ eor(x22, x23, x24);
147  __ extr(w25, w26, w27, 9);
148  __ extr(x28, x29, x2, 10);
149  __ hint(NOP);
150  __ ic(IVAU, x0);
151  __ isb();
152  __ ldar(w3, MemOperand(x0));
153  __ ldar(x4, MemOperand(x0));
154  __ ldarb(w5, MemOperand(x0));
155  __ ldarb(x6, MemOperand(x0));
156  __ ldarh(w7, MemOperand(x0));
157  __ ldarh(x8, MemOperand(x0));
158  __ ldaxp(w9, w10, MemOperand(x0));
159  __ ldaxp(x11, x12, MemOperand(x0));
160  __ ldaxr(w13, MemOperand(x0));
161  __ ldaxr(x14, MemOperand(x0));
162  __ ldaxrb(w15, MemOperand(x0));
163  __ ldaxrb(x16, MemOperand(x0));
164  __ ldaxrh(w17, MemOperand(x0));
165  __ ldaxrh(x18, MemOperand(x0));
166  __ ldnp(w19, w20, MemOperand(x0));
167  __ ldnp(x21, x22, MemOperand(x0));
168  __ ldp(w23, w24, MemOperand(x0));
169  __ ldp(w23, w24, MemOperand(x1, 8, PostIndex));
170  __ ldp(w23, w24, MemOperand(x1, 8, PreIndex));
171  __ ldp(x25, x26, MemOperand(x0));
172  __ ldp(x25, x26, MemOperand(x1, 16, PostIndex));
173  __ ldp(x25, x26, MemOperand(x1, 16, PreIndex));
174  __ ldpsw(x27, x28, MemOperand(x0));
175  __ ldpsw(x27, x28, MemOperand(x1, 8, PostIndex));
176  __ ldpsw(x27, x28, MemOperand(x1, 8, PreIndex));
177  __ ldr(w29, MemOperand(x0));
178  __ ldr(w29, MemOperand(x1, 4, PostIndex));
179  __ ldr(w29, MemOperand(x1, 4, PreIndex));
180  __ ldr(x2, MemOperand(x0));
181  __ ldr(x2, MemOperand(x1, 8, PostIndex));
182  __ ldr(x2, MemOperand(x1, 8, PreIndex));
183  __ ldrb(w3, MemOperand(x0));
184  __ ldrb(w3, MemOperand(x1, 1, PostIndex));
185  __ ldrb(w3, MemOperand(x1, 1, PreIndex));
186  __ ldrb(x4, MemOperand(x0));
187  __ ldrb(x4, MemOperand(x1, 1, PostIndex));
188  __ ldrb(x4, MemOperand(x1, 1, PreIndex));
189  __ ldrh(w5, MemOperand(x0));
190  __ ldrh(w5, MemOperand(x1, 2, PostIndex));
191  __ ldrh(w5, MemOperand(x1, 2, PreIndex));
192  __ ldrh(x6, MemOperand(x0));
193  __ ldrh(x6, MemOperand(x1, 2, PostIndex));
194  __ ldrh(x6, MemOperand(x1, 2, PreIndex));
195  __ ldrsb(w7, MemOperand(x0));
196  __ ldrsb(w7, MemOperand(x1, 1, PostIndex));
197  __ ldrsb(w7, MemOperand(x1, 1, PreIndex));
198  __ ldrsb(x8, MemOperand(x0));
199  __ ldrsb(x8, MemOperand(x1, 1, PostIndex));
200  __ ldrsb(x8, MemOperand(x1, 1, PreIndex));
201  __ ldrsh(w9, MemOperand(x0));
202  __ ldrsh(w9, MemOperand(x1, 2, PostIndex));
203  __ ldrsh(w9, MemOperand(x1, 2, PreIndex));
204  __ ldrsh(x10, MemOperand(x0));
205  __ ldrsh(x10, MemOperand(x1, 2, PostIndex));
206  __ ldrsh(x10, MemOperand(x1, 2, PreIndex));
207  __ ldrsw(x11, MemOperand(x0));
208  __ ldrsw(x11, MemOperand(x1, 4, PostIndex));
209  __ ldrsw(x11, MemOperand(x1, 4, PreIndex));
210  __ ldur(w12, MemOperand(x0, 7));
211  __ ldur(x13, MemOperand(x0, 15));
212  __ ldurb(w14, MemOperand(x0, 1));
213  __ ldurb(x15, MemOperand(x0, 1));
214  __ ldurh(w16, MemOperand(x0, 3));
215  __ ldurh(x17, MemOperand(x0, 3));
216  __ ldursb(w18, MemOperand(x0, 1));
217  __ ldursb(x19, MemOperand(x0, 1));
218  __ ldursh(w20, MemOperand(x0, 3));
219  __ ldursh(x21, MemOperand(x0, 3));
220  __ ldursw(x22, MemOperand(x0, 7));
221  __ ldxp(w23, w24, MemOperand(x0));
222  __ ldxp(x25, x26, MemOperand(x0));
223  __ ldxr(w27, MemOperand(x0));
224  __ ldxr(x28, MemOperand(x0));
225  __ ldxrb(w29, MemOperand(x0));
226  __ ldxrb(x2, MemOperand(x0));
227  __ ldxrh(w3, MemOperand(x0));
228  __ ldxrh(x4, MemOperand(x0));
229  __ lsl(w5, w6, 2);
230  __ lsl(x7, x8, 3);
231  __ lslv(w9, w10, w11);
232  __ lslv(x12, x13, x14);
233  __ lsr(w15, w16, 4);
234  __ lsr(x17, x18, 5);
235  __ lsrv(w19, w20, w21);
236  __ lsrv(x22, x23, x24);
237  __ madd(w25, w26, w27, w28);
238  __ madd(x29, x2, x3, x4);
239  __ mneg(w5, w6, w7);
240  __ mneg(x8, x9, x10);
241  __ mov(w11, w12);
242  __ mov(x13, x14);
243  __ movk(w15, 130);
244  __ movk(x16, 131);
245  __ movn(w17, 132);
246  __ movn(x18, 133);
247  __ movz(w19, 134);
248  __ movz(x20, 135);
249  __ msub(w22, w23, w24, w25);
250  __ msub(x26, x27, x28, x29);
251  __ mul(w2, w3, w4);
252  __ mul(x5, x6, x7);
253  __ mvn(w8, w9);
254  __ mvn(x10, x11);
255  __ neg(w12, w13);
256  __ neg(x14, x15);
257  __ negs(w16, w17);
258  __ negs(x18, x19);
259  __ ngc(w20, w21);
260  __ ngc(x22, x23);
261  __ ngcs(w24, w25);
262  __ ngcs(x26, x27);
263  __ nop();
264  __ orn(w28, w29, w2);
265  __ orn(x3, x4, x5);
266  __ orr(w6, w7, w8);
267  __ orr(x9, x10, x11);
268  __ prfm(PLDL1KEEP, MemOperand(x0, 4));
269  __ prfum(PLDL1KEEP, MemOperand(x0, 1));
270  __ rbit(w12, w13);
271  __ rbit(x14, x15);
272  __ rev(w16, w17);
273  __ rev(x18, x19);
274  __ rev16(w20, w21);
275  __ rev16(x22, x23);
276  __ rev32(x24, x25);
277  __ rorv(w26, w27, w28);
278  __ rorv(x29, x2, x3);
279  __ sbc(w4, w5, w6);
280  __ sbc(x7, x8, x9);
281  __ sbcs(w10, w11, w12);
282  __ sbcs(x13, x14, x15);
283  __ sbfiz(w16, w17, 2, 3);
284  __ sbfiz(x18, x19, 4, 5);
285  __ sbfx(w22, w23, 6, 7);
286  __ sbfx(x24, x25, 8, 9);
287  __ sdiv(w26, w27, w28);
288  __ sdiv(x29, x2, x3);
289  __ smulh(x12, x13, x14);
290  __ stlr(w18, MemOperand(x0));
291  __ stlr(x19, MemOperand(x0));
292  __ stlrb(w20, MemOperand(x0));
293  __ stlrb(x21, MemOperand(x0));
294  __ stlrh(w22, MemOperand(x0));
295  __ stlrh(x23, MemOperand(x0));
296  __ stlxp(w24, w25, w26, MemOperand(x0));
297  __ stlxp(x27, x28, x29, MemOperand(x0));
298  __ stlxr(w2, w3, MemOperand(x0));
299  __ stlxr(x4, x5, MemOperand(x0));
300  __ stlxrb(w6, w7, MemOperand(x0));
301  __ stlxrb(x8, x9, MemOperand(x0));
302  __ stlxrh(w10, w11, MemOperand(x0));
303  __ stlxrh(x12, x13, MemOperand(x0));
304  __ stnp(w14, w15, MemOperand(x0));
305  __ stnp(x16, x17, MemOperand(x0));
306  __ stp(w18, w19, MemOperand(x0));
307  __ stp(w18, w19, MemOperand(x1, 8, PostIndex));
308  __ stp(w18, w19, MemOperand(x1, 8, PreIndex));
309  __ stp(x20, x21, MemOperand(x0));
310  __ stp(x20, x21, MemOperand(x1, 16, PostIndex));
311  __ stp(x20, x21, MemOperand(x1, 16, PreIndex));
312  __ str(w22, MemOperand(x0));
313  __ str(w22, MemOperand(x1, 4, PostIndex));
314  __ str(w22, MemOperand(x1, 4, PreIndex));
315  __ str(x23, MemOperand(x0));
316  __ str(x23, MemOperand(x1, 8, PostIndex));
317  __ str(x23, MemOperand(x1, 8, PreIndex));
318  __ strb(w24, MemOperand(x0));
319  __ strb(w24, MemOperand(x1, 1, PostIndex));
320  __ strb(w24, MemOperand(x1, 1, PreIndex));
321  __ strb(x25, MemOperand(x0));
322  __ strb(x25, MemOperand(x1, 1, PostIndex));
323  __ strb(x25, MemOperand(x1, 1, PreIndex));
324  __ strh(w26, MemOperand(x0));
325  __ strh(w26, MemOperand(x1, 2, PostIndex));
326  __ strh(w26, MemOperand(x1, 2, PreIndex));
327  __ strh(x27, MemOperand(x0));
328  __ strh(x27, MemOperand(x1, 2, PostIndex));
329  __ strh(x27, MemOperand(x1, 2, PreIndex));
330  __ stur(w28, MemOperand(x0, 7));
331  __ stur(x29, MemOperand(x0, 15));
332  __ sturb(w2, MemOperand(x0, 1));
333  __ sturb(x3, MemOperand(x0, 1));
334  __ sturh(w4, MemOperand(x0, 3));
335  __ sturh(x5, MemOperand(x0, 3));
336  __ stxp(w6, w7, w8, MemOperand(x0));
337  __ stxp(x9, x10, x11, MemOperand(x0));
338  __ stxr(w12, w13, MemOperand(x0));
339  __ stxr(x14, x15, MemOperand(x0));
340  __ stxrb(w16, w17, MemOperand(x0));
341  __ stxrb(x18, x19, MemOperand(x0));
342  __ stxrh(w20, w21, MemOperand(x0));
343  __ stxrh(x22, x23, MemOperand(x0));
344  __ sub(w24, w25, w26);
345  __ sub(x27, x28, x29);
346  __ subs(w2, w3, w4);
347  __ subs(x5, x6, x7);
348  __ sxtb(w8, w9);
349  __ sxtb(x10, x11);
350  __ sxth(w12, w13);
351  __ sxth(x14, x15);
352  __ sxtw(w16, w17);
353  __ sxtw(x18, x19);
354  __ tst(w20, w21);
355  __ tst(x22, x23);
356  __ ubfiz(w24, w25, 10, 11);
357  __ ubfiz(x26, x27, 12, 13);
358  __ ubfm(w28, w29, 14, 15);
359  __ ubfm(x2, x3, 1, 2);
360  __ ubfx(w4, w5, 3, 4);
361  __ ubfx(x6, x7, 5, 6);
362  __ udiv(w8, w9, w10);
363  __ udiv(x11, x12, x13);
364  __ umulh(x22, x23, x24);
365  __ uxtb(w28, w29);
366  __ uxtb(x2, x3);
367  __ uxth(w4, w5);
368  __ uxth(x6, x7);
369  __ uxtw(w8, w9);
370  __ uxtw(x10, x11);
371
372  // Branch tests.
373  {
374    Label end;
375    // Branch to the next instruction.
376    __ b(&end);
377    __ bind(&end);
378  }
379  {
380    Label loop, end;
381    __ subs(x3, x3, x3);
382    __ bind(&loop);
383    // Not-taken branch (the first time).
384    // Taken branch (the second time).
385    __ b(&end, ne);
386    __ cmp(x3, 1);
387    // Backwards branch.
388    __ b(&loop);
389    __ bind(&end);
390  }
391}
392
393
394static void GenerateTestSequenceFP(MacroAssembler* masm) {
395  ExactAssemblyScope guard(masm,
396                           masm->GetBuffer()->GetRemainingBytes(),
397                           ExactAssemblyScope::kMaximumSize);
398
399  // Scalar floating point instructions.
400  __ fabd(d13, d2, d19);
401  __ fabd(s8, s10, s30);
402  __ fabs(d1, d1);
403  __ fabs(s25, s7);
404  __ facge(d1, d23, d16);
405  __ facge(s4, s17, s1);
406  __ facgt(d2, d21, d24);
407  __ facgt(s12, s26, s12);
408  __ fadd(d13, d11, d22);
409  __ fadd(s27, s19, s8);
410  __ fccmp(d6, d10, NoFlag, hs);
411  __ fccmp(s29, s20, NZVFlag, ne);
412  __ fccmpe(d10, d2, NZCFlag, al);
413  __ fccmpe(s3, s3, NZVFlag, pl);
414  __ fcmeq(d19, d8, d10);
415  __ fcmeq(d0, d18, 0.0);
416  __ fcmeq(s1, s4, s30);
417  __ fcmeq(s22, s29, 0.0);
418  __ fcmge(d27, d18, d1);
419  __ fcmge(d31, d28, 0.0);
420  __ fcmge(s31, s19, s9);
421  __ fcmge(s1, s25, 0.0);
422  __ fcmgt(d18, d1, d15);
423  __ fcmgt(d3, d31, 0.0);
424  __ fcmgt(s11, s25, s2);
425  __ fcmgt(s17, s16, 0.0);
426  __ fcmle(d24, d17, 0.0);
427  __ fcmle(s11, s8, 0.0);
428  __ fcmlt(d5, d31, 0.0);
429  __ fcmlt(s18, s23, 0.0);
430  __ fcmp(d10, d24);
431  __ fcmp(d13, 0.0);
432  __ fcmp(s18, s6);
433  __ fcmp(s16, 0.0);
434  __ fcmpe(d9, d17);
435  __ fcmpe(d29, 0.0);
436  __ fcmpe(s16, s17);
437  __ fcmpe(s22, 0.0);
438  __ fcsel(d10, d14, d19, gt);
439  __ fcsel(s22, s18, s2, ge);
440  __ fcvt(d4, h24);
441  __ fcvt(d11, s2);
442  __ fcvt(h8, d9);
443  __ fcvt(h12, s1);
444  __ fcvt(s12, d31);
445  __ fcvt(s27, h25);
446  __ fcvtas(d28, d16);
447  __ fcvtas(s3, s5);
448  __ fcvtas(w18, d31);
449  __ fcvtas(w29, s24);
450  __ fcvtas(x9, d1);
451  __ fcvtas(x30, s2);
452  __ fcvtau(d14, d0);
453  __ fcvtau(s31, s14);
454  __ fcvtau(w16, d2);
455  __ fcvtau(w18, s0);
456  __ fcvtau(x26, d7);
457  __ fcvtau(x25, s19);
458  __ fcvtms(d30, d25);
459  __ fcvtms(s12, s15);
460  __ fcvtms(w9, d7);
461  __ fcvtms(w19, s6);
462  __ fcvtms(x6, d6);
463  __ fcvtms(x22, s7);
464  __ fcvtmu(d27, d0);
465  __ fcvtmu(s8, s22);
466  __ fcvtmu(w29, d19);
467  __ fcvtmu(w26, s0);
468  __ fcvtmu(x13, d5);
469  __ fcvtmu(x5, s18);
470  __ fcvtns(d30, d15);
471  __ fcvtns(s10, s11);
472  __ fcvtns(w21, d15);
473  __ fcvtns(w18, s10);
474  __ fcvtns(x8, d17);
475  __ fcvtns(x17, s12);
476  __ fcvtnu(d0, d21);
477  __ fcvtnu(s6, s25);
478  __ fcvtnu(w29, d11);
479  __ fcvtnu(w25, s31);
480  __ fcvtnu(x30, d11);
481  __ fcvtnu(x27, s18);
482  __ fcvtps(d11, d22);
483  __ fcvtps(s29, s20);
484  __ fcvtps(w15, d25);
485  __ fcvtps(w16, s7);
486  __ fcvtps(x13, d20);
487  __ fcvtps(x3, s23);
488  __ fcvtpu(d24, d1);
489  __ fcvtpu(s14, s24);
490  __ fcvtpu(w26, d29);
491  __ fcvtpu(wzr, s26);
492  __ fcvtpu(x27, d6);
493  __ fcvtpu(x29, s14);
494  __ fcvtxn(s12, d12);
495  __ fcvtzs(d15, d0);
496  __ fcvtzs(d13, d4, 42);
497  __ fcvtzs(s8, s11);
498  __ fcvtzs(s31, s6, 25);
499  __ fcvtzs(w6, d9);
500  __ fcvtzs(w25, d10, 20);
501  __ fcvtzs(w9, s1);
502  __ fcvtzs(w17, s29, 30);
503  __ fcvtzs(x19, d2);
504  __ fcvtzs(x22, d14, 1);
505  __ fcvtzs(x14, s20);
506  __ fcvtzs(x3, s30, 33);
507  __ fcvtzu(d28, d15);
508  __ fcvtzu(d0, d4, 3);
509  __ fcvtzu(s2, s5);
510  __ fcvtzu(s4, s0, 30);
511  __ fcvtzu(w11, d4);
512  __ fcvtzu(w7, d24, 32);
513  __ fcvtzu(w18, s24);
514  __ fcvtzu(w14, s27, 4);
515  __ fcvtzu(x22, d11);
516  __ fcvtzu(x8, d27, 52);
517  __ fcvtzu(x7, s20);
518  __ fcvtzu(x22, s7, 44);
519  __ fdiv(d6, d14, d15);
520  __ fdiv(s26, s5, s25);
521  __ fmadd(d18, d26, d12, d30);
522  __ fmadd(s13, s9, s28, s4);
523  __ fmax(d12, d5, d5);
524  __ fmax(s12, s28, s6);
525  __ fmaxnm(d28, d4, d2);
526  __ fmaxnm(s6, s10, s8);
527  __ fmin(d20, d20, d18);
528  __ fmin(s7, s13, s16);
529  __ fminnm(d19, d14, d30);
530  __ fminnm(s0, s1, s1);
531  __ fmov(d13, d6);
532  __ fmov(d2, x17);
533  __ fmov(d8, -2.5000);
534  __ fmov(s5, s3);
535  __ fmov(s25, w20);
536  __ fmov(s21, 2.8750f);
537  __ fmov(w18, s24);
538  __ fmov(x18, d2);
539  __ fmsub(d20, d30, d3, d19);
540  __ fmsub(s5, s19, s4, s12);
541  __ fmul(d30, d27, d23);
542  __ fmul(s25, s17, s15);
543  __ fmulx(d4, d17, d1);
544  __ fmulx(s14, s25, s4);
545  __ fneg(d15, d0);
546  __ fneg(s14, s15);
547  __ fnmadd(d0, d16, d22, d31);
548  __ fnmadd(s0, s18, s26, s18);
549  __ fnmsub(d19, d12, d15, d21);
550  __ fnmsub(s29, s0, s11, s26);
551  __ fnmul(d31, d19, d1);
552  __ fnmul(s18, s3, s17);
553  __ frecpe(d7, d21);
554  __ frecpe(s29, s17);
555  __ frecps(d11, d26, d17);
556  __ frecps(s18, s27, s1);
557  __ frecpx(d15, d18);
558  __ frecpx(s5, s10);
559  __ frinta(d16, d30);
560  __ frinta(s1, s22);
561  __ frinti(d19, d29);
562  __ frinti(s14, s21);
563  __ frintm(d20, d30);
564  __ frintm(s1, s16);
565  __ frintn(d30, d1);
566  __ frintn(s24, s10);
567  __ frintp(d4, d20);
568  __ frintp(s13, s3);
569  __ frintx(d13, d20);
570  __ frintx(s17, s7);
571  __ frintz(d0, d8);
572  __ frintz(s15, s29);
573  __ frsqrte(d21, d10);
574  __ frsqrte(s17, s25);
575  __ frsqrts(d4, d29, d17);
576  __ frsqrts(s14, s3, s24);
577  __ fsqrt(d14, d17);
578  __ fsqrt(s4, s14);
579  __ fsub(d13, d19, d7);
580  __ fsub(s3, s21, s27);
581  __ scvtf(d31, d16);
582  __ scvtf(d26, d31, 24);
583  __ scvtf(d6, w16);
584  __ scvtf(d5, w20, 6);
585  __ scvtf(d16, x8);
586  __ scvtf(d15, x8, 10);
587  __ scvtf(s7, s4);
588  __ scvtf(s8, s15, 14);
589  __ scvtf(s29, w10);
590  __ scvtf(s15, w21, 11);
591  __ scvtf(s27, x26);
592  __ scvtf(s26, x12, 38);
593  __ ucvtf(d0, d9);
594  __ ucvtf(d5, d22, 47);
595  __ ucvtf(d30, w27);
596  __ ucvtf(d3, w19, 1);
597  __ ucvtf(d28, x21);
598  __ ucvtf(d27, x30, 35);
599  __ ucvtf(s11, s5);
600  __ ucvtf(s0, s23, 14);
601  __ ucvtf(s20, w19);
602  __ ucvtf(s21, w22, 18);
603  __ ucvtf(s6, x13);
604  __ ucvtf(s7, x2, 21);
605}
606
607
608static void GenerateTestSequenceNEON(MacroAssembler* masm) {
609  ExactAssemblyScope guard(masm,
610                           masm->GetBuffer()->GetRemainingBytes(),
611                           ExactAssemblyScope::kMaximumSize);
612
613  // NEON integer instructions.
614  __ abs(d19, d0);
615  __ abs(v16.V16B(), v11.V16B());
616  __ abs(v0.V2D(), v31.V2D());
617  __ abs(v27.V2S(), v25.V2S());
618  __ abs(v21.V4H(), v27.V4H());
619  __ abs(v16.V4S(), v1.V4S());
620  __ abs(v31.V8B(), v5.V8B());
621  __ abs(v29.V8H(), v13.V8H());
622  __ add(d10, d5, d17);
623  __ add(v31.V16B(), v15.V16B(), v23.V16B());
624  __ add(v10.V2D(), v31.V2D(), v14.V2D());
625  __ add(v15.V2S(), v14.V2S(), v19.V2S());
626  __ add(v27.V4H(), v23.V4H(), v17.V4H());
627  __ add(v25.V4S(), v28.V4S(), v29.V4S());
628  __ add(v13.V8B(), v7.V8B(), v18.V8B());
629  __ add(v4.V8H(), v2.V8H(), v1.V8H());
630  __ addhn(v10.V2S(), v14.V2D(), v15.V2D());
631  __ addhn(v10.V4H(), v30.V4S(), v26.V4S());
632  __ addhn(v31.V8B(), v12.V8H(), v22.V8H());
633  __ addhn2(v16.V16B(), v21.V8H(), v20.V8H());
634  __ addhn2(v0.V4S(), v2.V2D(), v17.V2D());
635  __ addhn2(v31.V8H(), v7.V4S(), v17.V4S());
636  __ addp(d14, v19.V2D());
637  __ addp(v3.V16B(), v8.V16B(), v28.V16B());
638  __ addp(v8.V2D(), v5.V2D(), v17.V2D());
639  __ addp(v22.V2S(), v30.V2S(), v26.V2S());
640  __ addp(v29.V4H(), v24.V4H(), v14.V4H());
641  __ addp(v30.V4S(), v26.V4S(), v24.V4S());
642  __ addp(v12.V8B(), v26.V8B(), v7.V8B());
643  __ addp(v17.V8H(), v8.V8H(), v12.V8H());
644  __ addv(b27, v23.V16B());
645  __ addv(b12, v20.V8B());
646  __ addv(h27, v30.V4H());
647  __ addv(h19, v14.V8H());
648  __ addv(s14, v27.V4S());
649  __ and_(v10.V16B(), v8.V16B(), v27.V16B());
650  __ and_(v5.V8B(), v1.V8B(), v16.V8B());
651  __ bic(v26.V16B(), v3.V16B(), v24.V16B());
652  __ bic(v7.V2S(), 0xe4, 16);
653  __ bic(v28.V4H(), 0x23, 8);
654  __ bic(v29.V4S(), 0xac);
655  __ bic(v12.V8B(), v31.V8B(), v21.V8B());
656  __ bic(v18.V8H(), 0x98);
657  __ bif(v12.V16B(), v26.V16B(), v8.V16B());
658  __ bif(v2.V8B(), v23.V8B(), v27.V8B());
659  __ bit(v8.V16B(), v3.V16B(), v13.V16B());
660  __ bit(v5.V8B(), v5.V8B(), v23.V8B());
661  __ bsl(v9.V16B(), v31.V16B(), v23.V16B());
662  __ bsl(v14.V8B(), v7.V8B(), v3.V8B());
663  __ cls(v29.V16B(), v5.V16B());
664  __ cls(v21.V2S(), v0.V2S());
665  __ cls(v1.V4H(), v12.V4H());
666  __ cls(v27.V4S(), v10.V4S());
667  __ cls(v19.V8B(), v4.V8B());
668  __ cls(v15.V8H(), v14.V8H());
669  __ clz(v1.V16B(), v4.V16B());
670  __ clz(v27.V2S(), v17.V2S());
671  __ clz(v9.V4H(), v9.V4H());
672  __ clz(v31.V4S(), v15.V4S());
673  __ clz(v14.V8B(), v19.V8B());
674  __ clz(v6.V8H(), v11.V8H());
675  __ cmeq(d18, d5, d29);
676  __ cmeq(d14, d31, 0);
677  __ cmeq(v19.V16B(), v3.V16B(), v22.V16B());
678  __ cmeq(v15.V16B(), v9.V16B(), 0);
679  __ cmeq(v12.V2D(), v16.V2D(), v10.V2D());
680  __ cmeq(v8.V2D(), v22.V2D(), 0);
681  __ cmeq(v2.V2S(), v3.V2S(), v9.V2S());
682  __ cmeq(v16.V2S(), v25.V2S(), 0);
683  __ cmeq(v6.V4H(), v23.V4H(), v20.V4H());
684  __ cmeq(v16.V4H(), v13.V4H(), 0);
685  __ cmeq(v21.V4S(), v17.V4S(), v2.V4S());
686  __ cmeq(v6.V4S(), v25.V4S(), 0);
687  __ cmeq(v16.V8B(), v13.V8B(), v2.V8B());
688  __ cmeq(v21.V8B(), v16.V8B(), 0);
689  __ cmeq(v20.V8H(), v7.V8H(), v25.V8H());
690  __ cmeq(v26.V8H(), v8.V8H(), 0);
691  __ cmge(d16, d13, d31);
692  __ cmge(d25, d24, 0);
693  __ cmge(v17.V16B(), v19.V16B(), v17.V16B());
694  __ cmge(v22.V16B(), v30.V16B(), 0);
695  __ cmge(v28.V2D(), v20.V2D(), v26.V2D());
696  __ cmge(v6.V2D(), v23.V2D(), 0);
697  __ cmge(v25.V2S(), v22.V2S(), v3.V2S());
698  __ cmge(v21.V2S(), v11.V2S(), 0);
699  __ cmge(v16.V4H(), v3.V4H(), v12.V4H());
700  __ cmge(v23.V4H(), v9.V4H(), 0);
701  __ cmge(v7.V4S(), v2.V4S(), v11.V4S());
702  __ cmge(v0.V4S(), v22.V4S(), 0);
703  __ cmge(v10.V8B(), v30.V8B(), v9.V8B());
704  __ cmge(v21.V8B(), v8.V8B(), 0);
705  __ cmge(v2.V8H(), v7.V8H(), v26.V8H());
706  __ cmge(v19.V8H(), v10.V8H(), 0);
707  __ cmgt(d6, d13, d1);
708  __ cmgt(d30, d24, 0);
709  __ cmgt(v20.V16B(), v25.V16B(), v27.V16B());
710  __ cmgt(v0.V16B(), v25.V16B(), 0);
711  __ cmgt(v22.V2D(), v25.V2D(), v1.V2D());
712  __ cmgt(v16.V2D(), v16.V2D(), 0);
713  __ cmgt(v5.V2S(), v9.V2S(), v15.V2S());
714  __ cmgt(v12.V2S(), v18.V2S(), 0);
715  __ cmgt(v28.V4H(), v18.V4H(), v11.V4H());
716  __ cmgt(v22.V4H(), v3.V4H(), 0);
717  __ cmgt(v5.V4S(), v11.V4S(), v27.V4S());
718  __ cmgt(v13.V4S(), v20.V4S(), 0);
719  __ cmgt(v27.V8B(), v31.V8B(), v7.V8B());
720  __ cmgt(v5.V8B(), v0.V8B(), 0);
721  __ cmgt(v22.V8H(), v28.V8H(), v13.V8H());
722  __ cmgt(v6.V8H(), v2.V8H(), 0);
723  __ cmhi(d21, d8, d22);
724  __ cmhi(v18.V16B(), v19.V16B(), v19.V16B());
725  __ cmhi(v7.V2D(), v0.V2D(), v21.V2D());
726  __ cmhi(v15.V2S(), v19.V2S(), v0.V2S());
727  __ cmhi(v31.V4H(), v7.V4H(), v12.V4H());
728  __ cmhi(v9.V4S(), v16.V4S(), v22.V4S());
729  __ cmhi(v7.V8B(), v24.V8B(), v28.V8B());
730  __ cmhi(v11.V8H(), v10.V8H(), v25.V8H());
731  __ cmhs(d1, d12, d17);
732  __ cmhs(v21.V16B(), v25.V16B(), v30.V16B());
733  __ cmhs(v8.V2D(), v2.V2D(), v26.V2D());
734  __ cmhs(v1.V2S(), v22.V2S(), v29.V2S());
735  __ cmhs(v26.V4H(), v30.V4H(), v30.V4H());
736  __ cmhs(v19.V4S(), v20.V4S(), v16.V4S());
737  __ cmhs(v1.V8B(), v3.V8B(), v26.V8B());
738  __ cmhs(v20.V8H(), v28.V8H(), v8.V8H());
739  __ cmle(d30, d24, 0);
740  __ cmle(v0.V16B(), v3.V16B(), 0);
741  __ cmle(v2.V2D(), v30.V2D(), 0);
742  __ cmle(v7.V2S(), v10.V2S(), 0);
743  __ cmle(v9.V4H(), v31.V4H(), 0);
744  __ cmle(v9.V4S(), v18.V4S(), 0);
745  __ cmle(v21.V8B(), v31.V8B(), 0);
746  __ cmle(v29.V8H(), v21.V8H(), 0);
747  __ cmlt(d25, d23, 0);
748  __ cmlt(v7.V16B(), v21.V16B(), 0);
749  __ cmlt(v7.V2D(), v30.V2D(), 0);
750  __ cmlt(v25.V2S(), v28.V2S(), 0);
751  __ cmlt(v0.V4H(), v11.V4H(), 0);
752  __ cmlt(v24.V4S(), v5.V4S(), 0);
753  __ cmlt(v26.V8B(), v11.V8B(), 0);
754  __ cmlt(v1.V8H(), v21.V8H(), 0);
755  __ cmtst(d28, d23, d30);
756  __ cmtst(v26.V16B(), v6.V16B(), v31.V16B());
757  __ cmtst(v1.V2D(), v21.V2D(), v4.V2D());
758  __ cmtst(v27.V2S(), v26.V2S(), v20.V2S());
759  __ cmtst(v26.V4H(), v0.V4H(), v18.V4H());
760  __ cmtst(v25.V4S(), v16.V4S(), v4.V4S());
761  __ cmtst(v11.V8B(), v10.V8B(), v9.V8B());
762  __ cmtst(v0.V8H(), v2.V8H(), v1.V8H());
763  __ cnt(v25.V16B(), v15.V16B());
764  __ cnt(v28.V8B(), v6.V8B());
765  __ dup(v6.V16B(), v7.B(), 7);
766  __ dup(v9.V16B(), w20);
767  __ dup(v12.V2D(), v13.D(), 1);
768  __ dup(v9.V2D(), xzr);
769  __ dup(v4.V2S(), v26.S(), 2);
770  __ dup(v3.V2S(), w12);
771  __ dup(v22.V4H(), v5.H(), 7);
772  __ dup(v16.V4H(), w25);
773  __ dup(v20.V4S(), v10.S(), 2);
774  __ dup(v10.V4S(), w7);
775  __ dup(v30.V8B(), v30.B(), 2);
776  __ dup(v31.V8B(), w15);
777  __ dup(v28.V8H(), v17.H(), 4);
778  __ dup(v2.V8H(), w3);
779  __ eor(v29.V16B(), v25.V16B(), v3.V16B());
780  __ eor(v3.V8B(), v16.V8B(), v28.V8B());
781  __ ext(v1.V16B(), v26.V16B(), v6.V16B(), 1);
782  __ ext(v2.V8B(), v30.V8B(), v1.V8B(), 1);
783  __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
784  __ ld1(v23.V16B(),
785         v24.V16B(),
786         v25.V16B(),
787         v26.V16B(),
788         MemOperand(x1, x2, PostIndex));
789  __ ld1(v5.V16B(),
790         v6.V16B(),
791         v7.V16B(),
792         v8.V16B(),
793         MemOperand(x1, 64, PostIndex));
794  __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), MemOperand(x0));
795  __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), MemOperand(x1, x2, PostIndex));
796  __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x1, 48, PostIndex));
797  __ ld1(v17.V16B(), v18.V16B(), MemOperand(x0));
798  __ ld1(v20.V16B(), v21.V16B(), MemOperand(x1, x2, PostIndex));
799  __ ld1(v28.V16B(), v29.V16B(), MemOperand(x1, 32, PostIndex));
800  __ ld1(v29.V16B(), MemOperand(x0));
801  __ ld1(v21.V16B(), MemOperand(x1, x2, PostIndex));
802  __ ld1(v4.V16B(), MemOperand(x1, 16, PostIndex));
803  __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x0));
804  __ ld1(v17.V1D(),
805         v18.V1D(),
806         v19.V1D(),
807         v20.V1D(),
808         MemOperand(x1, x2, PostIndex));
809  __ ld1(v28.V1D(),
810         v29.V1D(),
811         v30.V1D(),
812         v31.V1D(),
813         MemOperand(x1, 32, PostIndex));
814  __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), MemOperand(x0));
815  __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), MemOperand(x1, x2, PostIndex));
816  __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), MemOperand(x1, 24, PostIndex));
817  __ ld1(v29.V1D(), v30.V1D(), MemOperand(x0));
818  __ ld1(v31.V1D(), v0.V1D(), MemOperand(x1, x2, PostIndex));
819  __ ld1(v3.V1D(), v4.V1D(), MemOperand(x1, 16, PostIndex));
820  __ ld1(v28.V1D(), MemOperand(x0));
821  __ ld1(v11.V1D(), MemOperand(x1, x2, PostIndex));
822  __ ld1(v29.V1D(), MemOperand(x1, 8, PostIndex));
823  __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x0));
824  __ ld1(v8.V2D(),
825         v9.V2D(),
826         v10.V2D(),
827         v11.V2D(),
828         MemOperand(x1, x2, PostIndex));
829  __ ld1(v14.V2D(),
830         v15.V2D(),
831         v16.V2D(),
832         v17.V2D(),
833         MemOperand(x1, 64, PostIndex));
834  __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x0));
835  __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
836  __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x1, 48, PostIndex));
837  __ ld1(v18.V2D(), v19.V2D(), MemOperand(x0));
838  __ ld1(v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
839  __ ld1(v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex));
840  __ ld1(v5.V2D(), MemOperand(x0));
841  __ ld1(v6.V2D(), MemOperand(x1, x2, PostIndex));
842  __ ld1(v15.V2D(), MemOperand(x1, 16, PostIndex));
843  __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x0));
844  __ ld1(v24.V2S(),
845         v25.V2S(),
846         v26.V2S(),
847         v27.V2S(),
848         MemOperand(x1, x2, PostIndex));
849  __ ld1(v27.V2S(),
850         v28.V2S(),
851         v29.V2S(),
852         v30.V2S(),
853         MemOperand(x1, 32, PostIndex));
854  __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), MemOperand(x0));
855  __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), MemOperand(x1, x2, PostIndex));
856  __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x1, 24, PostIndex));
857  __ ld1(v0.V2S(), v1.V2S(), MemOperand(x0));
858  __ ld1(v13.V2S(), v14.V2S(), MemOperand(x1, x2, PostIndex));
859  __ ld1(v3.V2S(), v4.V2S(), MemOperand(x1, 16, PostIndex));
860  __ ld1(v26.V2S(), MemOperand(x0));
861  __ ld1(v0.V2S(), MemOperand(x1, x2, PostIndex));
862  __ ld1(v11.V2S(), MemOperand(x1, 8, PostIndex));
863  __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
864  __ ld1(v24.V4H(),
865         v25.V4H(),
866         v26.V4H(),
867         v27.V4H(),
868         MemOperand(x1, x2, PostIndex));
869  __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
870  __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), MemOperand(x0));
871  __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex));
872  __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 24, PostIndex));
873  __ ld1(v3.V4H(), v4.V4H(), MemOperand(x0));
874  __ ld1(v3.V4H(), v4.V4H(), MemOperand(x1, x2, PostIndex));
875  __ ld1(v23.V4H(), v24.V4H(), MemOperand(x1, 16, PostIndex));
876  __ ld1(v26.V4H(), MemOperand(x0));
877  __ ld1(v1.V4H(), MemOperand(x1, x2, PostIndex));
878  __ ld1(v14.V4H(), MemOperand(x1, 8, PostIndex));
879  __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), MemOperand(x0));
880  __ ld1(v28.V4S(),
881         v29.V4S(),
882         v30.V4S(),
883         v31.V4S(),
884         MemOperand(x1, x2, PostIndex));
885  __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1, 64, PostIndex));
886  __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
887  __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), MemOperand(x1, x2, PostIndex));
888  __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), MemOperand(x1, 48, PostIndex));
889  __ ld1(v20.V4S(), v21.V4S(), MemOperand(x0));
890  __ ld1(v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex));
891  __ ld1(v11.V4S(), v12.V4S(), MemOperand(x1, 32, PostIndex));
892  __ ld1(v15.V4S(), MemOperand(x0));
893  __ ld1(v12.V4S(), MemOperand(x1, x2, PostIndex));
894  __ ld1(v0.V4S(), MemOperand(x1, 16, PostIndex));
895  __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), MemOperand(x0));
896  __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, x2, PostIndex));
897  __ ld1(v9.V8B(),
898         v10.V8B(),
899         v11.V8B(),
900         v12.V8B(),
901         MemOperand(x1, 32, PostIndex));
902  __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), MemOperand(x0));
903  __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x1, x2, PostIndex));
904  __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
905  __ ld1(v10.V8B(), v11.V8B(), MemOperand(x0));
906  __ ld1(v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
907  __ ld1(v27.V8B(), v28.V8B(), MemOperand(x1, 16, PostIndex));
908  __ ld1(v31.V8B(), MemOperand(x0));
909  __ ld1(v10.V8B(), MemOperand(x1, x2, PostIndex));
910  __ ld1(v28.V8B(), MemOperand(x1, 8, PostIndex));
911  __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
912  __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
913  __ ld1(v10.V8H(),
914         v11.V8H(),
915         v12.V8H(),
916         v13.V8H(),
917         MemOperand(x1, 64, PostIndex));
918  __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
919  __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
920  __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), MemOperand(x1, 48, PostIndex));
921  __ ld1(v4.V8H(), v5.V8H(), MemOperand(x0));
922  __ ld1(v21.V8H(), v22.V8H(), MemOperand(x1, x2, PostIndex));
923  __ ld1(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
924  __ ld1(v9.V8H(), MemOperand(x0));
925  __ ld1(v27.V8H(), MemOperand(x1, x2, PostIndex));
926  __ ld1(v26.V8H(), MemOperand(x1, 16, PostIndex));
927  __ ld1(v19.B(), 1, MemOperand(x0));
928  __ ld1(v12.B(), 3, MemOperand(x1, x2, PostIndex));
929  __ ld1(v27.B(), 12, MemOperand(x1, 1, PostIndex));
930  __ ld1(v10.D(), 1, MemOperand(x0));
931  __ ld1(v26.D(), 1, MemOperand(x1, x2, PostIndex));
932  __ ld1(v7.D(), 1, MemOperand(x1, 8, PostIndex));
933  __ ld1(v19.H(), 5, MemOperand(x0));
934  __ ld1(v10.H(), 1, MemOperand(x1, x2, PostIndex));
935  __ ld1(v5.H(), 4, MemOperand(x1, 2, PostIndex));
936  __ ld1(v21.S(), 2, MemOperand(x0));
937  __ ld1(v13.S(), 2, MemOperand(x1, x2, PostIndex));
938  __ ld1(v1.S(), 2, MemOperand(x1, 4, PostIndex));
939  __ ld1r(v2.V16B(), MemOperand(x0));
940  __ ld1r(v2.V16B(), MemOperand(x1, x2, PostIndex));
941  __ ld1r(v22.V16B(), MemOperand(x1, 1, PostIndex));
942  __ ld1r(v25.V1D(), MemOperand(x0));
943  __ ld1r(v9.V1D(), MemOperand(x1, x2, PostIndex));
944  __ ld1r(v23.V1D(), MemOperand(x1, 8, PostIndex));
945  __ ld1r(v19.V2D(), MemOperand(x0));
946  __ ld1r(v21.V2D(), MemOperand(x1, x2, PostIndex));
947  __ ld1r(v30.V2D(), MemOperand(x1, 8, PostIndex));
948  __ ld1r(v24.V2S(), MemOperand(x0));
949  __ ld1r(v26.V2S(), MemOperand(x1, x2, PostIndex));
950  __ ld1r(v28.V2S(), MemOperand(x1, 4, PostIndex));
951  __ ld1r(v19.V4H(), MemOperand(x0));
952  __ ld1r(v1.V4H(), MemOperand(x1, x2, PostIndex));
953  __ ld1r(v21.V4H(), MemOperand(x1, 2, PostIndex));
954  __ ld1r(v15.V4S(), MemOperand(x0));
955  __ ld1r(v21.V4S(), MemOperand(x1, x2, PostIndex));
956  __ ld1r(v23.V4S(), MemOperand(x1, 4, PostIndex));
957  __ ld1r(v26.V8B(), MemOperand(x0));
958  __ ld1r(v14.V8B(), MemOperand(x1, x2, PostIndex));
959  __ ld1r(v19.V8B(), MemOperand(x1, 1, PostIndex));
960  __ ld1r(v13.V8H(), MemOperand(x0));
961  __ ld1r(v30.V8H(), MemOperand(x1, x2, PostIndex));
962  __ ld1r(v27.V8H(), MemOperand(x1, 2, PostIndex));
963  __ ld2(v21.V16B(), v22.V16B(), MemOperand(x0));
964  __ ld2(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
965  __ ld2(v12.V16B(), v13.V16B(), MemOperand(x1, 32, PostIndex));
966  __ ld2(v14.V2D(), v15.V2D(), MemOperand(x0));
967  __ ld2(v0.V2D(), v1.V2D(), MemOperand(x1, x2, PostIndex));
968  __ ld2(v12.V2D(), v13.V2D(), MemOperand(x1, 32, PostIndex));
969  __ ld2(v27.V2S(), v28.V2S(), MemOperand(x0));
970  __ ld2(v2.V2S(), v3.V2S(), MemOperand(x1, x2, PostIndex));
971  __ ld2(v12.V2S(), v13.V2S(), MemOperand(x1, 16, PostIndex));
972  __ ld2(v9.V4H(), v10.V4H(), MemOperand(x0));
973  __ ld2(v23.V4H(), v24.V4H(), MemOperand(x1, x2, PostIndex));
974  __ ld2(v1.V4H(), v2.V4H(), MemOperand(x1, 16, PostIndex));
975  __ ld2(v20.V4S(), v21.V4S(), MemOperand(x0));
976  __ ld2(v10.V4S(), v11.V4S(), MemOperand(x1, x2, PostIndex));
977  __ ld2(v24.V4S(), v25.V4S(), MemOperand(x1, 32, PostIndex));
978  __ ld2(v17.V8B(), v18.V8B(), MemOperand(x0));
979  __ ld2(v13.V8B(), v14.V8B(), MemOperand(x1, x2, PostIndex));
980  __ ld2(v7.V8B(), v8.V8B(), MemOperand(x1, 16, PostIndex));
981  __ ld2(v30.V8H(), v31.V8H(), MemOperand(x0));
982  __ ld2(v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
983  __ ld2(v13.V8H(), v14.V8H(), MemOperand(x1, 32, PostIndex));
984  __ ld2(v5.B(), v6.B(), 12, MemOperand(x0));
985  __ ld2(v16.B(), v17.B(), 7, MemOperand(x1, x2, PostIndex));
986  __ ld2(v29.B(), v30.B(), 2, MemOperand(x1, 2, PostIndex));
987  __ ld2(v11.D(), v12.D(), 1, MemOperand(x0));
988  __ ld2(v26.D(), v27.D(), 0, MemOperand(x1, x2, PostIndex));
989  __ ld2(v25.D(), v26.D(), 0, MemOperand(x1, 16, PostIndex));
990  __ ld2(v18.H(), v19.H(), 7, MemOperand(x0));
991  __ ld2(v17.H(), v18.H(), 5, MemOperand(x1, x2, PostIndex));
992  __ ld2(v30.H(), v31.H(), 2, MemOperand(x1, 4, PostIndex));
993  __ ld2(v29.S(), v30.S(), 3, MemOperand(x0));
994  __ ld2(v28.S(), v29.S(), 0, MemOperand(x1, x2, PostIndex));
995  __ ld2(v6.S(), v7.S(), 1, MemOperand(x1, 8, PostIndex));
996  __ ld2r(v26.V16B(), v27.V16B(), MemOperand(x0));
997  __ ld2r(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
998  __ ld2r(v5.V16B(), v6.V16B(), MemOperand(x1, 2, PostIndex));
999  __ ld2r(v26.V1D(), v27.V1D(), MemOperand(x0));
1000  __ ld2r(v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex));
1001  __ ld2r(v23.V1D(), v24.V1D(), MemOperand(x1, 16, PostIndex));
1002  __ ld2r(v11.V2D(), v12.V2D(), MemOperand(x0));
1003  __ ld2r(v29.V2D(), v30.V2D(), MemOperand(x1, x2, PostIndex));
1004  __ ld2r(v15.V2D(), v16.V2D(), MemOperand(x1, 16, PostIndex));
1005  __ ld2r(v26.V2S(), v27.V2S(), MemOperand(x0));
1006  __ ld2r(v22.V2S(), v23.V2S(), MemOperand(x1, x2, PostIndex));
1007  __ ld2r(v2.V2S(), v3.V2S(), MemOperand(x1, 8, PostIndex));
1008  __ ld2r(v2.V4H(), v3.V4H(), MemOperand(x0));
1009  __ ld2r(v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
1010  __ ld2r(v6.V4H(), v7.V4H(), MemOperand(x1, 4, PostIndex));
1011  __ ld2r(v7.V4S(), v8.V4S(), MemOperand(x0));
1012  __ ld2r(v19.V4S(), v20.V4S(), MemOperand(x1, x2, PostIndex));
1013  __ ld2r(v21.V4S(), v22.V4S(), MemOperand(x1, 8, PostIndex));
1014  __ ld2r(v26.V8B(), v27.V8B(), MemOperand(x0));
1015  __ ld2r(v20.V8B(), v21.V8B(), MemOperand(x1, x2, PostIndex));
1016  __ ld2r(v11.V8B(), v12.V8B(), MemOperand(x1, 2, PostIndex));
1017  __ ld2r(v12.V8H(), v13.V8H(), MemOperand(x0));
1018  __ ld2r(v6.V8H(), v7.V8H(), MemOperand(x1, x2, PostIndex));
1019  __ ld2r(v25.V8H(), v26.V8H(), MemOperand(x1, 4, PostIndex));
1020  __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x0));
1021  __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, x2, PostIndex));
1022  __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x1, 48, PostIndex));
1023  __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), MemOperand(x0));
1024  __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex));
1025  __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), MemOperand(x1, 48, PostIndex));
1026  __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x0));
1027  __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), MemOperand(x1, x2, PostIndex));
1028  __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), MemOperand(x1, 24, PostIndex));
1029  __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), MemOperand(x0));
1030  __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), MemOperand(x1, x2, PostIndex));
1031  __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 24, PostIndex));
1032  __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
1033  __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, x2, PostIndex));
1034  __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), MemOperand(x1, 48, PostIndex));
1035  __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x0));
1036  __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1037  __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
1038  __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), MemOperand(x0));
1039  __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), MemOperand(x1, x2, PostIndex));
1040  __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), MemOperand(x1, 48, PostIndex));
1041  __ ld3(v21.B(), v22.B(), v23.B(), 11, MemOperand(x0));
1042  __ ld3(v5.B(), v6.B(), v7.B(), 9, MemOperand(x1, x2, PostIndex));
1043  __ ld3(v23.B(), v24.B(), v25.B(), 0, MemOperand(x1, 3, PostIndex));
1044  __ ld3(v16.D(), v17.D(), v18.D(), 0, MemOperand(x0));
1045  __ ld3(v30.D(), v31.D(), v0.D(), 0, MemOperand(x1, x2, PostIndex));
1046  __ ld3(v28.D(), v29.D(), v30.D(), 1, MemOperand(x1, 24, PostIndex));
1047  __ ld3(v13.H(), v14.H(), v15.H(), 2, MemOperand(x0));
1048  __ ld3(v22.H(), v23.H(), v24.H(), 7, MemOperand(x1, x2, PostIndex));
1049  __ ld3(v14.H(), v15.H(), v16.H(), 3, MemOperand(x1, 6, PostIndex));
1050  __ ld3(v22.S(), v23.S(), v24.S(), 3, MemOperand(x0));
1051  __ ld3(v30.S(), v31.S(), v0.S(), 2, MemOperand(x1, x2, PostIndex));
1052  __ ld3(v12.S(), v13.S(), v14.S(), 1, MemOperand(x1, 12, PostIndex));
1053  __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x0));
1054  __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex));
1055  __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, 3, PostIndex));
1056  __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), MemOperand(x0));
1057  __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), MemOperand(x1, x2, PostIndex));
1058  __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), MemOperand(x1, 24, PostIndex));
1059  __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x0));
1060  __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
1061  __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), MemOperand(x1, 24, PostIndex));
1062  __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), MemOperand(x0));
1063  __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x1, x2, PostIndex));
1064  __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, 12, PostIndex));
1065  __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), MemOperand(x0));
1066  __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), MemOperand(x1, x2, PostIndex));
1067  __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 6, PostIndex));
1068  __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x0));
1069  __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), MemOperand(x1, x2, PostIndex));
1070  __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 12, PostIndex));
1071  __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x0));
1072  __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
1073  __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, 3, PostIndex));
1074  __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
1075  __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x1, x2, PostIndex));
1076  __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), MemOperand(x1, 6, PostIndex));
1077  __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), MemOperand(x0));
1078  __ ld4(v2.V16B(),
1079         v3.V16B(),
1080         v4.V16B(),
1081         v5.V16B(),
1082         MemOperand(x1, x2, PostIndex));
1083  __ ld4(v5.V16B(),
1084         v6.V16B(),
1085         v7.V16B(),
1086         v8.V16B(),
1087         MemOperand(x1, 64, PostIndex));
1088  __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), MemOperand(x0));
1089  __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1090  __ ld4(v29.V2D(),
1091         v30.V2D(),
1092         v31.V2D(),
1093         v0.V2D(),
1094         MemOperand(x1, 64, PostIndex));
1095  __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x0));
1096  __ ld4(v24.V2S(),
1097         v25.V2S(),
1098         v26.V2S(),
1099         v27.V2S(),
1100         MemOperand(x1, x2, PostIndex));
1101  __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), MemOperand(x1, 32, PostIndex));
1102  __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
1103  __ ld4(v23.V4H(),
1104         v24.V4H(),
1105         v25.V4H(),
1106         v26.V4H(),
1107         MemOperand(x1, x2, PostIndex));
1108  __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 32, PostIndex));
1109  __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), MemOperand(x0));
1110  __ ld4(v28.V4S(),
1111         v29.V4S(),
1112         v30.V4S(),
1113         v31.V4S(),
1114         MemOperand(x1, x2, PostIndex));
1115  __ ld4(v29.V4S(),
1116         v30.V4S(),
1117         v31.V4S(),
1118         v0.V4S(),
1119         MemOperand(x1, 64, PostIndex));
1120  __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x0));
1121  __ ld4(v27.V8B(),
1122         v28.V8B(),
1123         v29.V8B(),
1124         v30.V8B(),
1125         MemOperand(x1, x2, PostIndex));
1126  __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, 32, PostIndex));
1127  __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
1128  __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
1129  __ ld4(v20.V8H(),
1130         v21.V8H(),
1131         v22.V8H(),
1132         v23.V8H(),
1133         MemOperand(x1, 64, PostIndex));
1134  __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, MemOperand(x0));
1135  __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, MemOperand(x1, x2, PostIndex));
1136  __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, MemOperand(x1, 4, PostIndex));
1137  __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, MemOperand(x0));
1138  __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1139  __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, MemOperand(x1, 32, PostIndex));
1140  __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, MemOperand(x0));
1141  __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, MemOperand(x1, x2, PostIndex));
1142  __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, MemOperand(x1, 8, PostIndex));
1143  __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, MemOperand(x0));
1144  __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, MemOperand(x1, x2, PostIndex));
1145  __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, MemOperand(x1, 16, PostIndex));
1146  __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), MemOperand(x0));
1147  __ ld4r(v13.V16B(),
1148          v14.V16B(),
1149          v15.V16B(),
1150          v16.V16B(),
1151          MemOperand(x1, x2, PostIndex));
1152  __ ld4r(v9.V16B(),
1153          v10.V16B(),
1154          v11.V16B(),
1155          v12.V16B(),
1156          MemOperand(x1, 4, PostIndex));
1157  __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), MemOperand(x0));
1158  __ ld4r(v4.V1D(),
1159          v5.V1D(),
1160          v6.V1D(),
1161          v7.V1D(),
1162          MemOperand(x1, x2, PostIndex));
1163  __ ld4r(v26.V1D(),
1164          v27.V1D(),
1165          v28.V1D(),
1166          v29.V1D(),
1167          MemOperand(x1, 32, PostIndex));
1168  __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x0));
1169  __ ld4r(v28.V2D(),
1170          v29.V2D(),
1171          v30.V2D(),
1172          v31.V2D(),
1173          MemOperand(x1, x2, PostIndex));
1174  __ ld4r(v15.V2D(),
1175          v16.V2D(),
1176          v17.V2D(),
1177          v18.V2D(),
1178          MemOperand(x1, 32, PostIndex));
1179  __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x0));
1180  __ ld4r(v28.V2S(),
1181          v29.V2S(),
1182          v30.V2S(),
1183          v31.V2S(),
1184          MemOperand(x1, x2, PostIndex));
1185  __ ld4r(v11.V2S(),
1186          v12.V2S(),
1187          v13.V2S(),
1188          v14.V2S(),
1189          MemOperand(x1, 16, PostIndex));
1190  __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), MemOperand(x0));
1191  __ ld4r(v22.V4H(),
1192          v23.V4H(),
1193          v24.V4H(),
1194          v25.V4H(),
1195          MemOperand(x1, x2, PostIndex));
1196  __ ld4r(v20.V4H(),
1197          v21.V4H(),
1198          v22.V4H(),
1199          v23.V4H(),
1200          MemOperand(x1, 8, PostIndex));
1201  __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x0));
1202  __ ld4r(v25.V4S(),
1203          v26.V4S(),
1204          v27.V4S(),
1205          v28.V4S(),
1206          MemOperand(x1, x2, PostIndex));
1207  __ ld4r(v23.V4S(),
1208          v24.V4S(),
1209          v25.V4S(),
1210          v26.V4S(),
1211          MemOperand(x1, 16, PostIndex));
1212  __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), MemOperand(x0));
1213  __ ld4r(v27.V8B(),
1214          v28.V8B(),
1215          v29.V8B(),
1216          v30.V8B(),
1217          MemOperand(x1, x2, PostIndex));
1218  __ ld4r(v29.V8B(),
1219          v30.V8B(),
1220          v31.V8B(),
1221          v0.V8B(),
1222          MemOperand(x1, 4, PostIndex));
1223  __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x0));
1224  __ ld4r(v25.V8H(),
1225          v26.V8H(),
1226          v27.V8H(),
1227          v28.V8H(),
1228          MemOperand(x1, x2, PostIndex));
1229  __ ld4r(v22.V8H(),
1230          v23.V8H(),
1231          v24.V8H(),
1232          v25.V8H(),
1233          MemOperand(x1, 8, PostIndex));
1234  __ mla(v29.V16B(), v7.V16B(), v26.V16B());
1235  __ mla(v6.V2S(), v4.V2S(), v14.V2S());
1236  __ mla(v9.V2S(), v11.V2S(), v0.S(), 2);
1237  __ mla(v5.V4H(), v17.V4H(), v25.V4H());
1238  __ mla(v24.V4H(), v7.V4H(), v11.H(), 3);
1239  __ mla(v12.V4S(), v3.V4S(), v4.V4S());
1240  __ mla(v10.V4S(), v7.V4S(), v7.S(), 3);
1241  __ mla(v3.V8B(), v16.V8B(), v9.V8B());
1242  __ mla(v19.V8H(), v22.V8H(), v18.V8H());
1243  __ mla(v6.V8H(), v2.V8H(), v0.H(), 0);
1244  __ mls(v23.V16B(), v10.V16B(), v11.V16B());
1245  __ mls(v14.V2S(), v31.V2S(), v22.V2S());
1246  __ mls(v28.V2S(), v13.V2S(), v1.S(), 3);
1247  __ mls(v2.V4H(), v19.V4H(), v13.V4H());
1248  __ mls(v18.V4H(), v15.V4H(), v12.H(), 6);
1249  __ mls(v6.V4S(), v11.V4S(), v16.V4S());
1250  __ mls(v23.V4S(), v16.V4S(), v10.S(), 2);
1251  __ mls(v26.V8B(), v13.V8B(), v23.V8B());
1252  __ mls(v10.V8H(), v10.V8H(), v12.V8H());
1253  __ mls(v14.V8H(), v0.V8H(), v14.H(), 7);
1254  __ mov(b22, v1.B(), 3);
1255  __ mov(d7, v13.D(), 1);
1256  __ mov(h26, v21.H(), 2);
1257  __ mov(s26, v19.S(), 0);
1258  __ mov(v26.V16B(), v11.V16B());
1259  __ mov(v20.V8B(), v0.V8B());
1260  __ mov(v19.B(), 13, v6.B(), 4);
1261  __ mov(v4.B(), 13, w19);
1262  __ mov(v11.D(), 1, v8.D(), 0);
1263  __ mov(v3.D(), 0, x30);
1264  __ mov(v29.H(), 4, v11.H(), 7);
1265  __ mov(v2.H(), 6, w6);
1266  __ mov(v22.S(), 0, v5.S(), 2);
1267  __ mov(v24.S(), 3, w8);
1268  __ mov(w18, v1.S(), 3);
1269  __ mov(x28, v21.D(), 0);
1270  __ movi(d24, 0xffff0000ffffff);
1271  __ movi(v29.V16B(), 0x80);
1272  __ movi(v12.V2D(), 0xffff00ff00ffff00);
1273  __ movi(v12.V2S(), 0xec, LSL, 24);
1274  __ movi(v10.V2S(), 0x4c, MSL, 16);
1275  __ movi(v26.V4H(), 0xc0, LSL);
1276  __ movi(v24.V4S(), 0x98, LSL, 16);
1277  __ movi(v1.V4S(), 0xde, MSL, 16);
1278  __ movi(v21.V8B(), 0x4d);
1279  __ movi(v29.V8H(), 0x69, LSL);
1280  __ mul(v1.V16B(), v15.V16B(), v17.V16B());
1281  __ mul(v21.V2S(), v19.V2S(), v29.V2S());
1282  __ mul(v19.V2S(), v5.V2S(), v3.S(), 0);
1283  __ mul(v29.V4H(), v11.V4H(), v2.V4H());
1284  __ mul(v2.V4H(), v7.V4H(), v0.H(), 0);
1285  __ mul(v25.V4S(), v26.V4S(), v16.V4S());
1286  __ mul(v26.V4S(), v6.V4S(), v15.S(), 2);
1287  __ mul(v11.V8B(), v15.V8B(), v31.V8B());
1288  __ mul(v20.V8H(), v31.V8H(), v15.V8H());
1289  __ mul(v29.V8H(), v5.V8H(), v9.H(), 4);
1290  __ mvn(v13.V16B(), v21.V16B());
1291  __ mvn(v28.V8B(), v19.V8B());
1292  __ mvni(v25.V2S(), 0xb8, LSL, 8);
1293  __ mvni(v17.V2S(), 0x6c, MSL, 16);
1294  __ mvni(v29.V4H(), 0x48, LSL);
1295  __ mvni(v20.V4S(), 0x7a, LSL, 16);
1296  __ mvni(v0.V4S(), 0x1e, MSL, 8);
1297  __ mvni(v31.V8H(), 0x3e, LSL);
1298  __ neg(d25, d11);
1299  __ neg(v4.V16B(), v9.V16B());
1300  __ neg(v11.V2D(), v25.V2D());
1301  __ neg(v7.V2S(), v18.V2S());
1302  __ neg(v7.V4H(), v15.V4H());
1303  __ neg(v17.V4S(), v18.V4S());
1304  __ neg(v20.V8B(), v17.V8B());
1305  __ neg(v0.V8H(), v11.V8H());
1306  __ orn(v13.V16B(), v11.V16B(), v31.V16B());
1307  __ orn(v22.V8B(), v16.V8B(), v22.V8B());
1308  __ orr(v17.V16B(), v17.V16B(), v23.V16B());
1309  __ orr(v8.V2S(), 0xe3);
1310  __ orr(v11.V4H(), 0x97, 8);
1311  __ orr(v7.V4S(), 0xab);
1312  __ orr(v8.V8B(), v4.V8B(), v3.V8B());
1313  __ orr(v31.V8H(), 0xb0, 8);
1314  __ pmul(v11.V16B(), v18.V16B(), v23.V16B());
1315  __ pmul(v8.V8B(), v24.V8B(), v5.V8B());
1316  __ pmull(v24.V8H(), v18.V8B(), v22.V8B());
1317  __ pmull2(v13.V8H(), v3.V16B(), v21.V16B());
1318  __ raddhn(v22.V2S(), v10.V2D(), v21.V2D());
1319  __ raddhn(v5.V4H(), v13.V4S(), v13.V4S());
1320  __ raddhn(v10.V8B(), v17.V8H(), v26.V8H());
1321  __ raddhn2(v9.V16B(), v29.V8H(), v13.V8H());
1322  __ raddhn2(v27.V4S(), v23.V2D(), v26.V2D());
1323  __ raddhn2(v0.V8H(), v29.V4S(), v7.V4S());
1324  __ rbit(v22.V16B(), v15.V16B());
1325  __ rbit(v30.V8B(), v3.V8B());
1326  __ rev16(v31.V16B(), v27.V16B());
1327  __ rev16(v12.V8B(), v26.V8B());
1328  __ rev32(v5.V16B(), v4.V16B());
1329  __ rev32(v16.V4H(), v26.V4H());
1330  __ rev32(v20.V8B(), v3.V8B());
1331  __ rev32(v20.V8H(), v28.V8H());
1332  __ rev64(v9.V16B(), v19.V16B());
1333  __ rev64(v5.V2S(), v16.V2S());
1334  __ rev64(v7.V4H(), v31.V4H());
1335  __ rev64(v15.V4S(), v26.V4S());
1336  __ rev64(v25.V8B(), v9.V8B());
1337  __ rev64(v11.V8H(), v5.V8H());
1338  __ rshrn(v18.V2S(), v13.V2D(), 1);
1339  __ rshrn(v25.V4H(), v30.V4S(), 2);
1340  __ rshrn(v13.V8B(), v9.V8H(), 8);
1341  __ rshrn2(v3.V16B(), v6.V8H(), 8);
1342  __ rshrn2(v0.V4S(), v29.V2D(), 25);
1343  __ rshrn2(v27.V8H(), v26.V4S(), 15);
1344  __ rsubhn(v15.V2S(), v25.V2D(), v4.V2D());
1345  __ rsubhn(v23.V4H(), v9.V4S(), v3.V4S());
1346  __ rsubhn(v6.V8B(), v30.V8H(), v24.V8H());
1347  __ rsubhn2(v4.V16B(), v24.V8H(), v20.V8H());
1348  __ rsubhn2(v1.V4S(), v23.V2D(), v22.V2D());
1349  __ rsubhn2(v19.V8H(), v2.V4S(), v20.V4S());
1350  __ saba(v28.V16B(), v9.V16B(), v25.V16B());
1351  __ saba(v9.V2S(), v28.V2S(), v20.V2S());
1352  __ saba(v17.V4H(), v22.V4H(), v22.V4H());
1353  __ saba(v29.V4S(), v5.V4S(), v27.V4S());
1354  __ saba(v20.V8B(), v21.V8B(), v18.V8B());
1355  __ saba(v27.V8H(), v17.V8H(), v30.V8H());
1356  __ sabal(v20.V2D(), v13.V2S(), v7.V2S());
1357  __ sabal(v4.V4S(), v12.V4H(), v4.V4H());
1358  __ sabal(v23.V8H(), v24.V8B(), v20.V8B());
1359  __ sabal2(v26.V2D(), v21.V4S(), v18.V4S());
1360  __ sabal2(v27.V4S(), v28.V8H(), v8.V8H());
1361  __ sabal2(v12.V8H(), v16.V16B(), v21.V16B());
1362  __ sabd(v0.V16B(), v15.V16B(), v13.V16B());
1363  __ sabd(v15.V2S(), v7.V2S(), v30.V2S());
1364  __ sabd(v17.V4H(), v17.V4H(), v12.V4H());
1365  __ sabd(v7.V4S(), v4.V4S(), v22.V4S());
1366  __ sabd(v23.V8B(), v3.V8B(), v26.V8B());
1367  __ sabd(v20.V8H(), v28.V8H(), v5.V8H());
1368  __ sabdl(v27.V2D(), v22.V2S(), v20.V2S());
1369  __ sabdl(v31.V4S(), v20.V4H(), v23.V4H());
1370  __ sabdl(v0.V8H(), v20.V8B(), v27.V8B());
1371  __ sabdl2(v31.V2D(), v11.V4S(), v3.V4S());
1372  __ sabdl2(v26.V4S(), v11.V8H(), v27.V8H());
1373  __ sabdl2(v6.V8H(), v8.V16B(), v18.V16B());
1374  __ sadalp(v8.V1D(), v26.V2S());
1375  __ sadalp(v12.V2D(), v26.V4S());
1376  __ sadalp(v12.V2S(), v26.V4H());
1377  __ sadalp(v4.V4H(), v1.V8B());
1378  __ sadalp(v15.V4S(), v17.V8H());
1379  __ sadalp(v21.V8H(), v25.V16B());
1380  __ saddl(v5.V2D(), v10.V2S(), v14.V2S());
1381  __ saddl(v18.V4S(), v3.V4H(), v15.V4H());
1382  __ saddl(v15.V8H(), v2.V8B(), v23.V8B());
1383  __ saddl2(v16.V2D(), v16.V4S(), v27.V4S());
1384  __ saddl2(v6.V4S(), v24.V8H(), v0.V8H());
1385  __ saddl2(v7.V8H(), v20.V16B(), v28.V16B());
1386  __ saddlp(v10.V1D(), v25.V2S());
1387  __ saddlp(v15.V2D(), v16.V4S());
1388  __ saddlp(v18.V2S(), v10.V4H());
1389  __ saddlp(v29.V4H(), v26.V8B());
1390  __ saddlp(v10.V4S(), v1.V8H());
1391  __ saddlp(v0.V8H(), v21.V16B());
1392  __ saddlv(d12, v7.V4S());
1393  __ saddlv(h14, v28.V16B());
1394  __ saddlv(h30, v30.V8B());
1395  __ saddlv(s27, v3.V4H());
1396  __ saddlv(s16, v16.V8H());
1397  __ saddw(v24.V2D(), v11.V2D(), v18.V2S());
1398  __ saddw(v13.V4S(), v12.V4S(), v6.V4H());
1399  __ saddw(v19.V8H(), v19.V8H(), v7.V8B());
1400  __ saddw2(v27.V2D(), v9.V2D(), v26.V4S());
1401  __ saddw2(v19.V4S(), v23.V4S(), v21.V8H());
1402  __ saddw2(v15.V8H(), v25.V8H(), v30.V16B());
1403  __ shadd(v7.V16B(), v4.V16B(), v9.V16B());
1404  __ shadd(v29.V2S(), v25.V2S(), v24.V2S());
1405  __ shadd(v31.V4H(), v10.V4H(), v13.V4H());
1406  __ shadd(v21.V4S(), v16.V4S(), v8.V4S());
1407  __ shadd(v14.V8B(), v29.V8B(), v22.V8B());
1408  __ shadd(v19.V8H(), v24.V8H(), v20.V8H());
1409  __ shl(d22, d25, 23);
1410  __ shl(v5.V16B(), v17.V16B(), 7);
1411  __ shl(v2.V2D(), v4.V2D(), 21);
1412  __ shl(v4.V2S(), v3.V2S(), 26);
1413  __ shl(v3.V4H(), v28.V4H(), 8);
1414  __ shl(v4.V4S(), v31.V4S(), 24);
1415  __ shl(v18.V8B(), v16.V8B(), 2);
1416  __ shl(v0.V8H(), v11.V8H(), 3);
1417  __ shll(v5.V2D(), v24.V2S(), 32);
1418  __ shll(v26.V4S(), v20.V4H(), 16);
1419  __ shll(v5.V8H(), v9.V8B(), 8);
1420  __ shll2(v21.V2D(), v28.V4S(), 32);
1421  __ shll2(v22.V4S(), v1.V8H(), 16);
1422  __ shll2(v30.V8H(), v25.V16B(), 8);
1423  __ shrn(v5.V2S(), v1.V2D(), 28);
1424  __ shrn(v29.V4H(), v18.V4S(), 7);
1425  __ shrn(v17.V8B(), v29.V8H(), 2);
1426  __ shrn2(v5.V16B(), v30.V8H(), 3);
1427  __ shrn2(v24.V4S(), v1.V2D(), 1);
1428  __ shrn2(v5.V8H(), v14.V4S(), 16);
1429  __ shsub(v30.V16B(), v22.V16B(), v23.V16B());
1430  __ shsub(v22.V2S(), v27.V2S(), v25.V2S());
1431  __ shsub(v13.V4H(), v22.V4H(), v1.V4H());
1432  __ shsub(v10.V4S(), v8.V4S(), v23.V4S());
1433  __ shsub(v6.V8B(), v9.V8B(), v31.V8B());
1434  __ shsub(v8.V8H(), v31.V8H(), v8.V8H());
1435  __ sli(d19, d29, 20);
1436  __ sli(v9.V16B(), v24.V16B(), 0);
1437  __ sli(v22.V2D(), v9.V2D(), 10);
1438  __ sli(v11.V2S(), v27.V2S(), 20);
1439  __ sli(v16.V4H(), v15.V4H(), 5);
1440  __ sli(v8.V4S(), v8.V4S(), 25);
1441  __ sli(v10.V8B(), v30.V8B(), 0);
1442  __ sli(v7.V8H(), v28.V8H(), 6);
1443  __ smax(v18.V16B(), v8.V16B(), v1.V16B());
1444  __ smax(v30.V2S(), v5.V2S(), v1.V2S());
1445  __ smax(v17.V4H(), v25.V4H(), v19.V4H());
1446  __ smax(v1.V4S(), v24.V4S(), v31.V4S());
1447  __ smax(v17.V8B(), v24.V8B(), v24.V8B());
1448  __ smax(v11.V8H(), v26.V8H(), v10.V8H());
1449  __ smaxp(v12.V16B(), v14.V16B(), v7.V16B());
1450  __ smaxp(v31.V2S(), v24.V2S(), v6.V2S());
1451  __ smaxp(v10.V4H(), v29.V4H(), v10.V4H());
1452  __ smaxp(v18.V4S(), v11.V4S(), v7.V4S());
1453  __ smaxp(v21.V8B(), v0.V8B(), v18.V8B());
1454  __ smaxp(v26.V8H(), v8.V8H(), v15.V8H());
1455  __ smaxv(b4, v5.V16B());
1456  __ smaxv(b23, v0.V8B());
1457  __ smaxv(h6, v0.V4H());
1458  __ smaxv(h24, v8.V8H());
1459  __ smaxv(s3, v16.V4S());
1460  __ smin(v24.V16B(), v8.V16B(), v18.V16B());
1461  __ smin(v29.V2S(), v8.V2S(), v23.V2S());
1462  __ smin(v6.V4H(), v11.V4H(), v21.V4H());
1463  __ smin(v24.V4S(), v23.V4S(), v15.V4S());
1464  __ smin(v8.V8B(), v16.V8B(), v4.V8B());
1465  __ smin(v12.V8H(), v1.V8H(), v10.V8H());
1466  __ sminp(v13.V16B(), v18.V16B(), v28.V16B());
1467  __ sminp(v22.V2S(), v28.V2S(), v16.V2S());
1468  __ sminp(v15.V4H(), v12.V4H(), v5.V4H());
1469  __ sminp(v15.V4S(), v17.V4S(), v8.V4S());
1470  __ sminp(v21.V8B(), v2.V8B(), v6.V8B());
1471  __ sminp(v21.V8H(), v12.V8H(), v6.V8H());
1472  __ sminv(b8, v6.V16B());
1473  __ sminv(b6, v18.V8B());
1474  __ sminv(h20, v1.V4H());
1475  __ sminv(h7, v17.V8H());
1476  __ sminv(s21, v4.V4S());
1477  __ smlal(v24.V2D(), v14.V2S(), v21.V2S());
1478  __ smlal(v31.V2D(), v3.V2S(), v14.S(), 2);
1479  __ smlal(v7.V4S(), v20.V4H(), v21.V4H());
1480  __ smlal(v19.V4S(), v16.V4H(), v9.H(), 3);
1481  __ smlal(v29.V8H(), v14.V8B(), v1.V8B());
1482  __ smlal2(v30.V2D(), v26.V4S(), v16.V4S());
1483  __ smlal2(v31.V2D(), v30.V4S(), v1.S(), 0);
1484  __ smlal2(v17.V4S(), v6.V8H(), v3.V8H());
1485  __ smlal2(v11.V4S(), v31.V8H(), v5.H(), 7);
1486  __ smlal2(v30.V8H(), v16.V16B(), v29.V16B());
1487  __ smlsl(v1.V2D(), v20.V2S(), v17.V2S());
1488  __ smlsl(v29.V2D(), v12.V2S(), v5.S(), 3);
1489  __ smlsl(v0.V4S(), v26.V4H(), v1.V4H());
1490  __ smlsl(v3.V4S(), v5.V4H(), v6.H(), 5);
1491  __ smlsl(v4.V8H(), v0.V8B(), v26.V8B());
1492  __ smlsl2(v14.V2D(), v14.V4S(), v5.V4S());
1493  __ smlsl2(v15.V2D(), v5.V4S(), v0.S(), 1);
1494  __ smlsl2(v29.V4S(), v17.V8H(), v31.V8H());
1495  __ smlsl2(v6.V4S(), v15.V8H(), v9.H(), 6);
1496  __ smlsl2(v30.V8H(), v15.V16B(), v15.V16B());
1497  __ smov(w21, v6.B(), 3);
1498  __ smov(w13, v26.H(), 7);
1499  __ smov(x24, v16.B(), 7);
1500  __ smov(x7, v4.H(), 3);
1501  __ smov(x29, v7.S(), 1);
1502  __ smull(v4.V2D(), v29.V2S(), v17.V2S());
1503  __ smull(v30.V2D(), v21.V2S(), v6.S(), 2);
1504  __ smull(v23.V4S(), v5.V4H(), v23.V4H());
1505  __ smull(v8.V4S(), v9.V4H(), v2.H(), 1);
1506  __ smull(v31.V8H(), v17.V8B(), v1.V8B());
1507  __ smull2(v3.V2D(), v3.V4S(), v23.V4S());
1508  __ smull2(v15.V2D(), v29.V4S(), v6.S(), 1);
1509  __ smull2(v19.V4S(), v20.V8H(), v30.V8H());
1510  __ smull2(v6.V4S(), v10.V8H(), v7.H(), 4);
1511  __ smull2(v25.V8H(), v8.V16B(), v27.V16B());
1512  __ sqabs(b3, b15);
1513  __ sqabs(d14, d9);
1514  __ sqabs(h31, h28);
1515  __ sqabs(s8, s0);
1516  __ sqabs(v14.V16B(), v7.V16B());
1517  __ sqabs(v23.V2D(), v19.V2D());
1518  __ sqabs(v10.V2S(), v24.V2S());
1519  __ sqabs(v31.V4H(), v19.V4H());
1520  __ sqabs(v23.V4S(), v0.V4S());
1521  __ sqabs(v29.V8B(), v23.V8B());
1522  __ sqabs(v17.V8H(), v21.V8H());
1523  __ sqadd(b9, b23, b13);
1524  __ sqadd(d2, d25, d26);
1525  __ sqadd(h7, h29, h25);
1526  __ sqadd(s11, s7, s24);
1527  __ sqadd(v20.V16B(), v16.V16B(), v29.V16B());
1528  __ sqadd(v23.V2D(), v30.V2D(), v28.V2D());
1529  __ sqadd(v8.V2S(), v19.V2S(), v2.V2S());
1530  __ sqadd(v20.V4H(), v12.V4H(), v31.V4H());
1531  __ sqadd(v14.V4S(), v15.V4S(), v17.V4S());
1532  __ sqadd(v2.V8B(), v29.V8B(), v13.V8B());
1533  __ sqadd(v7.V8H(), v19.V8H(), v14.V8H());
1534  __ sqdmlal(d15, s5, s30);
1535  __ sqdmlal(d24, s10, v2.S(), 3);
1536  __ sqdmlal(s9, h19, h8);
1537  __ sqdmlal(s14, h1, v12.H(), 3);
1538  __ sqdmlal(v30.V2D(), v5.V2S(), v31.V2S());
1539  __ sqdmlal(v25.V2D(), v14.V2S(), v10.S(), 1);
1540  __ sqdmlal(v19.V4S(), v17.V4H(), v16.V4H());
1541  __ sqdmlal(v8.V4S(), v5.V4H(), v8.H(), 1);
1542  __ sqdmlal2(v1.V2D(), v23.V4S(), v3.V4S());
1543  __ sqdmlal2(v19.V2D(), v0.V4S(), v9.S(), 0);
1544  __ sqdmlal2(v26.V4S(), v22.V8H(), v11.V8H());
1545  __ sqdmlal2(v6.V4S(), v28.V8H(), v13.H(), 4);
1546  __ sqdmlsl(d10, s29, s20);
1547  __ sqdmlsl(d10, s9, v10.S(), 1);
1548  __ sqdmlsl(s30, h9, h24);
1549  __ sqdmlsl(s13, h24, v6.H(), 1);
1550  __ sqdmlsl(v27.V2D(), v10.V2S(), v20.V2S());
1551  __ sqdmlsl(v23.V2D(), v23.V2S(), v3.S(), 3);
1552  __ sqdmlsl(v7.V4S(), v17.V4H(), v29.V4H());
1553  __ sqdmlsl(v22.V4S(), v21.V4H(), v3.H(), 4);
1554  __ sqdmlsl2(v12.V2D(), v7.V4S(), v22.V4S());
1555  __ sqdmlsl2(v20.V2D(), v25.V4S(), v8.S(), 0);
1556  __ sqdmlsl2(v25.V4S(), v26.V8H(), v18.V8H());
1557  __ sqdmlsl2(v25.V4S(), v19.V8H(), v5.H(), 0);
1558  __ sqdmulh(h17, h27, h12);
1559  __ sqdmulh(h16, h5, v11.H(), 0);
1560  __ sqdmulh(s1, s19, s16);
1561  __ sqdmulh(s1, s16, v2.S(), 0);
1562  __ sqdmulh(v28.V2S(), v1.V2S(), v8.V2S());
1563  __ sqdmulh(v28.V2S(), v8.V2S(), v3.S(), 0);
1564  __ sqdmulh(v11.V4H(), v25.V4H(), v5.V4H());
1565  __ sqdmulh(v30.V4H(), v14.V4H(), v8.H(), 5);
1566  __ sqdmulh(v25.V4S(), v21.V4S(), v13.V4S());
1567  __ sqdmulh(v23.V4S(), v2.V4S(), v10.S(), 3);
1568  __ sqdmulh(v26.V8H(), v5.V8H(), v23.V8H());
1569  __ sqdmulh(v4.V8H(), v22.V8H(), v4.H(), 3);
1570  __ sqdmull(d25, s2, s26);
1571  __ sqdmull(d30, s14, v5.S(), 1);
1572  __ sqdmull(s29, h18, h11);
1573  __ sqdmull(s11, h13, v7.H(), 6);
1574  __ sqdmull(v23.V2D(), v9.V2S(), v8.V2S());
1575  __ sqdmull(v18.V2D(), v29.V2S(), v4.S(), 1);
1576  __ sqdmull(v17.V4S(), v24.V4H(), v7.V4H());
1577  __ sqdmull(v8.V4S(), v15.V4H(), v5.H(), 1);
1578  __ sqdmull2(v28.V2D(), v14.V4S(), v2.V4S());
1579  __ sqdmull2(v1.V2D(), v24.V4S(), v13.S(), 2);
1580  __ sqdmull2(v11.V4S(), v17.V8H(), v31.V8H());
1581  __ sqdmull2(v1.V4S(), v20.V8H(), v11.H(), 3);
1582  __ sqneg(b2, b0);
1583  __ sqneg(d24, d2);
1584  __ sqneg(h29, h3);
1585  __ sqneg(s4, s9);
1586  __ sqneg(v14.V16B(), v29.V16B());
1587  __ sqneg(v30.V2D(), v12.V2D());
1588  __ sqneg(v28.V2S(), v26.V2S());
1589  __ sqneg(v4.V4H(), v4.V4H());
1590  __ sqneg(v9.V4S(), v8.V4S());
1591  __ sqneg(v20.V8B(), v20.V8B());
1592  __ sqneg(v27.V8H(), v10.V8H());
1593  __ sqrdmulh(h7, h24, h0);
1594  __ sqrdmulh(h14, h3, v4.H(), 6);
1595  __ sqrdmulh(s27, s19, s24);
1596  __ sqrdmulh(s31, s21, v4.S(), 0);
1597  __ sqrdmulh(v18.V2S(), v25.V2S(), v1.V2S());
1598  __ sqrdmulh(v22.V2S(), v5.V2S(), v13.S(), 0);
1599  __ sqrdmulh(v22.V4H(), v24.V4H(), v9.V4H());
1600  __ sqrdmulh(v13.V4H(), v2.V4H(), v12.H(), 6);
1601  __ sqrdmulh(v9.V4S(), v27.V4S(), v2.V4S());
1602  __ sqrdmulh(v3.V4S(), v23.V4S(), v7.S(), 1);
1603  __ sqrdmulh(v2.V8H(), v0.V8H(), v7.V8H());
1604  __ sqrdmulh(v16.V8H(), v9.V8H(), v8.H(), 2);
1605  __ sqrshl(b8, b21, b13);
1606  __ sqrshl(d29, d7, d20);
1607  __ sqrshl(h28, h14, h10);
1608  __ sqrshl(s26, s18, s2);
1609  __ sqrshl(v18.V16B(), v31.V16B(), v26.V16B());
1610  __ sqrshl(v28.V2D(), v4.V2D(), v0.V2D());
1611  __ sqrshl(v3.V2S(), v6.V2S(), v0.V2S());
1612  __ sqrshl(v1.V4H(), v18.V4H(), v22.V4H());
1613  __ sqrshl(v16.V4S(), v25.V4S(), v7.V4S());
1614  __ sqrshl(v0.V8B(), v21.V8B(), v5.V8B());
1615  __ sqrshl(v30.V8H(), v19.V8H(), v8.V8H());
1616  __ sqrshrn(b6, h21, 4);
1617  __ sqrshrn(h14, s17, 11);
1618  __ sqrshrn(s25, d27, 10);
1619  __ sqrshrn(v6.V2S(), v13.V2D(), 18);
1620  __ sqrshrn(v5.V4H(), v9.V4S(), 15);
1621  __ sqrshrn(v19.V8B(), v12.V8H(), 1);
1622  __ sqrshrn2(v19.V16B(), v21.V8H(), 7);
1623  __ sqrshrn2(v29.V4S(), v24.V2D(), 13);
1624  __ sqrshrn2(v12.V8H(), v2.V4S(), 10);
1625  __ sqrshrun(b16, h9, 5);
1626  __ sqrshrun(h3, s24, 15);
1627  __ sqrshrun(s16, d18, 8);
1628  __ sqrshrun(v28.V2S(), v23.V2D(), 8);
1629  __ sqrshrun(v31.V4H(), v25.V4S(), 10);
1630  __ sqrshrun(v19.V8B(), v23.V8H(), 2);
1631  __ sqrshrun2(v24.V16B(), v0.V8H(), 8);
1632  __ sqrshrun2(v22.V4S(), v1.V2D(), 23);
1633  __ sqrshrun2(v28.V8H(), v21.V4S(), 13);
1634  __ sqshl(b6, b21, b8);
1635  __ sqshl(b11, b26, 2);
1636  __ sqshl(d29, d0, d4);
1637  __ sqshl(d21, d7, 35);
1638  __ sqshl(h20, h25, h17);
1639  __ sqshl(h20, h0, 8);
1640  __ sqshl(s29, s13, s4);
1641  __ sqshl(s10, s11, 20);
1642  __ sqshl(v8.V16B(), v18.V16B(), v28.V16B());
1643  __ sqshl(v29.V16B(), v29.V16B(), 2);
1644  __ sqshl(v8.V2D(), v31.V2D(), v16.V2D());
1645  __ sqshl(v7.V2D(), v14.V2D(), 37);
1646  __ sqshl(v0.V2S(), v26.V2S(), v7.V2S());
1647  __ sqshl(v5.V2S(), v11.V2S(), 19);
1648  __ sqshl(v11.V4H(), v30.V4H(), v0.V4H());
1649  __ sqshl(v1.V4H(), v18.V4H(), 7);
1650  __ sqshl(v22.V4S(), v3.V4S(), v30.V4S());
1651  __ sqshl(v16.V4S(), v15.V4S(), 28);
1652  __ sqshl(v6.V8B(), v28.V8B(), v25.V8B());
1653  __ sqshl(v0.V8B(), v15.V8B(), 0);
1654  __ sqshl(v6.V8H(), v16.V8H(), v30.V8H());
1655  __ sqshl(v3.V8H(), v20.V8H(), 14);
1656  __ sqshlu(b13, b14, 6);
1657  __ sqshlu(d0, d16, 44);
1658  __ sqshlu(h5, h29, 15);
1659  __ sqshlu(s29, s8, 13);
1660  __ sqshlu(v27.V16B(), v20.V16B(), 2);
1661  __ sqshlu(v24.V2D(), v12.V2D(), 11);
1662  __ sqshlu(v12.V2S(), v19.V2S(), 22);
1663  __ sqshlu(v8.V4H(), v12.V4H(), 11);
1664  __ sqshlu(v18.V4S(), v3.V4S(), 8);
1665  __ sqshlu(v3.V8B(), v10.V8B(), 1);
1666  __ sqshlu(v30.V8H(), v24.V8H(), 4);
1667  __ sqshrn(b1, h28, 1);
1668  __ sqshrn(h31, s7, 10);
1669  __ sqshrn(s4, d10, 24);
1670  __ sqshrn(v10.V2S(), v1.V2D(), 29);
1671  __ sqshrn(v3.V4H(), v13.V4S(), 14);
1672  __ sqshrn(v27.V8B(), v6.V8H(), 7);
1673  __ sqshrn2(v14.V16B(), v23.V8H(), 1);
1674  __ sqshrn2(v25.V4S(), v22.V2D(), 27);
1675  __ sqshrn2(v31.V8H(), v12.V4S(), 10);
1676  __ sqshrun(b9, h0, 1);
1677  __ sqshrun(h11, s6, 7);
1678  __ sqshrun(s13, d12, 13);
1679  __ sqshrun(v10.V2S(), v30.V2D(), 1);
1680  __ sqshrun(v31.V4H(), v3.V4S(), 11);
1681  __ sqshrun(v28.V8B(), v30.V8H(), 8);
1682  __ sqshrun2(v16.V16B(), v27.V8H(), 3);
1683  __ sqshrun2(v27.V4S(), v14.V2D(), 18);
1684  __ sqshrun2(v23.V8H(), v14.V4S(), 1);
1685  __ sqsub(b19, b29, b11);
1686  __ sqsub(d21, d31, d6);
1687  __ sqsub(h18, h10, h19);
1688  __ sqsub(s6, s5, s0);
1689  __ sqsub(v21.V16B(), v22.V16B(), v0.V16B());
1690  __ sqsub(v22.V2D(), v10.V2D(), v17.V2D());
1691  __ sqsub(v8.V2S(), v21.V2S(), v2.V2S());
1692  __ sqsub(v18.V4H(), v25.V4H(), v27.V4H());
1693  __ sqsub(v13.V4S(), v3.V4S(), v6.V4S());
1694  __ sqsub(v28.V8B(), v29.V8B(), v16.V8B());
1695  __ sqsub(v17.V8H(), v6.V8H(), v10.V8H());
1696  __ sqxtn(b27, h26);
1697  __ sqxtn(h17, s11);
1698  __ sqxtn(s22, d31);
1699  __ sqxtn(v26.V2S(), v5.V2D());
1700  __ sqxtn(v13.V4H(), v7.V4S());
1701  __ sqxtn(v19.V8B(), v19.V8H());
1702  __ sqxtn2(v19.V16B(), v3.V8H());
1703  __ sqxtn2(v23.V4S(), v1.V2D());
1704  __ sqxtn2(v13.V8H(), v3.V4S());
1705  __ sqxtun(b26, h9);
1706  __ sqxtun(h19, s12);
1707  __ sqxtun(s3, d6);
1708  __ sqxtun(v29.V2S(), v26.V2D());
1709  __ sqxtun(v26.V4H(), v10.V4S());
1710  __ sqxtun(v7.V8B(), v29.V8H());
1711  __ sqxtun2(v21.V16B(), v14.V8H());
1712  __ sqxtun2(v24.V4S(), v15.V2D());
1713  __ sqxtun2(v30.V8H(), v1.V4S());
1714  __ srhadd(v21.V16B(), v17.V16B(), v15.V16B());
1715  __ srhadd(v28.V2S(), v21.V2S(), v29.V2S());
1716  __ srhadd(v9.V4H(), v1.V4H(), v30.V4H());
1717  __ srhadd(v24.V4S(), v0.V4S(), v2.V4S());
1718  __ srhadd(v6.V8B(), v17.V8B(), v15.V8B());
1719  __ srhadd(v5.V8H(), v7.V8H(), v21.V8H());
1720  __ sri(d14, d14, 49);
1721  __ sri(v23.V16B(), v8.V16B(), 4);
1722  __ sri(v20.V2D(), v13.V2D(), 20);
1723  __ sri(v16.V2S(), v2.V2S(), 24);
1724  __ sri(v5.V4H(), v23.V4H(), 11);
1725  __ sri(v27.V4S(), v15.V4S(), 23);
1726  __ sri(v19.V8B(), v29.V8B(), 4);
1727  __ sri(v7.V8H(), v29.V8H(), 3);
1728  __ srshl(d2, d9, d26);
1729  __ srshl(v29.V16B(), v17.V16B(), v11.V16B());
1730  __ srshl(v8.V2D(), v15.V2D(), v4.V2D());
1731  __ srshl(v25.V2S(), v17.V2S(), v8.V2S());
1732  __ srshl(v19.V4H(), v7.V4H(), v7.V4H());
1733  __ srshl(v13.V4S(), v2.V4S(), v17.V4S());
1734  __ srshl(v22.V8B(), v6.V8B(), v21.V8B());
1735  __ srshl(v10.V8H(), v17.V8H(), v4.V8H());
1736  __ srshr(d21, d18, 45);
1737  __ srshr(v3.V16B(), v11.V16B(), 7);
1738  __ srshr(v21.V2D(), v26.V2D(), 53);
1739  __ srshr(v11.V2S(), v5.V2S(), 28);
1740  __ srshr(v7.V4H(), v18.V4H(), 12);
1741  __ srshr(v7.V4S(), v3.V4S(), 30);
1742  __ srshr(v14.V8B(), v2.V8B(), 6);
1743  __ srshr(v21.V8H(), v20.V8H(), 3);
1744  __ srsra(d21, d30, 63);
1745  __ srsra(v27.V16B(), v30.V16B(), 6);
1746  __ srsra(v20.V2D(), v12.V2D(), 27);
1747  __ srsra(v0.V2S(), v17.V2S(), 5);
1748  __ srsra(v14.V4H(), v16.V4H(), 15);
1749  __ srsra(v18.V4S(), v3.V4S(), 20);
1750  __ srsra(v21.V8B(), v1.V8B(), 1);
1751  __ srsra(v31.V8H(), v25.V8H(), 2);
1752  __ sshl(d1, d13, d9);
1753  __ sshl(v17.V16B(), v31.V16B(), v15.V16B());
1754  __ sshl(v13.V2D(), v16.V2D(), v0.V2D());
1755  __ sshl(v0.V2S(), v7.V2S(), v22.V2S());
1756  __ sshl(v23.V4H(), v19.V4H(), v4.V4H());
1757  __ sshl(v5.V4S(), v5.V4S(), v11.V4S());
1758  __ sshl(v23.V8B(), v27.V8B(), v7.V8B());
1759  __ sshl(v29.V8H(), v10.V8H(), v5.V8H());
1760  __ sshll(v0.V2D(), v2.V2S(), 23);
1761  __ sshll(v11.V4S(), v8.V4H(), 8);
1762  __ sshll(v4.V8H(), v29.V8B(), 1);
1763  __ sshll2(v10.V2D(), v4.V4S(), 14);
1764  __ sshll2(v26.V4S(), v31.V8H(), 6);
1765  __ sshll2(v3.V8H(), v26.V16B(), 4);
1766  __ sshr(d19, d21, 20);
1767  __ sshr(v15.V16B(), v23.V16B(), 5);
1768  __ sshr(v17.V2D(), v14.V2D(), 38);
1769  __ sshr(v3.V2S(), v29.V2S(), 23);
1770  __ sshr(v23.V4H(), v27.V4H(), 4);
1771  __ sshr(v28.V4S(), v3.V4S(), 4);
1772  __ sshr(v14.V8B(), v2.V8B(), 6);
1773  __ sshr(v3.V8H(), v8.V8H(), 6);
1774  __ ssra(d12, d28, 44);
1775  __ ssra(v29.V16B(), v31.V16B(), 4);
1776  __ ssra(v3.V2D(), v0.V2D(), 24);
1777  __ ssra(v14.V2S(), v28.V2S(), 6);
1778  __ ssra(v18.V4H(), v8.V4H(), 7);
1779  __ ssra(v31.V4S(), v14.V4S(), 24);
1780  __ ssra(v28.V8B(), v26.V8B(), 5);
1781  __ ssra(v9.V8H(), v9.V8H(), 14);
1782  __ ssubl(v13.V2D(), v14.V2S(), v3.V2S());
1783  __ ssubl(v5.V4S(), v16.V4H(), v8.V4H());
1784  __ ssubl(v0.V8H(), v28.V8B(), v6.V8B());
1785  __ ssubl2(v5.V2D(), v13.V4S(), v25.V4S());
1786  __ ssubl2(v3.V4S(), v15.V8H(), v17.V8H());
1787  __ ssubl2(v15.V8H(), v15.V16B(), v14.V16B());
1788  __ ssubw(v25.V2D(), v23.V2D(), v26.V2S());
1789  __ ssubw(v21.V4S(), v18.V4S(), v24.V4H());
1790  __ ssubw(v30.V8H(), v22.V8H(), v3.V8B());
1791  __ ssubw2(v16.V2D(), v24.V2D(), v28.V4S());
1792  __ ssubw2(v31.V4S(), v11.V4S(), v15.V8H());
1793  __ ssubw2(v4.V8H(), v8.V8H(), v16.V16B());
1794  __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
1795  __ st1(v10.V16B(),
1796         v11.V16B(),
1797         v12.V16B(),
1798         v13.V16B(),
1799         MemOperand(x1, x2, PostIndex));
1800  __ st1(v27.V16B(),
1801         v28.V16B(),
1802         v29.V16B(),
1803         v30.V16B(),
1804         MemOperand(x1, 64, PostIndex));
1805  __ st1(v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x0));
1806  __ st1(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
1807  __ st1(v9.V16B(), v10.V16B(), v11.V16B(), MemOperand(x1, 48, PostIndex));
1808  __ st1(v7.V16B(), v8.V16B(), MemOperand(x0));
1809  __ st1(v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex));
1810  __ st1(v22.V16B(), v23.V16B(), MemOperand(x1, 32, PostIndex));
1811  __ st1(v23.V16B(), MemOperand(x0));
1812  __ st1(v28.V16B(), MemOperand(x1, x2, PostIndex));
1813  __ st1(v2.V16B(), MemOperand(x1, 16, PostIndex));
1814  __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), MemOperand(x0));
1815  __ st1(v12.V1D(),
1816         v13.V1D(),
1817         v14.V1D(),
1818         v15.V1D(),
1819         MemOperand(x1, x2, PostIndex));
1820  __ st1(v30.V1D(),
1821         v31.V1D(),
1822         v0.V1D(),
1823         v1.V1D(),
1824         MemOperand(x1, 32, PostIndex));
1825  __ st1(v16.V1D(), v17.V1D(), v18.V1D(), MemOperand(x0));
1826  __ st1(v3.V1D(), v4.V1D(), v5.V1D(), MemOperand(x1, x2, PostIndex));
1827  __ st1(v14.V1D(), v15.V1D(), v16.V1D(), MemOperand(x1, 24, PostIndex));
1828  __ st1(v18.V1D(), v19.V1D(), MemOperand(x0));
1829  __ st1(v5.V1D(), v6.V1D(), MemOperand(x1, x2, PostIndex));
1830  __ st1(v2.V1D(), v3.V1D(), MemOperand(x1, 16, PostIndex));
1831  __ st1(v4.V1D(), MemOperand(x0));
1832  __ st1(v27.V1D(), MemOperand(x1, x2, PostIndex));
1833  __ st1(v23.V1D(), MemOperand(x1, 8, PostIndex));
1834  __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), MemOperand(x0));
1835  __ st1(v22.V2D(),
1836         v23.V2D(),
1837         v24.V2D(),
1838         v25.V2D(),
1839         MemOperand(x1, x2, PostIndex));
1840  __ st1(v28.V2D(),
1841         v29.V2D(),
1842         v30.V2D(),
1843         v31.V2D(),
1844         MemOperand(x1, 64, PostIndex));
1845  __ st1(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
1846  __ st1(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, x2, PostIndex));
1847  __ st1(v22.V2D(), v23.V2D(), v24.V2D(), MemOperand(x1, 48, PostIndex));
1848  __ st1(v21.V2D(), v22.V2D(), MemOperand(x0));
1849  __ st1(v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1850  __ st1(v27.V2D(), v28.V2D(), MemOperand(x1, 32, PostIndex));
1851  __ st1(v21.V2D(), MemOperand(x0));
1852  __ st1(v29.V2D(), MemOperand(x1, x2, PostIndex));
1853  __ st1(v20.V2D(), MemOperand(x1, 16, PostIndex));
1854  __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x0));
1855  __ st1(v8.V2S(),
1856         v9.V2S(),
1857         v10.V2S(),
1858         v11.V2S(),
1859         MemOperand(x1, x2, PostIndex));
1860  __ st1(v15.V2S(),
1861         v16.V2S(),
1862         v17.V2S(),
1863         v18.V2S(),
1864         MemOperand(x1, 32, PostIndex));
1865  __ st1(v2.V2S(), v3.V2S(), v4.V2S(), MemOperand(x0));
1866  __ st1(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, x2, PostIndex));
1867  __ st1(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x1, 24, PostIndex));
1868  __ st1(v28.V2S(), v29.V2S(), MemOperand(x0));
1869  __ st1(v29.V2S(), v30.V2S(), MemOperand(x1, x2, PostIndex));
1870  __ st1(v23.V2S(), v24.V2S(), MemOperand(x1, 16, PostIndex));
1871  __ st1(v6.V2S(), MemOperand(x0));
1872  __ st1(v11.V2S(), MemOperand(x1, x2, PostIndex));
1873  __ st1(v17.V2S(), MemOperand(x1, 8, PostIndex));
1874  __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x0));
1875  __ st1(v9.V4H(),
1876         v10.V4H(),
1877         v11.V4H(),
1878         v12.V4H(),
1879         MemOperand(x1, x2, PostIndex));
1880  __ st1(v25.V4H(),
1881         v26.V4H(),
1882         v27.V4H(),
1883         v28.V4H(),
1884         MemOperand(x1, 32, PostIndex));
1885  __ st1(v11.V4H(), v12.V4H(), v13.V4H(), MemOperand(x0));
1886  __ st1(v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex));
1887  __ st1(v12.V4H(), v13.V4H(), v14.V4H(), MemOperand(x1, 24, PostIndex));
1888  __ st1(v13.V4H(), v14.V4H(), MemOperand(x0));
1889  __ st1(v15.V4H(), v16.V4H(), MemOperand(x1, x2, PostIndex));
1890  __ st1(v21.V4H(), v22.V4H(), MemOperand(x1, 16, PostIndex));
1891  __ st1(v16.V4H(), MemOperand(x0));
1892  __ st1(v8.V4H(), MemOperand(x1, x2, PostIndex));
1893  __ st1(v30.V4H(), MemOperand(x1, 8, PostIndex));
1894  __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), MemOperand(x0));
1895  __ st1(v25.V4S(),
1896         v26.V4S(),
1897         v27.V4S(),
1898         v28.V4S(),
1899         MemOperand(x1, x2, PostIndex));
1900  __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 64, PostIndex));
1901  __ st1(v31.V4S(), v0.V4S(), v1.V4S(), MemOperand(x0));
1902  __ st1(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1903  __ st1(v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 48, PostIndex));
1904  __ st1(v17.V4S(), v18.V4S(), MemOperand(x0));
1905  __ st1(v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1906  __ st1(v1.V4S(), v2.V4S(), MemOperand(x1, 32, PostIndex));
1907  __ st1(v26.V4S(), MemOperand(x0));
1908  __ st1(v15.V4S(), MemOperand(x1, x2, PostIndex));
1909  __ st1(v13.V4S(), MemOperand(x1, 16, PostIndex));
1910  __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
1911  __ st1(v10.V8B(),
1912         v11.V8B(),
1913         v12.V8B(),
1914         v13.V8B(),
1915         MemOperand(x1, x2, PostIndex));
1916  __ st1(v15.V8B(),
1917         v16.V8B(),
1918         v17.V8B(),
1919         v18.V8B(),
1920         MemOperand(x1, 32, PostIndex));
1921  __ st1(v19.V8B(), v20.V8B(), v21.V8B(), MemOperand(x0));
1922  __ st1(v31.V8B(), v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1923  __ st1(v9.V8B(), v10.V8B(), v11.V8B(), MemOperand(x1, 24, PostIndex));
1924  __ st1(v12.V8B(), v13.V8B(), MemOperand(x0));
1925  __ st1(v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1926  __ st1(v0.V8B(), v1.V8B(), MemOperand(x1, 16, PostIndex));
1927  __ st1(v16.V8B(), MemOperand(x0));
1928  __ st1(v25.V8B(), MemOperand(x1, x2, PostIndex));
1929  __ st1(v31.V8B(), MemOperand(x1, 8, PostIndex));
1930  __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), MemOperand(x0));
1931  __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), MemOperand(x1, x2, PostIndex));
1932  __ st1(v26.V8H(),
1933         v27.V8H(),
1934         v28.V8H(),
1935         v29.V8H(),
1936         MemOperand(x1, 64, PostIndex));
1937  __ st1(v10.V8H(), v11.V8H(), v12.V8H(), MemOperand(x0));
1938  __ st1(v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1939  __ st1(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
1940  __ st1(v26.V8H(), v27.V8H(), MemOperand(x0));
1941  __ st1(v24.V8H(), v25.V8H(), MemOperand(x1, x2, PostIndex));
1942  __ st1(v17.V8H(), v18.V8H(), MemOperand(x1, 32, PostIndex));
1943  __ st1(v29.V8H(), MemOperand(x0));
1944  __ st1(v19.V8H(), MemOperand(x1, x2, PostIndex));
1945  __ st1(v23.V8H(), MemOperand(x1, 16, PostIndex));
1946  __ st1(v19.B(), 15, MemOperand(x0));
1947  __ st1(v25.B(), 9, MemOperand(x1, x2, PostIndex));
1948  __ st1(v4.B(), 8, MemOperand(x1, 1, PostIndex));
1949  __ st1(v13.D(), 0, MemOperand(x0));
1950  __ st1(v30.D(), 0, MemOperand(x1, x2, PostIndex));
1951  __ st1(v3.D(), 0, MemOperand(x1, 8, PostIndex));
1952  __ st1(v22.H(), 0, MemOperand(x0));
1953  __ st1(v31.H(), 7, MemOperand(x1, x2, PostIndex));
1954  __ st1(v23.H(), 3, MemOperand(x1, 2, PostIndex));
1955  __ st1(v0.S(), 0, MemOperand(x0));
1956  __ st1(v11.S(), 3, MemOperand(x1, x2, PostIndex));
1957  __ st1(v24.S(), 3, MemOperand(x1, 4, PostIndex));
1958  __ st2(v7.V16B(), v8.V16B(), MemOperand(x0));
1959  __ st2(v5.V16B(), v6.V16B(), MemOperand(x1, x2, PostIndex));
1960  __ st2(v18.V16B(), v19.V16B(), MemOperand(x1, 32, PostIndex));
1961  __ st2(v14.V2D(), v15.V2D(), MemOperand(x0));
1962  __ st2(v7.V2D(), v8.V2D(), MemOperand(x1, x2, PostIndex));
1963  __ st2(v24.V2D(), v25.V2D(), MemOperand(x1, 32, PostIndex));
1964  __ st2(v22.V2S(), v23.V2S(), MemOperand(x0));
1965  __ st2(v4.V2S(), v5.V2S(), MemOperand(x1, x2, PostIndex));
1966  __ st2(v2.V2S(), v3.V2S(), MemOperand(x1, 16, PostIndex));
1967  __ st2(v23.V4H(), v24.V4H(), MemOperand(x0));
1968  __ st2(v8.V4H(), v9.V4H(), MemOperand(x1, x2, PostIndex));
1969  __ st2(v7.V4H(), v8.V4H(), MemOperand(x1, 16, PostIndex));
1970  __ st2(v17.V4S(), v18.V4S(), MemOperand(x0));
1971  __ st2(v6.V4S(), v7.V4S(), MemOperand(x1, x2, PostIndex));
1972  __ st2(v26.V4S(), v27.V4S(), MemOperand(x1, 32, PostIndex));
1973  __ st2(v31.V8B(), v0.V8B(), MemOperand(x0));
1974  __ st2(v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1975  __ st2(v21.V8B(), v22.V8B(), MemOperand(x1, 16, PostIndex));
1976  __ st2(v7.V8H(), v8.V8H(), MemOperand(x0));
1977  __ st2(v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1978  __ st2(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
1979  __ st2(v8.B(), v9.B(), 15, MemOperand(x0));
1980  __ st2(v8.B(), v9.B(), 15, MemOperand(x1, x2, PostIndex));
1981  __ st2(v7.B(), v8.B(), 4, MemOperand(x1, 2, PostIndex));
1982  __ st2(v25.D(), v26.D(), 0, MemOperand(x0));
1983  __ st2(v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1984  __ st2(v3.D(), v4.D(), 1, MemOperand(x1, 16, PostIndex));
1985  __ st2(v4.H(), v5.H(), 3, MemOperand(x0));
1986  __ st2(v0.H(), v1.H(), 5, MemOperand(x1, x2, PostIndex));
1987  __ st2(v22.H(), v23.H(), 2, MemOperand(x1, 4, PostIndex));
1988  __ st2(v14.S(), v15.S(), 3, MemOperand(x0));
1989  __ st2(v23.S(), v24.S(), 3, MemOperand(x1, x2, PostIndex));
1990  __ st2(v0.S(), v1.S(), 2, MemOperand(x1, 8, PostIndex));
1991  __ st3(v26.V16B(), v27.V16B(), v28.V16B(), MemOperand(x0));
1992  __ st3(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
1993  __ st3(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, 48, PostIndex));
1994  __ st3(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
1995  __ st3(v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex));
1996  __ st3(v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 48, PostIndex));
1997  __ st3(v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x0));
1998  __ st3(v13.V2S(), v14.V2S(), v15.V2S(), MemOperand(x1, x2, PostIndex));
1999  __ st3(v22.V2S(), v23.V2S(), v24.V2S(), MemOperand(x1, 24, PostIndex));
2000  __ st3(v31.V4H(), v0.V4H(), v1.V4H(), MemOperand(x0));
2001  __ st3(v8.V4H(), v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
2002  __ st3(v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, 24, PostIndex));
2003  __ st3(v18.V4S(), v19.V4S(), v20.V4S(), MemOperand(x0));
2004  __ st3(v25.V4S(), v26.V4S(), v27.V4S(), MemOperand(x1, x2, PostIndex));
2005  __ st3(v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 48, PostIndex));
2006  __ st3(v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2007  __ st3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x1, x2, PostIndex));
2008  __ st3(v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 24, PostIndex));
2009  __ st3(v8.V8H(), v9.V8H(), v10.V8H(), MemOperand(x0));
2010  __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, x2, PostIndex));
2011  __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
2012  __ st3(v31.B(), v0.B(), v1.B(), 10, MemOperand(x0));
2013  __ st3(v4.B(), v5.B(), v6.B(), 5, MemOperand(x1, x2, PostIndex));
2014  __ st3(v5.B(), v6.B(), v7.B(), 1, MemOperand(x1, 3, PostIndex));
2015  __ st3(v5.D(), v6.D(), v7.D(), 0, MemOperand(x0));
2016  __ st3(v6.D(), v7.D(), v8.D(), 0, MemOperand(x1, x2, PostIndex));
2017  __ st3(v0.D(), v1.D(), v2.D(), 0, MemOperand(x1, 24, PostIndex));
2018  __ st3(v31.H(), v0.H(), v1.H(), 2, MemOperand(x0));
2019  __ st3(v14.H(), v15.H(), v16.H(), 5, MemOperand(x1, x2, PostIndex));
2020  __ st3(v21.H(), v22.H(), v23.H(), 6, MemOperand(x1, 6, PostIndex));
2021  __ st3(v21.S(), v22.S(), v23.S(), 0, MemOperand(x0));
2022  __ st3(v11.S(), v12.S(), v13.S(), 1, MemOperand(x1, x2, PostIndex));
2023  __ st3(v15.S(), v16.S(), v17.S(), 0, MemOperand(x1, 12, PostIndex));
2024  __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), MemOperand(x0));
2025  __ st4(v24.V16B(),
2026         v25.V16B(),
2027         v26.V16B(),
2028         v27.V16B(),
2029         MemOperand(x1, x2, PostIndex));
2030  __ st4(v15.V16B(),
2031         v16.V16B(),
2032         v17.V16B(),
2033         v18.V16B(),
2034         MemOperand(x1, 64, PostIndex));
2035  __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
2036  __ st4(v17.V2D(),
2037         v18.V2D(),
2038         v19.V2D(),
2039         v20.V2D(),
2040         MemOperand(x1, x2, PostIndex));
2041  __ st4(v9.V2D(),
2042         v10.V2D(),
2043         v11.V2D(),
2044         v12.V2D(),
2045         MemOperand(x1, 64, PostIndex));
2046  __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), MemOperand(x0));
2047  __ st4(v15.V2S(),
2048         v16.V2S(),
2049         v17.V2S(),
2050         v18.V2S(),
2051         MemOperand(x1, x2, PostIndex));
2052  __ st4(v24.V2S(),
2053         v25.V2S(),
2054         v26.V2S(),
2055         v27.V2S(),
2056         MemOperand(x1, 32, PostIndex));
2057  __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), MemOperand(x0));
2058  __ st4(v18.V4H(),
2059         v19.V4H(),
2060         v20.V4H(),
2061         v21.V4H(),
2062         MemOperand(x1, x2, PostIndex));
2063  __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
2064  __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), MemOperand(x0));
2065  __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), MemOperand(x1, x2, PostIndex));
2066  __ st4(v15.V4S(),
2067         v16.V4S(),
2068         v17.V4S(),
2069         v18.V4S(),
2070         MemOperand(x1, 64, PostIndex));
2071  __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2072  __ st4(v25.V8B(),
2073         v26.V8B(),
2074         v27.V8B(),
2075         v28.V8B(),
2076         MemOperand(x1, x2, PostIndex));
2077  __ st4(v19.V8B(),
2078         v20.V8B(),
2079         v21.V8B(),
2080         v22.V8B(),
2081         MemOperand(x1, 32, PostIndex));
2082  __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), MemOperand(x0));
2083  __ st4(v15.V8H(),
2084         v16.V8H(),
2085         v17.V8H(),
2086         v18.V8H(),
2087         MemOperand(x1, x2, PostIndex));
2088  __ st4(v31.V8H(),
2089         v0.V8H(),
2090         v1.V8H(),
2091         v2.V8H(),
2092         MemOperand(x1, 64, PostIndex));
2093  __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, MemOperand(x0));
2094  __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, MemOperand(x1, x2, PostIndex));
2095  __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, MemOperand(x1, 4, PostIndex));
2096  __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, MemOperand(x0));
2097  __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, MemOperand(x1, x2, PostIndex));
2098  __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, MemOperand(x1, 32, PostIndex));
2099  __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, MemOperand(x0));
2100  __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, MemOperand(x1, x2, PostIndex));
2101  __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, MemOperand(x1, 8, PostIndex));
2102  __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, MemOperand(x0));
2103  __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, MemOperand(x1, x2, PostIndex));
2104  __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, MemOperand(x1, 16, PostIndex));
2105  __ sub(d12, d17, d2);
2106  __ sub(v20.V16B(), v24.V16B(), v8.V16B());
2107  __ sub(v8.V2D(), v29.V2D(), v5.V2D());
2108  __ sub(v2.V2S(), v28.V2S(), v24.V2S());
2109  __ sub(v24.V4H(), v10.V4H(), v4.V4H());
2110  __ sub(v28.V4S(), v4.V4S(), v17.V4S());
2111  __ sub(v16.V8B(), v27.V8B(), v2.V8B());
2112  __ sub(v20.V8H(), v10.V8H(), v13.V8H());
2113  __ subhn(v5.V2S(), v14.V2D(), v13.V2D());
2114  __ subhn(v10.V4H(), v5.V4S(), v8.V4S());
2115  __ subhn(v6.V8B(), v10.V8H(), v22.V8H());
2116  __ subhn2(v11.V16B(), v6.V8H(), v9.V8H());
2117  __ subhn2(v25.V4S(), v18.V2D(), v24.V2D());
2118  __ subhn2(v20.V8H(), v21.V4S(), v1.V4S());
2119  __ suqadd(b25, b11);
2120  __ suqadd(d13, d1);
2121  __ suqadd(h0, h9);
2122  __ suqadd(s22, s8);
2123  __ suqadd(v24.V16B(), v27.V16B());
2124  __ suqadd(v26.V2D(), v14.V2D());
2125  __ suqadd(v7.V2S(), v10.V2S());
2126  __ suqadd(v25.V4H(), v12.V4H());
2127  __ suqadd(v4.V4S(), v3.V4S());
2128  __ suqadd(v14.V8B(), v18.V8B());
2129  __ suqadd(v31.V8H(), v8.V8H());
2130  __ sxtl(v16.V2D(), v20.V2S());
2131  __ sxtl(v27.V4S(), v28.V4H());
2132  __ sxtl(v0.V8H(), v22.V8B());
2133  __ sxtl2(v6.V2D(), v7.V4S());
2134  __ sxtl2(v9.V4S(), v27.V8H());
2135  __ sxtl2(v16.V8H(), v16.V16B());
2136  __ tbl(v25.V16B(),
2137         v17.V16B(),
2138         v18.V16B(),
2139         v19.V16B(),
2140         v20.V16B(),
2141         v22.V16B());
2142  __ tbl(v28.V16B(), v13.V16B(), v14.V16B(), v15.V16B(), v4.V16B());
2143  __ tbl(v3.V16B(), v0.V16B(), v1.V16B(), v2.V16B());
2144  __ tbl(v20.V16B(), v15.V16B(), v4.V16B());
2145  __ tbl(v7.V8B(), v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), v20.V8B());
2146  __ tbl(v8.V8B(), v1.V16B(), v2.V16B(), v3.V16B(), v31.V8B());
2147  __ tbl(v8.V8B(), v25.V16B(), v26.V16B(), v16.V8B());
2148  __ tbl(v11.V8B(), v19.V16B(), v30.V8B());
2149  __ tbx(v25.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), v28.V16B(), v5.V16B());
2150  __ tbx(v21.V16B(), v29.V16B(), v30.V16B(), v31.V16B(), v24.V16B());
2151  __ tbx(v6.V16B(), v16.V16B(), v17.V16B(), v1.V16B());
2152  __ tbx(v13.V16B(), v3.V16B(), v20.V16B());
2153  __ tbx(v24.V8B(), v29.V16B(), v30.V16B(), v31.V16B(), v0.V16B(), v9.V8B());
2154  __ tbx(v17.V8B(), v9.V16B(), v10.V16B(), v11.V16B(), v26.V8B());
2155  __ tbx(v5.V8B(), v3.V16B(), v4.V16B(), v21.V8B());
2156  __ tbx(v16.V8B(), v11.V16B(), v29.V8B());
2157  __ trn1(v19.V16B(), v24.V16B(), v12.V16B());
2158  __ trn1(v2.V2D(), v7.V2D(), v10.V2D());
2159  __ trn1(v22.V2S(), v0.V2S(), v21.V2S());
2160  __ trn1(v12.V4H(), v15.V4H(), v20.V4H());
2161  __ trn1(v30.V4S(), v17.V4S(), v9.V4S());
2162  __ trn1(v12.V8B(), v19.V8B(), v29.V8B());
2163  __ trn1(v23.V8H(), v8.V8H(), v9.V8H());
2164  __ trn2(v28.V16B(), v30.V16B(), v25.V16B());
2165  __ trn2(v7.V2D(), v27.V2D(), v7.V2D());
2166  __ trn2(v30.V2S(), v16.V2S(), v19.V2S());
2167  __ trn2(v24.V4H(), v6.V4H(), v25.V4H());
2168  __ trn2(v2.V4S(), v19.V4S(), v11.V4S());
2169  __ trn2(v25.V8B(), v27.V8B(), v18.V8B());
2170  __ trn2(v12.V8H(), v4.V8H(), v15.V8H());
2171  __ uaba(v31.V16B(), v12.V16B(), v28.V16B());
2172  __ uaba(v18.V2S(), v5.V2S(), v14.V2S());
2173  __ uaba(v9.V4H(), v20.V4H(), v21.V4H());
2174  __ uaba(v6.V4S(), v20.V4S(), v2.V4S());
2175  __ uaba(v16.V8B(), v12.V8B(), v5.V8B());
2176  __ uaba(v15.V8H(), v26.V8H(), v30.V8H());
2177  __ uabal(v10.V2D(), v18.V2S(), v15.V2S());
2178  __ uabal(v30.V4S(), v19.V4H(), v7.V4H());
2179  __ uabal(v4.V8H(), v27.V8B(), v0.V8B());
2180  __ uabal2(v19.V2D(), v12.V4S(), v2.V4S());
2181  __ uabal2(v26.V4S(), v5.V8H(), v12.V8H());
2182  __ uabal2(v19.V8H(), v20.V16B(), v28.V16B());
2183  __ uabd(v18.V16B(), v4.V16B(), v21.V16B());
2184  __ uabd(v30.V2S(), v21.V2S(), v16.V2S());
2185  __ uabd(v8.V4H(), v28.V4H(), v25.V4H());
2186  __ uabd(v28.V4S(), v12.V4S(), v21.V4S());
2187  __ uabd(v19.V8B(), v16.V8B(), v28.V8B());
2188  __ uabd(v9.V8H(), v12.V8H(), v29.V8H());
2189  __ uabdl(v26.V2D(), v0.V2S(), v8.V2S());
2190  __ uabdl(v29.V4S(), v31.V4H(), v25.V4H());
2191  __ uabdl(v27.V8H(), v29.V8B(), v14.V8B());
2192  __ uabdl2(v20.V2D(), v20.V4S(), v8.V4S());
2193  __ uabdl2(v22.V4S(), v15.V8H(), v18.V8H());
2194  __ uabdl2(v9.V8H(), v18.V16B(), v23.V16B());
2195  __ uadalp(v9.V1D(), v15.V2S());
2196  __ uadalp(v14.V2D(), v12.V4S());
2197  __ uadalp(v28.V2S(), v12.V4H());
2198  __ uadalp(v0.V4H(), v17.V8B());
2199  __ uadalp(v1.V4S(), v29.V8H());
2200  __ uadalp(v15.V8H(), v22.V16B());
2201  __ uaddl(v1.V2D(), v20.V2S(), v27.V2S());
2202  __ uaddl(v31.V4S(), v25.V4H(), v5.V4H());
2203  __ uaddl(v12.V8H(), v3.V8B(), v3.V8B());
2204  __ uaddl2(v5.V2D(), v23.V4S(), v6.V4S());
2205  __ uaddl2(v1.V4S(), v5.V8H(), v25.V8H());
2206  __ uaddl2(v22.V8H(), v30.V16B(), v28.V16B());
2207  __ uaddlp(v7.V1D(), v9.V2S());
2208  __ uaddlp(v26.V2D(), v4.V4S());
2209  __ uaddlp(v28.V2S(), v1.V4H());
2210  __ uaddlp(v20.V4H(), v31.V8B());
2211  __ uaddlp(v16.V4S(), v17.V8H());
2212  __ uaddlp(v6.V8H(), v2.V16B());
2213  __ uaddlv(d28, v22.V4S());
2214  __ uaddlv(h0, v19.V16B());
2215  __ uaddlv(h30, v30.V8B());
2216  __ uaddlv(s24, v18.V4H());
2217  __ uaddlv(s10, v0.V8H());
2218  __ uaddw(v9.V2D(), v17.V2D(), v14.V2S());
2219  __ uaddw(v9.V4S(), v25.V4S(), v3.V4H());
2220  __ uaddw(v18.V8H(), v1.V8H(), v0.V8B());
2221  __ uaddw2(v18.V2D(), v5.V2D(), v6.V4S());
2222  __ uaddw2(v17.V4S(), v15.V4S(), v11.V8H());
2223  __ uaddw2(v29.V8H(), v11.V8H(), v7.V16B());
2224  __ uhadd(v13.V16B(), v9.V16B(), v3.V16B());
2225  __ uhadd(v17.V2S(), v25.V2S(), v24.V2S());
2226  __ uhadd(v25.V4H(), v23.V4H(), v13.V4H());
2227  __ uhadd(v0.V4S(), v20.V4S(), v16.V4S());
2228  __ uhadd(v5.V8B(), v5.V8B(), v25.V8B());
2229  __ uhadd(v3.V8H(), v29.V8H(), v18.V8H());
2230  __ uhsub(v1.V16B(), v22.V16B(), v13.V16B());
2231  __ uhsub(v14.V2S(), v30.V2S(), v30.V2S());
2232  __ uhsub(v29.V4H(), v14.V4H(), v17.V4H());
2233  __ uhsub(v26.V4S(), v5.V4S(), v18.V4S());
2234  __ uhsub(v3.V8B(), v7.V8B(), v12.V8B());
2235  __ uhsub(v25.V8H(), v21.V8H(), v5.V8H());
2236  __ umax(v28.V16B(), v12.V16B(), v6.V16B());
2237  __ umax(v20.V2S(), v19.V2S(), v26.V2S());
2238  __ umax(v0.V4H(), v31.V4H(), v18.V4H());
2239  __ umax(v6.V4S(), v21.V4S(), v28.V4S());
2240  __ umax(v0.V8B(), v2.V8B(), v20.V8B());
2241  __ umax(v4.V8H(), v11.V8H(), v22.V8H());
2242  __ umaxp(v1.V16B(), v6.V16B(), v29.V16B());
2243  __ umaxp(v19.V2S(), v17.V2S(), v27.V2S());
2244  __ umaxp(v21.V4H(), v16.V4H(), v7.V4H());
2245  __ umaxp(v9.V4S(), v20.V4S(), v29.V4S());
2246  __ umaxp(v13.V8B(), v1.V8B(), v16.V8B());
2247  __ umaxp(v19.V8H(), v23.V8H(), v26.V8H());
2248  __ umaxv(b17, v30.V16B());
2249  __ umaxv(b23, v12.V8B());
2250  __ umaxv(h31, v15.V4H());
2251  __ umaxv(h15, v25.V8H());
2252  __ umaxv(s18, v21.V4S());
2253  __ umin(v22.V16B(), v0.V16B(), v18.V16B());
2254  __ umin(v1.V2S(), v21.V2S(), v16.V2S());
2255  __ umin(v17.V4H(), v4.V4H(), v25.V4H());
2256  __ umin(v24.V4S(), v26.V4S(), v13.V4S());
2257  __ umin(v20.V8B(), v1.V8B(), v5.V8B());
2258  __ umin(v26.V8H(), v25.V8H(), v23.V8H());
2259  __ uminp(v5.V16B(), v1.V16B(), v23.V16B());
2260  __ uminp(v7.V2S(), v26.V2S(), v30.V2S());
2261  __ uminp(v9.V4H(), v5.V4H(), v25.V4H());
2262  __ uminp(v23.V4S(), v10.V4S(), v1.V4S());
2263  __ uminp(v4.V8B(), v29.V8B(), v14.V8B());
2264  __ uminp(v21.V8H(), v0.V8H(), v14.V8H());
2265  __ uminv(b0, v17.V16B());
2266  __ uminv(b0, v31.V8B());
2267  __ uminv(h24, v0.V4H());
2268  __ uminv(h29, v14.V8H());
2269  __ uminv(s30, v3.V4S());
2270  __ umlal(v11.V2D(), v11.V2S(), v24.V2S());
2271  __ umlal(v30.V2D(), v16.V2S(), v11.S(), 3);
2272  __ umlal(v0.V4S(), v9.V4H(), v26.V4H());
2273  __ umlal(v20.V4S(), v24.V4H(), v12.H(), 4);
2274  __ umlal(v16.V8H(), v21.V8B(), v6.V8B());
2275  __ umlal2(v17.V2D(), v19.V4S(), v23.V4S());
2276  __ umlal2(v5.V2D(), v30.V4S(), v8.S(), 0);
2277  __ umlal2(v16.V4S(), v8.V8H(), v15.V8H());
2278  __ umlal2(v15.V4S(), v26.V8H(), v1.H(), 5);
2279  __ umlal2(v30.V8H(), v1.V16B(), v17.V16B());
2280  __ umlsl(v18.V2D(), v19.V2S(), v28.V2S());
2281  __ umlsl(v7.V2D(), v7.V2S(), v8.S(), 0);
2282  __ umlsl(v24.V4S(), v8.V4H(), v4.V4H());
2283  __ umlsl(v18.V4S(), v22.V4H(), v12.H(), 4);
2284  __ umlsl(v28.V8H(), v14.V8B(), v20.V8B());
2285  __ umlsl2(v11.V2D(), v0.V4S(), v9.V4S());
2286  __ umlsl2(v26.V2D(), v16.V4S(), v9.S(), 2);
2287  __ umlsl2(v3.V4S(), v11.V8H(), v9.V8H());
2288  __ umlsl2(v10.V4S(), v25.V8H(), v9.H(), 4);
2289  __ umlsl2(v24.V8H(), v16.V16B(), v28.V16B());
2290  __ umov(x30, v25.D(), 1);
2291  __ umull(v12.V2D(), v10.V2S(), v29.V2S());
2292  __ umull(v22.V2D(), v30.V2S(), v5.S(), 3);
2293  __ umull(v7.V4S(), v0.V4H(), v25.V4H());
2294  __ umull(v11.V4S(), v13.V4H(), v3.H(), 2);
2295  __ umull(v25.V8H(), v16.V8B(), v10.V8B());
2296  __ umull2(v17.V2D(), v3.V4S(), v26.V4S());
2297  __ umull2(v26.V2D(), v11.V4S(), v2.S(), 3);
2298  __ umull2(v12.V4S(), v17.V8H(), v23.V8H());
2299  __ umull2(v4.V4S(), v31.V8H(), v1.H(), 2);
2300  __ umull2(v5.V8H(), v12.V16B(), v17.V16B());
2301  __ uqadd(b30, b4, b28);
2302  __ uqadd(d27, d20, d16);
2303  __ uqadd(h7, h14, h28);
2304  __ uqadd(s28, s17, s4);
2305  __ uqadd(v19.V16B(), v22.V16B(), v21.V16B());
2306  __ uqadd(v16.V2D(), v4.V2D(), v11.V2D());
2307  __ uqadd(v20.V2S(), v14.V2S(), v4.V2S());
2308  __ uqadd(v5.V4H(), v0.V4H(), v16.V4H());
2309  __ uqadd(v21.V4S(), v31.V4S(), v9.V4S());
2310  __ uqadd(v23.V8B(), v24.V8B(), v3.V8B());
2311  __ uqadd(v17.V8H(), v27.V8H(), v11.V8H());
2312  __ uqrshl(b10, b22, b10);
2313  __ uqrshl(d29, d5, d11);
2314  __ uqrshl(h27, h24, h30);
2315  __ uqrshl(s10, s13, s8);
2316  __ uqrshl(v9.V16B(), v18.V16B(), v14.V16B());
2317  __ uqrshl(v24.V2D(), v15.V2D(), v17.V2D());
2318  __ uqrshl(v4.V2S(), v14.V2S(), v27.V2S());
2319  __ uqrshl(v15.V4H(), v5.V4H(), v8.V4H());
2320  __ uqrshl(v21.V4S(), v29.V4S(), v0.V4S());
2321  __ uqrshl(v16.V8B(), v24.V8B(), v9.V8B());
2322  __ uqrshl(v2.V8H(), v0.V8H(), v15.V8H());
2323  __ uqrshrn(b11, h26, 4);
2324  __ uqrshrn(h7, s30, 5);
2325  __ uqrshrn(s10, d8, 21);
2326  __ uqrshrn(v15.V2S(), v6.V2D(), 11);
2327  __ uqrshrn(v5.V4H(), v26.V4S(), 12);
2328  __ uqrshrn(v28.V8B(), v25.V8H(), 5);
2329  __ uqrshrn2(v25.V16B(), v30.V8H(), 2);
2330  __ uqrshrn2(v21.V4S(), v14.V2D(), 32);
2331  __ uqrshrn2(v13.V8H(), v7.V4S(), 2);
2332  __ uqshl(b13, b0, b23);
2333  __ uqshl(b9, b17, 4);
2334  __ uqshl(d23, d6, d4);
2335  __ uqshl(d8, d11, 44);
2336  __ uqshl(h19, h13, h15);
2337  __ uqshl(h25, h26, 6);
2338  __ uqshl(s4, s24, s10);
2339  __ uqshl(s19, s14, 1);
2340  __ uqshl(v14.V16B(), v30.V16B(), v25.V16B());
2341  __ uqshl(v6.V16B(), v10.V16B(), 5);
2342  __ uqshl(v18.V2D(), v8.V2D(), v7.V2D());
2343  __ uqshl(v25.V2D(), v14.V2D(), 18);
2344  __ uqshl(v25.V2S(), v16.V2S(), v23.V2S());
2345  __ uqshl(v13.V2S(), v15.V2S(), 31);
2346  __ uqshl(v28.V4H(), v24.V4H(), v15.V4H());
2347  __ uqshl(v4.V4H(), v17.V4H(), 1);
2348  __ uqshl(v9.V4S(), v31.V4S(), v23.V4S());
2349  __ uqshl(v18.V4S(), v28.V4S(), 31);
2350  __ uqshl(v31.V8B(), v21.V8B(), v15.V8B());
2351  __ uqshl(v6.V8B(), v21.V8B(), 1);
2352  __ uqshl(v28.V8H(), v2.V8H(), v17.V8H());
2353  __ uqshl(v24.V8H(), v8.V8H(), 14);
2354  __ uqshrn(b21, h27, 7);
2355  __ uqshrn(h28, s26, 11);
2356  __ uqshrn(s13, d31, 17);
2357  __ uqshrn(v21.V2S(), v16.V2D(), 8);
2358  __ uqshrn(v24.V4H(), v24.V4S(), 2);
2359  __ uqshrn(v5.V8B(), v1.V8H(), 8);
2360  __ uqshrn2(v16.V16B(), v29.V8H(), 6);
2361  __ uqshrn2(v2.V4S(), v6.V2D(), 1);
2362  __ uqshrn2(v16.V8H(), v10.V4S(), 14);
2363  __ uqsub(b28, b20, b26);
2364  __ uqsub(d0, d7, d10);
2365  __ uqsub(h26, h24, h7);
2366  __ uqsub(s23, s23, s16);
2367  __ uqsub(v14.V16B(), v16.V16B(), v24.V16B());
2368  __ uqsub(v11.V2D(), v17.V2D(), v6.V2D());
2369  __ uqsub(v10.V2S(), v10.V2S(), v8.V2S());
2370  __ uqsub(v9.V4H(), v15.V4H(), v12.V4H());
2371  __ uqsub(v23.V4S(), v18.V4S(), v7.V4S());
2372  __ uqsub(v9.V8B(), v19.V8B(), v17.V8B());
2373  __ uqsub(v20.V8H(), v2.V8H(), v6.V8H());
2374  __ uqxtn(b29, h19);
2375  __ uqxtn(h0, s13);
2376  __ uqxtn(s26, d22);
2377  __ uqxtn(v5.V2S(), v31.V2D());
2378  __ uqxtn(v30.V4H(), v19.V4S());
2379  __ uqxtn(v15.V8B(), v2.V8H());
2380  __ uqxtn2(v29.V16B(), v3.V8H());
2381  __ uqxtn2(v13.V4S(), v17.V2D());
2382  __ uqxtn2(v28.V8H(), v11.V4S());
2383  __ urecpe(v23.V2S(), v15.V2S());
2384  __ urecpe(v27.V4S(), v7.V4S());
2385  __ urhadd(v2.V16B(), v15.V16B(), v27.V16B());
2386  __ urhadd(v15.V2S(), v1.V2S(), v18.V2S());
2387  __ urhadd(v17.V4H(), v4.V4H(), v26.V4H());
2388  __ urhadd(v2.V4S(), v27.V4S(), v14.V4S());
2389  __ urhadd(v5.V8B(), v17.V8B(), v14.V8B());
2390  __ urhadd(v30.V8H(), v2.V8H(), v25.V8H());
2391  __ urshl(d4, d28, d30);
2392  __ urshl(v13.V16B(), v31.V16B(), v19.V16B());
2393  __ urshl(v14.V2D(), v23.V2D(), v21.V2D());
2394  __ urshl(v10.V2S(), v7.V2S(), v8.V2S());
2395  __ urshl(v15.V4H(), v21.V4H(), v28.V4H());
2396  __ urshl(v30.V4S(), v8.V4S(), v23.V4S());
2397  __ urshl(v31.V8B(), v20.V8B(), v5.V8B());
2398  __ urshl(v30.V8H(), v27.V8H(), v30.V8H());
2399  __ urshr(d4, d13, 49);
2400  __ urshr(v2.V16B(), v20.V16B(), 1);
2401  __ urshr(v13.V2D(), v11.V2D(), 51);
2402  __ urshr(v21.V2S(), v31.V2S(), 10);
2403  __ urshr(v21.V4H(), v17.V4H(), 11);
2404  __ urshr(v4.V4S(), v22.V4S(), 1);
2405  __ urshr(v0.V8B(), v1.V8B(), 7);
2406  __ urshr(v13.V8H(), v20.V8H(), 1);
2407  __ ursqrte(v20.V2S(), v16.V2S());
2408  __ ursqrte(v28.V4S(), v8.V4S());
2409  __ ursra(d27, d16, 45);
2410  __ ursra(v18.V16B(), v17.V16B(), 3);
2411  __ ursra(v26.V2D(), v28.V2D(), 58);
2412  __ ursra(v8.V2S(), v22.V2S(), 31);
2413  __ ursra(v31.V4H(), v4.V4H(), 7);
2414  __ ursra(v31.V4S(), v15.V4S(), 2);
2415  __ ursra(v3.V8B(), v1.V8B(), 5);
2416  __ ursra(v18.V8H(), v14.V8H(), 13);
2417  __ ushl(d31, d0, d16);
2418  __ ushl(v0.V16B(), v6.V16B(), v2.V16B());
2419  __ ushl(v18.V2D(), v1.V2D(), v18.V2D());
2420  __ ushl(v27.V2S(), v7.V2S(), v29.V2S());
2421  __ ushl(v14.V4H(), v14.V4H(), v13.V4H());
2422  __ ushl(v22.V4S(), v4.V4S(), v9.V4S());
2423  __ ushl(v23.V8B(), v22.V8B(), v27.V8B());
2424  __ ushl(v21.V8H(), v25.V8H(), v8.V8H());
2425  __ ushll(v11.V2D(), v0.V2S(), 21);
2426  __ ushll(v2.V4S(), v17.V4H(), 8);
2427  __ ushll(v11.V8H(), v14.V8B(), 1);
2428  __ ushll2(v8.V2D(), v29.V4S(), 7);
2429  __ ushll2(v29.V4S(), v9.V8H(), 2);
2430  __ ushll2(v5.V8H(), v24.V16B(), 6);
2431  __ ushr(d28, d27, 53);
2432  __ ushr(v1.V16B(), v9.V16B(), 7);
2433  __ ushr(v2.V2D(), v24.V2D(), 43);
2434  __ ushr(v30.V2S(), v25.V2S(), 11);
2435  __ ushr(v10.V4H(), v26.V4H(), 12);
2436  __ ushr(v4.V4S(), v5.V4S(), 30);
2437  __ ushr(v30.V8B(), v2.V8B(), 1);
2438  __ ushr(v6.V8H(), v12.V8H(), 2);
2439  __ usqadd(b19, b5);
2440  __ usqadd(d9, d2);
2441  __ usqadd(h2, h16);
2442  __ usqadd(s16, s3);
2443  __ usqadd(v31.V16B(), v29.V16B());
2444  __ usqadd(v8.V2D(), v10.V2D());
2445  __ usqadd(v18.V2S(), v9.V2S());
2446  __ usqadd(v24.V4H(), v14.V4H());
2447  __ usqadd(v10.V4S(), v30.V4S());
2448  __ usqadd(v16.V8B(), v20.V8B());
2449  __ usqadd(v12.V8H(), v16.V8H());
2450  __ usra(d28, d27, 37);
2451  __ usra(v5.V16B(), v22.V16B(), 5);
2452  __ usra(v2.V2D(), v19.V2D(), 33);
2453  __ usra(v0.V2S(), v0.V2S(), 21);
2454  __ usra(v7.V4H(), v6.V4H(), 12);
2455  __ usra(v4.V4S(), v17.V4S(), 9);
2456  __ usra(v9.V8B(), v12.V8B(), 7);
2457  __ usra(v3.V8H(), v27.V8H(), 14);
2458  __ usubl(v29.V2D(), v12.V2S(), v30.V2S());
2459  __ usubl(v29.V4S(), v28.V4H(), v6.V4H());
2460  __ usubl(v12.V8H(), v4.V8B(), v14.V8B());
2461  __ usubl2(v1.V2D(), v24.V4S(), v17.V4S());
2462  __ usubl2(v4.V4S(), v1.V8H(), v3.V8H());
2463  __ usubl2(v23.V8H(), v4.V16B(), v7.V16B());
2464  __ usubw(v9.V2D(), v20.V2D(), v30.V2S());
2465  __ usubw(v20.V4S(), v16.V4S(), v23.V4H());
2466  __ usubw(v25.V8H(), v8.V8H(), v29.V8B());
2467  __ usubw2(v18.V2D(), v29.V2D(), v6.V4S());
2468  __ usubw2(v6.V4S(), v6.V4S(), v20.V8H());
2469  __ usubw2(v18.V8H(), v4.V8H(), v16.V16B());
2470  __ uxtl(v27.V2D(), v21.V2S());
2471  __ uxtl(v0.V4S(), v31.V4H());
2472  __ uxtl(v27.V8H(), v10.V8B());
2473  __ uxtl2(v6.V2D(), v16.V4S());
2474  __ uxtl2(v22.V4S(), v20.V8H());
2475  __ uxtl2(v20.V8H(), v21.V16B());
2476  __ uzp1(v30.V16B(), v9.V16B(), v17.V16B());
2477  __ uzp1(v7.V2D(), v26.V2D(), v28.V2D());
2478  __ uzp1(v26.V2S(), v16.V2S(), v22.V2S());
2479  __ uzp1(v14.V4H(), v19.V4H(), v6.V4H());
2480  __ uzp1(v17.V4S(), v23.V4S(), v30.V4S());
2481  __ uzp1(v28.V8B(), v27.V8B(), v13.V8B());
2482  __ uzp1(v17.V8H(), v1.V8H(), v12.V8H());
2483  __ uzp2(v8.V16B(), v18.V16B(), v26.V16B());
2484  __ uzp2(v21.V2D(), v22.V2D(), v24.V2D());
2485  __ uzp2(v20.V2S(), v21.V2S(), v2.V2S());
2486  __ uzp2(v16.V4H(), v31.V4H(), v6.V4H());
2487  __ uzp2(v25.V4S(), v11.V4S(), v8.V4S());
2488  __ uzp2(v31.V8B(), v31.V8B(), v13.V8B());
2489  __ uzp2(v8.V8H(), v17.V8H(), v1.V8H());
2490  __ xtn(v17.V2S(), v26.V2D());
2491  __ xtn(v3.V4H(), v0.V4S());
2492  __ xtn(v18.V8B(), v8.V8H());
2493  __ xtn2(v0.V16B(), v0.V8H());
2494  __ xtn2(v15.V4S(), v4.V2D());
2495  __ xtn2(v31.V8H(), v18.V4S());
2496  __ zip1(v22.V16B(), v9.V16B(), v6.V16B());
2497  __ zip1(v23.V2D(), v11.V2D(), v2.V2D());
2498  __ zip1(v26.V2S(), v16.V2S(), v9.V2S());
2499  __ zip1(v1.V4H(), v9.V4H(), v7.V4H());
2500  __ zip1(v0.V4S(), v30.V4S(), v20.V4S());
2501  __ zip1(v30.V8B(), v17.V8B(), v15.V8B());
2502  __ zip1(v17.V8H(), v8.V8H(), v2.V8H());
2503  __ zip2(v23.V16B(), v10.V16B(), v11.V16B());
2504  __ zip2(v30.V2D(), v6.V2D(), v14.V2D());
2505  __ zip2(v9.V2S(), v10.V2S(), v21.V2S());
2506  __ zip2(v8.V4H(), v24.V4H(), v29.V4H());
2507  __ zip2(v0.V4S(), v21.V4S(), v23.V4S());
2508  __ zip2(v25.V8B(), v23.V8B(), v30.V8B());
2509  __ zip2(v7.V8H(), v10.V8H(), v30.V8H());
2510}  // NOLINT(readability/fn_size)
2511
2512
2513static void GenerateTestSequenceNEONFP(MacroAssembler* masm) {
2514  ExactAssemblyScope guard(masm,
2515                           masm->GetBuffer()->GetRemainingBytes(),
2516                           ExactAssemblyScope::kMaximumSize);
2517
2518  // NEON floating point instructions.
2519  __ fabd(v3.V2D(), v25.V2D(), v8.V2D());
2520  __ fabd(v14.V2S(), v27.V2S(), v11.V2S());
2521  __ fabd(v9.V4S(), v22.V4S(), v18.V4S());
2522  __ fabs(v1.V2D(), v29.V2D());
2523  __ fabs(v6.V2S(), v21.V2S());
2524  __ fabs(v12.V4S(), v25.V4S());
2525  __ facge(v18.V2D(), v5.V2D(), v0.V2D());
2526  __ facge(v15.V2S(), v11.V2S(), v6.V2S());
2527  __ facge(v30.V4S(), v10.V4S(), v25.V4S());
2528  __ facgt(v28.V2D(), v16.V2D(), v31.V2D());
2529  __ facgt(v15.V2S(), v1.V2S(), v4.V2S());
2530  __ facgt(v22.V4S(), v3.V4S(), v10.V4S());
2531  __ fadd(v7.V2D(), v10.V2D(), v24.V2D());
2532  __ fadd(v10.V2S(), v23.V2S(), v7.V2S());
2533  __ fadd(v16.V4S(), v22.V4S(), v11.V4S());
2534  __ faddp(d27, v28.V2D());
2535  __ faddp(s20, v23.V2S());
2536  __ faddp(v21.V2D(), v4.V2D(), v11.V2D());
2537  __ faddp(v31.V2S(), v26.V2S(), v1.V2S());
2538  __ faddp(v13.V4S(), v27.V4S(), v28.V4S());
2539  __ fcmeq(v17.V2D(), v13.V2D(), v20.V2D());
2540  __ fcmeq(v24.V2D(), v16.V2D(), 0.0);
2541  __ fcmeq(v26.V2S(), v17.V2S(), v10.V2S());
2542  __ fcmeq(v24.V2S(), v4.V2S(), 0.0);
2543  __ fcmeq(v8.V4S(), v4.V4S(), v14.V4S());
2544  __ fcmeq(v26.V4S(), v25.V4S(), 0.0);
2545  __ fcmge(v27.V2D(), v0.V2D(), v0.V2D());
2546  __ fcmge(v22.V2D(), v30.V2D(), 0.0);
2547  __ fcmge(v7.V2S(), v21.V2S(), v25.V2S());
2548  __ fcmge(v15.V2S(), v15.V2S(), 0.0);
2549  __ fcmge(v29.V4S(), v4.V4S(), v27.V4S());
2550  __ fcmge(v22.V4S(), v21.V4S(), 0.0);
2551  __ fcmgt(v1.V2D(), v26.V2D(), v15.V2D());
2552  __ fcmgt(v15.V2D(), v23.V2D(), 0.0);
2553  __ fcmgt(v21.V2S(), v16.V2S(), v6.V2S());
2554  __ fcmgt(v1.V2S(), v13.V2S(), 0.0);
2555  __ fcmgt(v14.V4S(), v0.V4S(), v25.V4S());
2556  __ fcmgt(v13.V4S(), v8.V4S(), 0.0);
2557  __ fcmle(v4.V2D(), v6.V2D(), 0.0);
2558  __ fcmle(v24.V2S(), v31.V2S(), 0.0);
2559  __ fcmle(v8.V4S(), v23.V4S(), 0.0);
2560  __ fcmlt(v7.V2D(), v3.V2D(), 0.0);
2561  __ fcmlt(v15.V2S(), v21.V2S(), 0.0);
2562  __ fcmlt(v1.V4S(), v2.V4S(), 0.0);
2563  __ fcvtas(v6.V2D(), v8.V2D());
2564  __ fcvtas(v1.V2S(), v9.V2S());
2565  __ fcvtas(v8.V4S(), v19.V4S());
2566  __ fcvtau(v5.V2D(), v31.V2D());
2567  __ fcvtau(v28.V2S(), v29.V2S());
2568  __ fcvtau(v11.V4S(), v26.V4S());
2569  __ fcvtl(v8.V2D(), v25.V2S());
2570  __ fcvtl(v27.V4S(), v14.V4H());
2571  __ fcvtl2(v1.V2D(), v6.V4S());
2572  __ fcvtl2(v24.V4S(), v9.V8H());
2573  __ fcvtms(v9.V2D(), v24.V2D());
2574  __ fcvtms(v7.V2S(), v11.V2S());
2575  __ fcvtms(v23.V4S(), v21.V4S());
2576  __ fcvtmu(v13.V2D(), v1.V2D());
2577  __ fcvtmu(v26.V2S(), v12.V2S());
2578  __ fcvtmu(v21.V4S(), v21.V4S());
2579  __ fcvtn(v11.V2S(), v1.V2D());
2580  __ fcvtn(v8.V4H(), v2.V4S());
2581  __ fcvtn2(v24.V4S(), v29.V2D());
2582  __ fcvtn2(v4.V8H(), v10.V4S());
2583  __ fcvtns(v25.V2D(), v10.V2D());
2584  __ fcvtns(v4.V2S(), v8.V2S());
2585  __ fcvtns(v29.V4S(), v27.V4S());
2586  __ fcvtnu(v18.V2D(), v27.V2D());
2587  __ fcvtnu(v11.V2S(), v14.V2S());
2588  __ fcvtnu(v27.V4S(), v21.V4S());
2589  __ fcvtps(v23.V2D(), v5.V2D());
2590  __ fcvtps(v24.V2S(), v15.V2S());
2591  __ fcvtps(v5.V4S(), v19.V4S());
2592  __ fcvtpu(v3.V2D(), v21.V2D());
2593  __ fcvtpu(v3.V2S(), v21.V2S());
2594  __ fcvtpu(v0.V4S(), v7.V4S());
2595  __ fcvtxn(v29.V2S(), v11.V2D());
2596  __ fcvtxn2(v31.V4S(), v25.V2D());
2597  __ fcvtzs(v19.V2D(), v17.V2D());
2598  __ fcvtzs(v12.V2D(), v24.V2D(), 64);
2599  __ fcvtzs(v9.V2S(), v2.V2S());
2600  __ fcvtzs(v5.V2S(), v20.V2S(), 29);
2601  __ fcvtzs(v21.V4S(), v25.V4S());
2602  __ fcvtzs(v26.V4S(), v1.V4S(), 6);
2603  __ fcvtzu(v13.V2D(), v25.V2D());
2604  __ fcvtzu(v28.V2D(), v13.V2D(), 32);
2605  __ fcvtzu(v26.V2S(), v6.V2S());
2606  __ fcvtzu(v9.V2S(), v10.V2S(), 15);
2607  __ fcvtzu(v30.V4S(), v6.V4S());
2608  __ fcvtzu(v19.V4S(), v22.V4S(), 18);
2609  __ fdiv(v15.V2D(), v8.V2D(), v15.V2D());
2610  __ fdiv(v12.V2S(), v9.V2S(), v26.V2S());
2611  __ fdiv(v19.V4S(), v22.V4S(), v19.V4S());
2612  __ fmax(v19.V2D(), v7.V2D(), v8.V2D());
2613  __ fmax(v25.V2S(), v12.V2S(), v29.V2S());
2614  __ fmax(v6.V4S(), v15.V4S(), v5.V4S());
2615  __ fmaxnm(v16.V2D(), v8.V2D(), v20.V2D());
2616  __ fmaxnm(v15.V2S(), v26.V2S(), v25.V2S());
2617  __ fmaxnm(v23.V4S(), v14.V4S(), v16.V4S());
2618  __ fmaxnmp(d6, v19.V2D());
2619  __ fmaxnmp(s27, v26.V2S());
2620  __ fmaxnmp(v8.V2D(), v12.V2D(), v23.V2D());
2621  __ fmaxnmp(v13.V2S(), v25.V2S(), v22.V2S());
2622  __ fmaxnmp(v15.V4S(), v11.V4S(), v17.V4S());
2623  __ fmaxnmv(s27, v19.V4S());
2624  __ fmaxp(d20, v14.V2D());
2625  __ fmaxp(s18, v2.V2S());
2626  __ fmaxp(v9.V2D(), v23.V2D(), v31.V2D());
2627  __ fmaxp(v7.V2S(), v22.V2S(), v31.V2S());
2628  __ fmaxp(v18.V4S(), v7.V4S(), v29.V4S());
2629  __ fmaxv(s31, v29.V4S());
2630  __ fmin(v2.V2D(), v5.V2D(), v2.V2D());
2631  __ fmin(v31.V2S(), v17.V2S(), v10.V2S());
2632  __ fmin(v10.V4S(), v4.V4S(), v16.V4S());
2633  __ fminnm(v21.V2D(), v6.V2D(), v5.V2D());
2634  __ fminnm(v22.V2S(), v18.V2S(), v14.V2S());
2635  __ fminnm(v25.V4S(), v31.V4S(), v3.V4S());
2636  __ fminnmp(d9, v1.V2D());
2637  __ fminnmp(s21, v20.V2S());
2638  __ fminnmp(v16.V2D(), v21.V2D(), v19.V2D());
2639  __ fminnmp(v16.V2S(), v31.V2S(), v25.V2S());
2640  __ fminnmp(v26.V4S(), v16.V4S(), v15.V4S());
2641  __ fminnmv(s3, v4.V4S());
2642  __ fminp(d24, v26.V2D());
2643  __ fminp(s7, v17.V2S());
2644  __ fminp(v23.V2D(), v19.V2D(), v3.V2D());
2645  __ fminp(v29.V2S(), v21.V2S(), v9.V2S());
2646  __ fminp(v0.V4S(), v24.V4S(), v21.V4S());
2647  __ fminv(s25, v8.V4S());
2648  __ fmla(d23, d0, v9.D(), 1);
2649  __ fmla(s23, s15, v7.S(), 0);
2650  __ fmla(v17.V2D(), v11.V2D(), v6.V2D());
2651  __ fmla(v30.V2D(), v30.V2D(), v11.D(), 0);
2652  __ fmla(v19.V2S(), v12.V2S(), v6.V2S());
2653  __ fmla(v24.V2S(), v17.V2S(), v9.S(), 0);
2654  __ fmla(v16.V4S(), v11.V4S(), v11.V4S());
2655  __ fmla(v27.V4S(), v23.V4S(), v9.S(), 2);
2656  __ fmls(d27, d30, v6.D(), 0);
2657  __ fmls(s21, s16, v2.S(), 0);
2658  __ fmls(v5.V2D(), v19.V2D(), v21.V2D());
2659  __ fmls(v18.V2D(), v30.V2D(), v12.D(), 0);
2660  __ fmls(v5.V2S(), v16.V2S(), v7.V2S());
2661  __ fmls(v3.V2S(), v18.V2S(), v11.S(), 1);
2662  __ fmls(v27.V4S(), v5.V4S(), v30.V4S());
2663  __ fmls(v26.V4S(), v20.V4S(), v4.S(), 3);
2664  __ fmov(v14.V2D(), -0.34375);
2665  __ fmov(v26.V2S(), 0.90625f);
2666  __ fmov(v31.V4S(), -5.0000f);
2667  __ fmov(v28.D(), 1, x25);
2668  __ fmov(x18, v2.D(), 1);
2669  __ fmul(d12, d4, v1.D(), 1);
2670  __ fmul(s30, s1, v15.S(), 3);
2671  __ fmul(v25.V2D(), v0.V2D(), v21.V2D());
2672  __ fmul(v10.V2D(), v24.V2D(), v10.D(), 1);
2673  __ fmul(v7.V2S(), v24.V2S(), v16.V2S());
2674  __ fmul(v1.V2S(), v16.V2S(), v4.S(), 2);
2675  __ fmul(v5.V4S(), v28.V4S(), v25.V4S());
2676  __ fmul(v11.V4S(), v3.V4S(), v8.S(), 0);
2677  __ fmulx(d28, d9, v3.D(), 1);
2678  __ fmulx(s25, s21, v15.S(), 1);
2679  __ fmulx(v31.V2D(), v28.V2D(), v8.V2D());
2680  __ fmulx(v3.V2D(), v21.V2D(), v6.D(), 0);
2681  __ fmulx(v9.V2S(), v1.V2S(), v0.V2S());
2682  __ fmulx(v16.V2S(), v27.V2S(), v6.S(), 0);
2683  __ fmulx(v2.V4S(), v4.V4S(), v5.V4S());
2684  __ fmulx(v18.V4S(), v7.V4S(), v4.S(), 0);
2685  __ fneg(v1.V2D(), v25.V2D());
2686  __ fneg(v14.V2S(), v31.V2S());
2687  __ fneg(v5.V4S(), v4.V4S());
2688  __ frecpe(v18.V2D(), v12.V2D());
2689  __ frecpe(v10.V2S(), v22.V2S());
2690  __ frecpe(v5.V4S(), v6.V4S());
2691  __ frecps(v22.V2D(), v7.V2D(), v26.V2D());
2692  __ frecps(v31.V2S(), v27.V2S(), v2.V2S());
2693  __ frecps(v18.V4S(), v6.V4S(), v27.V4S());
2694  __ frinta(v26.V2D(), v13.V2D());
2695  __ frinta(v15.V2S(), v26.V2S());
2696  __ frinta(v13.V4S(), v16.V4S());
2697  __ frinti(v9.V2D(), v12.V2D());
2698  __ frinti(v5.V2S(), v19.V2S());
2699  __ frinti(v15.V4S(), v11.V4S());
2700  __ frintm(v17.V2D(), v29.V2D());
2701  __ frintm(v30.V2S(), v11.V2S());
2702  __ frintm(v1.V4S(), v20.V4S());
2703  __ frintn(v24.V2D(), v6.V2D());
2704  __ frintn(v12.V2S(), v17.V2S());
2705  __ frintn(v29.V4S(), v11.V4S());
2706  __ frintp(v10.V2D(), v7.V2D());
2707  __ frintp(v12.V2S(), v18.V2S());
2708  __ frintp(v26.V4S(), v31.V4S());
2709  __ frintx(v24.V2D(), v13.V2D());
2710  __ frintx(v7.V2S(), v9.V2S());
2711  __ frintx(v18.V4S(), v21.V4S());
2712  __ frintz(v19.V2D(), v25.V2D());
2713  __ frintz(v15.V2S(), v8.V2S());
2714  __ frintz(v20.V4S(), v3.V4S());
2715  __ frsqrte(v23.V2D(), v5.V2D());
2716  __ frsqrte(v9.V2S(), v7.V2S());
2717  __ frsqrte(v3.V4S(), v9.V4S());
2718  __ frsqrts(v25.V2D(), v28.V2D(), v15.V2D());
2719  __ frsqrts(v9.V2S(), v26.V2S(), v10.V2S());
2720  __ frsqrts(v5.V4S(), v1.V4S(), v10.V4S());
2721  __ fsqrt(v6.V2D(), v18.V2D());
2722  __ fsqrt(v6.V2S(), v18.V2S());
2723  __ fsqrt(v0.V4S(), v31.V4S());
2724  __ fsub(v31.V2D(), v30.V2D(), v31.V2D());
2725  __ fsub(v11.V2S(), v8.V2S(), v6.V2S());
2726  __ fsub(v16.V4S(), v0.V4S(), v31.V4S());
2727  __ scvtf(v25.V2D(), v31.V2D());
2728  __ scvtf(v10.V2D(), v13.V2D(), 45);
2729  __ scvtf(v10.V2S(), v15.V2S());
2730  __ scvtf(v18.V2S(), v4.V2S(), 27);
2731  __ scvtf(v17.V4S(), v5.V4S());
2732  __ scvtf(v11.V4S(), v25.V4S(), 24);
2733  __ ucvtf(v9.V2D(), v3.V2D());
2734  __ ucvtf(v26.V2D(), v30.V2D(), 46);
2735  __ ucvtf(v11.V2S(), v4.V2S());
2736  __ ucvtf(v29.V2S(), v3.V2S(), 25);
2737  __ ucvtf(v22.V4S(), v23.V4S());
2738  __ ucvtf(v18.V4S(), v9.V4S(), 25);
2739}
2740
2741
2742static void MaskAddresses(const char* trace) {
2743// Hexadecimal expressions of the form `\xab` do not work out-of-the box with
2744// BSD `sed`. So we use ANSI-C quoting to have the regular expressions below
2745// work both on Linux and BSD (and macOS).
2746#ifdef __APPLE__
2747#define MAYBE_ANSI_C_QUOTE "$"
2748#define HEX(val) "\\x" #val
2749#define ESCAPE(c) "\\\\" #c
2750  const char* sed_options = "-i \"\" -E";
2751#else
2752#define MAYBE_ANSI_C_QUOTE
2753#define HEX(val) "\\x" #val
2754#define ESCAPE(c) "\\" #c
2755  const char* sed_options = "--in-place --regexp-extended";
2756#endif
2757#define COLOUR "(" HEX(1b) ESCAPE([) "[01];([0-9][0-9])?m)?"
2758  struct {
2759    const char* search;
2760    const char* replace;
2761  } patterns[] =
2762      {// Mask registers that hold addresses that change from run to run.
2763       {"((x0|x1|x2|sp): " COLOUR "0x)[0-9a-f]{16}",
2764        ESCAPE(1) "~~~~~~~~~~~~~~~~"},
2765       // Mask accessed memory addresses.
2766       {"((<-|->) " COLOUR "0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"},
2767       // Mask instruction addresses.
2768       {"^0x[0-9a-f]{16}", "0x~~~~~~~~~~~~~~~~"},
2769       // Mask branch targets.
2770       {"(Branch" COLOUR " to 0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"},
2771       {"addr 0x[0-9a-f]+", "addr 0x~~~~~~~~~~~~~~~~"}};
2772  const size_t patterns_length = sizeof(patterns) / sizeof(patterns[0]);
2773  // Rewrite `trace`, masking addresses and other values that legitimately vary
2774  // from run to run.
2775  char command[1024];
2776  for (size_t i = 0; i < patterns_length; i++) {
2777    size_t length = snprintf(command,
2778                             sizeof(command),
2779                             "sed %s " MAYBE_ANSI_C_QUOTE "'s/%s/%s/' '%s'",
2780                             sed_options,
2781                             patterns[i].search,
2782                             patterns[i].replace,
2783                             trace);
2784    VIXL_CHECK(length < sizeof(command));
2785    VIXL_CHECK(system(command) == 0);
2786  }
2787}
2788
2789
2790static void TraceTestHelper(bool coloured_trace,
2791                            TraceParameters trace_parameters,
2792                            const char* ref_file) {
2793  MacroAssembler masm(12 * KBytes);
2794
2795  char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
2796  FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
2797
2798  Decoder decoder;
2799  Simulator simulator(&decoder, trace_stream);
2800  simulator.SetColouredTrace(coloured_trace);
2801  simulator.SetTraceParameters(trace_parameters);
2802  simulator.SilenceExclusiveAccessWarning();
2803
2804  // Set up a scratch buffer so we can test loads and stores.
2805  const int kScratchSize = 64 * KBytes;
2806  const int kScratchGuardSize = 128;
2807  char scratch_buffer[kScratchSize + kScratchGuardSize];
2808  for (size_t i = 0; i < (sizeof(scratch_buffer) / sizeof(scratch_buffer[0]));
2809       i++) {
2810    scratch_buffer[i] = i & 0xff;
2811  }
2812  // Used for offset addressing.
2813  simulator.WriteRegister(0, scratch_buffer);
2814  // Used for pre-/post-index addressing.
2815  simulator.WriteRegister(1, scratch_buffer);
2816
2817  const int kPostIndexRegisterStep = 13;  // Arbitrary interesting value.
2818  // Used for post-index offsets.
2819  simulator.WriteRegister(2, kPostIndexRegisterStep);
2820
2821  // Initialize the other registers with unique values.
2822  uint64_t initial_base_u64 = 0x0100001000100101;
2823  for (unsigned i = 3; i < kNumberOfRegisters; i++) {
2824    if (i == kLinkRegCode) continue;
2825    if (i == kZeroRegCode) continue;
2826    // NoRegLog suppresses the log now, but the registers will still be logged
2827    // before the first instruction is executed since they have been written but
2828    // not printed.
2829    simulator.WriteRegister(i, initial_base_u64 * i, Simulator::NoRegLog);
2830  }
2831  float initial_base_f32 = 1.2345f;
2832  double initial_base_f64 = 1.3456f;
2833  for (unsigned i = 0; i < kNumberOfVRegisters; i++) {
2834    // Try to initialise V registers with reasonable FP values.
2835    uint64_t low = (DoubleToRawbits(initial_base_f64 * i) & ~kSRegMask) |
2836                   FloatToRawbits(initial_base_f32 * i);
2837    uint64_t high = low ^ 0x0005555500555555;
2838    LogicVRegister reg(simulator.ReadVRegister(i));
2839    reg.SetUint(kFormat2D, 0, low);
2840    reg.SetUint(kFormat2D, 1, high);
2841  }
2842
2843  GenerateTestSequenceBase(&masm);
2844  GenerateTestSequenceFP(&masm);
2845  GenerateTestSequenceNEON(&masm);
2846  GenerateTestSequenceNEONFP(&masm);
2847  masm.Ret();
2848  masm.FinalizeCode();
2849
2850  simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>());
2851
2852  fclose(trace_stream);
2853  MaskAddresses(trace_stream_filename);
2854
2855  bool trace_matched_reference;
2856  if (Test::generate_test_trace()) {
2857    // Copy trace_stream to stdout.
2858    trace_stream = fopen(trace_stream_filename, "r");
2859    VIXL_ASSERT(trace_stream != NULL);
2860    fseek(trace_stream, 0, SEEK_SET);
2861    int c;
2862    while (1) {
2863      c = getc(trace_stream);
2864      if (c == EOF) break;
2865      putc(c, stdout);
2866    }
2867    fclose(trace_stream);
2868    trace_matched_reference = true;
2869  } else {
2870    // Check trace_stream against ref_file.
2871    char command[1024];
2872    size_t length = snprintf(command,
2873                             sizeof(command),
2874                             "diff -u %s %s",
2875                             ref_file,
2876                             trace_stream_filename);
2877    VIXL_CHECK(length < sizeof(command));
2878    trace_matched_reference = (system(command) == 0);
2879  }
2880
2881  uint64_t offset_base = simulator.ReadRegister<uint64_t>(0);
2882  uint64_t index_base = simulator.ReadRegister<uint64_t>(1);
2883
2884  // Clean up before checking the result; VIXL_CHECK aborts.
2885  remove(trace_stream_filename);
2886
2887  VIXL_CHECK(trace_matched_reference);
2888  VIXL_CHECK(index_base >= offset_base);
2889  VIXL_CHECK((index_base - offset_base) <= kScratchSize);
2890}
2891
2892
2893#define REF(name) "test/test-trace-reference/" name
2894
2895// Test individual options.
2896TEST(disasm) { TraceTestHelper(false, LOG_DISASM, REF("log-disasm")); }
2897TEST(regs) { TraceTestHelper(false, LOG_REGS, REF("log-regs")); }
2898TEST(vregs) { TraceTestHelper(false, LOG_VREGS, REF("log-vregs")); }
2899TEST(sysregs) { TraceTestHelper(false, LOG_SYSREGS, REF("log-sysregs")); }
2900TEST(write) { TraceTestHelper(false, LOG_WRITE, REF("log-write")); }
2901TEST(branch) { TraceTestHelper(false, LOG_WRITE, REF("log-branch")); }
2902
2903// Test standard combinations.
2904TEST(none) { TraceTestHelper(false, LOG_NONE, REF("log-none")); }
2905TEST(state) { TraceTestHelper(false, LOG_STATE, REF("log-state")); }
2906TEST(all) { TraceTestHelper(false, LOG_ALL, REF("log-all")); }
2907
2908
2909// Test individual options (with colour).
2910TEST(disasm_colour) {
2911  TraceTestHelper(true, LOG_DISASM, REF("log-disasm-colour"));
2912}
2913TEST(regs_colour) { TraceTestHelper(true, LOG_REGS, REF("log-regs-colour")); }
2914TEST(vregs_colour) {
2915  TraceTestHelper(true, LOG_VREGS, REF("log-vregs-colour"));
2916}
2917TEST(sysregs_colour) {
2918  TraceTestHelper(true, LOG_SYSREGS, REF("log-sysregs-colour"));
2919}
2920TEST(write_colour) {
2921  TraceTestHelper(true, LOG_WRITE, REF("log-write-colour"));
2922}
2923TEST(branch_colour) {
2924  TraceTestHelper(true, LOG_WRITE, REF("log-branch-colour"));
2925}
2926
2927// Test standard combinations (with colour).
2928TEST(none_colour) { TraceTestHelper(true, LOG_NONE, REF("log-none-colour")); }
2929TEST(state_colour) {
2930  TraceTestHelper(true, LOG_STATE, REF("log-state-colour"));
2931}
2932TEST(all_colour) { TraceTestHelper(true, LOG_ALL, REF("log-all-colour")); }
2933
2934
2935#endif  // VIXL_INCLUDE_SIMULATOR_AARCH64
2936}  // namespace aarch64
2937}  // namespace vixl
2938