1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "intrinsics_x86_64.h"
18
19#include <limits>
20
21#include "arch/x86_64/instruction_set_features_x86_64.h"
22#include "art_method-inl.h"
23#include "code_generator_x86_64.h"
24#include "entrypoints/quick/quick_entrypoints.h"
25#include "intrinsics.h"
26#include "mirror/array-inl.h"
27#include "mirror/string.h"
28#include "thread.h"
29#include "utils/x86_64/assembler_x86_64.h"
30#include "utils/x86_64/constants_x86_64.h"
31
32namespace art {
33
34namespace x86_64 {
35
36IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
37  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
38}
39
40
41X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
42  return reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler());
43}
44
45ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
46  return codegen_->GetGraph()->GetArena();
47}
48
49bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
50  Dispatch(invoke);
51  const LocationSummary* res = invoke->GetLocations();
52  return res != nullptr && res->Intrinsified();
53}
54
55#define __ reinterpret_cast<X86_64Assembler*>(codegen->GetAssembler())->
56
57// TODO: trg as memory.
58static void MoveFromReturnRegister(Location trg,
59                                   Primitive::Type type,
60                                   CodeGeneratorX86_64* codegen) {
61  if (!trg.IsValid()) {
62    DCHECK(type == Primitive::kPrimVoid);
63    return;
64  }
65
66  switch (type) {
67    case Primitive::kPrimBoolean:
68    case Primitive::kPrimByte:
69    case Primitive::kPrimChar:
70    case Primitive::kPrimShort:
71    case Primitive::kPrimInt:
72    case Primitive::kPrimNot: {
73      CpuRegister trg_reg = trg.AsRegister<CpuRegister>();
74      if (trg_reg.AsRegister() != RAX) {
75        __ movl(trg_reg, CpuRegister(RAX));
76      }
77      break;
78    }
79    case Primitive::kPrimLong: {
80      CpuRegister trg_reg = trg.AsRegister<CpuRegister>();
81      if (trg_reg.AsRegister() != RAX) {
82        __ movq(trg_reg, CpuRegister(RAX));
83      }
84      break;
85    }
86
87    case Primitive::kPrimVoid:
88      LOG(FATAL) << "Unexpected void type for valid location " << trg;
89      UNREACHABLE();
90
91    case Primitive::kPrimDouble: {
92      XmmRegister trg_reg = trg.AsFpuRegister<XmmRegister>();
93      if (trg_reg.AsFloatRegister() != XMM0) {
94        __ movsd(trg_reg, XmmRegister(XMM0));
95      }
96      break;
97    }
98    case Primitive::kPrimFloat: {
99      XmmRegister trg_reg = trg.AsFpuRegister<XmmRegister>();
100      if (trg_reg.AsFloatRegister() != XMM0) {
101        __ movss(trg_reg, XmmRegister(XMM0));
102      }
103      break;
104    }
105  }
106}
107
108static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
109  InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
110  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
111}
112
113// Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified
114// call. This will copy the arguments into the positions for a regular call.
115//
116// Note: The actual parameters are required to be in the locations given by the invoke's location
117//       summary. If an intrinsic modifies those locations before a slowpath call, they must be
118//       restored!
119class IntrinsicSlowPathX86_64 : public SlowPathCodeX86_64 {
120 public:
121  explicit IntrinsicSlowPathX86_64(HInvoke* invoke) : invoke_(invoke) { }
122
123  void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE {
124    CodeGeneratorX86_64* codegen = down_cast<CodeGeneratorX86_64*>(codegen_in);
125    __ Bind(GetEntryLabel());
126
127    SaveLiveRegisters(codegen, invoke_->GetLocations());
128
129    MoveArguments(invoke_, codegen);
130
131    if (invoke_->IsInvokeStaticOrDirect()) {
132      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), CpuRegister(RDI));
133      RecordPcInfo(codegen, invoke_, invoke_->GetDexPc());
134    } else {
135      UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
136      UNREACHABLE();
137    }
138
139    // Copy the result back to the expected output.
140    Location out = invoke_->GetLocations()->Out();
141    if (out.IsValid()) {
142      DCHECK(out.IsRegister());  // TODO: Replace this when we support output in memory.
143      DCHECK(!invoke_->GetLocations()->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
144      MoveFromReturnRegister(out, invoke_->GetType(), codegen);
145    }
146
147    RestoreLiveRegisters(codegen, invoke_->GetLocations());
148    __ jmp(GetExitLabel());
149  }
150
151 private:
152  // The instruction where this slow path is happening.
153  HInvoke* const invoke_;
154
155  DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathX86_64);
156};
157
158#undef __
159#define __ assembler->
160
161static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
162  LocationSummary* locations = new (arena) LocationSummary(invoke,
163                                                           LocationSummary::kNoCall,
164                                                           kIntrinsified);
165  locations->SetInAt(0, Location::RequiresFpuRegister());
166  locations->SetOut(Location::RequiresRegister());
167}
168
169static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
170  LocationSummary* locations = new (arena) LocationSummary(invoke,
171                                                           LocationSummary::kNoCall,
172                                                           kIntrinsified);
173  locations->SetInAt(0, Location::RequiresRegister());
174  locations->SetOut(Location::RequiresFpuRegister());
175}
176
177static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
178  Location input = locations->InAt(0);
179  Location output = locations->Out();
180  __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
181}
182
183static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
184  Location input = locations->InAt(0);
185  Location output = locations->Out();
186  __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
187}
188
189void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
190  CreateFPToIntLocations(arena_, invoke);
191}
192void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
193  CreateIntToFPLocations(arena_, invoke);
194}
195
196void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
197  MoveFPToInt(invoke->GetLocations(), true, GetAssembler());
198}
199void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
200  MoveIntToFP(invoke->GetLocations(), true, GetAssembler());
201}
202
203void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
204  CreateFPToIntLocations(arena_, invoke);
205}
206void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
207  CreateIntToFPLocations(arena_, invoke);
208}
209
210void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
211  MoveFPToInt(invoke->GetLocations(), false, GetAssembler());
212}
213void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
214  MoveIntToFP(invoke->GetLocations(), false, GetAssembler());
215}
216
217static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
218  LocationSummary* locations = new (arena) LocationSummary(invoke,
219                                                           LocationSummary::kNoCall,
220                                                           kIntrinsified);
221  locations->SetInAt(0, Location::RequiresRegister());
222  locations->SetOut(Location::SameAsFirstInput());
223}
224
225static void GenReverseBytes(LocationSummary* locations,
226                            Primitive::Type size,
227                            X86_64Assembler* assembler) {
228  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
229
230  switch (size) {
231    case Primitive::kPrimShort:
232      // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
233      __ bswapl(out);
234      __ sarl(out, Immediate(16));
235      break;
236    case Primitive::kPrimInt:
237      __ bswapl(out);
238      break;
239    case Primitive::kPrimLong:
240      __ bswapq(out);
241      break;
242    default:
243      LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
244      UNREACHABLE();
245  }
246}
247
248void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
249  CreateIntToIntLocations(arena_, invoke);
250}
251
252void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
253  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
254}
255
256void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
257  CreateIntToIntLocations(arena_, invoke);
258}
259
260void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
261  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
262}
263
264void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
265  CreateIntToIntLocations(arena_, invoke);
266}
267
268void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
269  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
270}
271
272
273// TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we
274//       need is 64b.
275
276static void CreateFloatToFloatPlusTemps(ArenaAllocator* arena, HInvoke* invoke) {
277  // TODO: Enable memory operations when the assembler supports them.
278  LocationSummary* locations = new (arena) LocationSummary(invoke,
279                                                           LocationSummary::kNoCall,
280                                                           kIntrinsified);
281  locations->SetInAt(0, Location::RequiresFpuRegister());
282  // TODO: Allow x86 to work with memory. This requires assembler support, see below.
283  // locations->SetInAt(0, Location::Any());               // X86 can work on memory directly.
284  locations->SetOut(Location::SameAsFirstInput());
285  locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
286}
287
288static void MathAbsFP(LocationSummary* locations,
289                      bool is64bit,
290                      X86_64Assembler* assembler,
291                      CodeGeneratorX86_64* codegen) {
292  Location output = locations->Out();
293
294  if (output.IsFpuRegister()) {
295    // In-register
296    XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
297
298    // TODO: Can mask directly with constant area using pand if we can guarantee
299    // that the literal is aligned on a 16 byte boundary.  This will avoid a
300    // temporary.
301    if (is64bit) {
302      __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
303      __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
304    } else {
305      __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
306      __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
307    }
308  } else {
309    // TODO: update when assember support is available.
310    UNIMPLEMENTED(FATAL) << "Needs assembler support.";
311//  Once assembler support is available, in-memory operations look like this:
312//    if (is64bit) {
313//      DCHECK(output.IsDoubleStackSlot());
314//      // No 64b and with literal.
315//      __ movq(cpu_temp, Immediate(INT64_C(0x7FFFFFFFFFFFFFFF)));
316//      __ andq(Address(CpuRegister(RSP), output.GetStackIndex()), cpu_temp);
317//    } else {
318//      DCHECK(output.IsStackSlot());
319//      // Can use and with a literal directly.
320//      __ andl(Address(CpuRegister(RSP), output.GetStackIndex()), Immediate(INT64_C(0x7FFFFFFF)));
321//    }
322  }
323}
324
325void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
326  CreateFloatToFloatPlusTemps(arena_, invoke);
327}
328
329void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
330  MathAbsFP(invoke->GetLocations(), true, GetAssembler(), codegen_);
331}
332
333void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
334  CreateFloatToFloatPlusTemps(arena_, invoke);
335}
336
337void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
338  MathAbsFP(invoke->GetLocations(), false, GetAssembler(), codegen_);
339}
340
341static void CreateIntToIntPlusTemp(ArenaAllocator* arena, HInvoke* invoke) {
342  LocationSummary* locations = new (arena) LocationSummary(invoke,
343                                                           LocationSummary::kNoCall,
344                                                           kIntrinsified);
345  locations->SetInAt(0, Location::RequiresRegister());
346  locations->SetOut(Location::SameAsFirstInput());
347  locations->AddTemp(Location::RequiresRegister());
348}
349
350static void GenAbsInteger(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
351  Location output = locations->Out();
352  CpuRegister out = output.AsRegister<CpuRegister>();
353  CpuRegister mask = locations->GetTemp(0).AsRegister<CpuRegister>();
354
355  if (is64bit) {
356    // Create mask.
357    __ movq(mask, out);
358    __ sarq(mask, Immediate(63));
359    // Add mask.
360    __ addq(out, mask);
361    __ xorq(out, mask);
362  } else {
363    // Create mask.
364    __ movl(mask, out);
365    __ sarl(mask, Immediate(31));
366    // Add mask.
367    __ addl(out, mask);
368    __ xorl(out, mask);
369  }
370}
371
372void IntrinsicLocationsBuilderX86_64::VisitMathAbsInt(HInvoke* invoke) {
373  CreateIntToIntPlusTemp(arena_, invoke);
374}
375
376void IntrinsicCodeGeneratorX86_64::VisitMathAbsInt(HInvoke* invoke) {
377  GenAbsInteger(invoke->GetLocations(), false, GetAssembler());
378}
379
380void IntrinsicLocationsBuilderX86_64::VisitMathAbsLong(HInvoke* invoke) {
381  CreateIntToIntPlusTemp(arena_, invoke);
382}
383
384void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
385  GenAbsInteger(invoke->GetLocations(), true, GetAssembler());
386}
387
388static void GenMinMaxFP(LocationSummary* locations,
389                        bool is_min,
390                        bool is_double,
391                        X86_64Assembler* assembler,
392                        CodeGeneratorX86_64* codegen) {
393  Location op1_loc = locations->InAt(0);
394  Location op2_loc = locations->InAt(1);
395  Location out_loc = locations->Out();
396  XmmRegister out = out_loc.AsFpuRegister<XmmRegister>();
397
398  // Shortcut for same input locations.
399  if (op1_loc.Equals(op2_loc)) {
400    DCHECK(out_loc.Equals(op1_loc));
401    return;
402  }
403
404  //  (out := op1)
405  //  out <=? op2
406  //  if Nan jmp Nan_label
407  //  if out is min jmp done
408  //  if op2 is min jmp op2_label
409  //  handle -0/+0
410  //  jmp done
411  // Nan_label:
412  //  out := NaN
413  // op2_label:
414  //  out := op2
415  // done:
416  //
417  // This removes one jmp, but needs to copy one input (op1) to out.
418  //
419  // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
420
421  XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
422
423  Label nan, done, op2_label;
424  if (is_double) {
425    __ ucomisd(out, op2);
426  } else {
427    __ ucomiss(out, op2);
428  }
429
430  __ j(Condition::kParityEven, &nan);
431
432  __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label);
433  __ j(is_min ? Condition::kBelow : Condition::kAbove, &done);
434
435  // Handle 0.0/-0.0.
436  if (is_min) {
437    if (is_double) {
438      __ orpd(out, op2);
439    } else {
440      __ orps(out, op2);
441    }
442  } else {
443    if (is_double) {
444      __ andpd(out, op2);
445    } else {
446      __ andps(out, op2);
447    }
448  }
449  __ jmp(&done);
450
451  // NaN handling.
452  __ Bind(&nan);
453  if (is_double) {
454    __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
455  } else {
456    __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
457  }
458  __ jmp(&done);
459
460  // out := op2;
461  __ Bind(&op2_label);
462  if (is_double) {
463    __ movsd(out, op2);
464  } else {
465    __ movss(out, op2);
466  }
467
468  // Done.
469  __ Bind(&done);
470}
471
472static void CreateFPFPToFP(ArenaAllocator* arena, HInvoke* invoke) {
473  LocationSummary* locations = new (arena) LocationSummary(invoke,
474                                                           LocationSummary::kNoCall,
475                                                           kIntrinsified);
476  locations->SetInAt(0, Location::RequiresFpuRegister());
477  locations->SetInAt(1, Location::RequiresFpuRegister());
478  // The following is sub-optimal, but all we can do for now. It would be fine to also accept
479  // the second input to be the output (we can simply swap inputs).
480  locations->SetOut(Location::SameAsFirstInput());
481}
482
483void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
484  CreateFPFPToFP(arena_, invoke);
485}
486
487void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
488  GenMinMaxFP(invoke->GetLocations(), true, true, GetAssembler(), codegen_);
489}
490
491void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
492  CreateFPFPToFP(arena_, invoke);
493}
494
495void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
496  GenMinMaxFP(invoke->GetLocations(), true, false, GetAssembler(), codegen_);
497}
498
499void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
500  CreateFPFPToFP(arena_, invoke);
501}
502
503void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
504  GenMinMaxFP(invoke->GetLocations(), false, true, GetAssembler(), codegen_);
505}
506
507void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
508  CreateFPFPToFP(arena_, invoke);
509}
510
511void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
512  GenMinMaxFP(invoke->GetLocations(), false, false, GetAssembler(), codegen_);
513}
514
515static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
516                      X86_64Assembler* assembler) {
517  Location op1_loc = locations->InAt(0);
518  Location op2_loc = locations->InAt(1);
519
520  // Shortcut for same input locations.
521  if (op1_loc.Equals(op2_loc)) {
522    // Can return immediately, as op1_loc == out_loc.
523    // Note: if we ever support separate registers, e.g., output into memory, we need to check for
524    //       a copy here.
525    DCHECK(locations->Out().Equals(op1_loc));
526    return;
527  }
528
529  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
530  CpuRegister op2 = op2_loc.AsRegister<CpuRegister>();
531
532  //  (out := op1)
533  //  out <=? op2
534  //  if out is min jmp done
535  //  out := op2
536  // done:
537
538  if (is_long) {
539    __ cmpq(out, op2);
540  } else {
541    __ cmpl(out, op2);
542  }
543
544  __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long);
545}
546
547static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
548  LocationSummary* locations = new (arena) LocationSummary(invoke,
549                                                           LocationSummary::kNoCall,
550                                                           kIntrinsified);
551  locations->SetInAt(0, Location::RequiresRegister());
552  locations->SetInAt(1, Location::RequiresRegister());
553  locations->SetOut(Location::SameAsFirstInput());
554}
555
556void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) {
557  CreateIntIntToIntLocations(arena_, invoke);
558}
559
560void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) {
561  GenMinMax(invoke->GetLocations(), true, false, GetAssembler());
562}
563
564void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) {
565  CreateIntIntToIntLocations(arena_, invoke);
566}
567
568void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) {
569  GenMinMax(invoke->GetLocations(), true, true, GetAssembler());
570}
571
572void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
573  CreateIntIntToIntLocations(arena_, invoke);
574}
575
576void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
577  GenMinMax(invoke->GetLocations(), false, false, GetAssembler());
578}
579
580void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
581  CreateIntIntToIntLocations(arena_, invoke);
582}
583
584void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
585  GenMinMax(invoke->GetLocations(), false, true, GetAssembler());
586}
587
588static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
589  LocationSummary* locations = new (arena) LocationSummary(invoke,
590                                                           LocationSummary::kNoCall,
591                                                           kIntrinsified);
592  locations->SetInAt(0, Location::RequiresFpuRegister());
593  locations->SetOut(Location::RequiresFpuRegister());
594}
595
596void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
597  CreateFPToFPLocations(arena_, invoke);
598}
599
600void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
601  LocationSummary* locations = invoke->GetLocations();
602  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
603  XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
604
605  GetAssembler()->sqrtsd(out, in);
606}
607
608static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
609  MoveArguments(invoke, codegen);
610
611  DCHECK(invoke->IsInvokeStaticOrDirect());
612  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), CpuRegister(RDI));
613  codegen->RecordPcInfo(invoke, invoke->GetDexPc());
614
615  // Copy the result back to the expected output.
616  Location out = invoke->GetLocations()->Out();
617  if (out.IsValid()) {
618    DCHECK(out.IsRegister());
619    MoveFromReturnRegister(out, invoke->GetType(), codegen);
620  }
621}
622
623static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
624                                      HInvoke* invoke,
625                                      CodeGeneratorX86_64* codegen) {
626  // Do we have instruction support?
627  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
628    CreateFPToFPLocations(arena, invoke);
629    return;
630  }
631
632  // We have to fall back to a call to the intrinsic.
633  LocationSummary* locations = new (arena) LocationSummary(invoke,
634                                                           LocationSummary::kCall);
635  InvokeRuntimeCallingConvention calling_convention;
636  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
637  locations->SetOut(Location::FpuRegisterLocation(XMM0));
638  // Needs to be RDI for the invoke.
639  locations->AddTemp(Location::RegisterLocation(RDI));
640}
641
642static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
643                                   HInvoke* invoke,
644                                   X86_64Assembler* assembler,
645                                   int round_mode) {
646  LocationSummary* locations = invoke->GetLocations();
647  if (locations->WillCall()) {
648    InvokeOutOfLineIntrinsic(codegen, invoke);
649  } else {
650    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
651    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
652    __ roundsd(out, in, Immediate(round_mode));
653  }
654}
655
656void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
657  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
658}
659
660void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
661  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
662}
663
664void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
665  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
666}
667
668void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
669  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
670}
671
672void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
673  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
674}
675
676void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
677  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
678}
679
680static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
681                                       HInvoke* invoke,
682                                       CodeGeneratorX86_64* codegen) {
683  // Do we have instruction support?
684  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
685    LocationSummary* locations = new (arena) LocationSummary(invoke,
686                                                              LocationSummary::kNoCall,
687                                                              kIntrinsified);
688    locations->SetInAt(0, Location::RequiresFpuRegister());
689    locations->SetOut(Location::RequiresRegister());
690    locations->AddTemp(Location::RequiresFpuRegister());
691    return;
692  }
693
694  // We have to fall back to a call to the intrinsic.
695  LocationSummary* locations = new (arena) LocationSummary(invoke,
696                                                           LocationSummary::kCall);
697  InvokeRuntimeCallingConvention calling_convention;
698  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
699  locations->SetOut(Location::RegisterLocation(RAX));
700  // Needs to be RDI for the invoke.
701  locations->AddTemp(Location::RegisterLocation(RDI));
702}
703
704void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
705  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
706}
707
708void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
709  LocationSummary* locations = invoke->GetLocations();
710  if (locations->WillCall()) {
711    InvokeOutOfLineIntrinsic(codegen_, invoke);
712    return;
713  }
714
715  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
716  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
717  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
718  XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
719  Label done, nan;
720  X86_64Assembler* assembler = GetAssembler();
721
722  // Load 0.5 into inPlusPointFive.
723  __ movss(inPlusPointFive, codegen_->LiteralFloatAddress(0.5f));
724
725  // Add in the input.
726  __ addss(inPlusPointFive, in);
727
728  // And truncate to an integer.
729  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
730
731  // Load maxInt into out.
732  codegen_->Load64BitValue(out, kPrimIntMax);
733
734  // if inPlusPointFive >= maxInt goto done
735  __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax)));
736  __ j(kAboveEqual, &done);
737
738  // if input == NaN goto nan
739  __ j(kUnordered, &nan);
740
741  // output = float-to-int-truncate(input)
742  __ cvttss2si(out, inPlusPointFive);
743  __ jmp(&done);
744  __ Bind(&nan);
745
746  //  output = 0
747  __ xorl(out, out);
748  __ Bind(&done);
749}
750
751void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
752  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
753}
754
755void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
756  LocationSummary* locations = invoke->GetLocations();
757  if (locations->WillCall()) {
758    InvokeOutOfLineIntrinsic(codegen_, invoke);
759    return;
760  }
761
762  // Implement RoundDouble as t1 = floor(input + 0.5);  convert to long.
763  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
764  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
765  XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
766  Label done, nan;
767  X86_64Assembler* assembler = GetAssembler();
768
769  // Load 0.5 into inPlusPointFive.
770  __ movsd(inPlusPointFive, codegen_->LiteralDoubleAddress(0.5));
771
772  // Add in the input.
773  __ addsd(inPlusPointFive, in);
774
775  // And truncate to an integer.
776  __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1));
777
778  // Load maxLong into out.
779  codegen_->Load64BitValue(out, kPrimLongMax);
780
781  // if inPlusPointFive >= maxLong goto done
782  __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax)));
783  __ j(kAboveEqual, &done);
784
785  // if input == NaN goto nan
786  __ j(kUnordered, &nan);
787
788  // output = double-to-long-truncate(input)
789  __ cvttsd2si(out, inPlusPointFive, true);
790  __ jmp(&done);
791  __ Bind(&nan);
792
793  //  output = 0
794  __ xorl(out, out);
795  __ Bind(&done);
796}
797
798void IntrinsicLocationsBuilderX86_64::VisitStringCharAt(HInvoke* invoke) {
799  // The inputs plus one temp.
800  LocationSummary* locations = new (arena_) LocationSummary(invoke,
801                                                            LocationSummary::kCallOnSlowPath,
802                                                            kIntrinsified);
803  locations->SetInAt(0, Location::RequiresRegister());
804  locations->SetInAt(1, Location::RequiresRegister());
805  locations->SetOut(Location::SameAsFirstInput());
806  locations->AddTemp(Location::RequiresRegister());
807}
808
809void IntrinsicCodeGeneratorX86_64::VisitStringCharAt(HInvoke* invoke) {
810  LocationSummary* locations = invoke->GetLocations();
811
812  // Location of reference to data array
813  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
814  // Location of count
815  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
816
817  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
818  CpuRegister idx = locations->InAt(1).AsRegister<CpuRegister>();
819  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
820
821  // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
822  //       the cost.
823  // TODO: For simplicity, the index parameter is requested in a register, so different from Quick
824  //       we will not optimize the code for constants (which would save a register).
825
826  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
827  codegen_->AddSlowPath(slow_path);
828
829  X86_64Assembler* assembler = GetAssembler();
830
831  __ cmpl(idx, Address(obj, count_offset));
832  codegen_->MaybeRecordImplicitNullCheck(invoke);
833  __ j(kAboveEqual, slow_path->GetEntryLabel());
834
835  // out = out[2*idx].
836  __ movzxw(out, Address(out, idx, ScaleFactor::TIMES_2, value_offset));
837
838  __ Bind(slow_path->GetExitLabel());
839}
840
841void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
842  LocationSummary* locations = new (arena_) LocationSummary(invoke,
843                                                            LocationSummary::kCall,
844                                                            kIntrinsified);
845  InvokeRuntimeCallingConvention calling_convention;
846  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
847  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
848  locations->SetOut(Location::RegisterLocation(RAX));
849}
850
851void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
852  X86_64Assembler* assembler = GetAssembler();
853  LocationSummary* locations = invoke->GetLocations();
854
855  // Note that the null check must have been done earlier.
856  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
857
858  CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
859  __ testl(argument, argument);
860  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
861  codegen_->AddSlowPath(slow_path);
862  __ j(kEqual, slow_path->GetEntryLabel());
863
864  __ gs()->call(Address::Absolute(
865        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pStringCompareTo), true));
866  __ Bind(slow_path->GetExitLabel());
867}
868
869static void CreateStringIndexOfLocations(HInvoke* invoke,
870                                         ArenaAllocator* allocator,
871                                         bool start_at_zero) {
872  LocationSummary* locations = new (allocator) LocationSummary(invoke,
873                                                               LocationSummary::kCallOnSlowPath,
874                                                               kIntrinsified);
875  // The data needs to be in RDI for scasw. So request that the string is there, anyways.
876  locations->SetInAt(0, Location::RegisterLocation(RDI));
877  // If we look for a constant char, we'll still have to copy it into RAX. So just request the
878  // allocator to do that, anyways. We can still do the constant check by checking the parameter
879  // of the instruction explicitly.
880  // Note: This works as we don't clobber RAX anywhere.
881  locations->SetInAt(1, Location::RegisterLocation(RAX));
882  if (!start_at_zero) {
883    locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
884  }
885  // As we clobber RDI during execution anyways, also use it as the output.
886  locations->SetOut(Location::SameAsFirstInput());
887
888  // repne scasw uses RCX as the counter.
889  locations->AddTemp(Location::RegisterLocation(RCX));
890  // Need another temporary to be able to compute the result.
891  locations->AddTemp(Location::RequiresRegister());
892}
893
894static void GenerateStringIndexOf(HInvoke* invoke,
895                                  X86_64Assembler* assembler,
896                                  CodeGeneratorX86_64* codegen,
897                                  ArenaAllocator* allocator,
898                                  bool start_at_zero) {
899  LocationSummary* locations = invoke->GetLocations();
900
901  // Note that the null check must have been done earlier.
902  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
903
904  CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
905  CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
906  CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
907  CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
908  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
909
910  // Check our assumptions for registers.
911  DCHECK_EQ(string_obj.AsRegister(), RDI);
912  DCHECK_EQ(search_value.AsRegister(), RAX);
913  DCHECK_EQ(counter.AsRegister(), RCX);
914  DCHECK_EQ(out.AsRegister(), RDI);
915
916  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
917  // or directly dispatch if we have a constant.
918  SlowPathCodeX86_64* slow_path = nullptr;
919  if (invoke->InputAt(1)->IsIntConstant()) {
920    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
921    std::numeric_limits<uint16_t>::max()) {
922      // Always needs the slow-path. We could directly dispatch to it, but this case should be
923      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
924      slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
925      codegen->AddSlowPath(slow_path);
926      __ jmp(slow_path->GetEntryLabel());
927      __ Bind(slow_path->GetExitLabel());
928      return;
929    }
930  } else {
931    __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
932    slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
933    codegen->AddSlowPath(slow_path);
934    __ j(kAbove, slow_path->GetEntryLabel());
935  }
936
937  // From here down, we know that we are looking for a char that fits in 16 bits.
938  // Location of reference to data array within the String object.
939  int32_t value_offset = mirror::String::ValueOffset().Int32Value();
940  // Location of count within the String object.
941  int32_t count_offset = mirror::String::CountOffset().Int32Value();
942
943  // Load string length, i.e., the count field of the string.
944  __ movl(string_length, Address(string_obj, count_offset));
945
946  // Do a length check.
947  // TODO: Support jecxz.
948  Label not_found_label;
949  __ testl(string_length, string_length);
950  __ j(kEqual, &not_found_label);
951
952  if (start_at_zero) {
953    // Number of chars to scan is the same as the string length.
954    __ movl(counter, string_length);
955
956    // Move to the start of the string.
957    __ addq(string_obj, Immediate(value_offset));
958  } else {
959    CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
960
961    // Do a start_index check.
962    __ cmpl(start_index, string_length);
963    __ j(kGreaterEqual, &not_found_label);
964
965    // Ensure we have a start index >= 0;
966    __ xorl(counter, counter);
967    __ cmpl(start_index, Immediate(0));
968    __ cmov(kGreater, counter, start_index, false);  // 32-bit copy is enough.
969
970    // Move to the start of the string: string_obj + value_offset + 2 * start_index.
971    __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
972
973    // Now update ecx, the work counter: it's gonna be string.length - start_index.
974    __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
975    __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
976  }
977
978  // Everything is set up for repne scasw:
979  //   * Comparison address in RDI.
980  //   * Counter in ECX.
981  __ repne_scasw();
982
983  // Did we find a match?
984  __ j(kNotEqual, &not_found_label);
985
986  // Yes, we matched.  Compute the index of the result.
987  __ subl(string_length, counter);
988  __ leal(out, Address(string_length, -1));
989
990  Label done;
991  __ jmp(&done);
992
993  // Failed to match; return -1.
994  __ Bind(&not_found_label);
995  __ movl(out, Immediate(-1));
996
997  // And join up at the end.
998  __ Bind(&done);
999  if (slow_path != nullptr) {
1000    __ Bind(slow_path->GetExitLabel());
1001  }
1002}
1003
1004void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
1005  CreateStringIndexOfLocations(invoke, arena_, true);
1006}
1007
1008void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
1009  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), true);
1010}
1011
1012void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1013  CreateStringIndexOfLocations(invoke, arena_, false);
1014}
1015
1016void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1017  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), false);
1018}
1019
1020void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1021  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1022                                                            LocationSummary::kCall,
1023                                                            kIntrinsified);
1024  InvokeRuntimeCallingConvention calling_convention;
1025  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1026  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1027  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1028  locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
1029  locations->SetOut(Location::RegisterLocation(RAX));
1030}
1031
1032void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1033  X86_64Assembler* assembler = GetAssembler();
1034  LocationSummary* locations = invoke->GetLocations();
1035
1036  CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
1037  __ testl(byte_array, byte_array);
1038  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1039  codegen_->AddSlowPath(slow_path);
1040  __ j(kEqual, slow_path->GetEntryLabel());
1041
1042  __ gs()->call(Address::Absolute(
1043        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromBytes), true));
1044  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
1045  __ Bind(slow_path->GetExitLabel());
1046}
1047
1048void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1049  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1050                                                            LocationSummary::kCall,
1051                                                            kIntrinsified);
1052  InvokeRuntimeCallingConvention calling_convention;
1053  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1054  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1055  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1056  locations->SetOut(Location::RegisterLocation(RAX));
1057}
1058
1059void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1060  X86_64Assembler* assembler = GetAssembler();
1061
1062  __ gs()->call(Address::Absolute(
1063        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromChars), true));
1064  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
1065}
1066
1067void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1068  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1069                                                            LocationSummary::kCall,
1070                                                            kIntrinsified);
1071  InvokeRuntimeCallingConvention calling_convention;
1072  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1073  locations->SetOut(Location::RegisterLocation(RAX));
1074}
1075
1076void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1077  X86_64Assembler* assembler = GetAssembler();
1078  LocationSummary* locations = invoke->GetLocations();
1079
1080  CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
1081  __ testl(string_to_copy, string_to_copy);
1082  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1083  codegen_->AddSlowPath(slow_path);
1084  __ j(kEqual, slow_path->GetEntryLabel());
1085
1086  __ gs()->call(Address::Absolute(
1087        QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromString), true));
1088  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
1089  __ Bind(slow_path->GetExitLabel());
1090}
1091
1092static void GenPeek(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
1093  CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1094  CpuRegister out = locations->Out().AsRegister<CpuRegister>();  // == address, here for clarity.
1095  // x86 allows unaligned access. We do not have to check the input or use specific instructions
1096  // to avoid a SIGBUS.
1097  switch (size) {
1098    case Primitive::kPrimByte:
1099      __ movsxb(out, Address(address, 0));
1100      break;
1101    case Primitive::kPrimShort:
1102      __ movsxw(out, Address(address, 0));
1103      break;
1104    case Primitive::kPrimInt:
1105      __ movl(out, Address(address, 0));
1106      break;
1107    case Primitive::kPrimLong:
1108      __ movq(out, Address(address, 0));
1109      break;
1110    default:
1111      LOG(FATAL) << "Type not recognized for peek: " << size;
1112      UNREACHABLE();
1113  }
1114}
1115
1116void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
1117  CreateIntToIntLocations(arena_, invoke);
1118}
1119
1120void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
1121  GenPeek(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
1122}
1123
1124void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
1125  CreateIntToIntLocations(arena_, invoke);
1126}
1127
1128void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
1129  GenPeek(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
1130}
1131
1132void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
1133  CreateIntToIntLocations(arena_, invoke);
1134}
1135
1136void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
1137  GenPeek(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
1138}
1139
1140void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
1141  CreateIntToIntLocations(arena_, invoke);
1142}
1143
1144void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
1145  GenPeek(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
1146}
1147
1148static void CreateIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* invoke) {
1149  LocationSummary* locations = new (arena) LocationSummary(invoke,
1150                                                           LocationSummary::kNoCall,
1151                                                           kIntrinsified);
1152  locations->SetInAt(0, Location::RequiresRegister());
1153  locations->SetInAt(1, Location::RegisterOrInt32LongConstant(invoke->InputAt(1)));
1154}
1155
1156static void GenPoke(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
1157  CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1158  Location value = locations->InAt(1);
1159  // x86 allows unaligned access. We do not have to check the input or use specific instructions
1160  // to avoid a SIGBUS.
1161  switch (size) {
1162    case Primitive::kPrimByte:
1163      if (value.IsConstant()) {
1164        __ movb(Address(address, 0),
1165                Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1166      } else {
1167        __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
1168      }
1169      break;
1170    case Primitive::kPrimShort:
1171      if (value.IsConstant()) {
1172        __ movw(Address(address, 0),
1173                Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1174      } else {
1175        __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
1176      }
1177      break;
1178    case Primitive::kPrimInt:
1179      if (value.IsConstant()) {
1180        __ movl(Address(address, 0),
1181                Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1182      } else {
1183        __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
1184      }
1185      break;
1186    case Primitive::kPrimLong:
1187      if (value.IsConstant()) {
1188        int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
1189        DCHECK(IsInt<32>(v));
1190        int32_t v_32 = v;
1191        __ movq(Address(address, 0), Immediate(v_32));
1192      } else {
1193        __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
1194      }
1195      break;
1196    default:
1197      LOG(FATAL) << "Type not recognized for poke: " << size;
1198      UNREACHABLE();
1199  }
1200}
1201
1202void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
1203  CreateIntIntToVoidLocations(arena_, invoke);
1204}
1205
1206void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
1207  GenPoke(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
1208}
1209
1210void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
1211  CreateIntIntToVoidLocations(arena_, invoke);
1212}
1213
1214void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
1215  GenPoke(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
1216}
1217
1218void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
1219  CreateIntIntToVoidLocations(arena_, invoke);
1220}
1221
1222void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
1223  GenPoke(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
1224}
1225
1226void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
1227  CreateIntIntToVoidLocations(arena_, invoke);
1228}
1229
1230void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
1231  GenPoke(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
1232}
1233
1234void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
1235  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1236                                                            LocationSummary::kNoCall,
1237                                                            kIntrinsified);
1238  locations->SetOut(Location::RequiresRegister());
1239}
1240
1241void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
1242  CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
1243  GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(), true));
1244}
1245
1246static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type,
1247                         bool is_volatile ATTRIBUTE_UNUSED, X86_64Assembler* assembler) {
1248  CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
1249  CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
1250  CpuRegister trg = locations->Out().AsRegister<CpuRegister>();
1251
1252  switch (type) {
1253    case Primitive::kPrimInt:
1254    case Primitive::kPrimNot:
1255      __ movl(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
1256      break;
1257
1258    case Primitive::kPrimLong:
1259      __ movq(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
1260      break;
1261
1262    default:
1263      LOG(FATAL) << "Unsupported op size " << type;
1264      UNREACHABLE();
1265  }
1266}
1267
1268static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
1269  LocationSummary* locations = new (arena) LocationSummary(invoke,
1270                                                           LocationSummary::kNoCall,
1271                                                           kIntrinsified);
1272  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1273  locations->SetInAt(1, Location::RequiresRegister());
1274  locations->SetInAt(2, Location::RequiresRegister());
1275  locations->SetOut(Location::RequiresRegister());
1276}
1277
1278void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
1279  CreateIntIntIntToIntLocations(arena_, invoke);
1280}
1281void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
1282  CreateIntIntIntToIntLocations(arena_, invoke);
1283}
1284void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
1285  CreateIntIntIntToIntLocations(arena_, invoke);
1286}
1287void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
1288  CreateIntIntIntToIntLocations(arena_, invoke);
1289}
1290void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
1291  CreateIntIntIntToIntLocations(arena_, invoke);
1292}
1293void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
1294  CreateIntIntIntToIntLocations(arena_, invoke);
1295}
1296
1297
1298void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
1299  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, false, GetAssembler());
1300}
1301void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
1302  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, true, GetAssembler());
1303}
1304void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
1305  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, false, GetAssembler());
1306}
1307void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
1308  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, true, GetAssembler());
1309}
1310void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
1311  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, false, GetAssembler());
1312}
1313void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
1314  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, true, GetAssembler());
1315}
1316
1317
1318static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* arena,
1319                                                       Primitive::Type type,
1320                                                       HInvoke* invoke) {
1321  LocationSummary* locations = new (arena) LocationSummary(invoke,
1322                                                           LocationSummary::kNoCall,
1323                                                           kIntrinsified);
1324  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1325  locations->SetInAt(1, Location::RequiresRegister());
1326  locations->SetInAt(2, Location::RequiresRegister());
1327  locations->SetInAt(3, Location::RequiresRegister());
1328  if (type == Primitive::kPrimNot) {
1329    // Need temp registers for card-marking.
1330    locations->AddTemp(Location::RequiresRegister());
1331    locations->AddTemp(Location::RequiresRegister());
1332  }
1333}
1334
1335void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
1336  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
1337}
1338void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
1339  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
1340}
1341void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
1342  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
1343}
1344void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
1345  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
1346}
1347void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
1348  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
1349}
1350void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
1351  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
1352}
1353void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
1354  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
1355}
1356void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
1357  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
1358}
1359void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
1360  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
1361}
1362
1363// We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
1364// memory model.
1365static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool is_volatile,
1366                         CodeGeneratorX86_64* codegen) {
1367  X86_64Assembler* assembler = reinterpret_cast<X86_64Assembler*>(codegen->GetAssembler());
1368  CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
1369  CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
1370  CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
1371
1372  if (type == Primitive::kPrimLong) {
1373    __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
1374  } else {
1375    __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
1376  }
1377
1378  if (is_volatile) {
1379    __ mfence();
1380  }
1381
1382  if (type == Primitive::kPrimNot) {
1383    codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
1384                        locations->GetTemp(1).AsRegister<CpuRegister>(),
1385                        base,
1386                        value);
1387  }
1388}
1389
1390void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
1391  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, false, codegen_);
1392}
1393void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
1394  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, false, codegen_);
1395}
1396void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
1397  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, true, codegen_);
1398}
1399void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
1400  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, false, codegen_);
1401}
1402void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
1403  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, false, codegen_);
1404}
1405void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
1406  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, true, codegen_);
1407}
1408void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
1409  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, false, codegen_);
1410}
1411void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
1412  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, false, codegen_);
1413}
1414void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
1415  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, true, codegen_);
1416}
1417
1418static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, Primitive::Type type,
1419                                       HInvoke* invoke) {
1420  LocationSummary* locations = new (arena) LocationSummary(invoke,
1421                                                           LocationSummary::kNoCall,
1422                                                           kIntrinsified);
1423  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1424  locations->SetInAt(1, Location::RequiresRegister());
1425  locations->SetInAt(2, Location::RequiresRegister());
1426  // expected value must be in EAX/RAX.
1427  locations->SetInAt(3, Location::RegisterLocation(RAX));
1428  locations->SetInAt(4, Location::RequiresRegister());
1429
1430  locations->SetOut(Location::RequiresRegister());
1431  if (type == Primitive::kPrimNot) {
1432    // Need temp registers for card-marking.
1433    locations->AddTemp(Location::RequiresRegister());
1434    locations->AddTemp(Location::RequiresRegister());
1435  }
1436}
1437
1438void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
1439  CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimInt, invoke);
1440}
1441
1442void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
1443  CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimLong, invoke);
1444}
1445
1446void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
1447  CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimNot, invoke);
1448}
1449
1450static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
1451  X86_64Assembler* assembler =
1452    reinterpret_cast<X86_64Assembler*>(codegen->GetAssembler());
1453  LocationSummary* locations = invoke->GetLocations();
1454
1455  CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
1456  CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
1457  CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
1458  DCHECK_EQ(expected.AsRegister(), RAX);
1459  CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
1460  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
1461
1462  if (type == Primitive::kPrimLong) {
1463    __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
1464  } else {
1465    // Integer or object.
1466    if (type == Primitive::kPrimNot) {
1467      // Mark card for object assuming new value is stored.
1468      codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
1469                          locations->GetTemp(1).AsRegister<CpuRegister>(),
1470                          base,
1471                          value);
1472    }
1473
1474    __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
1475  }
1476
1477  // locked cmpxchg has full barrier semantics, and we don't need scheduling
1478  // barriers at this time.
1479
1480  // Convert ZF into the boolean result.
1481  __ setcc(kZero, out);
1482  __ movzxb(out, out);
1483}
1484
1485void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
1486  GenCAS(Primitive::kPrimInt, invoke, codegen_);
1487}
1488
1489void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
1490  GenCAS(Primitive::kPrimLong, invoke, codegen_);
1491}
1492
1493void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
1494  GenCAS(Primitive::kPrimNot, invoke, codegen_);
1495}
1496
1497void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
1498  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1499                                                           LocationSummary::kNoCall,
1500                                                           kIntrinsified);
1501  locations->SetInAt(0, Location::RequiresRegister());
1502  locations->SetOut(Location::SameAsFirstInput());
1503  locations->AddTemp(Location::RequiresRegister());
1504}
1505
1506static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
1507                     X86_64Assembler* assembler) {
1508  Immediate imm_shift(shift);
1509  Immediate imm_mask(mask);
1510  __ movl(temp, reg);
1511  __ shrl(reg, imm_shift);
1512  __ andl(temp, imm_mask);
1513  __ andl(reg, imm_mask);
1514  __ shll(temp, imm_shift);
1515  __ orl(reg, temp);
1516}
1517
1518void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
1519  X86_64Assembler* assembler =
1520    reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler());
1521  LocationSummary* locations = invoke->GetLocations();
1522
1523  CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
1524  CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
1525
1526  /*
1527   * Use one bswap instruction to reverse byte order first and then use 3 rounds of
1528   * swapping bits to reverse bits in a number x. Using bswap to save instructions
1529   * compared to generic luni implementation which has 5 rounds of swapping bits.
1530   * x = bswap x
1531   * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
1532   * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
1533   * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
1534   */
1535  __ bswapl(reg);
1536  SwapBits(reg, temp, 1, 0x55555555, assembler);
1537  SwapBits(reg, temp, 2, 0x33333333, assembler);
1538  SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
1539}
1540
1541void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
1542  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1543                                                           LocationSummary::kNoCall,
1544                                                           kIntrinsified);
1545  locations->SetInAt(0, Location::RequiresRegister());
1546  locations->SetOut(Location::SameAsFirstInput());
1547  locations->AddTemp(Location::RequiresRegister());
1548  locations->AddTemp(Location::RequiresRegister());
1549}
1550
1551static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
1552                       int32_t shift, int64_t mask, X86_64Assembler* assembler) {
1553  Immediate imm_shift(shift);
1554  __ movq(temp_mask, Immediate(mask));
1555  __ movq(temp, reg);
1556  __ shrq(reg, imm_shift);
1557  __ andq(temp, temp_mask);
1558  __ andq(reg, temp_mask);
1559  __ shlq(temp, imm_shift);
1560  __ orq(reg, temp);
1561}
1562
1563void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
1564  X86_64Assembler* assembler =
1565    reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler());
1566  LocationSummary* locations = invoke->GetLocations();
1567
1568  CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
1569  CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
1570  CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
1571
1572  /*
1573   * Use one bswap instruction to reverse byte order first and then use 3 rounds of
1574   * swapping bits to reverse bits in a long number x. Using bswap to save instructions
1575   * compared to generic luni implementation which has 5 rounds of swapping bits.
1576   * x = bswap x
1577   * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
1578   * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
1579   * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
1580   */
1581  __ bswapq(reg);
1582  SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
1583  SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
1584  SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
1585}
1586
1587// Unimplemented intrinsics.
1588
1589#define UNIMPLEMENTED_INTRINSIC(Name)                                                   \
1590void IntrinsicLocationsBuilderX86_64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \
1591}                                                                                       \
1592void IntrinsicCodeGeneratorX86_64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
1593}
1594
1595UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
1596UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
1597UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
1598
1599}  // namespace x86_64
1600}  // namespace art
1601