1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "intrinsics_x86_64.h"
18
19#include <limits>
20
21#include "arch/x86_64/instruction_set_features_x86_64.h"
22#include "art_method-inl.h"
23#include "base/bit_utils.h"
24#include "code_generator_x86_64.h"
25#include "entrypoints/quick/quick_entrypoints.h"
26#include "intrinsics.h"
27#include "intrinsics_utils.h"
28#include "mirror/array-inl.h"
29#include "mirror/string.h"
30#include "thread.h"
31#include "utils/x86_64/assembler_x86_64.h"
32#include "utils/x86_64/constants_x86_64.h"
33
34namespace art {
35
36namespace x86_64 {
37
38IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
39  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
40}
41
42
43X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
44  return down_cast<X86_64Assembler*>(codegen_->GetAssembler());
45}
46
47ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
48  return codegen_->GetGraph()->GetArena();
49}
50
51bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
52  Dispatch(invoke);
53  LocationSummary* res = invoke->GetLocations();
54  if (res == nullptr) {
55    return false;
56  }
57  if (kEmitCompilerReadBarrier && res->CanCall()) {
58    // Generating an intrinsic for this HInvoke may produce an
59    // IntrinsicSlowPathX86_64 slow path.  Currently this approach
60    // does not work when using read barriers, as the emitted
61    // calling sequence will make use of another slow path
62    // (ReadBarrierForRootSlowPathX86_64 for HInvokeStaticOrDirect,
63    // ReadBarrierSlowPathX86_64 for HInvokeVirtual).  So we bail
64    // out in this case.
65    //
66    // TODO: Find a way to have intrinsics work with read barriers.
67    invoke->SetLocations(nullptr);
68    return false;
69  }
70  return res->Intrinsified();
71}
72
73static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
74  InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
75  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
76}
77
78using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
79
80#define __ assembler->
81
82static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
83  LocationSummary* locations = new (arena) LocationSummary(invoke,
84                                                           LocationSummary::kNoCall,
85                                                           kIntrinsified);
86  locations->SetInAt(0, Location::RequiresFpuRegister());
87  locations->SetOut(Location::RequiresRegister());
88}
89
90static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
91  LocationSummary* locations = new (arena) LocationSummary(invoke,
92                                                           LocationSummary::kNoCall,
93                                                           kIntrinsified);
94  locations->SetInAt(0, Location::RequiresRegister());
95  locations->SetOut(Location::RequiresFpuRegister());
96}
97
98static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
99  Location input = locations->InAt(0);
100  Location output = locations->Out();
101  __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
102}
103
104static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
105  Location input = locations->InAt(0);
106  Location output = locations->Out();
107  __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
108}
109
110void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
111  CreateFPToIntLocations(arena_, invoke);
112}
113void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
114  CreateIntToFPLocations(arena_, invoke);
115}
116
117void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
118  MoveFPToInt(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
119}
120void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
121  MoveIntToFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
122}
123
124void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
125  CreateFPToIntLocations(arena_, invoke);
126}
127void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
128  CreateIntToFPLocations(arena_, invoke);
129}
130
131void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
132  MoveFPToInt(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
133}
134void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
135  MoveIntToFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
136}
137
138static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
139  LocationSummary* locations = new (arena) LocationSummary(invoke,
140                                                           LocationSummary::kNoCall,
141                                                           kIntrinsified);
142  locations->SetInAt(0, Location::RequiresRegister());
143  locations->SetOut(Location::SameAsFirstInput());
144}
145
146static void GenReverseBytes(LocationSummary* locations,
147                            Primitive::Type size,
148                            X86_64Assembler* assembler) {
149  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
150
151  switch (size) {
152    case Primitive::kPrimShort:
153      // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
154      __ bswapl(out);
155      __ sarl(out, Immediate(16));
156      break;
157    case Primitive::kPrimInt:
158      __ bswapl(out);
159      break;
160    case Primitive::kPrimLong:
161      __ bswapq(out);
162      break;
163    default:
164      LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
165      UNREACHABLE();
166  }
167}
168
169void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
170  CreateIntToIntLocations(arena_, invoke);
171}
172
173void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
174  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
175}
176
177void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
178  CreateIntToIntLocations(arena_, invoke);
179}
180
181void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
182  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
183}
184
185void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
186  CreateIntToIntLocations(arena_, invoke);
187}
188
189void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
190  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
191}
192
193
194// TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we
195//       need is 64b.
196
197static void CreateFloatToFloatPlusTemps(ArenaAllocator* arena, HInvoke* invoke) {
198  // TODO: Enable memory operations when the assembler supports them.
199  LocationSummary* locations = new (arena) LocationSummary(invoke,
200                                                           LocationSummary::kNoCall,
201                                                           kIntrinsified);
202  locations->SetInAt(0, Location::RequiresFpuRegister());
203  locations->SetOut(Location::SameAsFirstInput());
204  locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
205}
206
207static void MathAbsFP(LocationSummary* locations,
208                      bool is64bit,
209                      X86_64Assembler* assembler,
210                      CodeGeneratorX86_64* codegen) {
211  Location output = locations->Out();
212
213  DCHECK(output.IsFpuRegister());
214  XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
215
216  // TODO: Can mask directly with constant area using pand if we can guarantee
217  // that the literal is aligned on a 16 byte boundary.  This will avoid a
218  // temporary.
219  if (is64bit) {
220    __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
221    __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
222  } else {
223    __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
224    __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
225  }
226}
227
228void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
229  CreateFloatToFloatPlusTemps(arena_, invoke);
230}
231
232void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
233  MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_);
234}
235
236void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
237  CreateFloatToFloatPlusTemps(arena_, invoke);
238}
239
240void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
241  MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_);
242}
243
244static void CreateIntToIntPlusTemp(ArenaAllocator* arena, HInvoke* invoke) {
245  LocationSummary* locations = new (arena) LocationSummary(invoke,
246                                                           LocationSummary::kNoCall,
247                                                           kIntrinsified);
248  locations->SetInAt(0, Location::RequiresRegister());
249  locations->SetOut(Location::SameAsFirstInput());
250  locations->AddTemp(Location::RequiresRegister());
251}
252
253static void GenAbsInteger(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
254  Location output = locations->Out();
255  CpuRegister out = output.AsRegister<CpuRegister>();
256  CpuRegister mask = locations->GetTemp(0).AsRegister<CpuRegister>();
257
258  if (is64bit) {
259    // Create mask.
260    __ movq(mask, out);
261    __ sarq(mask, Immediate(63));
262    // Add mask.
263    __ addq(out, mask);
264    __ xorq(out, mask);
265  } else {
266    // Create mask.
267    __ movl(mask, out);
268    __ sarl(mask, Immediate(31));
269    // Add mask.
270    __ addl(out, mask);
271    __ xorl(out, mask);
272  }
273}
274
275void IntrinsicLocationsBuilderX86_64::VisitMathAbsInt(HInvoke* invoke) {
276  CreateIntToIntPlusTemp(arena_, invoke);
277}
278
279void IntrinsicCodeGeneratorX86_64::VisitMathAbsInt(HInvoke* invoke) {
280  GenAbsInteger(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
281}
282
283void IntrinsicLocationsBuilderX86_64::VisitMathAbsLong(HInvoke* invoke) {
284  CreateIntToIntPlusTemp(arena_, invoke);
285}
286
287void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
288  GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
289}
290
291static void GenMinMaxFP(LocationSummary* locations,
292                        bool is_min,
293                        bool is_double,
294                        X86_64Assembler* assembler,
295                        CodeGeneratorX86_64* codegen) {
296  Location op1_loc = locations->InAt(0);
297  Location op2_loc = locations->InAt(1);
298  Location out_loc = locations->Out();
299  XmmRegister out = out_loc.AsFpuRegister<XmmRegister>();
300
301  // Shortcut for same input locations.
302  if (op1_loc.Equals(op2_loc)) {
303    DCHECK(out_loc.Equals(op1_loc));
304    return;
305  }
306
307  //  (out := op1)
308  //  out <=? op2
309  //  if Nan jmp Nan_label
310  //  if out is min jmp done
311  //  if op2 is min jmp op2_label
312  //  handle -0/+0
313  //  jmp done
314  // Nan_label:
315  //  out := NaN
316  // op2_label:
317  //  out := op2
318  // done:
319  //
320  // This removes one jmp, but needs to copy one input (op1) to out.
321  //
322  // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
323
324  XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
325
326  NearLabel nan, done, op2_label;
327  if (is_double) {
328    __ ucomisd(out, op2);
329  } else {
330    __ ucomiss(out, op2);
331  }
332
333  __ j(Condition::kParityEven, &nan);
334
335  __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label);
336  __ j(is_min ? Condition::kBelow : Condition::kAbove, &done);
337
338  // Handle 0.0/-0.0.
339  if (is_min) {
340    if (is_double) {
341      __ orpd(out, op2);
342    } else {
343      __ orps(out, op2);
344    }
345  } else {
346    if (is_double) {
347      __ andpd(out, op2);
348    } else {
349      __ andps(out, op2);
350    }
351  }
352  __ jmp(&done);
353
354  // NaN handling.
355  __ Bind(&nan);
356  if (is_double) {
357    __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
358  } else {
359    __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
360  }
361  __ jmp(&done);
362
363  // out := op2;
364  __ Bind(&op2_label);
365  if (is_double) {
366    __ movsd(out, op2);
367  } else {
368    __ movss(out, op2);
369  }
370
371  // Done.
372  __ Bind(&done);
373}
374
375static void CreateFPFPToFP(ArenaAllocator* arena, HInvoke* invoke) {
376  LocationSummary* locations = new (arena) LocationSummary(invoke,
377                                                           LocationSummary::kNoCall,
378                                                           kIntrinsified);
379  locations->SetInAt(0, Location::RequiresFpuRegister());
380  locations->SetInAt(1, Location::RequiresFpuRegister());
381  // The following is sub-optimal, but all we can do for now. It would be fine to also accept
382  // the second input to be the output (we can simply swap inputs).
383  locations->SetOut(Location::SameAsFirstInput());
384}
385
386void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
387  CreateFPFPToFP(arena_, invoke);
388}
389
390void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
391  GenMinMaxFP(
392      invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler(), codegen_);
393}
394
395void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
396  CreateFPFPToFP(arena_, invoke);
397}
398
399void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
400  GenMinMaxFP(
401      invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler(), codegen_);
402}
403
404void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
405  CreateFPFPToFP(arena_, invoke);
406}
407
408void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
409  GenMinMaxFP(
410      invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler(), codegen_);
411}
412
413void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
414  CreateFPFPToFP(arena_, invoke);
415}
416
417void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
418  GenMinMaxFP(
419      invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler(), codegen_);
420}
421
422static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
423                      X86_64Assembler* assembler) {
424  Location op1_loc = locations->InAt(0);
425  Location op2_loc = locations->InAt(1);
426
427  // Shortcut for same input locations.
428  if (op1_loc.Equals(op2_loc)) {
429    // Can return immediately, as op1_loc == out_loc.
430    // Note: if we ever support separate registers, e.g., output into memory, we need to check for
431    //       a copy here.
432    DCHECK(locations->Out().Equals(op1_loc));
433    return;
434  }
435
436  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
437  CpuRegister op2 = op2_loc.AsRegister<CpuRegister>();
438
439  //  (out := op1)
440  //  out <=? op2
441  //  if out is min jmp done
442  //  out := op2
443  // done:
444
445  if (is_long) {
446    __ cmpq(out, op2);
447  } else {
448    __ cmpl(out, op2);
449  }
450
451  __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long);
452}
453
454static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
455  LocationSummary* locations = new (arena) LocationSummary(invoke,
456                                                           LocationSummary::kNoCall,
457                                                           kIntrinsified);
458  locations->SetInAt(0, Location::RequiresRegister());
459  locations->SetInAt(1, Location::RequiresRegister());
460  locations->SetOut(Location::SameAsFirstInput());
461}
462
463void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) {
464  CreateIntIntToIntLocations(arena_, invoke);
465}
466
467void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) {
468  GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler());
469}
470
471void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) {
472  CreateIntIntToIntLocations(arena_, invoke);
473}
474
475void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) {
476  GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler());
477}
478
479void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
480  CreateIntIntToIntLocations(arena_, invoke);
481}
482
483void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
484  GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler());
485}
486
487void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
488  CreateIntIntToIntLocations(arena_, invoke);
489}
490
491void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
492  GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler());
493}
494
495static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
496  LocationSummary* locations = new (arena) LocationSummary(invoke,
497                                                           LocationSummary::kNoCall,
498                                                           kIntrinsified);
499  locations->SetInAt(0, Location::RequiresFpuRegister());
500  locations->SetOut(Location::RequiresFpuRegister());
501}
502
503void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
504  CreateFPToFPLocations(arena_, invoke);
505}
506
507void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
508  LocationSummary* locations = invoke->GetLocations();
509  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
510  XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
511
512  GetAssembler()->sqrtsd(out, in);
513}
514
515static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
516  MoveArguments(invoke, codegen);
517
518  DCHECK(invoke->IsInvokeStaticOrDirect());
519  codegen->GenerateStaticOrDirectCall(
520      invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
521  codegen->RecordPcInfo(invoke, invoke->GetDexPc());
522
523  // Copy the result back to the expected output.
524  Location out = invoke->GetLocations()->Out();
525  if (out.IsValid()) {
526    DCHECK(out.IsRegister());
527    codegen->MoveFromReturnRegister(out, invoke->GetType());
528  }
529}
530
531static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
532                                      HInvoke* invoke,
533                                      CodeGeneratorX86_64* codegen) {
534  // Do we have instruction support?
535  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
536    CreateFPToFPLocations(arena, invoke);
537    return;
538  }
539
540  // We have to fall back to a call to the intrinsic.
541  LocationSummary* locations = new (arena) LocationSummary(invoke,
542                                                           LocationSummary::kCall);
543  InvokeRuntimeCallingConvention calling_convention;
544  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
545  locations->SetOut(Location::FpuRegisterLocation(XMM0));
546  // Needs to be RDI for the invoke.
547  locations->AddTemp(Location::RegisterLocation(RDI));
548}
549
550static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
551                                   HInvoke* invoke,
552                                   X86_64Assembler* assembler,
553                                   int round_mode) {
554  LocationSummary* locations = invoke->GetLocations();
555  if (locations->WillCall()) {
556    InvokeOutOfLineIntrinsic(codegen, invoke);
557  } else {
558    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
559    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
560    __ roundsd(out, in, Immediate(round_mode));
561  }
562}
563
564void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
565  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
566}
567
568void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
569  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
570}
571
572void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
573  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
574}
575
576void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
577  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
578}
579
580void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
581  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
582}
583
584void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
585  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
586}
587
588static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
589                                       HInvoke* invoke,
590                                       CodeGeneratorX86_64* codegen) {
591  // Do we have instruction support?
592  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
593    LocationSummary* locations = new (arena) LocationSummary(invoke,
594                                                              LocationSummary::kNoCall,
595                                                              kIntrinsified);
596    locations->SetInAt(0, Location::RequiresFpuRegister());
597    locations->SetOut(Location::RequiresRegister());
598    locations->AddTemp(Location::RequiresFpuRegister());
599    return;
600  }
601
602  // We have to fall back to a call to the intrinsic.
603  LocationSummary* locations = new (arena) LocationSummary(invoke,
604                                                           LocationSummary::kCall);
605  InvokeRuntimeCallingConvention calling_convention;
606  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
607  locations->SetOut(Location::RegisterLocation(RAX));
608  // Needs to be RDI for the invoke.
609  locations->AddTemp(Location::RegisterLocation(RDI));
610}
611
612void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
613  // See intrinsics.h.
614  if (kRoundIsPlusPointFive) {
615    CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
616  }
617}
618
619void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
620  LocationSummary* locations = invoke->GetLocations();
621  if (locations->WillCall()) {
622    InvokeOutOfLineIntrinsic(codegen_, invoke);
623    return;
624  }
625
626  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
627  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
628  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
629  XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
630  NearLabel done, nan;
631  X86_64Assembler* assembler = GetAssembler();
632
633  // Load 0.5 into inPlusPointFive.
634  __ movss(inPlusPointFive, codegen_->LiteralFloatAddress(0.5f));
635
636  // Add in the input.
637  __ addss(inPlusPointFive, in);
638
639  // And truncate to an integer.
640  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
641
642  // Load maxInt into out.
643  codegen_->Load64BitValue(out, kPrimIntMax);
644
645  // if inPlusPointFive >= maxInt goto done
646  __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax)));
647  __ j(kAboveEqual, &done);
648
649  // if input == NaN goto nan
650  __ j(kUnordered, &nan);
651
652  // output = float-to-int-truncate(input)
653  __ cvttss2si(out, inPlusPointFive);
654  __ jmp(&done);
655  __ Bind(&nan);
656
657  //  output = 0
658  __ xorl(out, out);
659  __ Bind(&done);
660}
661
662void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
663  // See intrinsics.h.
664  if (kRoundIsPlusPointFive) {
665    CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
666  }
667}
668
669void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
670  LocationSummary* locations = invoke->GetLocations();
671  if (locations->WillCall()) {
672    InvokeOutOfLineIntrinsic(codegen_, invoke);
673    return;
674  }
675
676  // Implement RoundDouble as t1 = floor(input + 0.5);  convert to long.
677  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
678  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
679  XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
680  NearLabel done, nan;
681  X86_64Assembler* assembler = GetAssembler();
682
683  // Load 0.5 into inPlusPointFive.
684  __ movsd(inPlusPointFive, codegen_->LiteralDoubleAddress(0.5));
685
686  // Add in the input.
687  __ addsd(inPlusPointFive, in);
688
689  // And truncate to an integer.
690  __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1));
691
692  // Load maxLong into out.
693  codegen_->Load64BitValue(out, kPrimLongMax);
694
695  // if inPlusPointFive >= maxLong goto done
696  __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax)));
697  __ j(kAboveEqual, &done);
698
699  // if input == NaN goto nan
700  __ j(kUnordered, &nan);
701
702  // output = double-to-long-truncate(input)
703  __ cvttsd2si(out, inPlusPointFive, /* is64bit */ true);
704  __ jmp(&done);
705  __ Bind(&nan);
706
707  //  output = 0
708  __ xorl(out, out);
709  __ Bind(&done);
710}
711
712static void CreateFPToFPCallLocations(ArenaAllocator* arena,
713                                      HInvoke* invoke) {
714  LocationSummary* locations = new (arena) LocationSummary(invoke,
715                                                           LocationSummary::kCall,
716                                                           kIntrinsified);
717  InvokeRuntimeCallingConvention calling_convention;
718  locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
719  locations->SetOut(Location::FpuRegisterLocation(XMM0));
720
721  // We have to ensure that the native code doesn't clobber the XMM registers which are
722  // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
723  // saved in the prologue and properly restored.
724  for (auto fp_reg : non_volatile_xmm_regs) {
725    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
726  }
727}
728
729static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
730                          QuickEntrypointEnum entry) {
731  LocationSummary* locations = invoke->GetLocations();
732  DCHECK(locations->WillCall());
733  DCHECK(invoke->IsInvokeStaticOrDirect());
734  X86_64Assembler* assembler = codegen->GetAssembler();
735
736  __ gs()->call(Address::Absolute(GetThreadOffset<kX86_64WordSize>(entry), true));
737  codegen->RecordPcInfo(invoke, invoke->GetDexPc());
738}
739
740void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) {
741  CreateFPToFPCallLocations(arena_, invoke);
742}
743
744void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) {
745  GenFPToFPCall(invoke, codegen_, kQuickCos);
746}
747
748void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) {
749  CreateFPToFPCallLocations(arena_, invoke);
750}
751
752void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) {
753  GenFPToFPCall(invoke, codegen_, kQuickSin);
754}
755
756void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) {
757  CreateFPToFPCallLocations(arena_, invoke);
758}
759
760void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) {
761  GenFPToFPCall(invoke, codegen_, kQuickAcos);
762}
763
764void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) {
765  CreateFPToFPCallLocations(arena_, invoke);
766}
767
768void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) {
769  GenFPToFPCall(invoke, codegen_, kQuickAsin);
770}
771
772void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) {
773  CreateFPToFPCallLocations(arena_, invoke);
774}
775
776void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) {
777  GenFPToFPCall(invoke, codegen_, kQuickAtan);
778}
779
780void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) {
781  CreateFPToFPCallLocations(arena_, invoke);
782}
783
784void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) {
785  GenFPToFPCall(invoke, codegen_, kQuickCbrt);
786}
787
788void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) {
789  CreateFPToFPCallLocations(arena_, invoke);
790}
791
792void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) {
793  GenFPToFPCall(invoke, codegen_, kQuickCosh);
794}
795
796void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) {
797  CreateFPToFPCallLocations(arena_, invoke);
798}
799
800void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) {
801  GenFPToFPCall(invoke, codegen_, kQuickExp);
802}
803
804void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) {
805  CreateFPToFPCallLocations(arena_, invoke);
806}
807
808void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) {
809  GenFPToFPCall(invoke, codegen_, kQuickExpm1);
810}
811
812void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) {
813  CreateFPToFPCallLocations(arena_, invoke);
814}
815
816void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) {
817  GenFPToFPCall(invoke, codegen_, kQuickLog);
818}
819
820void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) {
821  CreateFPToFPCallLocations(arena_, invoke);
822}
823
824void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) {
825  GenFPToFPCall(invoke, codegen_, kQuickLog10);
826}
827
828void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) {
829  CreateFPToFPCallLocations(arena_, invoke);
830}
831
832void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) {
833  GenFPToFPCall(invoke, codegen_, kQuickSinh);
834}
835
836void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) {
837  CreateFPToFPCallLocations(arena_, invoke);
838}
839
840void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) {
841  GenFPToFPCall(invoke, codegen_, kQuickTan);
842}
843
844void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) {
845  CreateFPToFPCallLocations(arena_, invoke);
846}
847
848void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
849  GenFPToFPCall(invoke, codegen_, kQuickTanh);
850}
851
852static void CreateFPFPToFPCallLocations(ArenaAllocator* arena,
853                                        HInvoke* invoke) {
854  LocationSummary* locations = new (arena) LocationSummary(invoke,
855                                                           LocationSummary::kCall,
856                                                           kIntrinsified);
857  InvokeRuntimeCallingConvention calling_convention;
858  locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
859  locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
860  locations->SetOut(Location::FpuRegisterLocation(XMM0));
861
862  // We have to ensure that the native code doesn't clobber the XMM registers which are
863  // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
864  // saved in the prologue and properly restored.
865  for (auto fp_reg : non_volatile_xmm_regs) {
866    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
867  }
868}
869
870void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
871  CreateFPFPToFPCallLocations(arena_, invoke);
872}
873
874void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) {
875  GenFPToFPCall(invoke, codegen_, kQuickAtan2);
876}
877
878void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) {
879  CreateFPFPToFPCallLocations(arena_, invoke);
880}
881
882void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) {
883  GenFPToFPCall(invoke, codegen_, kQuickHypot);
884}
885
886void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) {
887  CreateFPFPToFPCallLocations(arena_, invoke);
888}
889
890void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) {
891  GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
892}
893
894void IntrinsicLocationsBuilderX86_64::VisitStringCharAt(HInvoke* invoke) {
895  // The inputs plus one temp.
896  LocationSummary* locations = new (arena_) LocationSummary(invoke,
897                                                            LocationSummary::kCallOnSlowPath,
898                                                            kIntrinsified);
899  locations->SetInAt(0, Location::RequiresRegister());
900  locations->SetInAt(1, Location::RequiresRegister());
901  locations->SetOut(Location::SameAsFirstInput());
902  locations->AddTemp(Location::RequiresRegister());
903}
904
905void IntrinsicCodeGeneratorX86_64::VisitStringCharAt(HInvoke* invoke) {
906  LocationSummary* locations = invoke->GetLocations();
907
908  // Location of reference to data array.
909  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
910  // Location of count.
911  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
912
913  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
914  CpuRegister idx = locations->InAt(1).AsRegister<CpuRegister>();
915  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
916
917  // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
918  //       the cost.
919  // TODO: For simplicity, the index parameter is requested in a register, so different from Quick
920  //       we will not optimize the code for constants (which would save a register).
921
922  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
923  codegen_->AddSlowPath(slow_path);
924
925  X86_64Assembler* assembler = GetAssembler();
926
927  __ cmpl(idx, Address(obj, count_offset));
928  codegen_->MaybeRecordImplicitNullCheck(invoke);
929  __ j(kAboveEqual, slow_path->GetEntryLabel());
930
931  // out = out[2*idx].
932  __ movzxw(out, Address(out, idx, ScaleFactor::TIMES_2, value_offset));
933
934  __ Bind(slow_path->GetExitLabel());
935}
936
937void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
938  // Check to see if we have known failures that will cause us to have to bail out
939  // to the runtime, and just generate the runtime call directly.
940  HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
941  HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
942
943  // The positions must be non-negative.
944  if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
945      (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
946    // We will have to fail anyways.
947    return;
948  }
949
950  // The length must be > 0.
951  HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
952  if (length != nullptr) {
953    int32_t len = length->GetValue();
954    if (len < 0) {
955      // Just call as normal.
956      return;
957    }
958  }
959
960  LocationSummary* locations = new (arena_) LocationSummary(invoke,
961                                                            LocationSummary::kCallOnSlowPath,
962                                                            kIntrinsified);
963  // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
964  locations->SetInAt(0, Location::RequiresRegister());
965  locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
966  locations->SetInAt(2, Location::RequiresRegister());
967  locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
968  locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
969
970  // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
971  locations->AddTemp(Location::RegisterLocation(RSI));
972  locations->AddTemp(Location::RegisterLocation(RDI));
973  locations->AddTemp(Location::RegisterLocation(RCX));
974}
975
976static void CheckPosition(X86_64Assembler* assembler,
977                          Location pos,
978                          CpuRegister input,
979                          Location length,
980                          SlowPathCode* slow_path,
981                          CpuRegister input_len,
982                          CpuRegister temp,
983                          bool length_is_input_length = false) {
984  // Where is the length in the Array?
985  const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
986
987  if (pos.IsConstant()) {
988    int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
989    if (pos_const == 0) {
990      if (!length_is_input_length) {
991        // Check that length(input) >= length.
992        if (length.IsConstant()) {
993          __ cmpl(Address(input, length_offset),
994                  Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
995        } else {
996          __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>());
997        }
998        __ j(kLess, slow_path->GetEntryLabel());
999      }
1000    } else {
1001      // Check that length(input) >= pos.
1002      __ movl(input_len, Address(input, length_offset));
1003      __ cmpl(input_len, Immediate(pos_const));
1004      __ j(kLess, slow_path->GetEntryLabel());
1005
1006      // Check that (length(input) - pos) >= length.
1007      __ leal(temp, Address(input_len, -pos_const));
1008      if (length.IsConstant()) {
1009        __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1010      } else {
1011        __ cmpl(temp, length.AsRegister<CpuRegister>());
1012      }
1013      __ j(kLess, slow_path->GetEntryLabel());
1014    }
1015  } else if (length_is_input_length) {
1016    // The only way the copy can succeed is if pos is zero.
1017    CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
1018    __ testl(pos_reg, pos_reg);
1019    __ j(kNotEqual, slow_path->GetEntryLabel());
1020  } else {
1021    // Check that pos >= 0.
1022    CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
1023    __ testl(pos_reg, pos_reg);
1024    __ j(kLess, slow_path->GetEntryLabel());
1025
1026    // Check that pos <= length(input).
1027    __ cmpl(Address(input, length_offset), pos_reg);
1028    __ j(kLess, slow_path->GetEntryLabel());
1029
1030    // Check that (length(input) - pos) >= length.
1031    __ movl(temp, Address(input, length_offset));
1032    __ subl(temp, pos_reg);
1033    if (length.IsConstant()) {
1034      __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1035    } else {
1036      __ cmpl(temp, length.AsRegister<CpuRegister>());
1037    }
1038    __ j(kLess, slow_path->GetEntryLabel());
1039  }
1040}
1041
1042void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
1043  X86_64Assembler* assembler = GetAssembler();
1044  LocationSummary* locations = invoke->GetLocations();
1045
1046  CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
1047  Location src_pos = locations->InAt(1);
1048  CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
1049  Location dest_pos = locations->InAt(3);
1050  Location length = locations->InAt(4);
1051
1052  // Temporaries that we need for MOVSW.
1053  CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
1054  DCHECK_EQ(src_base.AsRegister(), RSI);
1055  CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
1056  DCHECK_EQ(dest_base.AsRegister(), RDI);
1057  CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
1058  DCHECK_EQ(count.AsRegister(), RCX);
1059
1060  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1061  codegen_->AddSlowPath(slow_path);
1062
1063  // Bail out if the source and destination are the same.
1064  __ cmpl(src, dest);
1065  __ j(kEqual, slow_path->GetEntryLabel());
1066
1067  // Bail out if the source is null.
1068  __ testl(src, src);
1069  __ j(kEqual, slow_path->GetEntryLabel());
1070
1071  // Bail out if the destination is null.
1072  __ testl(dest, dest);
1073  __ j(kEqual, slow_path->GetEntryLabel());
1074
1075  // If the length is negative, bail out.
1076  // We have already checked in the LocationsBuilder for the constant case.
1077  if (!length.IsConstant()) {
1078    __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
1079    __ j(kLess, slow_path->GetEntryLabel());
1080  }
1081
1082  // Validity checks: source.
1083  CheckPosition(assembler, src_pos, src, length, slow_path, src_base, dest_base);
1084
1085  // Validity checks: dest.
1086  CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base, dest_base);
1087
1088  // We need the count in RCX.
1089  if (length.IsConstant()) {
1090    __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1091  } else {
1092    __ movl(count, length.AsRegister<CpuRegister>());
1093  }
1094
1095  // Okay, everything checks out.  Finally time to do the copy.
1096  // Check assumption that sizeof(Char) is 2 (used in scaling below).
1097  const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
1098  DCHECK_EQ(char_size, 2u);
1099
1100  const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
1101
1102  if (src_pos.IsConstant()) {
1103    int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
1104    __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
1105  } else {
1106    __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
1107                              ScaleFactor::TIMES_2, data_offset));
1108  }
1109  if (dest_pos.IsConstant()) {
1110    int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1111    __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
1112  } else {
1113    __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
1114                               ScaleFactor::TIMES_2, data_offset));
1115  }
1116
1117  // Do the move.
1118  __ rep_movsw();
1119
1120  __ Bind(slow_path->GetExitLabel());
1121}
1122
1123
1124void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
1125  CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
1126}
1127
1128// TODO: Implement read barriers in the SystemArrayCopy intrinsic.
1129// Note that this code path is not used (yet) because we do not
1130// intrinsify methods that can go into the IntrinsicSlowPathX86_64
1131// slow path.
1132void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
1133  X86_64Assembler* assembler = GetAssembler();
1134  LocationSummary* locations = invoke->GetLocations();
1135
1136  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
1137  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
1138  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
1139  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
1140
1141  CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
1142  Location src_pos = locations->InAt(1);
1143  CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
1144  Location dest_pos = locations->InAt(3);
1145  Location length = locations->InAt(4);
1146  CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
1147  CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
1148  CpuRegister temp3 = locations->GetTemp(2).AsRegister<CpuRegister>();
1149
1150  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1151  codegen_->AddSlowPath(slow_path);
1152
1153  NearLabel conditions_on_positions_validated;
1154  SystemArrayCopyOptimizations optimizations(invoke);
1155
1156  // If source and destination are the same, we go to slow path if we need to do
1157  // forward copying.
1158  if (src_pos.IsConstant()) {
1159    int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
1160    if (dest_pos.IsConstant()) {
1161      int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1162      if (optimizations.GetDestinationIsSource()) {
1163        // Checked when building locations.
1164        DCHECK_GE(src_pos_constant, dest_pos_constant);
1165      } else if (src_pos_constant < dest_pos_constant) {
1166        __ cmpl(src, dest);
1167        __ j(kEqual, slow_path->GetEntryLabel());
1168      }
1169    } else {
1170      if (!optimizations.GetDestinationIsSource()) {
1171        __ cmpl(src, dest);
1172        __ j(kNotEqual, &conditions_on_positions_validated);
1173      }
1174      __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
1175      __ j(kGreater, slow_path->GetEntryLabel());
1176    }
1177  } else {
1178    if (!optimizations.GetDestinationIsSource()) {
1179      __ cmpl(src, dest);
1180      __ j(kNotEqual, &conditions_on_positions_validated);
1181    }
1182    if (dest_pos.IsConstant()) {
1183      int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1184      __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
1185      __ j(kLess, slow_path->GetEntryLabel());
1186    } else {
1187      __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
1188      __ j(kLess, slow_path->GetEntryLabel());
1189    }
1190  }
1191
1192  __ Bind(&conditions_on_positions_validated);
1193
1194  if (!optimizations.GetSourceIsNotNull()) {
1195    // Bail out if the source is null.
1196    __ testl(src, src);
1197    __ j(kEqual, slow_path->GetEntryLabel());
1198  }
1199
1200  if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
1201    // Bail out if the destination is null.
1202    __ testl(dest, dest);
1203    __ j(kEqual, slow_path->GetEntryLabel());
1204  }
1205
1206  // If the length is negative, bail out.
1207  // We have already checked in the LocationsBuilder for the constant case.
1208  if (!length.IsConstant() &&
1209      !optimizations.GetCountIsSourceLength() &&
1210      !optimizations.GetCountIsDestinationLength()) {
1211    __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
1212    __ j(kLess, slow_path->GetEntryLabel());
1213  }
1214
1215  // Validity checks: source.
1216  CheckPosition(assembler,
1217                src_pos,
1218                src,
1219                length,
1220                slow_path,
1221                temp1,
1222                temp2,
1223                optimizations.GetCountIsSourceLength());
1224
1225  // Validity checks: dest.
1226  CheckPosition(assembler,
1227                dest_pos,
1228                dest,
1229                length,
1230                slow_path,
1231                temp1,
1232                temp2,
1233                optimizations.GetCountIsDestinationLength());
1234
1235  if (!optimizations.GetDoesNotNeedTypeCheck()) {
1236    // Check whether all elements of the source array are assignable to the component
1237    // type of the destination array. We do two checks: the classes are the same,
1238    // or the destination is Object[]. If none of these checks succeed, we go to the
1239    // slow path.
1240    __ movl(temp1, Address(dest, class_offset));
1241    __ movl(temp2, Address(src, class_offset));
1242    bool did_unpoison = false;
1243    if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
1244        !optimizations.GetSourceIsNonPrimitiveArray()) {
1245      // One or two of the references need to be unpoisoned. Unpoison them
1246      // both to make the identity check valid.
1247      __ MaybeUnpoisonHeapReference(temp1);
1248      __ MaybeUnpoisonHeapReference(temp2);
1249      did_unpoison = true;
1250    }
1251
1252    if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
1253      // Bail out if the destination is not a non primitive array.
1254      // /* HeapReference<Class> */ TMP = temp1->component_type_
1255      __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1256      __ testl(CpuRegister(TMP), CpuRegister(TMP));
1257      __ j(kEqual, slow_path->GetEntryLabel());
1258      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1259      __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1260      __ j(kNotEqual, slow_path->GetEntryLabel());
1261    }
1262
1263    if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1264      // Bail out if the source is not a non primitive array.
1265      // /* HeapReference<Class> */ TMP = temp2->component_type_
1266      __ movl(CpuRegister(TMP), Address(temp2, component_offset));
1267      __ testl(CpuRegister(TMP), CpuRegister(TMP));
1268      __ j(kEqual, slow_path->GetEntryLabel());
1269      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1270      __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1271      __ j(kNotEqual, slow_path->GetEntryLabel());
1272    }
1273
1274    __ cmpl(temp1, temp2);
1275
1276    if (optimizations.GetDestinationIsTypedObjectArray()) {
1277      NearLabel do_copy;
1278      __ j(kEqual, &do_copy);
1279      if (!did_unpoison) {
1280        __ MaybeUnpoisonHeapReference(temp1);
1281      }
1282      // /* HeapReference<Class> */ temp1 = temp1->component_type_
1283      __ movl(temp1, Address(temp1, component_offset));
1284      __ MaybeUnpoisonHeapReference(temp1);
1285      // /* HeapReference<Class> */ temp1 = temp1->super_class_
1286      __ movl(temp1, Address(temp1, super_offset));
1287      // No need to unpoison the result, we're comparing against null.
1288      __ testl(temp1, temp1);
1289      __ j(kNotEqual, slow_path->GetEntryLabel());
1290      __ Bind(&do_copy);
1291    } else {
1292      __ j(kNotEqual, slow_path->GetEntryLabel());
1293    }
1294  } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1295    DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
1296    // Bail out if the source is not a non primitive array.
1297    // /* HeapReference<Class> */ temp1 = src->klass_
1298    __ movl(temp1, Address(src, class_offset));
1299    __ MaybeUnpoisonHeapReference(temp1);
1300    // /* HeapReference<Class> */ TMP = temp1->component_type_
1301    __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1302    __ testl(CpuRegister(TMP), CpuRegister(TMP));
1303    __ j(kEqual, slow_path->GetEntryLabel());
1304    __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1305    __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1306    __ j(kNotEqual, slow_path->GetEntryLabel());
1307  }
1308
1309  // Compute base source address, base destination address, and end source address.
1310
1311  uint32_t element_size = sizeof(int32_t);
1312  uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
1313  if (src_pos.IsConstant()) {
1314    int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
1315    __ leal(temp1, Address(src, element_size * constant + offset));
1316  } else {
1317    __ leal(temp1, Address(src, src_pos.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, offset));
1318  }
1319
1320  if (dest_pos.IsConstant()) {
1321    int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1322    __ leal(temp2, Address(dest, element_size * constant + offset));
1323  } else {
1324    __ leal(temp2, Address(dest, dest_pos.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, offset));
1325  }
1326
1327  if (length.IsConstant()) {
1328    int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
1329    __ leal(temp3, Address(temp1, element_size * constant));
1330  } else {
1331    __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0));
1332  }
1333
1334  // Iterate over the arrays and do a raw copy of the objects. We don't need to
1335  // poison/unpoison, nor do any read barrier as the next uses of the destination
1336  // array will do it.
1337  NearLabel loop, done;
1338  __ cmpl(temp1, temp3);
1339  __ j(kEqual, &done);
1340  __ Bind(&loop);
1341  __ movl(CpuRegister(TMP), Address(temp1, 0));
1342  __ movl(Address(temp2, 0), CpuRegister(TMP));
1343  __ addl(temp1, Immediate(element_size));
1344  __ addl(temp2, Immediate(element_size));
1345  __ cmpl(temp1, temp3);
1346  __ j(kNotEqual, &loop);
1347  __ Bind(&done);
1348
1349  // We only need one card marking on the destination array.
1350  codegen_->MarkGCCard(temp1,
1351                       temp2,
1352                       dest,
1353                       CpuRegister(kNoRegister),
1354                       /* value_can_be_null */ false);
1355
1356  __ Bind(slow_path->GetExitLabel());
1357}
1358
1359void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
1360  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1361                                                            LocationSummary::kCall,
1362                                                            kIntrinsified);
1363  InvokeRuntimeCallingConvention calling_convention;
1364  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1365  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1366  locations->SetOut(Location::RegisterLocation(RAX));
1367}
1368
1369void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
1370  X86_64Assembler* assembler = GetAssembler();
1371  LocationSummary* locations = invoke->GetLocations();
1372
1373  // Note that the null check must have been done earlier.
1374  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1375
1376  CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
1377  __ testl(argument, argument);
1378  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1379  codegen_->AddSlowPath(slow_path);
1380  __ j(kEqual, slow_path->GetEntryLabel());
1381
1382  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pStringCompareTo),
1383                                  /* no_rip */ true));
1384  __ Bind(slow_path->GetExitLabel());
1385}
1386
1387void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
1388  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1389                                                            LocationSummary::kNoCall,
1390                                                            kIntrinsified);
1391  locations->SetInAt(0, Location::RequiresRegister());
1392  locations->SetInAt(1, Location::RequiresRegister());
1393
1394  // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
1395  locations->AddTemp(Location::RegisterLocation(RCX));
1396  locations->AddTemp(Location::RegisterLocation(RDI));
1397
1398  // Set output, RSI needed for repe_cmpsq instruction anyways.
1399  locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
1400}
1401
1402void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
1403  X86_64Assembler* assembler = GetAssembler();
1404  LocationSummary* locations = invoke->GetLocations();
1405
1406  CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
1407  CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
1408  CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
1409  CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
1410  CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
1411
1412  NearLabel end, return_true, return_false;
1413
1414  // Get offsets of count, value, and class fields within a string object.
1415  const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1416  const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1417  const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
1418
1419  // Note that the null check must have been done earlier.
1420  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1421
1422  // Check if input is null, return false if it is.
1423  __ testl(arg, arg);
1424  __ j(kEqual, &return_false);
1425
1426  // Instanceof check for the argument by comparing class fields.
1427  // All string objects must have the same type since String cannot be subclassed.
1428  // Receiver must be a string object, so its class field is equal to all strings' class fields.
1429  // If the argument is a string object, its class field must be equal to receiver's class field.
1430  __ movl(rcx, Address(str, class_offset));
1431  __ cmpl(rcx, Address(arg, class_offset));
1432  __ j(kNotEqual, &return_false);
1433
1434  // Reference equality check, return true if same reference.
1435  __ cmpl(str, arg);
1436  __ j(kEqual, &return_true);
1437
1438  // Load length of receiver string.
1439  __ movl(rcx, Address(str, count_offset));
1440  // Check if lengths are equal, return false if they're not.
1441  __ cmpl(rcx, Address(arg, count_offset));
1442  __ j(kNotEqual, &return_false);
1443  // Return true if both strings are empty.
1444  __ jrcxz(&return_true);
1445
1446  // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
1447  __ leal(rsi, Address(str, value_offset));
1448  __ leal(rdi, Address(arg, value_offset));
1449
1450  // Divide string length by 4 and adjust for lengths not divisible by 4.
1451  __ addl(rcx, Immediate(3));
1452  __ shrl(rcx, Immediate(2));
1453
1454  // Assertions that must hold in order to compare strings 4 characters at a time.
1455  DCHECK_ALIGNED(value_offset, 8);
1456  static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
1457
1458  // Loop to compare strings four characters at a time starting at the beginning of the string.
1459  __ repe_cmpsq();
1460  // If strings are not equal, zero flag will be cleared.
1461  __ j(kNotEqual, &return_false);
1462
1463  // Return true and exit the function.
1464  // If loop does not result in returning false, we return true.
1465  __ Bind(&return_true);
1466  __ movl(rsi, Immediate(1));
1467  __ jmp(&end);
1468
1469  // Return false and exit the function.
1470  __ Bind(&return_false);
1471  __ xorl(rsi, rsi);
1472  __ Bind(&end);
1473}
1474
1475static void CreateStringIndexOfLocations(HInvoke* invoke,
1476                                         ArenaAllocator* allocator,
1477                                         bool start_at_zero) {
1478  LocationSummary* locations = new (allocator) LocationSummary(invoke,
1479                                                               LocationSummary::kCallOnSlowPath,
1480                                                               kIntrinsified);
1481  // The data needs to be in RDI for scasw. So request that the string is there, anyways.
1482  locations->SetInAt(0, Location::RegisterLocation(RDI));
1483  // If we look for a constant char, we'll still have to copy it into RAX. So just request the
1484  // allocator to do that, anyways. We can still do the constant check by checking the parameter
1485  // of the instruction explicitly.
1486  // Note: This works as we don't clobber RAX anywhere.
1487  locations->SetInAt(1, Location::RegisterLocation(RAX));
1488  if (!start_at_zero) {
1489    locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
1490  }
1491  // As we clobber RDI during execution anyways, also use it as the output.
1492  locations->SetOut(Location::SameAsFirstInput());
1493
1494  // repne scasw uses RCX as the counter.
1495  locations->AddTemp(Location::RegisterLocation(RCX));
1496  // Need another temporary to be able to compute the result.
1497  locations->AddTemp(Location::RequiresRegister());
1498}
1499
1500static void GenerateStringIndexOf(HInvoke* invoke,
1501                                  X86_64Assembler* assembler,
1502                                  CodeGeneratorX86_64* codegen,
1503                                  ArenaAllocator* allocator,
1504                                  bool start_at_zero) {
1505  LocationSummary* locations = invoke->GetLocations();
1506
1507  // Note that the null check must have been done earlier.
1508  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1509
1510  CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
1511  CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
1512  CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
1513  CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
1514  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
1515
1516  // Check our assumptions for registers.
1517  DCHECK_EQ(string_obj.AsRegister(), RDI);
1518  DCHECK_EQ(search_value.AsRegister(), RAX);
1519  DCHECK_EQ(counter.AsRegister(), RCX);
1520  DCHECK_EQ(out.AsRegister(), RDI);
1521
1522  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
1523  // or directly dispatch if we have a constant.
1524  SlowPathCode* slow_path = nullptr;
1525  if (invoke->InputAt(1)->IsIntConstant()) {
1526    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
1527    std::numeric_limits<uint16_t>::max()) {
1528      // Always needs the slow-path. We could directly dispatch to it, but this case should be
1529      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
1530      slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
1531      codegen->AddSlowPath(slow_path);
1532      __ jmp(slow_path->GetEntryLabel());
1533      __ Bind(slow_path->GetExitLabel());
1534      return;
1535    }
1536  } else {
1537    __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
1538    slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
1539    codegen->AddSlowPath(slow_path);
1540    __ j(kAbove, slow_path->GetEntryLabel());
1541  }
1542
1543  // From here down, we know that we are looking for a char that fits in 16 bits.
1544  // Location of reference to data array within the String object.
1545  int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1546  // Location of count within the String object.
1547  int32_t count_offset = mirror::String::CountOffset().Int32Value();
1548
1549  // Load string length, i.e., the count field of the string.
1550  __ movl(string_length, Address(string_obj, count_offset));
1551
1552  // Do a length check.
1553  // TODO: Support jecxz.
1554  NearLabel not_found_label;
1555  __ testl(string_length, string_length);
1556  __ j(kEqual, &not_found_label);
1557
1558  if (start_at_zero) {
1559    // Number of chars to scan is the same as the string length.
1560    __ movl(counter, string_length);
1561
1562    // Move to the start of the string.
1563    __ addq(string_obj, Immediate(value_offset));
1564  } else {
1565    CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
1566
1567    // Do a start_index check.
1568    __ cmpl(start_index, string_length);
1569    __ j(kGreaterEqual, &not_found_label);
1570
1571    // Ensure we have a start index >= 0;
1572    __ xorl(counter, counter);
1573    __ cmpl(start_index, Immediate(0));
1574    __ cmov(kGreater, counter, start_index, /* is64bit */ false);  // 32-bit copy is enough.
1575
1576    // Move to the start of the string: string_obj + value_offset + 2 * start_index.
1577    __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1578
1579    // Now update ecx, the work counter: it's gonna be string.length - start_index.
1580    __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
1581    __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
1582  }
1583
1584  // Everything is set up for repne scasw:
1585  //   * Comparison address in RDI.
1586  //   * Counter in ECX.
1587  __ repne_scasw();
1588
1589  // Did we find a match?
1590  __ j(kNotEqual, &not_found_label);
1591
1592  // Yes, we matched.  Compute the index of the result.
1593  __ subl(string_length, counter);
1594  __ leal(out, Address(string_length, -1));
1595
1596  NearLabel done;
1597  __ jmp(&done);
1598
1599  // Failed to match; return -1.
1600  __ Bind(&not_found_label);
1601  __ movl(out, Immediate(-1));
1602
1603  // And join up at the end.
1604  __ Bind(&done);
1605  if (slow_path != nullptr) {
1606    __ Bind(slow_path->GetExitLabel());
1607  }
1608}
1609
1610void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
1611  CreateStringIndexOfLocations(invoke, arena_, /* start_at_zero */ true);
1612}
1613
1614void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
1615  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), /* start_at_zero */ true);
1616}
1617
1618void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1619  CreateStringIndexOfLocations(invoke, arena_, /* start_at_zero */ false);
1620}
1621
1622void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1623  GenerateStringIndexOf(
1624      invoke, GetAssembler(), codegen_, GetAllocator(), /* start_at_zero */ false);
1625}
1626
1627void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1628  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1629                                                            LocationSummary::kCall,
1630                                                            kIntrinsified);
1631  InvokeRuntimeCallingConvention calling_convention;
1632  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1633  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1634  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1635  locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
1636  locations->SetOut(Location::RegisterLocation(RAX));
1637}
1638
1639void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1640  X86_64Assembler* assembler = GetAssembler();
1641  LocationSummary* locations = invoke->GetLocations();
1642
1643  CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
1644  __ testl(byte_array, byte_array);
1645  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1646  codegen_->AddSlowPath(slow_path);
1647  __ j(kEqual, slow_path->GetEntryLabel());
1648
1649  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromBytes),
1650                                  /* no_rip */ true));
1651  CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
1652  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
1653  __ Bind(slow_path->GetExitLabel());
1654}
1655
1656void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1657  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1658                                                            LocationSummary::kCall,
1659                                                            kIntrinsified);
1660  InvokeRuntimeCallingConvention calling_convention;
1661  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1662  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1663  locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1664  locations->SetOut(Location::RegisterLocation(RAX));
1665}
1666
1667void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1668  X86_64Assembler* assembler = GetAssembler();
1669
1670  // No need to emit code checking whether `locations->InAt(2)` is a null
1671  // pointer, as callers of the native method
1672  //
1673  //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
1674  //
1675  // all include a null check on `data` before calling that method.
1676  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromChars),
1677                                  /* no_rip */ true));
1678  CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
1679  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
1680}
1681
1682void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1683  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1684                                                            LocationSummary::kCall,
1685                                                            kIntrinsified);
1686  InvokeRuntimeCallingConvention calling_convention;
1687  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1688  locations->SetOut(Location::RegisterLocation(RAX));
1689}
1690
1691void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1692  X86_64Assembler* assembler = GetAssembler();
1693  LocationSummary* locations = invoke->GetLocations();
1694
1695  CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
1696  __ testl(string_to_copy, string_to_copy);
1697  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1698  codegen_->AddSlowPath(slow_path);
1699  __ j(kEqual, slow_path->GetEntryLabel());
1700
1701  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromString),
1702                                  /* no_rip */ true));
1703  CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
1704  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
1705  __ Bind(slow_path->GetExitLabel());
1706}
1707
1708void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1709  // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1710  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1711                                                            LocationSummary::kNoCall,
1712                                                            kIntrinsified);
1713  locations->SetInAt(0, Location::RequiresRegister());
1714  locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
1715  locations->SetInAt(2, Location::RequiresRegister());
1716  locations->SetInAt(3, Location::RequiresRegister());
1717  locations->SetInAt(4, Location::RequiresRegister());
1718
1719  // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
1720  locations->AddTemp(Location::RegisterLocation(RSI));
1721  locations->AddTemp(Location::RegisterLocation(RDI));
1722  locations->AddTemp(Location::RegisterLocation(RCX));
1723}
1724
1725void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1726  X86_64Assembler* assembler = GetAssembler();
1727  LocationSummary* locations = invoke->GetLocations();
1728
1729  size_t char_component_size = Primitive::ComponentSize(Primitive::kPrimChar);
1730  // Location of data in char array buffer.
1731  const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
1732  // Location of char array data in string.
1733  const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1734
1735  // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1736  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
1737  Location srcBegin = locations->InAt(1);
1738  int srcBegin_value =
1739    srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
1740  CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>();
1741  CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>();
1742  CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>();
1743
1744  // Check assumption that sizeof(Char) is 2 (used in scaling below).
1745  const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
1746  DCHECK_EQ(char_size, 2u);
1747
1748  // Compute the address of the destination buffer.
1749  __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1750
1751  // Compute the address of the source string.
1752  if (srcBegin.IsConstant()) {
1753    // Compute the address of the source string by adding the number of chars from
1754    // the source beginning to the value offset of a string.
1755    __ leaq(CpuRegister(RSI), Address(obj, srcBegin_value * char_size + value_offset));
1756  } else {
1757    __ leaq(CpuRegister(RSI), Address(obj, srcBegin.AsRegister<CpuRegister>(),
1758                                      ScaleFactor::TIMES_2, value_offset));
1759  }
1760
1761  // Compute the number of chars (words) to move.
1762  __ movl(CpuRegister(RCX), srcEnd);
1763  if (srcBegin.IsConstant()) {
1764    if (srcBegin_value != 0) {
1765      __ subl(CpuRegister(RCX), Immediate(srcBegin_value));
1766    }
1767  } else {
1768    DCHECK(srcBegin.IsRegister());
1769    __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>());
1770  }
1771
1772  // Do the move.
1773  __ rep_movsw();
1774}
1775
1776static void GenPeek(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
1777  CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1778  CpuRegister out = locations->Out().AsRegister<CpuRegister>();  // == address, here for clarity.
1779  // x86 allows unaligned access. We do not have to check the input or use specific instructions
1780  // to avoid a SIGBUS.
1781  switch (size) {
1782    case Primitive::kPrimByte:
1783      __ movsxb(out, Address(address, 0));
1784      break;
1785    case Primitive::kPrimShort:
1786      __ movsxw(out, Address(address, 0));
1787      break;
1788    case Primitive::kPrimInt:
1789      __ movl(out, Address(address, 0));
1790      break;
1791    case Primitive::kPrimLong:
1792      __ movq(out, Address(address, 0));
1793      break;
1794    default:
1795      LOG(FATAL) << "Type not recognized for peek: " << size;
1796      UNREACHABLE();
1797  }
1798}
1799
1800void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
1801  CreateIntToIntLocations(arena_, invoke);
1802}
1803
1804void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
1805  GenPeek(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
1806}
1807
1808void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
1809  CreateIntToIntLocations(arena_, invoke);
1810}
1811
1812void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
1813  GenPeek(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
1814}
1815
1816void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
1817  CreateIntToIntLocations(arena_, invoke);
1818}
1819
1820void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
1821  GenPeek(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
1822}
1823
1824void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
1825  CreateIntToIntLocations(arena_, invoke);
1826}
1827
1828void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
1829  GenPeek(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
1830}
1831
1832static void CreateIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* invoke) {
1833  LocationSummary* locations = new (arena) LocationSummary(invoke,
1834                                                           LocationSummary::kNoCall,
1835                                                           kIntrinsified);
1836  locations->SetInAt(0, Location::RequiresRegister());
1837  locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1)));
1838}
1839
1840static void GenPoke(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
1841  CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1842  Location value = locations->InAt(1);
1843  // x86 allows unaligned access. We do not have to check the input or use specific instructions
1844  // to avoid a SIGBUS.
1845  switch (size) {
1846    case Primitive::kPrimByte:
1847      if (value.IsConstant()) {
1848        __ movb(Address(address, 0),
1849                Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1850      } else {
1851        __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
1852      }
1853      break;
1854    case Primitive::kPrimShort:
1855      if (value.IsConstant()) {
1856        __ movw(Address(address, 0),
1857                Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1858      } else {
1859        __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
1860      }
1861      break;
1862    case Primitive::kPrimInt:
1863      if (value.IsConstant()) {
1864        __ movl(Address(address, 0),
1865                Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1866      } else {
1867        __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
1868      }
1869      break;
1870    case Primitive::kPrimLong:
1871      if (value.IsConstant()) {
1872        int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
1873        DCHECK(IsInt<32>(v));
1874        int32_t v_32 = v;
1875        __ movq(Address(address, 0), Immediate(v_32));
1876      } else {
1877        __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
1878      }
1879      break;
1880    default:
1881      LOG(FATAL) << "Type not recognized for poke: " << size;
1882      UNREACHABLE();
1883  }
1884}
1885
1886void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
1887  CreateIntIntToVoidLocations(arena_, invoke);
1888}
1889
1890void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
1891  GenPoke(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
1892}
1893
1894void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
1895  CreateIntIntToVoidLocations(arena_, invoke);
1896}
1897
1898void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
1899  GenPoke(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
1900}
1901
1902void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
1903  CreateIntIntToVoidLocations(arena_, invoke);
1904}
1905
1906void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
1907  GenPoke(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
1908}
1909
1910void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
1911  CreateIntIntToVoidLocations(arena_, invoke);
1912}
1913
1914void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
1915  GenPoke(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
1916}
1917
1918void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
1919  LocationSummary* locations = new (arena_) LocationSummary(invoke,
1920                                                            LocationSummary::kNoCall,
1921                                                            kIntrinsified);
1922  locations->SetOut(Location::RequiresRegister());
1923}
1924
1925void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
1926  CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
1927  GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(),
1928                                                    /* no_rip */ true));
1929}
1930
1931static void GenUnsafeGet(HInvoke* invoke,
1932                         Primitive::Type type,
1933                         bool is_volatile ATTRIBUTE_UNUSED,
1934                         CodeGeneratorX86_64* codegen) {
1935  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
1936  LocationSummary* locations = invoke->GetLocations();
1937  Location base_loc = locations->InAt(1);
1938  CpuRegister base = base_loc.AsRegister<CpuRegister>();
1939  Location offset_loc = locations->InAt(2);
1940  CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
1941  Location output_loc = locations->Out();
1942  CpuRegister output = output_loc.AsRegister<CpuRegister>();
1943
1944  switch (type) {
1945    case Primitive::kPrimInt:
1946      __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1947      break;
1948
1949    case Primitive::kPrimNot: {
1950      if (kEmitCompilerReadBarrier) {
1951        if (kUseBakerReadBarrier) {
1952          Location temp = locations->GetTemp(0);
1953          codegen->GenerateArrayLoadWithBakerReadBarrier(
1954              invoke, output_loc, base, 0U, offset_loc, temp, /* needs_null_check */ false);
1955        } else {
1956          __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1957          codegen->GenerateReadBarrierSlow(
1958              invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
1959        }
1960      } else {
1961        __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1962        __ MaybeUnpoisonHeapReference(output);
1963      }
1964      break;
1965    }
1966
1967    case Primitive::kPrimLong:
1968      __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1969      break;
1970
1971    default:
1972      LOG(FATAL) << "Unsupported op size " << type;
1973      UNREACHABLE();
1974  }
1975}
1976
1977static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
1978                                          HInvoke* invoke,
1979                                          Primitive::Type type) {
1980  bool can_call = kEmitCompilerReadBarrier &&
1981      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
1982       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
1983  LocationSummary* locations = new (arena) LocationSummary(invoke,
1984                                                           can_call ?
1985                                                               LocationSummary::kCallOnSlowPath :
1986                                                               LocationSummary::kNoCall,
1987                                                           kIntrinsified);
1988  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1989  locations->SetInAt(1, Location::RequiresRegister());
1990  locations->SetInAt(2, Location::RequiresRegister());
1991  locations->SetOut(Location::RequiresRegister());
1992  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1993    // We need a temporary register for the read barrier marking slow
1994    // path in InstructionCodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier.
1995    locations->AddTemp(Location::RequiresRegister());
1996  }
1997}
1998
1999void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
2000  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
2001}
2002void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
2003  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
2004}
2005void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
2006  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
2007}
2008void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
2009  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
2010}
2011void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
2012  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
2013}
2014void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
2015  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
2016}
2017
2018
2019void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
2020  GenUnsafeGet(invoke, Primitive::kPrimInt, /* is_volatile */ false, codegen_);
2021}
2022void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
2023  GenUnsafeGet(invoke, Primitive::kPrimInt, /* is_volatile */ true, codegen_);
2024}
2025void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
2026  GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ false, codegen_);
2027}
2028void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
2029  GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ true, codegen_);
2030}
2031void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
2032  GenUnsafeGet(invoke, Primitive::kPrimNot, /* is_volatile */ false, codegen_);
2033}
2034void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
2035  GenUnsafeGet(invoke, Primitive::kPrimNot, /* is_volatile */ true, codegen_);
2036}
2037
2038
2039static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* arena,
2040                                                       Primitive::Type type,
2041                                                       HInvoke* invoke) {
2042  LocationSummary* locations = new (arena) LocationSummary(invoke,
2043                                                           LocationSummary::kNoCall,
2044                                                           kIntrinsified);
2045  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
2046  locations->SetInAt(1, Location::RequiresRegister());
2047  locations->SetInAt(2, Location::RequiresRegister());
2048  locations->SetInAt(3, Location::RequiresRegister());
2049  if (type == Primitive::kPrimNot) {
2050    // Need temp registers for card-marking.
2051    locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
2052    locations->AddTemp(Location::RequiresRegister());
2053  }
2054}
2055
2056void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
2057  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
2058}
2059void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2060  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
2061}
2062void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2063  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
2064}
2065void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2066  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
2067}
2068void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2069  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
2070}
2071void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2072  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
2073}
2074void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2075  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
2076}
2077void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2078  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
2079}
2080void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2081  CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
2082}
2083
2084// We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
2085// memory model.
2086static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool is_volatile,
2087                         CodeGeneratorX86_64* codegen) {
2088  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2089  CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2090  CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2091  CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
2092
2093  if (type == Primitive::kPrimLong) {
2094    __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2095  } else if (kPoisonHeapReferences && type == Primitive::kPrimNot) {
2096    CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2097    __ movl(temp, value);
2098    __ PoisonHeapReference(temp);
2099    __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
2100  } else {
2101    __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2102  }
2103
2104  if (is_volatile) {
2105    codegen->MemoryFence();
2106  }
2107
2108  if (type == Primitive::kPrimNot) {
2109    bool value_can_be_null = true;  // TODO: Worth finding out this information?
2110    codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
2111                        locations->GetTemp(1).AsRegister<CpuRegister>(),
2112                        base,
2113                        value,
2114                        value_can_be_null);
2115  }
2116}
2117
2118void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
2119  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ false, codegen_);
2120}
2121void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2122  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ false, codegen_);
2123}
2124void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2125  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ true, codegen_);
2126}
2127void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2128  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ false, codegen_);
2129}
2130void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2131  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ false, codegen_);
2132}
2133void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2134  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ true, codegen_);
2135}
2136void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2137  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ false, codegen_);
2138}
2139void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2140  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ false, codegen_);
2141}
2142void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2143  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ true, codegen_);
2144}
2145
2146static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, Primitive::Type type,
2147                                       HInvoke* invoke) {
2148  LocationSummary* locations = new (arena) LocationSummary(invoke,
2149                                                           LocationSummary::kNoCall,
2150                                                           kIntrinsified);
2151  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
2152  locations->SetInAt(1, Location::RequiresRegister());
2153  locations->SetInAt(2, Location::RequiresRegister());
2154  // expected value must be in EAX/RAX.
2155  locations->SetInAt(3, Location::RegisterLocation(RAX));
2156  locations->SetInAt(4, Location::RequiresRegister());
2157
2158  locations->SetOut(Location::RequiresRegister());
2159  if (type == Primitive::kPrimNot) {
2160    // Need temp registers for card-marking.
2161    locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
2162    locations->AddTemp(Location::RequiresRegister());
2163  }
2164}
2165
2166void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2167  CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimInt, invoke);
2168}
2169
2170void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2171  CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimLong, invoke);
2172}
2173
2174void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2175  // The UnsafeCASObject intrinsic is missing a read barrier, and
2176  // therefore sometimes does not work as expected (b/25883050).
2177  // Turn it off temporarily as a quick fix, until the read barrier is
2178  // implemented.
2179  //
2180  // TODO(rpl): Implement a read barrier in GenCAS below and re-enable
2181  // this intrinsic.
2182  if (kEmitCompilerReadBarrier) {
2183    return;
2184  }
2185
2186  CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimNot, invoke);
2187}
2188
2189static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
2190  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2191  LocationSummary* locations = invoke->GetLocations();
2192
2193  CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2194  CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2195  CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
2196  // Ensure `expected` is in RAX (required by the CMPXCHG instruction).
2197  DCHECK_EQ(expected.AsRegister(), RAX);
2198  CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
2199  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2200
2201  if (type == Primitive::kPrimNot) {
2202    // Mark card for object assuming new value is stored.
2203    bool value_can_be_null = true;  // TODO: Worth finding out this information?
2204    codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
2205                        locations->GetTemp(1).AsRegister<CpuRegister>(),
2206                        base,
2207                        value,
2208                        value_can_be_null);
2209
2210    bool base_equals_value = (base.AsRegister() == value.AsRegister());
2211    Register value_reg = value.AsRegister();
2212    if (kPoisonHeapReferences) {
2213      if (base_equals_value) {
2214        // If `base` and `value` are the same register location, move
2215        // `value_reg` to a temporary register.  This way, poisoning
2216        // `value_reg` won't invalidate `base`.
2217        value_reg = locations->GetTemp(0).AsRegister<CpuRegister>().AsRegister();
2218        __ movl(CpuRegister(value_reg), base);
2219      }
2220
2221      // Check that the register allocator did not assign the location
2222      // of `expected` (RAX) to `value` nor to `base`, so that heap
2223      // poisoning (when enabled) works as intended below.
2224      // - If `value` were equal to `expected`, both references would
2225      //   be poisoned twice, meaning they would not be poisoned at
2226      //   all, as heap poisoning uses address negation.
2227      // - If `base` were equal to `expected`, poisoning `expected`
2228      //   would invalidate `base`.
2229      DCHECK_NE(value_reg, expected.AsRegister());
2230      DCHECK_NE(base.AsRegister(), expected.AsRegister());
2231
2232      __ PoisonHeapReference(expected);
2233      __ PoisonHeapReference(CpuRegister(value_reg));
2234    }
2235
2236    // TODO: Add a read barrier for the reference stored in the object
2237    // before attempting the CAS, similar to the one in the
2238    // art::Unsafe_compareAndSwapObject JNI implementation.
2239    //
2240    // Note that this code is not (yet) used when read barriers are
2241    // enabled (see IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject).
2242    DCHECK(!kEmitCompilerReadBarrier);
2243    __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), CpuRegister(value_reg));
2244
2245    // LOCK CMPXCHG has full barrier semantics, and we don't need
2246    // scheduling barriers at this time.
2247
2248    // Convert ZF into the boolean result.
2249    __ setcc(kZero, out);
2250    __ movzxb(out, out);
2251
2252    // If heap poisoning is enabled, we need to unpoison the values
2253    // that were poisoned earlier.
2254    if (kPoisonHeapReferences) {
2255      if (base_equals_value) {
2256        // `value_reg` has been moved to a temporary register, no need
2257        // to unpoison it.
2258      } else {
2259        // Ensure `value` is different from `out`, so that unpoisoning
2260        // the former does not invalidate the latter.
2261        DCHECK_NE(value_reg, out.AsRegister());
2262        __ UnpoisonHeapReference(CpuRegister(value_reg));
2263      }
2264      // Ensure `expected` is different from `out`, so that unpoisoning
2265      // the former does not invalidate the latter.
2266      DCHECK_NE(expected.AsRegister(), out.AsRegister());
2267      __ UnpoisonHeapReference(expected);
2268    }
2269  } else {
2270    if (type == Primitive::kPrimInt) {
2271      __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
2272    } else if (type == Primitive::kPrimLong) {
2273      __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
2274    } else {
2275      LOG(FATAL) << "Unexpected CAS type " << type;
2276    }
2277
2278    // LOCK CMPXCHG has full barrier semantics, and we don't need
2279    // scheduling barriers at this time.
2280
2281    // Convert ZF into the boolean result.
2282    __ setcc(kZero, out);
2283    __ movzxb(out, out);
2284  }
2285}
2286
2287void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2288  GenCAS(Primitive::kPrimInt, invoke, codegen_);
2289}
2290
2291void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2292  GenCAS(Primitive::kPrimLong, invoke, codegen_);
2293}
2294
2295void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2296  GenCAS(Primitive::kPrimNot, invoke, codegen_);
2297}
2298
2299void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
2300  LocationSummary* locations = new (arena_) LocationSummary(invoke,
2301                                                           LocationSummary::kNoCall,
2302                                                           kIntrinsified);
2303  locations->SetInAt(0, Location::RequiresRegister());
2304  locations->SetOut(Location::SameAsFirstInput());
2305  locations->AddTemp(Location::RequiresRegister());
2306}
2307
2308static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
2309                     X86_64Assembler* assembler) {
2310  Immediate imm_shift(shift);
2311  Immediate imm_mask(mask);
2312  __ movl(temp, reg);
2313  __ shrl(reg, imm_shift);
2314  __ andl(temp, imm_mask);
2315  __ andl(reg, imm_mask);
2316  __ shll(temp, imm_shift);
2317  __ orl(reg, temp);
2318}
2319
2320void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
2321  X86_64Assembler* assembler = GetAssembler();
2322  LocationSummary* locations = invoke->GetLocations();
2323
2324  CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2325  CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2326
2327  /*
2328   * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2329   * swapping bits to reverse bits in a number x. Using bswap to save instructions
2330   * compared to generic luni implementation which has 5 rounds of swapping bits.
2331   * x = bswap x
2332   * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
2333   * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
2334   * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
2335   */
2336  __ bswapl(reg);
2337  SwapBits(reg, temp, 1, 0x55555555, assembler);
2338  SwapBits(reg, temp, 2, 0x33333333, assembler);
2339  SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
2340}
2341
2342void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
2343  LocationSummary* locations = new (arena_) LocationSummary(invoke,
2344                                                           LocationSummary::kNoCall,
2345                                                           kIntrinsified);
2346  locations->SetInAt(0, Location::RequiresRegister());
2347  locations->SetOut(Location::SameAsFirstInput());
2348  locations->AddTemp(Location::RequiresRegister());
2349  locations->AddTemp(Location::RequiresRegister());
2350}
2351
2352static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
2353                       int32_t shift, int64_t mask, X86_64Assembler* assembler) {
2354  Immediate imm_shift(shift);
2355  __ movq(temp_mask, Immediate(mask));
2356  __ movq(temp, reg);
2357  __ shrq(reg, imm_shift);
2358  __ andq(temp, temp_mask);
2359  __ andq(reg, temp_mask);
2360  __ shlq(temp, imm_shift);
2361  __ orq(reg, temp);
2362}
2363
2364void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
2365  X86_64Assembler* assembler = GetAssembler();
2366  LocationSummary* locations = invoke->GetLocations();
2367
2368  CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2369  CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2370  CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2371
2372  /*
2373   * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2374   * swapping bits to reverse bits in a long number x. Using bswap to save instructions
2375   * compared to generic luni implementation which has 5 rounds of swapping bits.
2376   * x = bswap x
2377   * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
2378   * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
2379   * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
2380   */
2381  __ bswapq(reg);
2382  SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
2383  SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
2384  SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
2385}
2386
2387static void CreateBitCountLocations(
2388    ArenaAllocator* arena, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
2389  if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
2390    // Do nothing if there is no popcnt support. This results in generating
2391    // a call for the intrinsic rather than direct code.
2392    return;
2393  }
2394  LocationSummary* locations = new (arena) LocationSummary(invoke,
2395                                                           LocationSummary::kNoCall,
2396                                                           kIntrinsified);
2397  locations->SetInAt(0, Location::Any());
2398  locations->SetOut(Location::RequiresRegister());
2399}
2400
2401static void GenBitCount(X86_64Assembler* assembler,
2402                        CodeGeneratorX86_64* codegen,
2403                        HInvoke* invoke,
2404                        bool is_long) {
2405  LocationSummary* locations = invoke->GetLocations();
2406  Location src = locations->InAt(0);
2407  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2408
2409  if (invoke->InputAt(0)->IsConstant()) {
2410    // Evaluate this at compile time.
2411    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2412    int32_t result = is_long
2413        ? POPCOUNT(static_cast<uint64_t>(value))
2414        : POPCOUNT(static_cast<uint32_t>(value));
2415    codegen->Load32BitValue(out, result);
2416    return;
2417  }
2418
2419  if (src.IsRegister()) {
2420    if (is_long) {
2421      __ popcntq(out, src.AsRegister<CpuRegister>());
2422    } else {
2423      __ popcntl(out, src.AsRegister<CpuRegister>());
2424    }
2425  } else if (is_long) {
2426    DCHECK(src.IsDoubleStackSlot());
2427    __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2428  } else {
2429    DCHECK(src.IsStackSlot());
2430    __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2431  }
2432}
2433
2434void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2435  CreateBitCountLocations(arena_, codegen_, invoke);
2436}
2437
2438void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2439  GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ false);
2440}
2441
2442void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
2443  CreateBitCountLocations(arena_, codegen_, invoke);
2444}
2445
2446void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
2447  GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ true);
2448}
2449
2450static void CreateOneBitLocations(ArenaAllocator* arena, HInvoke* invoke, bool is_high) {
2451  LocationSummary* locations = new (arena) LocationSummary(invoke,
2452                                                           LocationSummary::kNoCall,
2453                                                           kIntrinsified);
2454  locations->SetInAt(0, Location::Any());
2455  locations->SetOut(Location::RequiresRegister());
2456  locations->AddTemp(is_high ? Location::RegisterLocation(RCX)  // needs CL
2457                             : Location::RequiresRegister());  // any will do
2458}
2459
2460static void GenOneBit(X86_64Assembler* assembler,
2461                      CodeGeneratorX86_64* codegen,
2462                      HInvoke* invoke,
2463                      bool is_high, bool is_long) {
2464  LocationSummary* locations = invoke->GetLocations();
2465  Location src = locations->InAt(0);
2466  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2467
2468  if (invoke->InputAt(0)->IsConstant()) {
2469    // Evaluate this at compile time.
2470    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2471    if (value == 0) {
2472      __ xorl(out, out);  // Clears upper bits too.
2473      return;
2474    }
2475    // Nonzero value.
2476    if (is_high) {
2477      value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
2478                      : 31 - CLZ(static_cast<uint32_t>(value));
2479    } else {
2480      value = is_long ? CTZ(static_cast<uint64_t>(value))
2481                      : CTZ(static_cast<uint32_t>(value));
2482    }
2483    if (is_long) {
2484      codegen->Load64BitValue(out, 1L << value);
2485    } else {
2486      codegen->Load32BitValue(out, 1 << value);
2487    }
2488    return;
2489  }
2490
2491  // Handle the non-constant cases.
2492  CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
2493  if (is_high) {
2494    // Use architectural support: basically 1 << bsr.
2495    if (src.IsRegister()) {
2496      if (is_long) {
2497        __ bsrq(tmp, src.AsRegister<CpuRegister>());
2498      } else {
2499        __ bsrl(tmp, src.AsRegister<CpuRegister>());
2500      }
2501    } else if (is_long) {
2502      DCHECK(src.IsDoubleStackSlot());
2503      __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2504    } else {
2505      DCHECK(src.IsStackSlot());
2506      __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2507    }
2508    // BSR sets ZF if the input was zero.
2509    NearLabel is_zero, done;
2510    __ j(kEqual, &is_zero);
2511    __ movl(out, Immediate(1));  // Clears upper bits too.
2512    if (is_long) {
2513      __ shlq(out, tmp);
2514    } else {
2515      __ shll(out, tmp);
2516    }
2517    __ jmp(&done);
2518    __ Bind(&is_zero);
2519    __ xorl(out, out);  // Clears upper bits too.
2520    __ Bind(&done);
2521  } else  {
2522    // Copy input into temporary.
2523    if (src.IsRegister()) {
2524      if (is_long) {
2525        __ movq(tmp, src.AsRegister<CpuRegister>());
2526      } else {
2527        __ movl(tmp, src.AsRegister<CpuRegister>());
2528      }
2529    } else if (is_long) {
2530      DCHECK(src.IsDoubleStackSlot());
2531      __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2532    } else {
2533      DCHECK(src.IsStackSlot());
2534      __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2535    }
2536    // Do the bit twiddling: basically tmp & -tmp;
2537    if (is_long) {
2538      __ movq(out, tmp);
2539      __ negq(tmp);
2540      __ andq(out, tmp);
2541    } else {
2542      __ movl(out, tmp);
2543      __ negl(tmp);
2544      __ andl(out, tmp);
2545    }
2546  }
2547}
2548
2549void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2550  CreateOneBitLocations(arena_, invoke, /* is_high */ true);
2551}
2552
2553void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2554  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ false);
2555}
2556
2557void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2558  CreateOneBitLocations(arena_, invoke, /* is_high */ true);
2559}
2560
2561void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2562  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ true);
2563}
2564
2565void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2566  CreateOneBitLocations(arena_, invoke, /* is_high */ false);
2567}
2568
2569void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2570  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ false);
2571}
2572
2573void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2574  CreateOneBitLocations(arena_, invoke, /* is_high */ false);
2575}
2576
2577void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2578  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ true);
2579}
2580
2581static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
2582  LocationSummary* locations = new (arena) LocationSummary(invoke,
2583                                                           LocationSummary::kNoCall,
2584                                                           kIntrinsified);
2585  locations->SetInAt(0, Location::Any());
2586  locations->SetOut(Location::RequiresRegister());
2587}
2588
2589static void GenLeadingZeros(X86_64Assembler* assembler,
2590                            CodeGeneratorX86_64* codegen,
2591                            HInvoke* invoke, bool is_long) {
2592  LocationSummary* locations = invoke->GetLocations();
2593  Location src = locations->InAt(0);
2594  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2595
2596  int zero_value_result = is_long ? 64 : 32;
2597  if (invoke->InputAt(0)->IsConstant()) {
2598    // Evaluate this at compile time.
2599    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2600    if (value == 0) {
2601      value = zero_value_result;
2602    } else {
2603      value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
2604    }
2605    codegen->Load32BitValue(out, value);
2606    return;
2607  }
2608
2609  // Handle the non-constant cases.
2610  if (src.IsRegister()) {
2611    if (is_long) {
2612      __ bsrq(out, src.AsRegister<CpuRegister>());
2613    } else {
2614      __ bsrl(out, src.AsRegister<CpuRegister>());
2615    }
2616  } else if (is_long) {
2617    DCHECK(src.IsDoubleStackSlot());
2618    __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2619  } else {
2620    DCHECK(src.IsStackSlot());
2621    __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2622  }
2623
2624  // BSR sets ZF if the input was zero, and the output is undefined.
2625  NearLabel is_zero, done;
2626  __ j(kEqual, &is_zero);
2627
2628  // Correct the result from BSR to get the CLZ result.
2629  __ xorl(out, Immediate(zero_value_result - 1));
2630  __ jmp(&done);
2631
2632  // Fix the zero case with the expected result.
2633  __ Bind(&is_zero);
2634  __ movl(out, Immediate(zero_value_result));
2635
2636  __ Bind(&done);
2637}
2638
2639void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2640  CreateLeadingZeroLocations(arena_, invoke);
2641}
2642
2643void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2644  GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
2645}
2646
2647void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2648  CreateLeadingZeroLocations(arena_, invoke);
2649}
2650
2651void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2652  GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
2653}
2654
2655static void CreateTrailingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
2656  LocationSummary* locations = new (arena) LocationSummary(invoke,
2657                                                           LocationSummary::kNoCall,
2658                                                           kIntrinsified);
2659  locations->SetInAt(0, Location::Any());
2660  locations->SetOut(Location::RequiresRegister());
2661}
2662
2663static void GenTrailingZeros(X86_64Assembler* assembler,
2664                             CodeGeneratorX86_64* codegen,
2665                             HInvoke* invoke, bool is_long) {
2666  LocationSummary* locations = invoke->GetLocations();
2667  Location src = locations->InAt(0);
2668  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2669
2670  int zero_value_result = is_long ? 64 : 32;
2671  if (invoke->InputAt(0)->IsConstant()) {
2672    // Evaluate this at compile time.
2673    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2674    if (value == 0) {
2675      value = zero_value_result;
2676    } else {
2677      value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
2678    }
2679    codegen->Load32BitValue(out, value);
2680    return;
2681  }
2682
2683  // Handle the non-constant cases.
2684  if (src.IsRegister()) {
2685    if (is_long) {
2686      __ bsfq(out, src.AsRegister<CpuRegister>());
2687    } else {
2688      __ bsfl(out, src.AsRegister<CpuRegister>());
2689    }
2690  } else if (is_long) {
2691    DCHECK(src.IsDoubleStackSlot());
2692    __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2693  } else {
2694    DCHECK(src.IsStackSlot());
2695    __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2696  }
2697
2698  // BSF sets ZF if the input was zero, and the output is undefined.
2699  NearLabel done;
2700  __ j(kNotEqual, &done);
2701
2702  // Fix the zero case with the expected result.
2703  __ movl(out, Immediate(zero_value_result));
2704
2705  __ Bind(&done);
2706}
2707
2708void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2709  CreateTrailingZeroLocations(arena_, invoke);
2710}
2711
2712void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2713  GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
2714}
2715
2716void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2717  CreateTrailingZeroLocations(arena_, invoke);
2718}
2719
2720void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2721  GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
2722}
2723
2724UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent)
2725UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
2726UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
2727
2728// 1.8.
2729UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
2730UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
2731UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
2732UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong)
2733UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject)
2734
2735UNREACHABLE_INTRINSICS(X86_64)
2736
2737#undef __
2738
2739}  // namespace x86_64
2740}  // namespace art
2741