1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "intrinsics_x86_64.h"
18
19 #include <limits>
20
21 #include "arch/x86_64/instruction_set_features_x86_64.h"
22 #include "art_method.h"
23 #include "base/bit_utils.h"
24 #include "code_generator_x86_64.h"
25 #include "entrypoints/quick/quick_entrypoints.h"
26 #include "intrinsics.h"
27 #include "intrinsics_utils.h"
28 #include "lock_word.h"
29 #include "mirror/array-inl.h"
30 #include "mirror/object_array-inl.h"
31 #include "mirror/reference.h"
32 #include "mirror/string.h"
33 #include "scoped_thread_state_change-inl.h"
34 #include "thread-inl.h"
35 #include "utils/x86_64/assembler_x86_64.h"
36 #include "utils/x86_64/constants_x86_64.h"
37
38 namespace art {
39
40 namespace x86_64 {
41
IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64 * codegen)42 IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
43 : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
44 }
45
GetAssembler()46 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
47 return down_cast<X86_64Assembler*>(codegen_->GetAssembler());
48 }
49
GetAllocator()50 ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
51 return codegen_->GetGraph()->GetArena();
52 }
53
TryDispatch(HInvoke * invoke)54 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
55 Dispatch(invoke);
56 LocationSummary* res = invoke->GetLocations();
57 if (res == nullptr) {
58 return false;
59 }
60 return res->Intrinsified();
61 }
62
MoveArguments(HInvoke * invoke,CodeGeneratorX86_64 * codegen)63 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
64 InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
65 IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
66 }
67
68 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
69
70 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
71 #define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT
72
73 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
74 class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode {
75 public:
ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction * instruction)76 explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
77 : SlowPathCode(instruction) {
78 DCHECK(kEmitCompilerReadBarrier);
79 DCHECK(kUseBakerReadBarrier);
80 }
81
EmitNativeCode(CodeGenerator * codegen)82 void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
83 CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
84 LocationSummary* locations = instruction_->GetLocations();
85 DCHECK(locations->CanCall());
86 DCHECK(instruction_->IsInvokeStaticOrDirect())
87 << "Unexpected instruction in read barrier arraycopy slow path: "
88 << instruction_->DebugName();
89 DCHECK(instruction_->GetLocations()->Intrinsified());
90 DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
91
92 int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
93
94 CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>();
95 CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>();
96 CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>();
97
98 __ Bind(GetEntryLabel());
99 NearLabel loop;
100 __ Bind(&loop);
101 __ movl(CpuRegister(TMP), Address(src_curr_addr, 0));
102 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
103 // TODO: Inline the mark bit check before calling the runtime?
104 // TMP = ReadBarrier::Mark(TMP);
105 // No need to save live registers; it's taken care of by the
106 // entrypoint. Also, there is no need to update the stack mask,
107 // as this runtime call will not trigger a garbage collection.
108 int32_t entry_point_offset =
109 CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
110 // This runtime call does not require a stack map.
111 x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
112 __ MaybePoisonHeapReference(CpuRegister(TMP));
113 __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP));
114 __ addl(src_curr_addr, Immediate(element_size));
115 __ addl(dst_curr_addr, Immediate(element_size));
116 __ cmpl(src_curr_addr, src_stop_addr);
117 __ j(kNotEqual, &loop);
118 __ jmp(GetExitLabel());
119 }
120
GetDescription() const121 const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; }
122
123 private:
124 DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64);
125 };
126
127 #undef __
128
129 #define __ assembler->
130
CreateFPToIntLocations(ArenaAllocator * arena,HInvoke * invoke)131 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
132 LocationSummary* locations = new (arena) LocationSummary(invoke,
133 LocationSummary::kNoCall,
134 kIntrinsified);
135 locations->SetInAt(0, Location::RequiresFpuRegister());
136 locations->SetOut(Location::RequiresRegister());
137 }
138
CreateIntToFPLocations(ArenaAllocator * arena,HInvoke * invoke)139 static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
140 LocationSummary* locations = new (arena) LocationSummary(invoke,
141 LocationSummary::kNoCall,
142 kIntrinsified);
143 locations->SetInAt(0, Location::RequiresRegister());
144 locations->SetOut(Location::RequiresFpuRegister());
145 }
146
MoveFPToInt(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler)147 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
148 Location input = locations->InAt(0);
149 Location output = locations->Out();
150 __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
151 }
152
MoveIntToFP(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler)153 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
154 Location input = locations->InAt(0);
155 Location output = locations->Out();
156 __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
157 }
158
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)159 void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
160 CreateFPToIntLocations(arena_, invoke);
161 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)162 void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
163 CreateIntToFPLocations(arena_, invoke);
164 }
165
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)166 void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
167 MoveFPToInt(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
168 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)169 void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
170 MoveIntToFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
171 }
172
VisitFloatFloatToRawIntBits(HInvoke * invoke)173 void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
174 CreateFPToIntLocations(arena_, invoke);
175 }
VisitFloatIntBitsToFloat(HInvoke * invoke)176 void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
177 CreateIntToFPLocations(arena_, invoke);
178 }
179
VisitFloatFloatToRawIntBits(HInvoke * invoke)180 void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
181 MoveFPToInt(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
182 }
VisitFloatIntBitsToFloat(HInvoke * invoke)183 void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
184 MoveIntToFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
185 }
186
CreateIntToIntLocations(ArenaAllocator * arena,HInvoke * invoke)187 static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
188 LocationSummary* locations = new (arena) LocationSummary(invoke,
189 LocationSummary::kNoCall,
190 kIntrinsified);
191 locations->SetInAt(0, Location::RequiresRegister());
192 locations->SetOut(Location::SameAsFirstInput());
193 }
194
GenReverseBytes(LocationSummary * locations,Primitive::Type size,X86_64Assembler * assembler)195 static void GenReverseBytes(LocationSummary* locations,
196 Primitive::Type size,
197 X86_64Assembler* assembler) {
198 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
199
200 switch (size) {
201 case Primitive::kPrimShort:
202 // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
203 __ bswapl(out);
204 __ sarl(out, Immediate(16));
205 break;
206 case Primitive::kPrimInt:
207 __ bswapl(out);
208 break;
209 case Primitive::kPrimLong:
210 __ bswapq(out);
211 break;
212 default:
213 LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
214 UNREACHABLE();
215 }
216 }
217
VisitIntegerReverseBytes(HInvoke * invoke)218 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
219 CreateIntToIntLocations(arena_, invoke);
220 }
221
VisitIntegerReverseBytes(HInvoke * invoke)222 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
223 GenReverseBytes(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
224 }
225
VisitLongReverseBytes(HInvoke * invoke)226 void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
227 CreateIntToIntLocations(arena_, invoke);
228 }
229
VisitLongReverseBytes(HInvoke * invoke)230 void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
231 GenReverseBytes(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
232 }
233
VisitShortReverseBytes(HInvoke * invoke)234 void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
235 CreateIntToIntLocations(arena_, invoke);
236 }
237
VisitShortReverseBytes(HInvoke * invoke)238 void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
239 GenReverseBytes(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
240 }
241
242
243 // TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we
244 // need is 64b.
245
CreateFloatToFloatPlusTemps(ArenaAllocator * arena,HInvoke * invoke)246 static void CreateFloatToFloatPlusTemps(ArenaAllocator* arena, HInvoke* invoke) {
247 // TODO: Enable memory operations when the assembler supports them.
248 LocationSummary* locations = new (arena) LocationSummary(invoke,
249 LocationSummary::kNoCall,
250 kIntrinsified);
251 locations->SetInAt(0, Location::RequiresFpuRegister());
252 locations->SetOut(Location::SameAsFirstInput());
253 locations->AddTemp(Location::RequiresFpuRegister()); // FP reg to hold mask.
254 }
255
MathAbsFP(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen)256 static void MathAbsFP(LocationSummary* locations,
257 bool is64bit,
258 X86_64Assembler* assembler,
259 CodeGeneratorX86_64* codegen) {
260 Location output = locations->Out();
261
262 DCHECK(output.IsFpuRegister());
263 XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
264
265 // TODO: Can mask directly with constant area using pand if we can guarantee
266 // that the literal is aligned on a 16 byte boundary. This will avoid a
267 // temporary.
268 if (is64bit) {
269 __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
270 __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
271 } else {
272 __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
273 __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
274 }
275 }
276
VisitMathAbsDouble(HInvoke * invoke)277 void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
278 CreateFloatToFloatPlusTemps(arena_, invoke);
279 }
280
VisitMathAbsDouble(HInvoke * invoke)281 void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
282 MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_);
283 }
284
VisitMathAbsFloat(HInvoke * invoke)285 void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
286 CreateFloatToFloatPlusTemps(arena_, invoke);
287 }
288
VisitMathAbsFloat(HInvoke * invoke)289 void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
290 MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_);
291 }
292
CreateIntToIntPlusTemp(ArenaAllocator * arena,HInvoke * invoke)293 static void CreateIntToIntPlusTemp(ArenaAllocator* arena, HInvoke* invoke) {
294 LocationSummary* locations = new (arena) LocationSummary(invoke,
295 LocationSummary::kNoCall,
296 kIntrinsified);
297 locations->SetInAt(0, Location::RequiresRegister());
298 locations->SetOut(Location::SameAsFirstInput());
299 locations->AddTemp(Location::RequiresRegister());
300 }
301
GenAbsInteger(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler)302 static void GenAbsInteger(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
303 Location output = locations->Out();
304 CpuRegister out = output.AsRegister<CpuRegister>();
305 CpuRegister mask = locations->GetTemp(0).AsRegister<CpuRegister>();
306
307 if (is64bit) {
308 // Create mask.
309 __ movq(mask, out);
310 __ sarq(mask, Immediate(63));
311 // Add mask.
312 __ addq(out, mask);
313 __ xorq(out, mask);
314 } else {
315 // Create mask.
316 __ movl(mask, out);
317 __ sarl(mask, Immediate(31));
318 // Add mask.
319 __ addl(out, mask);
320 __ xorl(out, mask);
321 }
322 }
323
VisitMathAbsInt(HInvoke * invoke)324 void IntrinsicLocationsBuilderX86_64::VisitMathAbsInt(HInvoke* invoke) {
325 CreateIntToIntPlusTemp(arena_, invoke);
326 }
327
VisitMathAbsInt(HInvoke * invoke)328 void IntrinsicCodeGeneratorX86_64::VisitMathAbsInt(HInvoke* invoke) {
329 GenAbsInteger(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
330 }
331
VisitMathAbsLong(HInvoke * invoke)332 void IntrinsicLocationsBuilderX86_64::VisitMathAbsLong(HInvoke* invoke) {
333 CreateIntToIntPlusTemp(arena_, invoke);
334 }
335
VisitMathAbsLong(HInvoke * invoke)336 void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
337 GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
338 }
339
GenMinMaxFP(LocationSummary * locations,bool is_min,bool is_double,X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen)340 static void GenMinMaxFP(LocationSummary* locations,
341 bool is_min,
342 bool is_double,
343 X86_64Assembler* assembler,
344 CodeGeneratorX86_64* codegen) {
345 Location op1_loc = locations->InAt(0);
346 Location op2_loc = locations->InAt(1);
347 Location out_loc = locations->Out();
348 XmmRegister out = out_loc.AsFpuRegister<XmmRegister>();
349
350 // Shortcut for same input locations.
351 if (op1_loc.Equals(op2_loc)) {
352 DCHECK(out_loc.Equals(op1_loc));
353 return;
354 }
355
356 // (out := op1)
357 // out <=? op2
358 // if Nan jmp Nan_label
359 // if out is min jmp done
360 // if op2 is min jmp op2_label
361 // handle -0/+0
362 // jmp done
363 // Nan_label:
364 // out := NaN
365 // op2_label:
366 // out := op2
367 // done:
368 //
369 // This removes one jmp, but needs to copy one input (op1) to out.
370 //
371 // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
372
373 XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
374
375 NearLabel nan, done, op2_label;
376 if (is_double) {
377 __ ucomisd(out, op2);
378 } else {
379 __ ucomiss(out, op2);
380 }
381
382 __ j(Condition::kParityEven, &nan);
383
384 __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label);
385 __ j(is_min ? Condition::kBelow : Condition::kAbove, &done);
386
387 // Handle 0.0/-0.0.
388 if (is_min) {
389 if (is_double) {
390 __ orpd(out, op2);
391 } else {
392 __ orps(out, op2);
393 }
394 } else {
395 if (is_double) {
396 __ andpd(out, op2);
397 } else {
398 __ andps(out, op2);
399 }
400 }
401 __ jmp(&done);
402
403 // NaN handling.
404 __ Bind(&nan);
405 if (is_double) {
406 __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
407 } else {
408 __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
409 }
410 __ jmp(&done);
411
412 // out := op2;
413 __ Bind(&op2_label);
414 if (is_double) {
415 __ movsd(out, op2);
416 } else {
417 __ movss(out, op2);
418 }
419
420 // Done.
421 __ Bind(&done);
422 }
423
CreateFPFPToFP(ArenaAllocator * arena,HInvoke * invoke)424 static void CreateFPFPToFP(ArenaAllocator* arena, HInvoke* invoke) {
425 LocationSummary* locations = new (arena) LocationSummary(invoke,
426 LocationSummary::kNoCall,
427 kIntrinsified);
428 locations->SetInAt(0, Location::RequiresFpuRegister());
429 locations->SetInAt(1, Location::RequiresFpuRegister());
430 // The following is sub-optimal, but all we can do for now. It would be fine to also accept
431 // the second input to be the output (we can simply swap inputs).
432 locations->SetOut(Location::SameAsFirstInput());
433 }
434
VisitMathMinDoubleDouble(HInvoke * invoke)435 void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
436 CreateFPFPToFP(arena_, invoke);
437 }
438
VisitMathMinDoubleDouble(HInvoke * invoke)439 void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
440 GenMinMaxFP(
441 invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler(), codegen_);
442 }
443
VisitMathMinFloatFloat(HInvoke * invoke)444 void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
445 CreateFPFPToFP(arena_, invoke);
446 }
447
VisitMathMinFloatFloat(HInvoke * invoke)448 void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
449 GenMinMaxFP(
450 invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler(), codegen_);
451 }
452
VisitMathMaxDoubleDouble(HInvoke * invoke)453 void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
454 CreateFPFPToFP(arena_, invoke);
455 }
456
VisitMathMaxDoubleDouble(HInvoke * invoke)457 void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
458 GenMinMaxFP(
459 invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler(), codegen_);
460 }
461
VisitMathMaxFloatFloat(HInvoke * invoke)462 void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
463 CreateFPFPToFP(arena_, invoke);
464 }
465
VisitMathMaxFloatFloat(HInvoke * invoke)466 void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
467 GenMinMaxFP(
468 invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler(), codegen_);
469 }
470
GenMinMax(LocationSummary * locations,bool is_min,bool is_long,X86_64Assembler * assembler)471 static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
472 X86_64Assembler* assembler) {
473 Location op1_loc = locations->InAt(0);
474 Location op2_loc = locations->InAt(1);
475
476 // Shortcut for same input locations.
477 if (op1_loc.Equals(op2_loc)) {
478 // Can return immediately, as op1_loc == out_loc.
479 // Note: if we ever support separate registers, e.g., output into memory, we need to check for
480 // a copy here.
481 DCHECK(locations->Out().Equals(op1_loc));
482 return;
483 }
484
485 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
486 CpuRegister op2 = op2_loc.AsRegister<CpuRegister>();
487
488 // (out := op1)
489 // out <=? op2
490 // if out is min jmp done
491 // out := op2
492 // done:
493
494 if (is_long) {
495 __ cmpq(out, op2);
496 } else {
497 __ cmpl(out, op2);
498 }
499
500 __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long);
501 }
502
CreateIntIntToIntLocations(ArenaAllocator * arena,HInvoke * invoke)503 static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
504 LocationSummary* locations = new (arena) LocationSummary(invoke,
505 LocationSummary::kNoCall,
506 kIntrinsified);
507 locations->SetInAt(0, Location::RequiresRegister());
508 locations->SetInAt(1, Location::RequiresRegister());
509 locations->SetOut(Location::SameAsFirstInput());
510 }
511
VisitMathMinIntInt(HInvoke * invoke)512 void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) {
513 CreateIntIntToIntLocations(arena_, invoke);
514 }
515
VisitMathMinIntInt(HInvoke * invoke)516 void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) {
517 GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler());
518 }
519
VisitMathMinLongLong(HInvoke * invoke)520 void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) {
521 CreateIntIntToIntLocations(arena_, invoke);
522 }
523
VisitMathMinLongLong(HInvoke * invoke)524 void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) {
525 GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler());
526 }
527
VisitMathMaxIntInt(HInvoke * invoke)528 void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
529 CreateIntIntToIntLocations(arena_, invoke);
530 }
531
VisitMathMaxIntInt(HInvoke * invoke)532 void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
533 GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler());
534 }
535
VisitMathMaxLongLong(HInvoke * invoke)536 void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
537 CreateIntIntToIntLocations(arena_, invoke);
538 }
539
VisitMathMaxLongLong(HInvoke * invoke)540 void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
541 GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler());
542 }
543
CreateFPToFPLocations(ArenaAllocator * arena,HInvoke * invoke)544 static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
545 LocationSummary* locations = new (arena) LocationSummary(invoke,
546 LocationSummary::kNoCall,
547 kIntrinsified);
548 locations->SetInAt(0, Location::RequiresFpuRegister());
549 locations->SetOut(Location::RequiresFpuRegister());
550 }
551
VisitMathSqrt(HInvoke * invoke)552 void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
553 CreateFPToFPLocations(arena_, invoke);
554 }
555
VisitMathSqrt(HInvoke * invoke)556 void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
557 LocationSummary* locations = invoke->GetLocations();
558 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
559 XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
560
561 GetAssembler()->sqrtsd(out, in);
562 }
563
InvokeOutOfLineIntrinsic(CodeGeneratorX86_64 * codegen,HInvoke * invoke)564 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
565 MoveArguments(invoke, codegen);
566
567 DCHECK(invoke->IsInvokeStaticOrDirect());
568 codegen->GenerateStaticOrDirectCall(
569 invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
570 codegen->RecordPcInfo(invoke, invoke->GetDexPc());
571
572 // Copy the result back to the expected output.
573 Location out = invoke->GetLocations()->Out();
574 if (out.IsValid()) {
575 DCHECK(out.IsRegister());
576 codegen->MoveFromReturnRegister(out, invoke->GetType());
577 }
578 }
579
CreateSSE41FPToFPLocations(ArenaAllocator * arena,HInvoke * invoke,CodeGeneratorX86_64 * codegen)580 static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
581 HInvoke* invoke,
582 CodeGeneratorX86_64* codegen) {
583 // Do we have instruction support?
584 if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
585 CreateFPToFPLocations(arena, invoke);
586 return;
587 }
588
589 // We have to fall back to a call to the intrinsic.
590 LocationSummary* locations = new (arena) LocationSummary(invoke,
591 LocationSummary::kCallOnMainOnly);
592 InvokeRuntimeCallingConvention calling_convention;
593 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
594 locations->SetOut(Location::FpuRegisterLocation(XMM0));
595 // Needs to be RDI for the invoke.
596 locations->AddTemp(Location::RegisterLocation(RDI));
597 }
598
GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64 * codegen,HInvoke * invoke,X86_64Assembler * assembler,int round_mode)599 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
600 HInvoke* invoke,
601 X86_64Assembler* assembler,
602 int round_mode) {
603 LocationSummary* locations = invoke->GetLocations();
604 if (locations->WillCall()) {
605 InvokeOutOfLineIntrinsic(codegen, invoke);
606 } else {
607 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
608 XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
609 __ roundsd(out, in, Immediate(round_mode));
610 }
611 }
612
VisitMathCeil(HInvoke * invoke)613 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
614 CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
615 }
616
VisitMathCeil(HInvoke * invoke)617 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
618 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
619 }
620
VisitMathFloor(HInvoke * invoke)621 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
622 CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
623 }
624
VisitMathFloor(HInvoke * invoke)625 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
626 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
627 }
628
VisitMathRint(HInvoke * invoke)629 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
630 CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
631 }
632
VisitMathRint(HInvoke * invoke)633 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
634 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
635 }
636
CreateSSE41FPToIntLocations(ArenaAllocator * arena,HInvoke * invoke,CodeGeneratorX86_64 * codegen)637 static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
638 HInvoke* invoke,
639 CodeGeneratorX86_64* codegen) {
640 // Do we have instruction support?
641 if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
642 LocationSummary* locations = new (arena) LocationSummary(invoke,
643 LocationSummary::kNoCall,
644 kIntrinsified);
645 locations->SetInAt(0, Location::RequiresFpuRegister());
646 locations->SetOut(Location::RequiresRegister());
647 locations->AddTemp(Location::RequiresFpuRegister());
648 locations->AddTemp(Location::RequiresFpuRegister());
649 return;
650 }
651
652 // We have to fall back to a call to the intrinsic.
653 LocationSummary* locations = new (arena) LocationSummary(invoke,
654 LocationSummary::kCallOnMainOnly);
655 InvokeRuntimeCallingConvention calling_convention;
656 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
657 locations->SetOut(Location::RegisterLocation(RAX));
658 // Needs to be RDI for the invoke.
659 locations->AddTemp(Location::RegisterLocation(RDI));
660 }
661
VisitMathRoundFloat(HInvoke * invoke)662 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
663 CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
664 }
665
VisitMathRoundFloat(HInvoke * invoke)666 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
667 LocationSummary* locations = invoke->GetLocations();
668 if (locations->WillCall()) {
669 InvokeOutOfLineIntrinsic(codegen_, invoke);
670 return;
671 }
672
673 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
674 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
675 XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
676 XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
677 NearLabel skip_incr, done;
678 X86_64Assembler* assembler = GetAssembler();
679
680 // Since no direct x86 rounding instruction matches the required semantics,
681 // this intrinsic is implemented as follows:
682 // result = floor(in);
683 // if (in - result >= 0.5f)
684 // result = result + 1.0f;
685 __ movss(t2, in);
686 __ roundss(t1, in, Immediate(1));
687 __ subss(t2, t1);
688 __ comiss(t2, codegen_->LiteralFloatAddress(0.5f));
689 __ j(kBelow, &skip_incr);
690 __ addss(t1, codegen_->LiteralFloatAddress(1.0f));
691 __ Bind(&skip_incr);
692
693 // Final conversion to an integer. Unfortunately this also does not have a
694 // direct x86 instruction, since NaN should map to 0 and large positive
695 // values need to be clipped to the extreme value.
696 codegen_->Load32BitValue(out, kPrimIntMax);
697 __ cvtsi2ss(t2, out);
698 __ comiss(t1, t2);
699 __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered
700 __ movl(out, Immediate(0)); // does not change flags
701 __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out)
702 __ cvttss2si(out, t1);
703 __ Bind(&done);
704 }
705
VisitMathRoundDouble(HInvoke * invoke)706 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
707 CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
708 }
709
VisitMathRoundDouble(HInvoke * invoke)710 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
711 LocationSummary* locations = invoke->GetLocations();
712 if (locations->WillCall()) {
713 InvokeOutOfLineIntrinsic(codegen_, invoke);
714 return;
715 }
716
717 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
718 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
719 XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
720 XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
721 NearLabel skip_incr, done;
722 X86_64Assembler* assembler = GetAssembler();
723
724 // Since no direct x86 rounding instruction matches the required semantics,
725 // this intrinsic is implemented as follows:
726 // result = floor(in);
727 // if (in - result >= 0.5)
728 // result = result + 1.0f;
729 __ movsd(t2, in);
730 __ roundsd(t1, in, Immediate(1));
731 __ subsd(t2, t1);
732 __ comisd(t2, codegen_->LiteralDoubleAddress(0.5));
733 __ j(kBelow, &skip_incr);
734 __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f));
735 __ Bind(&skip_incr);
736
737 // Final conversion to an integer. Unfortunately this also does not have a
738 // direct x86 instruction, since NaN should map to 0 and large positive
739 // values need to be clipped to the extreme value.
740 codegen_->Load64BitValue(out, kPrimLongMax);
741 __ cvtsi2sd(t2, out, /* is64bit */ true);
742 __ comisd(t1, t2);
743 __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered
744 __ movl(out, Immediate(0)); // does not change flags, implicit zero extension to 64-bit
745 __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out)
746 __ cvttsd2si(out, t1, /* is64bit */ true);
747 __ Bind(&done);
748 }
749
CreateFPToFPCallLocations(ArenaAllocator * arena,HInvoke * invoke)750 static void CreateFPToFPCallLocations(ArenaAllocator* arena,
751 HInvoke* invoke) {
752 LocationSummary* locations = new (arena) LocationSummary(invoke,
753 LocationSummary::kCallOnMainOnly,
754 kIntrinsified);
755 InvokeRuntimeCallingConvention calling_convention;
756 locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
757 locations->SetOut(Location::FpuRegisterLocation(XMM0));
758
759 // We have to ensure that the native code doesn't clobber the XMM registers which are
760 // non-volatile for ART, but volatile for Native calls. This will ensure that they are
761 // saved in the prologue and properly restored.
762 for (auto fp_reg : non_volatile_xmm_regs) {
763 locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
764 }
765 }
766
GenFPToFPCall(HInvoke * invoke,CodeGeneratorX86_64 * codegen,QuickEntrypointEnum entry)767 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
768 QuickEntrypointEnum entry) {
769 LocationSummary* locations = invoke->GetLocations();
770 DCHECK(locations->WillCall());
771 DCHECK(invoke->IsInvokeStaticOrDirect());
772
773 codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
774 }
775
VisitMathCos(HInvoke * invoke)776 void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) {
777 CreateFPToFPCallLocations(arena_, invoke);
778 }
779
VisitMathCos(HInvoke * invoke)780 void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) {
781 GenFPToFPCall(invoke, codegen_, kQuickCos);
782 }
783
VisitMathSin(HInvoke * invoke)784 void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) {
785 CreateFPToFPCallLocations(arena_, invoke);
786 }
787
VisitMathSin(HInvoke * invoke)788 void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) {
789 GenFPToFPCall(invoke, codegen_, kQuickSin);
790 }
791
VisitMathAcos(HInvoke * invoke)792 void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) {
793 CreateFPToFPCallLocations(arena_, invoke);
794 }
795
VisitMathAcos(HInvoke * invoke)796 void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) {
797 GenFPToFPCall(invoke, codegen_, kQuickAcos);
798 }
799
VisitMathAsin(HInvoke * invoke)800 void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) {
801 CreateFPToFPCallLocations(arena_, invoke);
802 }
803
VisitMathAsin(HInvoke * invoke)804 void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) {
805 GenFPToFPCall(invoke, codegen_, kQuickAsin);
806 }
807
VisitMathAtan(HInvoke * invoke)808 void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) {
809 CreateFPToFPCallLocations(arena_, invoke);
810 }
811
VisitMathAtan(HInvoke * invoke)812 void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) {
813 GenFPToFPCall(invoke, codegen_, kQuickAtan);
814 }
815
VisitMathCbrt(HInvoke * invoke)816 void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) {
817 CreateFPToFPCallLocations(arena_, invoke);
818 }
819
VisitMathCbrt(HInvoke * invoke)820 void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) {
821 GenFPToFPCall(invoke, codegen_, kQuickCbrt);
822 }
823
VisitMathCosh(HInvoke * invoke)824 void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) {
825 CreateFPToFPCallLocations(arena_, invoke);
826 }
827
VisitMathCosh(HInvoke * invoke)828 void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) {
829 GenFPToFPCall(invoke, codegen_, kQuickCosh);
830 }
831
VisitMathExp(HInvoke * invoke)832 void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) {
833 CreateFPToFPCallLocations(arena_, invoke);
834 }
835
VisitMathExp(HInvoke * invoke)836 void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) {
837 GenFPToFPCall(invoke, codegen_, kQuickExp);
838 }
839
VisitMathExpm1(HInvoke * invoke)840 void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) {
841 CreateFPToFPCallLocations(arena_, invoke);
842 }
843
VisitMathExpm1(HInvoke * invoke)844 void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) {
845 GenFPToFPCall(invoke, codegen_, kQuickExpm1);
846 }
847
VisitMathLog(HInvoke * invoke)848 void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) {
849 CreateFPToFPCallLocations(arena_, invoke);
850 }
851
VisitMathLog(HInvoke * invoke)852 void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) {
853 GenFPToFPCall(invoke, codegen_, kQuickLog);
854 }
855
VisitMathLog10(HInvoke * invoke)856 void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) {
857 CreateFPToFPCallLocations(arena_, invoke);
858 }
859
VisitMathLog10(HInvoke * invoke)860 void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) {
861 GenFPToFPCall(invoke, codegen_, kQuickLog10);
862 }
863
VisitMathSinh(HInvoke * invoke)864 void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) {
865 CreateFPToFPCallLocations(arena_, invoke);
866 }
867
VisitMathSinh(HInvoke * invoke)868 void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) {
869 GenFPToFPCall(invoke, codegen_, kQuickSinh);
870 }
871
VisitMathTan(HInvoke * invoke)872 void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) {
873 CreateFPToFPCallLocations(arena_, invoke);
874 }
875
VisitMathTan(HInvoke * invoke)876 void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) {
877 GenFPToFPCall(invoke, codegen_, kQuickTan);
878 }
879
VisitMathTanh(HInvoke * invoke)880 void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) {
881 CreateFPToFPCallLocations(arena_, invoke);
882 }
883
VisitMathTanh(HInvoke * invoke)884 void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
885 GenFPToFPCall(invoke, codegen_, kQuickTanh);
886 }
887
CreateFPFPToFPCallLocations(ArenaAllocator * arena,HInvoke * invoke)888 static void CreateFPFPToFPCallLocations(ArenaAllocator* arena,
889 HInvoke* invoke) {
890 LocationSummary* locations = new (arena) LocationSummary(invoke,
891 LocationSummary::kCallOnMainOnly,
892 kIntrinsified);
893 InvokeRuntimeCallingConvention calling_convention;
894 locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
895 locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
896 locations->SetOut(Location::FpuRegisterLocation(XMM0));
897
898 // We have to ensure that the native code doesn't clobber the XMM registers which are
899 // non-volatile for ART, but volatile for Native calls. This will ensure that they are
900 // saved in the prologue and properly restored.
901 for (auto fp_reg : non_volatile_xmm_regs) {
902 locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
903 }
904 }
905
VisitMathAtan2(HInvoke * invoke)906 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
907 CreateFPFPToFPCallLocations(arena_, invoke);
908 }
909
VisitMathAtan2(HInvoke * invoke)910 void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) {
911 GenFPToFPCall(invoke, codegen_, kQuickAtan2);
912 }
913
VisitMathHypot(HInvoke * invoke)914 void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) {
915 CreateFPFPToFPCallLocations(arena_, invoke);
916 }
917
VisitMathHypot(HInvoke * invoke)918 void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) {
919 GenFPToFPCall(invoke, codegen_, kQuickHypot);
920 }
921
VisitMathNextAfter(HInvoke * invoke)922 void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) {
923 CreateFPFPToFPCallLocations(arena_, invoke);
924 }
925
VisitMathNextAfter(HInvoke * invoke)926 void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) {
927 GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
928 }
929
VisitSystemArrayCopyChar(HInvoke * invoke)930 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
931 // Check to see if we have known failures that will cause us to have to bail out
932 // to the runtime, and just generate the runtime call directly.
933 HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
934 HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
935
936 // The positions must be non-negative.
937 if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
938 (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
939 // We will have to fail anyways.
940 return;
941 }
942
943 // The length must be > 0.
944 HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
945 if (length != nullptr) {
946 int32_t len = length->GetValue();
947 if (len < 0) {
948 // Just call as normal.
949 return;
950 }
951 }
952
953 LocationSummary* locations = new (arena_) LocationSummary(invoke,
954 LocationSummary::kCallOnSlowPath,
955 kIntrinsified);
956 // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
957 locations->SetInAt(0, Location::RequiresRegister());
958 locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
959 locations->SetInAt(2, Location::RequiresRegister());
960 locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
961 locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
962
963 // And we need some temporaries. We will use REP MOVSW, so we need fixed registers.
964 locations->AddTemp(Location::RegisterLocation(RSI));
965 locations->AddTemp(Location::RegisterLocation(RDI));
966 locations->AddTemp(Location::RegisterLocation(RCX));
967 }
968
CheckPosition(X86_64Assembler * assembler,Location pos,CpuRegister input,Location length,SlowPathCode * slow_path,CpuRegister temp,bool length_is_input_length=false)969 static void CheckPosition(X86_64Assembler* assembler,
970 Location pos,
971 CpuRegister input,
972 Location length,
973 SlowPathCode* slow_path,
974 CpuRegister temp,
975 bool length_is_input_length = false) {
976 // Where is the length in the Array?
977 const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
978
979 if (pos.IsConstant()) {
980 int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
981 if (pos_const == 0) {
982 if (!length_is_input_length) {
983 // Check that length(input) >= length.
984 if (length.IsConstant()) {
985 __ cmpl(Address(input, length_offset),
986 Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
987 } else {
988 __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>());
989 }
990 __ j(kLess, slow_path->GetEntryLabel());
991 }
992 } else {
993 // Check that length(input) >= pos.
994 __ movl(temp, Address(input, length_offset));
995 __ subl(temp, Immediate(pos_const));
996 __ j(kLess, slow_path->GetEntryLabel());
997
998 // Check that (length(input) - pos) >= length.
999 if (length.IsConstant()) {
1000 __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1001 } else {
1002 __ cmpl(temp, length.AsRegister<CpuRegister>());
1003 }
1004 __ j(kLess, slow_path->GetEntryLabel());
1005 }
1006 } else if (length_is_input_length) {
1007 // The only way the copy can succeed is if pos is zero.
1008 CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
1009 __ testl(pos_reg, pos_reg);
1010 __ j(kNotEqual, slow_path->GetEntryLabel());
1011 } else {
1012 // Check that pos >= 0.
1013 CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
1014 __ testl(pos_reg, pos_reg);
1015 __ j(kLess, slow_path->GetEntryLabel());
1016
1017 // Check that pos <= length(input).
1018 __ cmpl(Address(input, length_offset), pos_reg);
1019 __ j(kLess, slow_path->GetEntryLabel());
1020
1021 // Check that (length(input) - pos) >= length.
1022 __ movl(temp, Address(input, length_offset));
1023 __ subl(temp, pos_reg);
1024 if (length.IsConstant()) {
1025 __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1026 } else {
1027 __ cmpl(temp, length.AsRegister<CpuRegister>());
1028 }
1029 __ j(kLess, slow_path->GetEntryLabel());
1030 }
1031 }
1032
VisitSystemArrayCopyChar(HInvoke * invoke)1033 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
1034 X86_64Assembler* assembler = GetAssembler();
1035 LocationSummary* locations = invoke->GetLocations();
1036
1037 CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
1038 Location src_pos = locations->InAt(1);
1039 CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
1040 Location dest_pos = locations->InAt(3);
1041 Location length = locations->InAt(4);
1042
1043 // Temporaries that we need for MOVSW.
1044 CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
1045 DCHECK_EQ(src_base.AsRegister(), RSI);
1046 CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
1047 DCHECK_EQ(dest_base.AsRegister(), RDI);
1048 CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
1049 DCHECK_EQ(count.AsRegister(), RCX);
1050
1051 SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1052 codegen_->AddSlowPath(slow_path);
1053
1054 // Bail out if the source and destination are the same.
1055 __ cmpl(src, dest);
1056 __ j(kEqual, slow_path->GetEntryLabel());
1057
1058 // Bail out if the source is null.
1059 __ testl(src, src);
1060 __ j(kEqual, slow_path->GetEntryLabel());
1061
1062 // Bail out if the destination is null.
1063 __ testl(dest, dest);
1064 __ j(kEqual, slow_path->GetEntryLabel());
1065
1066 // If the length is negative, bail out.
1067 // We have already checked in the LocationsBuilder for the constant case.
1068 if (!length.IsConstant()) {
1069 __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
1070 __ j(kLess, slow_path->GetEntryLabel());
1071 }
1072
1073 // Validity checks: source. Use src_base as a temporary register.
1074 CheckPosition(assembler, src_pos, src, length, slow_path, src_base);
1075
1076 // Validity checks: dest. Use src_base as a temporary register.
1077 CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base);
1078
1079 // We need the count in RCX.
1080 if (length.IsConstant()) {
1081 __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1082 } else {
1083 __ movl(count, length.AsRegister<CpuRegister>());
1084 }
1085
1086 // Okay, everything checks out. Finally time to do the copy.
1087 // Check assumption that sizeof(Char) is 2 (used in scaling below).
1088 const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
1089 DCHECK_EQ(char_size, 2u);
1090
1091 const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
1092
1093 if (src_pos.IsConstant()) {
1094 int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
1095 __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
1096 } else {
1097 __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
1098 ScaleFactor::TIMES_2, data_offset));
1099 }
1100 if (dest_pos.IsConstant()) {
1101 int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1102 __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
1103 } else {
1104 __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
1105 ScaleFactor::TIMES_2, data_offset));
1106 }
1107
1108 // Do the move.
1109 __ rep_movsw();
1110
1111 __ Bind(slow_path->GetExitLabel());
1112 }
1113
1114
VisitSystemArrayCopy(HInvoke * invoke)1115 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
1116 // The only read barrier implementation supporting the
1117 // SystemArrayCopy intrinsic is the Baker-style read barriers.
1118 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
1119 return;
1120 }
1121
1122 CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
1123 }
1124
1125 // Compute base source address, base destination address, and end
1126 // source address for the System.arraycopy intrinsic in `src_base`,
1127 // `dst_base` and `src_end` respectively.
GenSystemArrayCopyAddresses(X86_64Assembler * assembler,Primitive::Type type,const CpuRegister & src,const Location & src_pos,const CpuRegister & dst,const Location & dst_pos,const Location & copy_length,const CpuRegister & src_base,const CpuRegister & dst_base,const CpuRegister & src_end)1128 static void GenSystemArrayCopyAddresses(X86_64Assembler* assembler,
1129 Primitive::Type type,
1130 const CpuRegister& src,
1131 const Location& src_pos,
1132 const CpuRegister& dst,
1133 const Location& dst_pos,
1134 const Location& copy_length,
1135 const CpuRegister& src_base,
1136 const CpuRegister& dst_base,
1137 const CpuRegister& src_end) {
1138 // This routine is only used by the SystemArrayCopy intrinsic.
1139 DCHECK_EQ(type, Primitive::kPrimNot);
1140 const int32_t element_size = Primitive::ComponentSize(type);
1141 const ScaleFactor scale_factor = static_cast<ScaleFactor>(Primitive::ComponentSizeShift(type));
1142 const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
1143
1144 if (src_pos.IsConstant()) {
1145 int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
1146 __ leal(src_base, Address(src, element_size * constant + data_offset));
1147 } else {
1148 __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
1149 }
1150
1151 if (dst_pos.IsConstant()) {
1152 int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue();
1153 __ leal(dst_base, Address(dst, element_size * constant + data_offset));
1154 } else {
1155 __ leal(dst_base, Address(dst, dst_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
1156 }
1157
1158 if (copy_length.IsConstant()) {
1159 int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
1160 __ leal(src_end, Address(src_base, element_size * constant));
1161 } else {
1162 __ leal(src_end, Address(src_base, copy_length.AsRegister<CpuRegister>(), scale_factor, 0));
1163 }
1164 }
1165
VisitSystemArrayCopy(HInvoke * invoke)1166 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
1167 // The only read barrier implementation supporting the
1168 // SystemArrayCopy intrinsic is the Baker-style read barriers.
1169 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
1170
1171 X86_64Assembler* assembler = GetAssembler();
1172 LocationSummary* locations = invoke->GetLocations();
1173
1174 uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
1175 uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
1176 uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
1177 uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
1178 uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
1179
1180 CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
1181 Location src_pos = locations->InAt(1);
1182 CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
1183 Location dest_pos = locations->InAt(3);
1184 Location length = locations->InAt(4);
1185 Location temp1_loc = locations->GetTemp(0);
1186 CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>();
1187 Location temp2_loc = locations->GetTemp(1);
1188 CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
1189 Location temp3_loc = locations->GetTemp(2);
1190 CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>();
1191 Location TMP_loc = Location::RegisterLocation(TMP);
1192
1193 SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1194 codegen_->AddSlowPath(intrinsic_slow_path);
1195
1196 NearLabel conditions_on_positions_validated;
1197 SystemArrayCopyOptimizations optimizations(invoke);
1198
1199 // If source and destination are the same, we go to slow path if we need to do
1200 // forward copying.
1201 if (src_pos.IsConstant()) {
1202 int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
1203 if (dest_pos.IsConstant()) {
1204 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1205 if (optimizations.GetDestinationIsSource()) {
1206 // Checked when building locations.
1207 DCHECK_GE(src_pos_constant, dest_pos_constant);
1208 } else if (src_pos_constant < dest_pos_constant) {
1209 __ cmpl(src, dest);
1210 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1211 }
1212 } else {
1213 if (!optimizations.GetDestinationIsSource()) {
1214 __ cmpl(src, dest);
1215 __ j(kNotEqual, &conditions_on_positions_validated);
1216 }
1217 __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
1218 __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
1219 }
1220 } else {
1221 if (!optimizations.GetDestinationIsSource()) {
1222 __ cmpl(src, dest);
1223 __ j(kNotEqual, &conditions_on_positions_validated);
1224 }
1225 if (dest_pos.IsConstant()) {
1226 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1227 __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
1228 __ j(kLess, intrinsic_slow_path->GetEntryLabel());
1229 } else {
1230 __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
1231 __ j(kLess, intrinsic_slow_path->GetEntryLabel());
1232 }
1233 }
1234
1235 __ Bind(&conditions_on_positions_validated);
1236
1237 if (!optimizations.GetSourceIsNotNull()) {
1238 // Bail out if the source is null.
1239 __ testl(src, src);
1240 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1241 }
1242
1243 if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
1244 // Bail out if the destination is null.
1245 __ testl(dest, dest);
1246 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1247 }
1248
1249 // If the length is negative, bail out.
1250 // We have already checked in the LocationsBuilder for the constant case.
1251 if (!length.IsConstant() &&
1252 !optimizations.GetCountIsSourceLength() &&
1253 !optimizations.GetCountIsDestinationLength()) {
1254 __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
1255 __ j(kLess, intrinsic_slow_path->GetEntryLabel());
1256 }
1257
1258 // Validity checks: source.
1259 CheckPosition(assembler,
1260 src_pos,
1261 src,
1262 length,
1263 intrinsic_slow_path,
1264 temp1,
1265 optimizations.GetCountIsSourceLength());
1266
1267 // Validity checks: dest.
1268 CheckPosition(assembler,
1269 dest_pos,
1270 dest,
1271 length,
1272 intrinsic_slow_path,
1273 temp1,
1274 optimizations.GetCountIsDestinationLength());
1275
1276 if (!optimizations.GetDoesNotNeedTypeCheck()) {
1277 // Check whether all elements of the source array are assignable to the component
1278 // type of the destination array. We do two checks: the classes are the same,
1279 // or the destination is Object[]. If none of these checks succeed, we go to the
1280 // slow path.
1281
1282 bool did_unpoison = false;
1283 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1284 // /* HeapReference<Class> */ temp1 = dest->klass_
1285 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1286 invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
1287 // Register `temp1` is not trashed by the read barrier emitted
1288 // by GenerateFieldLoadWithBakerReadBarrier below, as that
1289 // method produces a call to a ReadBarrierMarkRegX entry point,
1290 // which saves all potentially live registers, including
1291 // temporaries such a `temp1`.
1292 // /* HeapReference<Class> */ temp2 = src->klass_
1293 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1294 invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
1295 // If heap poisoning is enabled, `temp1` and `temp2` have been
1296 // unpoisoned by the the previous calls to
1297 // GenerateFieldLoadWithBakerReadBarrier.
1298 } else {
1299 // /* HeapReference<Class> */ temp1 = dest->klass_
1300 __ movl(temp1, Address(dest, class_offset));
1301 // /* HeapReference<Class> */ temp2 = src->klass_
1302 __ movl(temp2, Address(src, class_offset));
1303 if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
1304 !optimizations.GetSourceIsNonPrimitiveArray()) {
1305 // One or two of the references need to be unpoisoned. Unpoison them
1306 // both to make the identity check valid.
1307 __ MaybeUnpoisonHeapReference(temp1);
1308 __ MaybeUnpoisonHeapReference(temp2);
1309 did_unpoison = true;
1310 }
1311 }
1312
1313 if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
1314 // Bail out if the destination is not a non primitive array.
1315 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1316 // /* HeapReference<Class> */ TMP = temp1->component_type_
1317 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1318 invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
1319 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1320 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1321 // If heap poisoning is enabled, `TMP` has been unpoisoned by
1322 // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
1323 } else {
1324 // /* HeapReference<Class> */ TMP = temp1->component_type_
1325 __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1326 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1327 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1328 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1329 }
1330 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1331 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1332 }
1333
1334 if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1335 // Bail out if the source is not a non primitive array.
1336 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1337 // For the same reason given earlier, `temp1` is not trashed by the
1338 // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
1339 // /* HeapReference<Class> */ TMP = temp2->component_type_
1340 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1341 invoke, TMP_loc, temp2, component_offset, /* needs_null_check */ false);
1342 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1343 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1344 // If heap poisoning is enabled, `TMP` has been unpoisoned by
1345 // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
1346 } else {
1347 // /* HeapReference<Class> */ TMP = temp2->component_type_
1348 __ movl(CpuRegister(TMP), Address(temp2, component_offset));
1349 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1350 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1351 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1352 }
1353 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1354 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1355 }
1356
1357 __ cmpl(temp1, temp2);
1358
1359 if (optimizations.GetDestinationIsTypedObjectArray()) {
1360 NearLabel do_copy;
1361 __ j(kEqual, &do_copy);
1362 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1363 // /* HeapReference<Class> */ temp1 = temp1->component_type_
1364 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1365 invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
1366 // We do not need to emit a read barrier for the following
1367 // heap reference load, as `temp1` is only used in a
1368 // comparison with null below, and this reference is not
1369 // kept afterwards.
1370 __ cmpl(Address(temp1, super_offset), Immediate(0));
1371 } else {
1372 if (!did_unpoison) {
1373 __ MaybeUnpoisonHeapReference(temp1);
1374 }
1375 // /* HeapReference<Class> */ temp1 = temp1->component_type_
1376 __ movl(temp1, Address(temp1, component_offset));
1377 __ MaybeUnpoisonHeapReference(temp1);
1378 // No need to unpoison the following heap reference load, as
1379 // we're comparing against null.
1380 __ cmpl(Address(temp1, super_offset), Immediate(0));
1381 }
1382 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1383 __ Bind(&do_copy);
1384 } else {
1385 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1386 }
1387 } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1388 DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
1389 // Bail out if the source is not a non primitive array.
1390 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1391 // /* HeapReference<Class> */ temp1 = src->klass_
1392 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1393 invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
1394 // /* HeapReference<Class> */ TMP = temp1->component_type_
1395 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1396 invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
1397 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1398 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1399 } else {
1400 // /* HeapReference<Class> */ temp1 = src->klass_
1401 __ movl(temp1, Address(src, class_offset));
1402 __ MaybeUnpoisonHeapReference(temp1);
1403 // /* HeapReference<Class> */ TMP = temp1->component_type_
1404 __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1405 // No need to unpoison `TMP` now, as we're comparing against null.
1406 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1407 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1408 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1409 }
1410 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1411 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1412 }
1413
1414 const Primitive::Type type = Primitive::kPrimNot;
1415 const int32_t element_size = Primitive::ComponentSize(type);
1416
1417 // Compute base source address, base destination address, and end
1418 // source address in `temp1`, `temp2` and `temp3` respectively.
1419 GenSystemArrayCopyAddresses(
1420 GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3);
1421
1422 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1423 // SystemArrayCopy implementation for Baker read barriers (see
1424 // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
1425 //
1426 // if (src_ptr != end_ptr) {
1427 // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
1428 // lfence; // Load fence or artificial data dependency to prevent load-load reordering
1429 // bool is_gray = (rb_state == ReadBarrier::GrayState());
1430 // if (is_gray) {
1431 // // Slow-path copy.
1432 // do {
1433 // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
1434 // } while (src_ptr != end_ptr)
1435 // } else {
1436 // // Fast-path copy.
1437 // do {
1438 // *dest_ptr++ = *src_ptr++;
1439 // } while (src_ptr != end_ptr)
1440 // }
1441 // }
1442
1443 NearLabel loop, done;
1444
1445 // Don't enter copy loop if `length == 0`.
1446 __ cmpl(temp1, temp3);
1447 __ j(kEqual, &done);
1448
1449 // Given the numeric representation, it's enough to check the low bit of the rb_state.
1450 static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
1451 static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
1452 constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
1453 constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
1454 constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
1455
1456 // if (rb_state == ReadBarrier::GrayState())
1457 // goto slow_path;
1458 // At this point, just do the "if" and make sure that flags are preserved until the branch.
1459 __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
1460
1461 // Load fence to prevent load-load reordering.
1462 // Note that this is a no-op, thanks to the x86-64 memory model.
1463 codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
1464
1465 // Slow path used to copy array when `src` is gray.
1466 SlowPathCode* read_barrier_slow_path =
1467 new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
1468 codegen_->AddSlowPath(read_barrier_slow_path);
1469
1470 // We have done the "if" of the gray bit check above, now branch based on the flags.
1471 __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
1472
1473 // Fast-path copy.
1474 // Iterate over the arrays and do a raw copy of the objects. We don't need to
1475 // poison/unpoison.
1476 __ Bind(&loop);
1477 __ movl(CpuRegister(TMP), Address(temp1, 0));
1478 __ movl(Address(temp2, 0), CpuRegister(TMP));
1479 __ addl(temp1, Immediate(element_size));
1480 __ addl(temp2, Immediate(element_size));
1481 __ cmpl(temp1, temp3);
1482 __ j(kNotEqual, &loop);
1483
1484 __ Bind(read_barrier_slow_path->GetExitLabel());
1485 __ Bind(&done);
1486 } else {
1487 // Non read barrier code.
1488
1489 // Iterate over the arrays and do a raw copy of the objects. We don't need to
1490 // poison/unpoison.
1491 NearLabel loop, done;
1492 __ cmpl(temp1, temp3);
1493 __ j(kEqual, &done);
1494 __ Bind(&loop);
1495 __ movl(CpuRegister(TMP), Address(temp1, 0));
1496 __ movl(Address(temp2, 0), CpuRegister(TMP));
1497 __ addl(temp1, Immediate(element_size));
1498 __ addl(temp2, Immediate(element_size));
1499 __ cmpl(temp1, temp3);
1500 __ j(kNotEqual, &loop);
1501 __ Bind(&done);
1502 }
1503
1504 // We only need one card marking on the destination array.
1505 codegen_->MarkGCCard(temp1, temp2, dest, CpuRegister(kNoRegister), /* value_can_be_null */ false);
1506
1507 __ Bind(intrinsic_slow_path->GetExitLabel());
1508 }
1509
VisitStringCompareTo(HInvoke * invoke)1510 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
1511 LocationSummary* locations = new (arena_) LocationSummary(invoke,
1512 LocationSummary::kCallOnMainAndSlowPath,
1513 kIntrinsified);
1514 InvokeRuntimeCallingConvention calling_convention;
1515 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1516 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1517 locations->SetOut(Location::RegisterLocation(RAX));
1518 }
1519
VisitStringCompareTo(HInvoke * invoke)1520 void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
1521 X86_64Assembler* assembler = GetAssembler();
1522 LocationSummary* locations = invoke->GetLocations();
1523
1524 // Note that the null check must have been done earlier.
1525 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1526
1527 CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
1528 __ testl(argument, argument);
1529 SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1530 codegen_->AddSlowPath(slow_path);
1531 __ j(kEqual, slow_path->GetEntryLabel());
1532
1533 codegen_->InvokeRuntime(kQuickStringCompareTo, invoke, invoke->GetDexPc(), slow_path);
1534 __ Bind(slow_path->GetExitLabel());
1535 }
1536
VisitStringEquals(HInvoke * invoke)1537 void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
1538 LocationSummary* locations = new (arena_) LocationSummary(invoke,
1539 LocationSummary::kNoCall,
1540 kIntrinsified);
1541 locations->SetInAt(0, Location::RequiresRegister());
1542 locations->SetInAt(1, Location::RequiresRegister());
1543
1544 // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
1545 locations->AddTemp(Location::RegisterLocation(RCX));
1546 locations->AddTemp(Location::RegisterLocation(RDI));
1547
1548 // Set output, RSI needed for repe_cmpsq instruction anyways.
1549 locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
1550 }
1551
VisitStringEquals(HInvoke * invoke)1552 void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
1553 X86_64Assembler* assembler = GetAssembler();
1554 LocationSummary* locations = invoke->GetLocations();
1555
1556 CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
1557 CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
1558 CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
1559 CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
1560 CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
1561
1562 NearLabel end, return_true, return_false;
1563
1564 // Get offsets of count, value, and class fields within a string object.
1565 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1566 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1567 const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
1568
1569 // Note that the null check must have been done earlier.
1570 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1571
1572 StringEqualsOptimizations optimizations(invoke);
1573 if (!optimizations.GetArgumentNotNull()) {
1574 // Check if input is null, return false if it is.
1575 __ testl(arg, arg);
1576 __ j(kEqual, &return_false);
1577 }
1578
1579 if (!optimizations.GetArgumentIsString()) {
1580 // Instanceof check for the argument by comparing class fields.
1581 // All string objects must have the same type since String cannot be subclassed.
1582 // Receiver must be a string object, so its class field is equal to all strings' class fields.
1583 // If the argument is a string object, its class field must be equal to receiver's class field.
1584 __ movl(rcx, Address(str, class_offset));
1585 __ cmpl(rcx, Address(arg, class_offset));
1586 __ j(kNotEqual, &return_false);
1587 }
1588
1589 // Reference equality check, return true if same reference.
1590 __ cmpl(str, arg);
1591 __ j(kEqual, &return_true);
1592
1593 // Load length and compression flag of receiver string.
1594 __ movl(rcx, Address(str, count_offset));
1595 // Check if lengths and compressiond flags are equal, return false if they're not.
1596 // Two identical strings will always have same compression style since
1597 // compression style is decided on alloc.
1598 __ cmpl(rcx, Address(arg, count_offset));
1599 __ j(kNotEqual, &return_false);
1600 // Return true if both strings are empty. Even with string compression `count == 0` means empty.
1601 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1602 "Expecting 0=compressed, 1=uncompressed");
1603 __ jrcxz(&return_true);
1604
1605 if (mirror::kUseStringCompression) {
1606 NearLabel string_uncompressed;
1607 // Extract length and differentiate between both compressed or both uncompressed.
1608 // Different compression style is cut above.
1609 __ shrl(rcx, Immediate(1));
1610 __ j(kCarrySet, &string_uncompressed);
1611 // Divide string length by 2, rounding up, and continue as if uncompressed.
1612 // Merge clearing the compression flag with +1 for rounding.
1613 __ addl(rcx, Immediate(1));
1614 __ shrl(rcx, Immediate(1));
1615 __ Bind(&string_uncompressed);
1616 }
1617 // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
1618 __ leal(rsi, Address(str, value_offset));
1619 __ leal(rdi, Address(arg, value_offset));
1620
1621 // Divide string length by 4 and adjust for lengths not divisible by 4.
1622 __ addl(rcx, Immediate(3));
1623 __ shrl(rcx, Immediate(2));
1624
1625 // Assertions that must hold in order to compare strings 4 characters (uncompressed)
1626 // or 8 characters (compressed) at a time.
1627 DCHECK_ALIGNED(value_offset, 8);
1628 static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
1629
1630 // Loop to compare strings four characters at a time starting at the beginning of the string.
1631 __ repe_cmpsq();
1632 // If strings are not equal, zero flag will be cleared.
1633 __ j(kNotEqual, &return_false);
1634
1635 // Return true and exit the function.
1636 // If loop does not result in returning false, we return true.
1637 __ Bind(&return_true);
1638 __ movl(rsi, Immediate(1));
1639 __ jmp(&end);
1640
1641 // Return false and exit the function.
1642 __ Bind(&return_false);
1643 __ xorl(rsi, rsi);
1644 __ Bind(&end);
1645 }
1646
CreateStringIndexOfLocations(HInvoke * invoke,ArenaAllocator * allocator,bool start_at_zero)1647 static void CreateStringIndexOfLocations(HInvoke* invoke,
1648 ArenaAllocator* allocator,
1649 bool start_at_zero) {
1650 LocationSummary* locations = new (allocator) LocationSummary(invoke,
1651 LocationSummary::kCallOnSlowPath,
1652 kIntrinsified);
1653 // The data needs to be in RDI for scasw. So request that the string is there, anyways.
1654 locations->SetInAt(0, Location::RegisterLocation(RDI));
1655 // If we look for a constant char, we'll still have to copy it into RAX. So just request the
1656 // allocator to do that, anyways. We can still do the constant check by checking the parameter
1657 // of the instruction explicitly.
1658 // Note: This works as we don't clobber RAX anywhere.
1659 locations->SetInAt(1, Location::RegisterLocation(RAX));
1660 if (!start_at_zero) {
1661 locations->SetInAt(2, Location::RequiresRegister()); // The starting index.
1662 }
1663 // As we clobber RDI during execution anyways, also use it as the output.
1664 locations->SetOut(Location::SameAsFirstInput());
1665
1666 // repne scasw uses RCX as the counter.
1667 locations->AddTemp(Location::RegisterLocation(RCX));
1668 // Need another temporary to be able to compute the result.
1669 locations->AddTemp(Location::RequiresRegister());
1670 }
1671
GenerateStringIndexOf(HInvoke * invoke,X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,ArenaAllocator * allocator,bool start_at_zero)1672 static void GenerateStringIndexOf(HInvoke* invoke,
1673 X86_64Assembler* assembler,
1674 CodeGeneratorX86_64* codegen,
1675 ArenaAllocator* allocator,
1676 bool start_at_zero) {
1677 LocationSummary* locations = invoke->GetLocations();
1678
1679 // Note that the null check must have been done earlier.
1680 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1681
1682 CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
1683 CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
1684 CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
1685 CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
1686 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
1687
1688 // Check our assumptions for registers.
1689 DCHECK_EQ(string_obj.AsRegister(), RDI);
1690 DCHECK_EQ(search_value.AsRegister(), RAX);
1691 DCHECK_EQ(counter.AsRegister(), RCX);
1692 DCHECK_EQ(out.AsRegister(), RDI);
1693
1694 // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
1695 // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
1696 SlowPathCode* slow_path = nullptr;
1697 HInstruction* code_point = invoke->InputAt(1);
1698 if (code_point->IsIntConstant()) {
1699 if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
1700 std::numeric_limits<uint16_t>::max()) {
1701 // Always needs the slow-path. We could directly dispatch to it, but this case should be
1702 // rare, so for simplicity just put the full slow-path down and branch unconditionally.
1703 slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
1704 codegen->AddSlowPath(slow_path);
1705 __ jmp(slow_path->GetEntryLabel());
1706 __ Bind(slow_path->GetExitLabel());
1707 return;
1708 }
1709 } else if (code_point->GetType() != Primitive::kPrimChar) {
1710 __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
1711 slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
1712 codegen->AddSlowPath(slow_path);
1713 __ j(kAbove, slow_path->GetEntryLabel());
1714 }
1715
1716 // From here down, we know that we are looking for a char that fits in
1717 // 16 bits (uncompressed) or 8 bits (compressed).
1718 // Location of reference to data array within the String object.
1719 int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1720 // Location of count within the String object.
1721 int32_t count_offset = mirror::String::CountOffset().Int32Value();
1722
1723 // Load the count field of the string containing the length and compression flag.
1724 __ movl(string_length, Address(string_obj, count_offset));
1725
1726 // Do a zero-length check. Even with string compression `count == 0` means empty.
1727 // TODO: Support jecxz.
1728 NearLabel not_found_label;
1729 __ testl(string_length, string_length);
1730 __ j(kEqual, ¬_found_label);
1731
1732 if (mirror::kUseStringCompression) {
1733 // Use TMP to keep string_length_flagged.
1734 __ movl(CpuRegister(TMP), string_length);
1735 // Mask out first bit used as compression flag.
1736 __ shrl(string_length, Immediate(1));
1737 }
1738
1739 if (start_at_zero) {
1740 // Number of chars to scan is the same as the string length.
1741 __ movl(counter, string_length);
1742 // Move to the start of the string.
1743 __ addq(string_obj, Immediate(value_offset));
1744 } else {
1745 CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
1746
1747 // Do a start_index check.
1748 __ cmpl(start_index, string_length);
1749 __ j(kGreaterEqual, ¬_found_label);
1750
1751 // Ensure we have a start index >= 0;
1752 __ xorl(counter, counter);
1753 __ cmpl(start_index, Immediate(0));
1754 __ cmov(kGreater, counter, start_index, /* is64bit */ false); // 32-bit copy is enough.
1755
1756 if (mirror::kUseStringCompression) {
1757 NearLabel modify_counter, offset_uncompressed_label;
1758 __ testl(CpuRegister(TMP), Immediate(1));
1759 __ j(kNotZero, &offset_uncompressed_label);
1760 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
1761 __ jmp(&modify_counter);
1762 // Move to the start of the string: string_obj + value_offset + 2 * start_index.
1763 __ Bind(&offset_uncompressed_label);
1764 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1765 __ Bind(&modify_counter);
1766 } else {
1767 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1768 }
1769 // Now update ecx, the work counter: it's gonna be string.length - start_index.
1770 __ negq(counter); // Needs to be 64-bit negation, as the address computation is 64-bit.
1771 __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
1772 }
1773
1774 if (mirror::kUseStringCompression) {
1775 NearLabel uncompressed_string_comparison;
1776 NearLabel comparison_done;
1777 __ testl(CpuRegister(TMP), Immediate(1));
1778 __ j(kNotZero, &uncompressed_string_comparison);
1779 // Check if RAX (search_value) is ASCII.
1780 __ cmpl(search_value, Immediate(127));
1781 __ j(kGreater, ¬_found_label);
1782 // Comparing byte-per-byte.
1783 __ repne_scasb();
1784 __ jmp(&comparison_done);
1785 // Everything is set up for repne scasw:
1786 // * Comparison address in RDI.
1787 // * Counter in ECX.
1788 __ Bind(&uncompressed_string_comparison);
1789 __ repne_scasw();
1790 __ Bind(&comparison_done);
1791 } else {
1792 __ repne_scasw();
1793 }
1794 // Did we find a match?
1795 __ j(kNotEqual, ¬_found_label);
1796
1797 // Yes, we matched. Compute the index of the result.
1798 __ subl(string_length, counter);
1799 __ leal(out, Address(string_length, -1));
1800
1801 NearLabel done;
1802 __ jmp(&done);
1803
1804 // Failed to match; return -1.
1805 __ Bind(¬_found_label);
1806 __ movl(out, Immediate(-1));
1807
1808 // And join up at the end.
1809 __ Bind(&done);
1810 if (slow_path != nullptr) {
1811 __ Bind(slow_path->GetExitLabel());
1812 }
1813 }
1814
VisitStringIndexOf(HInvoke * invoke)1815 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
1816 CreateStringIndexOfLocations(invoke, arena_, /* start_at_zero */ true);
1817 }
1818
VisitStringIndexOf(HInvoke * invoke)1819 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
1820 GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), /* start_at_zero */ true);
1821 }
1822
VisitStringIndexOfAfter(HInvoke * invoke)1823 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1824 CreateStringIndexOfLocations(invoke, arena_, /* start_at_zero */ false);
1825 }
1826
VisitStringIndexOfAfter(HInvoke * invoke)1827 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1828 GenerateStringIndexOf(
1829 invoke, GetAssembler(), codegen_, GetAllocator(), /* start_at_zero */ false);
1830 }
1831
VisitStringNewStringFromBytes(HInvoke * invoke)1832 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1833 LocationSummary* locations = new (arena_) LocationSummary(invoke,
1834 LocationSummary::kCallOnMainAndSlowPath,
1835 kIntrinsified);
1836 InvokeRuntimeCallingConvention calling_convention;
1837 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1838 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1839 locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1840 locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
1841 locations->SetOut(Location::RegisterLocation(RAX));
1842 }
1843
VisitStringNewStringFromBytes(HInvoke * invoke)1844 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1845 X86_64Assembler* assembler = GetAssembler();
1846 LocationSummary* locations = invoke->GetLocations();
1847
1848 CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
1849 __ testl(byte_array, byte_array);
1850 SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1851 codegen_->AddSlowPath(slow_path);
1852 __ j(kEqual, slow_path->GetEntryLabel());
1853
1854 codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc());
1855 CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
1856 __ Bind(slow_path->GetExitLabel());
1857 }
1858
VisitStringNewStringFromChars(HInvoke * invoke)1859 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1860 LocationSummary* locations = new (arena_) LocationSummary(invoke,
1861 LocationSummary::kCallOnMainOnly,
1862 kIntrinsified);
1863 InvokeRuntimeCallingConvention calling_convention;
1864 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1865 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1866 locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1867 locations->SetOut(Location::RegisterLocation(RAX));
1868 }
1869
VisitStringNewStringFromChars(HInvoke * invoke)1870 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1871 // No need to emit code checking whether `locations->InAt(2)` is a null
1872 // pointer, as callers of the native method
1873 //
1874 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
1875 //
1876 // all include a null check on `data` before calling that method.
1877 codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
1878 CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
1879 }
1880
VisitStringNewStringFromString(HInvoke * invoke)1881 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1882 LocationSummary* locations = new (arena_) LocationSummary(invoke,
1883 LocationSummary::kCallOnMainAndSlowPath,
1884 kIntrinsified);
1885 InvokeRuntimeCallingConvention calling_convention;
1886 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1887 locations->SetOut(Location::RegisterLocation(RAX));
1888 }
1889
VisitStringNewStringFromString(HInvoke * invoke)1890 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1891 X86_64Assembler* assembler = GetAssembler();
1892 LocationSummary* locations = invoke->GetLocations();
1893
1894 CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
1895 __ testl(string_to_copy, string_to_copy);
1896 SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
1897 codegen_->AddSlowPath(slow_path);
1898 __ j(kEqual, slow_path->GetEntryLabel());
1899
1900 codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc());
1901 CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
1902 __ Bind(slow_path->GetExitLabel());
1903 }
1904
VisitStringGetCharsNoCheck(HInvoke * invoke)1905 void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1906 // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1907 LocationSummary* locations = new (arena_) LocationSummary(invoke,
1908 LocationSummary::kNoCall,
1909 kIntrinsified);
1910 locations->SetInAt(0, Location::RequiresRegister());
1911 locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
1912 locations->SetInAt(2, Location::RequiresRegister());
1913 locations->SetInAt(3, Location::RequiresRegister());
1914 locations->SetInAt(4, Location::RequiresRegister());
1915
1916 // And we need some temporaries. We will use REP MOVSW, so we need fixed registers.
1917 locations->AddTemp(Location::RegisterLocation(RSI));
1918 locations->AddTemp(Location::RegisterLocation(RDI));
1919 locations->AddTemp(Location::RegisterLocation(RCX));
1920 }
1921
VisitStringGetCharsNoCheck(HInvoke * invoke)1922 void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1923 X86_64Assembler* assembler = GetAssembler();
1924 LocationSummary* locations = invoke->GetLocations();
1925
1926 size_t char_component_size = Primitive::ComponentSize(Primitive::kPrimChar);
1927 // Location of data in char array buffer.
1928 const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
1929 // Location of char array data in string.
1930 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1931
1932 // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1933 CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
1934 Location srcBegin = locations->InAt(1);
1935 int srcBegin_value =
1936 srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
1937 CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>();
1938 CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>();
1939 CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>();
1940
1941 // Check assumption that sizeof(Char) is 2 (used in scaling below).
1942 const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
1943 DCHECK_EQ(char_size, 2u);
1944
1945 NearLabel done;
1946 // Compute the number of chars (words) to move.
1947 __ movl(CpuRegister(RCX), srcEnd);
1948 if (srcBegin.IsConstant()) {
1949 __ subl(CpuRegister(RCX), Immediate(srcBegin_value));
1950 } else {
1951 DCHECK(srcBegin.IsRegister());
1952 __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>());
1953 }
1954 if (mirror::kUseStringCompression) {
1955 NearLabel copy_uncompressed, copy_loop;
1956 const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
1957 DCHECK_EQ(c_char_size, 1u);
1958 // Location of count in string.
1959 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1960
1961 __ testl(Address(obj, count_offset), Immediate(1));
1962 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1963 "Expecting 0=compressed, 1=uncompressed");
1964 __ j(kNotZero, ©_uncompressed);
1965 // Compute the address of the source string by adding the number of chars from
1966 // the source beginning to the value offset of a string.
1967 __ leaq(CpuRegister(RSI),
1968 CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_1, value_offset));
1969 // Start the loop to copy String's value to Array of Char.
1970 __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1971
1972 __ Bind(©_loop);
1973 __ jrcxz(&done);
1974 // Use TMP as temporary (convert byte from RSI to word).
1975 // TODO: Selecting RAX as the temporary and using LODSB/STOSW.
1976 __ movzxb(CpuRegister(TMP), Address(CpuRegister(RSI), 0));
1977 __ movw(Address(CpuRegister(RDI), 0), CpuRegister(TMP));
1978 __ leaq(CpuRegister(RDI), Address(CpuRegister(RDI), char_size));
1979 __ leaq(CpuRegister(RSI), Address(CpuRegister(RSI), c_char_size));
1980 // TODO: Add support for LOOP to X86_64Assembler.
1981 __ subl(CpuRegister(RCX), Immediate(1));
1982 __ jmp(©_loop);
1983
1984 __ Bind(©_uncompressed);
1985 }
1986
1987 __ leaq(CpuRegister(RSI),
1988 CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_2, value_offset));
1989 // Compute the address of the destination buffer.
1990 __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1991 // Do the move.
1992 __ rep_movsw();
1993
1994 __ Bind(&done);
1995 }
1996
GenPeek(LocationSummary * locations,Primitive::Type size,X86_64Assembler * assembler)1997 static void GenPeek(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
1998 CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1999 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); // == address, here for clarity.
2000 // x86 allows unaligned access. We do not have to check the input or use specific instructions
2001 // to avoid a SIGBUS.
2002 switch (size) {
2003 case Primitive::kPrimByte:
2004 __ movsxb(out, Address(address, 0));
2005 break;
2006 case Primitive::kPrimShort:
2007 __ movsxw(out, Address(address, 0));
2008 break;
2009 case Primitive::kPrimInt:
2010 __ movl(out, Address(address, 0));
2011 break;
2012 case Primitive::kPrimLong:
2013 __ movq(out, Address(address, 0));
2014 break;
2015 default:
2016 LOG(FATAL) << "Type not recognized for peek: " << size;
2017 UNREACHABLE();
2018 }
2019 }
2020
VisitMemoryPeekByte(HInvoke * invoke)2021 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
2022 CreateIntToIntLocations(arena_, invoke);
2023 }
2024
VisitMemoryPeekByte(HInvoke * invoke)2025 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
2026 GenPeek(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
2027 }
2028
VisitMemoryPeekIntNative(HInvoke * invoke)2029 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
2030 CreateIntToIntLocations(arena_, invoke);
2031 }
2032
VisitMemoryPeekIntNative(HInvoke * invoke)2033 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
2034 GenPeek(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
2035 }
2036
VisitMemoryPeekLongNative(HInvoke * invoke)2037 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
2038 CreateIntToIntLocations(arena_, invoke);
2039 }
2040
VisitMemoryPeekLongNative(HInvoke * invoke)2041 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
2042 GenPeek(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
2043 }
2044
VisitMemoryPeekShortNative(HInvoke * invoke)2045 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
2046 CreateIntToIntLocations(arena_, invoke);
2047 }
2048
VisitMemoryPeekShortNative(HInvoke * invoke)2049 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
2050 GenPeek(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
2051 }
2052
CreateIntIntToVoidLocations(ArenaAllocator * arena,HInvoke * invoke)2053 static void CreateIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* invoke) {
2054 LocationSummary* locations = new (arena) LocationSummary(invoke,
2055 LocationSummary::kNoCall,
2056 kIntrinsified);
2057 locations->SetInAt(0, Location::RequiresRegister());
2058 locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1)));
2059 }
2060
GenPoke(LocationSummary * locations,Primitive::Type size,X86_64Assembler * assembler)2061 static void GenPoke(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
2062 CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
2063 Location value = locations->InAt(1);
2064 // x86 allows unaligned access. We do not have to check the input or use specific instructions
2065 // to avoid a SIGBUS.
2066 switch (size) {
2067 case Primitive::kPrimByte:
2068 if (value.IsConstant()) {
2069 __ movb(Address(address, 0),
2070 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
2071 } else {
2072 __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
2073 }
2074 break;
2075 case Primitive::kPrimShort:
2076 if (value.IsConstant()) {
2077 __ movw(Address(address, 0),
2078 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
2079 } else {
2080 __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
2081 }
2082 break;
2083 case Primitive::kPrimInt:
2084 if (value.IsConstant()) {
2085 __ movl(Address(address, 0),
2086 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
2087 } else {
2088 __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
2089 }
2090 break;
2091 case Primitive::kPrimLong:
2092 if (value.IsConstant()) {
2093 int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
2094 DCHECK(IsInt<32>(v));
2095 int32_t v_32 = v;
2096 __ movq(Address(address, 0), Immediate(v_32));
2097 } else {
2098 __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
2099 }
2100 break;
2101 default:
2102 LOG(FATAL) << "Type not recognized for poke: " << size;
2103 UNREACHABLE();
2104 }
2105 }
2106
VisitMemoryPokeByte(HInvoke * invoke)2107 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
2108 CreateIntIntToVoidLocations(arena_, invoke);
2109 }
2110
VisitMemoryPokeByte(HInvoke * invoke)2111 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
2112 GenPoke(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
2113 }
2114
VisitMemoryPokeIntNative(HInvoke * invoke)2115 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
2116 CreateIntIntToVoidLocations(arena_, invoke);
2117 }
2118
VisitMemoryPokeIntNative(HInvoke * invoke)2119 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
2120 GenPoke(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
2121 }
2122
VisitMemoryPokeLongNative(HInvoke * invoke)2123 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
2124 CreateIntIntToVoidLocations(arena_, invoke);
2125 }
2126
VisitMemoryPokeLongNative(HInvoke * invoke)2127 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
2128 GenPoke(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
2129 }
2130
VisitMemoryPokeShortNative(HInvoke * invoke)2131 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
2132 CreateIntIntToVoidLocations(arena_, invoke);
2133 }
2134
VisitMemoryPokeShortNative(HInvoke * invoke)2135 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
2136 GenPoke(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
2137 }
2138
VisitThreadCurrentThread(HInvoke * invoke)2139 void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
2140 LocationSummary* locations = new (arena_) LocationSummary(invoke,
2141 LocationSummary::kNoCall,
2142 kIntrinsified);
2143 locations->SetOut(Location::RequiresRegister());
2144 }
2145
VisitThreadCurrentThread(HInvoke * invoke)2146 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
2147 CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
2148 GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(),
2149 /* no_rip */ true));
2150 }
2151
GenUnsafeGet(HInvoke * invoke,Primitive::Type type,bool is_volatile ATTRIBUTE_UNUSED,CodeGeneratorX86_64 * codegen)2152 static void GenUnsafeGet(HInvoke* invoke,
2153 Primitive::Type type,
2154 bool is_volatile ATTRIBUTE_UNUSED,
2155 CodeGeneratorX86_64* codegen) {
2156 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2157 LocationSummary* locations = invoke->GetLocations();
2158 Location base_loc = locations->InAt(1);
2159 CpuRegister base = base_loc.AsRegister<CpuRegister>();
2160 Location offset_loc = locations->InAt(2);
2161 CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
2162 Location output_loc = locations->Out();
2163 CpuRegister output = output_loc.AsRegister<CpuRegister>();
2164
2165 switch (type) {
2166 case Primitive::kPrimInt:
2167 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2168 break;
2169
2170 case Primitive::kPrimNot: {
2171 if (kEmitCompilerReadBarrier) {
2172 if (kUseBakerReadBarrier) {
2173 Address src(base, offset, ScaleFactor::TIMES_1, 0);
2174 codegen->GenerateReferenceLoadWithBakerReadBarrier(
2175 invoke, output_loc, base, src, /* needs_null_check */ false);
2176 } else {
2177 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2178 codegen->GenerateReadBarrierSlow(
2179 invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
2180 }
2181 } else {
2182 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2183 __ MaybeUnpoisonHeapReference(output);
2184 }
2185 break;
2186 }
2187
2188 case Primitive::kPrimLong:
2189 __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2190 break;
2191
2192 default:
2193 LOG(FATAL) << "Unsupported op size " << type;
2194 UNREACHABLE();
2195 }
2196 }
2197
CreateIntIntIntToIntLocations(ArenaAllocator * arena,HInvoke * invoke)2198 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
2199 bool can_call = kEmitCompilerReadBarrier &&
2200 (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
2201 invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
2202 LocationSummary* locations = new (arena) LocationSummary(invoke,
2203 (can_call
2204 ? LocationSummary::kCallOnSlowPath
2205 : LocationSummary::kNoCall),
2206 kIntrinsified);
2207 if (can_call && kUseBakerReadBarrier) {
2208 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
2209 }
2210 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
2211 locations->SetInAt(1, Location::RequiresRegister());
2212 locations->SetInAt(2, Location::RequiresRegister());
2213 locations->SetOut(Location::RequiresRegister(),
2214 (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
2215 }
2216
VisitUnsafeGet(HInvoke * invoke)2217 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
2218 CreateIntIntIntToIntLocations(arena_, invoke);
2219 }
VisitUnsafeGetVolatile(HInvoke * invoke)2220 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
2221 CreateIntIntIntToIntLocations(arena_, invoke);
2222 }
VisitUnsafeGetLong(HInvoke * invoke)2223 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
2224 CreateIntIntIntToIntLocations(arena_, invoke);
2225 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)2226 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
2227 CreateIntIntIntToIntLocations(arena_, invoke);
2228 }
VisitUnsafeGetObject(HInvoke * invoke)2229 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
2230 CreateIntIntIntToIntLocations(arena_, invoke);
2231 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)2232 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
2233 CreateIntIntIntToIntLocations(arena_, invoke);
2234 }
2235
2236
VisitUnsafeGet(HInvoke * invoke)2237 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
2238 GenUnsafeGet(invoke, Primitive::kPrimInt, /* is_volatile */ false, codegen_);
2239 }
VisitUnsafeGetVolatile(HInvoke * invoke)2240 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
2241 GenUnsafeGet(invoke, Primitive::kPrimInt, /* is_volatile */ true, codegen_);
2242 }
VisitUnsafeGetLong(HInvoke * invoke)2243 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
2244 GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ false, codegen_);
2245 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)2246 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
2247 GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ true, codegen_);
2248 }
VisitUnsafeGetObject(HInvoke * invoke)2249 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
2250 GenUnsafeGet(invoke, Primitive::kPrimNot, /* is_volatile */ false, codegen_);
2251 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)2252 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
2253 GenUnsafeGet(invoke, Primitive::kPrimNot, /* is_volatile */ true, codegen_);
2254 }
2255
2256
CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator * arena,Primitive::Type type,HInvoke * invoke)2257 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* arena,
2258 Primitive::Type type,
2259 HInvoke* invoke) {
2260 LocationSummary* locations = new (arena) LocationSummary(invoke,
2261 LocationSummary::kNoCall,
2262 kIntrinsified);
2263 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
2264 locations->SetInAt(1, Location::RequiresRegister());
2265 locations->SetInAt(2, Location::RequiresRegister());
2266 locations->SetInAt(3, Location::RequiresRegister());
2267 if (type == Primitive::kPrimNot) {
2268 // Need temp registers for card-marking.
2269 locations->AddTemp(Location::RequiresRegister()); // Possibly used for reference poisoning too.
2270 locations->AddTemp(Location::RequiresRegister());
2271 }
2272 }
2273
VisitUnsafePut(HInvoke * invoke)2274 void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
2275 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
2276 }
VisitUnsafePutOrdered(HInvoke * invoke)2277 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2278 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
2279 }
VisitUnsafePutVolatile(HInvoke * invoke)2280 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2281 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
2282 }
VisitUnsafePutObject(HInvoke * invoke)2283 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2284 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
2285 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)2286 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2287 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
2288 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)2289 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2290 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
2291 }
VisitUnsafePutLong(HInvoke * invoke)2292 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2293 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
2294 }
VisitUnsafePutLongOrdered(HInvoke * invoke)2295 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2296 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
2297 }
VisitUnsafePutLongVolatile(HInvoke * invoke)2298 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2299 CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
2300 }
2301
2302 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
2303 // memory model.
GenUnsafePut(LocationSummary * locations,Primitive::Type type,bool is_volatile,CodeGeneratorX86_64 * codegen)2304 static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool is_volatile,
2305 CodeGeneratorX86_64* codegen) {
2306 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2307 CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2308 CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2309 CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
2310
2311 if (type == Primitive::kPrimLong) {
2312 __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2313 } else if (kPoisonHeapReferences && type == Primitive::kPrimNot) {
2314 CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2315 __ movl(temp, value);
2316 __ PoisonHeapReference(temp);
2317 __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
2318 } else {
2319 __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2320 }
2321
2322 if (is_volatile) {
2323 codegen->MemoryFence();
2324 }
2325
2326 if (type == Primitive::kPrimNot) {
2327 bool value_can_be_null = true; // TODO: Worth finding out this information?
2328 codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
2329 locations->GetTemp(1).AsRegister<CpuRegister>(),
2330 base,
2331 value,
2332 value_can_be_null);
2333 }
2334 }
2335
VisitUnsafePut(HInvoke * invoke)2336 void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
2337 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ false, codegen_);
2338 }
VisitUnsafePutOrdered(HInvoke * invoke)2339 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2340 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ false, codegen_);
2341 }
VisitUnsafePutVolatile(HInvoke * invoke)2342 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2343 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ true, codegen_);
2344 }
VisitUnsafePutObject(HInvoke * invoke)2345 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2346 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ false, codegen_);
2347 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)2348 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2349 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ false, codegen_);
2350 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)2351 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2352 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ true, codegen_);
2353 }
VisitUnsafePutLong(HInvoke * invoke)2354 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2355 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ false, codegen_);
2356 }
VisitUnsafePutLongOrdered(HInvoke * invoke)2357 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2358 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ false, codegen_);
2359 }
VisitUnsafePutLongVolatile(HInvoke * invoke)2360 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2361 GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ true, codegen_);
2362 }
2363
CreateIntIntIntIntIntToInt(ArenaAllocator * arena,Primitive::Type type,HInvoke * invoke)2364 static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena,
2365 Primitive::Type type,
2366 HInvoke* invoke) {
2367 bool can_call = kEmitCompilerReadBarrier &&
2368 kUseBakerReadBarrier &&
2369 (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
2370 LocationSummary* locations = new (arena) LocationSummary(invoke,
2371 (can_call
2372 ? LocationSummary::kCallOnSlowPath
2373 : LocationSummary::kNoCall),
2374 kIntrinsified);
2375 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
2376 locations->SetInAt(1, Location::RequiresRegister());
2377 locations->SetInAt(2, Location::RequiresRegister());
2378 // expected value must be in EAX/RAX.
2379 locations->SetInAt(3, Location::RegisterLocation(RAX));
2380 locations->SetInAt(4, Location::RequiresRegister());
2381
2382 locations->SetOut(Location::RequiresRegister());
2383 if (type == Primitive::kPrimNot) {
2384 // Need temporary registers for card-marking, and possibly for
2385 // (Baker) read barrier.
2386 locations->AddTemp(Location::RequiresRegister()); // Possibly used for reference poisoning too.
2387 locations->AddTemp(Location::RequiresRegister());
2388 }
2389 }
2390
VisitUnsafeCASInt(HInvoke * invoke)2391 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2392 CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimInt, invoke);
2393 }
2394
VisitUnsafeCASLong(HInvoke * invoke)2395 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2396 CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimLong, invoke);
2397 }
2398
VisitUnsafeCASObject(HInvoke * invoke)2399 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2400 // The only read barrier implementation supporting the
2401 // UnsafeCASObject intrinsic is the Baker-style read barriers.
2402 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
2403 return;
2404 }
2405
2406 CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimNot, invoke);
2407 }
2408
GenCAS(Primitive::Type type,HInvoke * invoke,CodeGeneratorX86_64 * codegen)2409 static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
2410 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2411 LocationSummary* locations = invoke->GetLocations();
2412
2413 CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2414 CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2415 CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
2416 // Ensure `expected` is in RAX (required by the CMPXCHG instruction).
2417 DCHECK_EQ(expected.AsRegister(), RAX);
2418 CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
2419 Location out_loc = locations->Out();
2420 CpuRegister out = out_loc.AsRegister<CpuRegister>();
2421
2422 if (type == Primitive::kPrimNot) {
2423 // The only read barrier implementation supporting the
2424 // UnsafeCASObject intrinsic is the Baker-style read barriers.
2425 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2426
2427 CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2428 CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2429
2430 // Mark card for object assuming new value is stored.
2431 bool value_can_be_null = true; // TODO: Worth finding out this information?
2432 codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null);
2433
2434 // The address of the field within the holding object.
2435 Address field_addr(base, offset, ScaleFactor::TIMES_1, 0);
2436
2437 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2438 // Need to make sure the reference stored in the field is a to-space
2439 // one before attempting the CAS or the CAS could fail incorrectly.
2440 codegen->GenerateReferenceLoadWithBakerReadBarrier(
2441 invoke,
2442 out_loc, // Unused, used only as a "temporary" within the read barrier.
2443 base,
2444 field_addr,
2445 /* needs_null_check */ false,
2446 /* always_update_field */ true,
2447 &temp1,
2448 &temp2);
2449 }
2450
2451 bool base_equals_value = (base.AsRegister() == value.AsRegister());
2452 Register value_reg = value.AsRegister();
2453 if (kPoisonHeapReferences) {
2454 if (base_equals_value) {
2455 // If `base` and `value` are the same register location, move
2456 // `value_reg` to a temporary register. This way, poisoning
2457 // `value_reg` won't invalidate `base`.
2458 value_reg = temp1.AsRegister();
2459 __ movl(CpuRegister(value_reg), base);
2460 }
2461
2462 // Check that the register allocator did not assign the location
2463 // of `expected` (RAX) to `value` nor to `base`, so that heap
2464 // poisoning (when enabled) works as intended below.
2465 // - If `value` were equal to `expected`, both references would
2466 // be poisoned twice, meaning they would not be poisoned at
2467 // all, as heap poisoning uses address negation.
2468 // - If `base` were equal to `expected`, poisoning `expected`
2469 // would invalidate `base`.
2470 DCHECK_NE(value_reg, expected.AsRegister());
2471 DCHECK_NE(base.AsRegister(), expected.AsRegister());
2472
2473 __ PoisonHeapReference(expected);
2474 __ PoisonHeapReference(CpuRegister(value_reg));
2475 }
2476
2477 __ LockCmpxchgl(field_addr, CpuRegister(value_reg));
2478
2479 // LOCK CMPXCHG has full barrier semantics, and we don't need
2480 // scheduling barriers at this time.
2481
2482 // Convert ZF into the Boolean result.
2483 __ setcc(kZero, out);
2484 __ movzxb(out, out);
2485
2486 // If heap poisoning is enabled, we need to unpoison the values
2487 // that were poisoned earlier.
2488 if (kPoisonHeapReferences) {
2489 if (base_equals_value) {
2490 // `value_reg` has been moved to a temporary register, no need
2491 // to unpoison it.
2492 } else {
2493 // Ensure `value` is different from `out`, so that unpoisoning
2494 // the former does not invalidate the latter.
2495 DCHECK_NE(value_reg, out.AsRegister());
2496 __ UnpoisonHeapReference(CpuRegister(value_reg));
2497 }
2498 // Ensure `expected` is different from `out`, so that unpoisoning
2499 // the former does not invalidate the latter.
2500 DCHECK_NE(expected.AsRegister(), out.AsRegister());
2501 __ UnpoisonHeapReference(expected);
2502 }
2503 } else {
2504 if (type == Primitive::kPrimInt) {
2505 __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
2506 } else if (type == Primitive::kPrimLong) {
2507 __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
2508 } else {
2509 LOG(FATAL) << "Unexpected CAS type " << type;
2510 }
2511
2512 // LOCK CMPXCHG has full barrier semantics, and we don't need
2513 // scheduling barriers at this time.
2514
2515 // Convert ZF into the Boolean result.
2516 __ setcc(kZero, out);
2517 __ movzxb(out, out);
2518 }
2519 }
2520
VisitUnsafeCASInt(HInvoke * invoke)2521 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2522 GenCAS(Primitive::kPrimInt, invoke, codegen_);
2523 }
2524
VisitUnsafeCASLong(HInvoke * invoke)2525 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2526 GenCAS(Primitive::kPrimLong, invoke, codegen_);
2527 }
2528
VisitUnsafeCASObject(HInvoke * invoke)2529 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2530 // The only read barrier implementation supporting the
2531 // UnsafeCASObject intrinsic is the Baker-style read barriers.
2532 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2533
2534 GenCAS(Primitive::kPrimNot, invoke, codegen_);
2535 }
2536
VisitIntegerReverse(HInvoke * invoke)2537 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
2538 LocationSummary* locations = new (arena_) LocationSummary(invoke,
2539 LocationSummary::kNoCall,
2540 kIntrinsified);
2541 locations->SetInAt(0, Location::RequiresRegister());
2542 locations->SetOut(Location::SameAsFirstInput());
2543 locations->AddTemp(Location::RequiresRegister());
2544 }
2545
SwapBits(CpuRegister reg,CpuRegister temp,int32_t shift,int32_t mask,X86_64Assembler * assembler)2546 static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
2547 X86_64Assembler* assembler) {
2548 Immediate imm_shift(shift);
2549 Immediate imm_mask(mask);
2550 __ movl(temp, reg);
2551 __ shrl(reg, imm_shift);
2552 __ andl(temp, imm_mask);
2553 __ andl(reg, imm_mask);
2554 __ shll(temp, imm_shift);
2555 __ orl(reg, temp);
2556 }
2557
VisitIntegerReverse(HInvoke * invoke)2558 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
2559 X86_64Assembler* assembler = GetAssembler();
2560 LocationSummary* locations = invoke->GetLocations();
2561
2562 CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2563 CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2564
2565 /*
2566 * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2567 * swapping bits to reverse bits in a number x. Using bswap to save instructions
2568 * compared to generic luni implementation which has 5 rounds of swapping bits.
2569 * x = bswap x
2570 * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
2571 * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
2572 * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
2573 */
2574 __ bswapl(reg);
2575 SwapBits(reg, temp, 1, 0x55555555, assembler);
2576 SwapBits(reg, temp, 2, 0x33333333, assembler);
2577 SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
2578 }
2579
VisitLongReverse(HInvoke * invoke)2580 void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
2581 LocationSummary* locations = new (arena_) LocationSummary(invoke,
2582 LocationSummary::kNoCall,
2583 kIntrinsified);
2584 locations->SetInAt(0, Location::RequiresRegister());
2585 locations->SetOut(Location::SameAsFirstInput());
2586 locations->AddTemp(Location::RequiresRegister());
2587 locations->AddTemp(Location::RequiresRegister());
2588 }
2589
SwapBits64(CpuRegister reg,CpuRegister temp,CpuRegister temp_mask,int32_t shift,int64_t mask,X86_64Assembler * assembler)2590 static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
2591 int32_t shift, int64_t mask, X86_64Assembler* assembler) {
2592 Immediate imm_shift(shift);
2593 __ movq(temp_mask, Immediate(mask));
2594 __ movq(temp, reg);
2595 __ shrq(reg, imm_shift);
2596 __ andq(temp, temp_mask);
2597 __ andq(reg, temp_mask);
2598 __ shlq(temp, imm_shift);
2599 __ orq(reg, temp);
2600 }
2601
VisitLongReverse(HInvoke * invoke)2602 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
2603 X86_64Assembler* assembler = GetAssembler();
2604 LocationSummary* locations = invoke->GetLocations();
2605
2606 CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2607 CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2608 CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2609
2610 /*
2611 * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2612 * swapping bits to reverse bits in a long number x. Using bswap to save instructions
2613 * compared to generic luni implementation which has 5 rounds of swapping bits.
2614 * x = bswap x
2615 * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
2616 * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
2617 * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
2618 */
2619 __ bswapq(reg);
2620 SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
2621 SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
2622 SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
2623 }
2624
CreateBitCountLocations(ArenaAllocator * arena,CodeGeneratorX86_64 * codegen,HInvoke * invoke)2625 static void CreateBitCountLocations(
2626 ArenaAllocator* arena, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
2627 if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
2628 // Do nothing if there is no popcnt support. This results in generating
2629 // a call for the intrinsic rather than direct code.
2630 return;
2631 }
2632 LocationSummary* locations = new (arena) LocationSummary(invoke,
2633 LocationSummary::kNoCall,
2634 kIntrinsified);
2635 locations->SetInAt(0, Location::Any());
2636 locations->SetOut(Location::RequiresRegister());
2637 }
2638
GenBitCount(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_long)2639 static void GenBitCount(X86_64Assembler* assembler,
2640 CodeGeneratorX86_64* codegen,
2641 HInvoke* invoke,
2642 bool is_long) {
2643 LocationSummary* locations = invoke->GetLocations();
2644 Location src = locations->InAt(0);
2645 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2646
2647 if (invoke->InputAt(0)->IsConstant()) {
2648 // Evaluate this at compile time.
2649 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2650 int32_t result = is_long
2651 ? POPCOUNT(static_cast<uint64_t>(value))
2652 : POPCOUNT(static_cast<uint32_t>(value));
2653 codegen->Load32BitValue(out, result);
2654 return;
2655 }
2656
2657 if (src.IsRegister()) {
2658 if (is_long) {
2659 __ popcntq(out, src.AsRegister<CpuRegister>());
2660 } else {
2661 __ popcntl(out, src.AsRegister<CpuRegister>());
2662 }
2663 } else if (is_long) {
2664 DCHECK(src.IsDoubleStackSlot());
2665 __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2666 } else {
2667 DCHECK(src.IsStackSlot());
2668 __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2669 }
2670 }
2671
VisitIntegerBitCount(HInvoke * invoke)2672 void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2673 CreateBitCountLocations(arena_, codegen_, invoke);
2674 }
2675
VisitIntegerBitCount(HInvoke * invoke)2676 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2677 GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ false);
2678 }
2679
VisitLongBitCount(HInvoke * invoke)2680 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
2681 CreateBitCountLocations(arena_, codegen_, invoke);
2682 }
2683
VisitLongBitCount(HInvoke * invoke)2684 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
2685 GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ true);
2686 }
2687
CreateOneBitLocations(ArenaAllocator * arena,HInvoke * invoke,bool is_high)2688 static void CreateOneBitLocations(ArenaAllocator* arena, HInvoke* invoke, bool is_high) {
2689 LocationSummary* locations = new (arena) LocationSummary(invoke,
2690 LocationSummary::kNoCall,
2691 kIntrinsified);
2692 locations->SetInAt(0, Location::Any());
2693 locations->SetOut(Location::RequiresRegister());
2694 locations->AddTemp(is_high ? Location::RegisterLocation(RCX) // needs CL
2695 : Location::RequiresRegister()); // any will do
2696 }
2697
GenOneBit(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_high,bool is_long)2698 static void GenOneBit(X86_64Assembler* assembler,
2699 CodeGeneratorX86_64* codegen,
2700 HInvoke* invoke,
2701 bool is_high, bool is_long) {
2702 LocationSummary* locations = invoke->GetLocations();
2703 Location src = locations->InAt(0);
2704 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2705
2706 if (invoke->InputAt(0)->IsConstant()) {
2707 // Evaluate this at compile time.
2708 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2709 if (value == 0) {
2710 __ xorl(out, out); // Clears upper bits too.
2711 return;
2712 }
2713 // Nonzero value.
2714 if (is_high) {
2715 value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
2716 : 31 - CLZ(static_cast<uint32_t>(value));
2717 } else {
2718 value = is_long ? CTZ(static_cast<uint64_t>(value))
2719 : CTZ(static_cast<uint32_t>(value));
2720 }
2721 if (is_long) {
2722 codegen->Load64BitValue(out, 1ULL << value);
2723 } else {
2724 codegen->Load32BitValue(out, 1 << value);
2725 }
2726 return;
2727 }
2728
2729 // Handle the non-constant cases.
2730 CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
2731 if (is_high) {
2732 // Use architectural support: basically 1 << bsr.
2733 if (src.IsRegister()) {
2734 if (is_long) {
2735 __ bsrq(tmp, src.AsRegister<CpuRegister>());
2736 } else {
2737 __ bsrl(tmp, src.AsRegister<CpuRegister>());
2738 }
2739 } else if (is_long) {
2740 DCHECK(src.IsDoubleStackSlot());
2741 __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2742 } else {
2743 DCHECK(src.IsStackSlot());
2744 __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2745 }
2746 // BSR sets ZF if the input was zero.
2747 NearLabel is_zero, done;
2748 __ j(kEqual, &is_zero);
2749 __ movl(out, Immediate(1)); // Clears upper bits too.
2750 if (is_long) {
2751 __ shlq(out, tmp);
2752 } else {
2753 __ shll(out, tmp);
2754 }
2755 __ jmp(&done);
2756 __ Bind(&is_zero);
2757 __ xorl(out, out); // Clears upper bits too.
2758 __ Bind(&done);
2759 } else {
2760 // Copy input into temporary.
2761 if (src.IsRegister()) {
2762 if (is_long) {
2763 __ movq(tmp, src.AsRegister<CpuRegister>());
2764 } else {
2765 __ movl(tmp, src.AsRegister<CpuRegister>());
2766 }
2767 } else if (is_long) {
2768 DCHECK(src.IsDoubleStackSlot());
2769 __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2770 } else {
2771 DCHECK(src.IsStackSlot());
2772 __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2773 }
2774 // Do the bit twiddling: basically tmp & -tmp;
2775 if (is_long) {
2776 __ movq(out, tmp);
2777 __ negq(tmp);
2778 __ andq(out, tmp);
2779 } else {
2780 __ movl(out, tmp);
2781 __ negl(tmp);
2782 __ andl(out, tmp);
2783 }
2784 }
2785 }
2786
VisitIntegerHighestOneBit(HInvoke * invoke)2787 void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2788 CreateOneBitLocations(arena_, invoke, /* is_high */ true);
2789 }
2790
VisitIntegerHighestOneBit(HInvoke * invoke)2791 void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2792 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ false);
2793 }
2794
VisitLongHighestOneBit(HInvoke * invoke)2795 void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2796 CreateOneBitLocations(arena_, invoke, /* is_high */ true);
2797 }
2798
VisitLongHighestOneBit(HInvoke * invoke)2799 void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2800 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ true);
2801 }
2802
VisitIntegerLowestOneBit(HInvoke * invoke)2803 void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2804 CreateOneBitLocations(arena_, invoke, /* is_high */ false);
2805 }
2806
VisitIntegerLowestOneBit(HInvoke * invoke)2807 void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2808 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ false);
2809 }
2810
VisitLongLowestOneBit(HInvoke * invoke)2811 void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2812 CreateOneBitLocations(arena_, invoke, /* is_high */ false);
2813 }
2814
VisitLongLowestOneBit(HInvoke * invoke)2815 void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2816 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ true);
2817 }
2818
CreateLeadingZeroLocations(ArenaAllocator * arena,HInvoke * invoke)2819 static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
2820 LocationSummary* locations = new (arena) LocationSummary(invoke,
2821 LocationSummary::kNoCall,
2822 kIntrinsified);
2823 locations->SetInAt(0, Location::Any());
2824 locations->SetOut(Location::RequiresRegister());
2825 }
2826
GenLeadingZeros(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_long)2827 static void GenLeadingZeros(X86_64Assembler* assembler,
2828 CodeGeneratorX86_64* codegen,
2829 HInvoke* invoke, bool is_long) {
2830 LocationSummary* locations = invoke->GetLocations();
2831 Location src = locations->InAt(0);
2832 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2833
2834 int zero_value_result = is_long ? 64 : 32;
2835 if (invoke->InputAt(0)->IsConstant()) {
2836 // Evaluate this at compile time.
2837 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2838 if (value == 0) {
2839 value = zero_value_result;
2840 } else {
2841 value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
2842 }
2843 codegen->Load32BitValue(out, value);
2844 return;
2845 }
2846
2847 // Handle the non-constant cases.
2848 if (src.IsRegister()) {
2849 if (is_long) {
2850 __ bsrq(out, src.AsRegister<CpuRegister>());
2851 } else {
2852 __ bsrl(out, src.AsRegister<CpuRegister>());
2853 }
2854 } else if (is_long) {
2855 DCHECK(src.IsDoubleStackSlot());
2856 __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2857 } else {
2858 DCHECK(src.IsStackSlot());
2859 __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2860 }
2861
2862 // BSR sets ZF if the input was zero, and the output is undefined.
2863 NearLabel is_zero, done;
2864 __ j(kEqual, &is_zero);
2865
2866 // Correct the result from BSR to get the CLZ result.
2867 __ xorl(out, Immediate(zero_value_result - 1));
2868 __ jmp(&done);
2869
2870 // Fix the zero case with the expected result.
2871 __ Bind(&is_zero);
2872 __ movl(out, Immediate(zero_value_result));
2873
2874 __ Bind(&done);
2875 }
2876
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)2877 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2878 CreateLeadingZeroLocations(arena_, invoke);
2879 }
2880
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)2881 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2882 GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
2883 }
2884
VisitLongNumberOfLeadingZeros(HInvoke * invoke)2885 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2886 CreateLeadingZeroLocations(arena_, invoke);
2887 }
2888
VisitLongNumberOfLeadingZeros(HInvoke * invoke)2889 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2890 GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
2891 }
2892
CreateTrailingZeroLocations(ArenaAllocator * arena,HInvoke * invoke)2893 static void CreateTrailingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
2894 LocationSummary* locations = new (arena) LocationSummary(invoke,
2895 LocationSummary::kNoCall,
2896 kIntrinsified);
2897 locations->SetInAt(0, Location::Any());
2898 locations->SetOut(Location::RequiresRegister());
2899 }
2900
GenTrailingZeros(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_long)2901 static void GenTrailingZeros(X86_64Assembler* assembler,
2902 CodeGeneratorX86_64* codegen,
2903 HInvoke* invoke, bool is_long) {
2904 LocationSummary* locations = invoke->GetLocations();
2905 Location src = locations->InAt(0);
2906 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2907
2908 int zero_value_result = is_long ? 64 : 32;
2909 if (invoke->InputAt(0)->IsConstant()) {
2910 // Evaluate this at compile time.
2911 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2912 if (value == 0) {
2913 value = zero_value_result;
2914 } else {
2915 value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
2916 }
2917 codegen->Load32BitValue(out, value);
2918 return;
2919 }
2920
2921 // Handle the non-constant cases.
2922 if (src.IsRegister()) {
2923 if (is_long) {
2924 __ bsfq(out, src.AsRegister<CpuRegister>());
2925 } else {
2926 __ bsfl(out, src.AsRegister<CpuRegister>());
2927 }
2928 } else if (is_long) {
2929 DCHECK(src.IsDoubleStackSlot());
2930 __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2931 } else {
2932 DCHECK(src.IsStackSlot());
2933 __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2934 }
2935
2936 // BSF sets ZF if the input was zero, and the output is undefined.
2937 NearLabel done;
2938 __ j(kNotEqual, &done);
2939
2940 // Fix the zero case with the expected result.
2941 __ movl(out, Immediate(zero_value_result));
2942
2943 __ Bind(&done);
2944 }
2945
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)2946 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2947 CreateTrailingZeroLocations(arena_, invoke);
2948 }
2949
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)2950 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2951 GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
2952 }
2953
VisitLongNumberOfTrailingZeros(HInvoke * invoke)2954 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2955 CreateTrailingZeroLocations(arena_, invoke);
2956 }
2957
VisitLongNumberOfTrailingZeros(HInvoke * invoke)2958 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2959 GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
2960 }
2961
VisitReferenceGetReferent(HInvoke * invoke)2962 void IntrinsicLocationsBuilderX86_64::VisitReferenceGetReferent(HInvoke* invoke) {
2963 if (kEmitCompilerReadBarrier) {
2964 // Do not intrinsify this call with the read barrier configuration.
2965 return;
2966 }
2967 LocationSummary* locations = new (arena_) LocationSummary(invoke,
2968 LocationSummary::kCallOnSlowPath,
2969 kIntrinsified);
2970 locations->SetInAt(0, Location::RequiresRegister());
2971 locations->SetOut(Location::SameAsFirstInput());
2972 locations->AddTemp(Location::RequiresRegister());
2973 }
2974
VisitReferenceGetReferent(HInvoke * invoke)2975 void IntrinsicCodeGeneratorX86_64::VisitReferenceGetReferent(HInvoke* invoke) {
2976 DCHECK(!kEmitCompilerReadBarrier);
2977 LocationSummary* locations = invoke->GetLocations();
2978 X86_64Assembler* assembler = GetAssembler();
2979
2980 CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
2981 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2982
2983 SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
2984 codegen_->AddSlowPath(slow_path);
2985
2986 // Load ArtMethod first.
2987 HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect();
2988 DCHECK(invoke_direct != nullptr);
2989 Location temp_loc = codegen_->GenerateCalleeMethodStaticOrDirectCall(
2990 invoke_direct, locations->GetTemp(0));
2991 DCHECK(temp_loc.Equals(locations->GetTemp(0)));
2992 CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
2993
2994 // Now get declaring class.
2995 __ movl(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value()));
2996
2997 uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset();
2998 uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset();
2999 DCHECK_NE(slow_path_flag_offset, 0u);
3000 DCHECK_NE(disable_flag_offset, 0u);
3001 DCHECK_NE(slow_path_flag_offset, disable_flag_offset);
3002
3003 // Check static flags preventing us for using intrinsic.
3004 if (slow_path_flag_offset == disable_flag_offset + 1) {
3005 __ cmpw(Address(temp, disable_flag_offset), Immediate(0));
3006 __ j(kNotEqual, slow_path->GetEntryLabel());
3007 } else {
3008 __ cmpb(Address(temp, disable_flag_offset), Immediate(0));
3009 __ j(kNotEqual, slow_path->GetEntryLabel());
3010 __ cmpb(Address(temp, slow_path_flag_offset), Immediate(0));
3011 __ j(kNotEqual, slow_path->GetEntryLabel());
3012 }
3013
3014 // Fast path.
3015 __ movl(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value()));
3016 codegen_->MaybeRecordImplicitNullCheck(invoke);
3017 __ MaybeUnpoisonHeapReference(out);
3018 __ Bind(slow_path->GetExitLabel());
3019 }
3020
VisitIntegerValueOf(HInvoke * invoke)3021 void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) {
3022 InvokeRuntimeCallingConvention calling_convention;
3023 IntrinsicVisitor::ComputeIntegerValueOfLocations(
3024 invoke,
3025 codegen_,
3026 Location::RegisterLocation(RAX),
3027 Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
3028 }
3029
VisitIntegerValueOf(HInvoke * invoke)3030 void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) {
3031 IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo();
3032 LocationSummary* locations = invoke->GetLocations();
3033 X86_64Assembler* assembler = GetAssembler();
3034
3035 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
3036 InvokeRuntimeCallingConvention calling_convention;
3037 if (invoke->InputAt(0)->IsConstant()) {
3038 int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
3039 if (value >= info.low && value <= info.high) {
3040 // Just embed the j.l.Integer in the code.
3041 ScopedObjectAccess soa(Thread::Current());
3042 mirror::Object* boxed = info.cache->Get(value + (-info.low));
3043 DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed));
3044 uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed));
3045 __ movl(out, Immediate(static_cast<int32_t>(address)));
3046 } else {
3047 // Allocate and initialize a new j.l.Integer.
3048 // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
3049 // JIT object table.
3050 CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
3051 uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
3052 __ movl(argument, Immediate(static_cast<int32_t>(address)));
3053 codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
3054 CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
3055 __ movl(Address(out, info.value_offset), Immediate(value));
3056 }
3057 } else {
3058 CpuRegister in = locations->InAt(0).AsRegister<CpuRegister>();
3059 // Check bounds of our cache.
3060 __ leal(out, Address(in, -info.low));
3061 __ cmpl(out, Immediate(info.high - info.low + 1));
3062 NearLabel allocate, done;
3063 __ j(kAboveEqual, &allocate);
3064 // If the value is within the bounds, load the j.l.Integer directly from the array.
3065 uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
3066 uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
3067 if (data_offset + address <= std::numeric_limits<int32_t>::max()) {
3068 __ movl(out, Address(out, TIMES_4, data_offset + address));
3069 } else {
3070 CpuRegister temp = CpuRegister(calling_convention.GetRegisterAt(0));
3071 __ movl(temp, Immediate(static_cast<int32_t>(data_offset + address)));
3072 __ movl(out, Address(temp, out, TIMES_4, 0));
3073 }
3074 __ MaybeUnpoisonHeapReference(out);
3075 __ jmp(&done);
3076 __ Bind(&allocate);
3077 // Otherwise allocate and initialize a new j.l.Integer.
3078 CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
3079 address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
3080 __ movl(argument, Immediate(static_cast<int32_t>(address)));
3081 codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
3082 CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
3083 __ movl(Address(out, info.value_offset), in);
3084 __ Bind(&done);
3085 }
3086 }
3087
3088 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
3089 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
3090
3091 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
3092 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
3093 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferAppend);
3094 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferLength);
3095 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferToString);
3096 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppend);
3097 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderLength);
3098 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderToString);
3099
3100 // 1.8.
3101 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
3102 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
3103 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
3104 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong)
3105 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject)
3106
3107 UNREACHABLE_INTRINSICS(X86_64)
3108
3109 #undef __
3110
3111 } // namespace x86_64
3112 } // namespace art
3113