1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "intrinsics_arm64.h"
18 
19 #include "arch/arm64/callee_save_frame_arm64.h"
20 #include "arch/arm64/instruction_set_features_arm64.h"
21 #include "art_method.h"
22 #include "base/bit_utils.h"
23 #include "code_generator_arm64.h"
24 #include "common_arm64.h"
25 #include "data_type-inl.h"
26 #include "entrypoints/quick/quick_entrypoints.h"
27 #include "heap_poisoning.h"
28 #include "intrinsic_objects.h"
29 #include "intrinsics.h"
30 #include "intrinsics_utils.h"
31 #include "lock_word.h"
32 #include "mirror/array-inl.h"
33 #include "mirror/object_array-inl.h"
34 #include "mirror/reference.h"
35 #include "mirror/string-inl.h"
36 #include "mirror/var_handle.h"
37 #include "scoped_thread_state_change-inl.h"
38 #include "thread-current-inl.h"
39 #include "utils/arm64/assembler_arm64.h"
40 #include "well_known_classes.h"
41 
42 using namespace vixl::aarch64;  // NOLINT(build/namespaces)
43 
44 // TODO(VIXL): Make VIXL compile with -Wshadow.
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wshadow"
47 #include "aarch64/disasm-aarch64.h"
48 #include "aarch64/macro-assembler-aarch64.h"
49 #pragma GCC diagnostic pop
50 
51 namespace art HIDDEN {
52 
53 namespace arm64 {
54 
55 using helpers::CPURegisterFrom;
56 using helpers::DRegisterFrom;
57 using helpers::HeapOperand;
58 using helpers::LocationFrom;
59 using helpers::InputCPURegisterAt;
60 using helpers::InputCPURegisterOrZeroRegAt;
61 using helpers::OperandFrom;
62 using helpers::RegisterFrom;
63 using helpers::SRegisterFrom;
64 using helpers::WRegisterFrom;
65 using helpers::XRegisterFrom;
66 using helpers::HRegisterFrom;
67 using helpers::InputRegisterAt;
68 using helpers::OutputRegister;
69 
70 namespace {
71 
AbsoluteHeapOperandFrom(Location location,size_t offset=0)72 ALWAYS_INLINE inline MemOperand AbsoluteHeapOperandFrom(Location location, size_t offset = 0) {
73   return MemOperand(XRegisterFrom(location), offset);
74 }
75 
76 }  // namespace
77 
GetVIXLAssembler()78 MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
79   return codegen_->GetVIXLAssembler();
80 }
81 
GetAllocator()82 ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() {
83   return codegen_->GetGraph()->GetAllocator();
84 }
85 
86 using IntrinsicSlowPathARM64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM64,
87                                                  SlowPathCodeARM64,
88                                                  Arm64Assembler>;
89 
90 #define __ codegen->GetVIXLAssembler()->
91 
92 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
93 class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 {
94  public:
ReadBarrierSystemArrayCopySlowPathARM64(HInstruction * instruction,Location tmp)95   ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp)
96       : SlowPathCodeARM64(instruction), tmp_(tmp) {
97   }
98 
EmitNativeCode(CodeGenerator * codegen_in)99   void EmitNativeCode(CodeGenerator* codegen_in) override {
100     DCHECK(codegen_in->EmitBakerReadBarrier());
101     CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
102     LocationSummary* locations = instruction_->GetLocations();
103     DCHECK(locations->CanCall());
104     DCHECK(instruction_->IsInvokeStaticOrDirect())
105         << "Unexpected instruction in read barrier arraycopy slow path: "
106         << instruction_->DebugName();
107     DCHECK(instruction_->GetLocations()->Intrinsified());
108     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
109 
110     const int32_t element_size = DataType::Size(DataType::Type::kReference);
111 
112     Register src_curr_addr = XRegisterFrom(locations->GetTemp(0));
113     Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1));
114     Register src_stop_addr = XRegisterFrom(locations->GetTemp(2));
115     Register tmp_reg = WRegisterFrom(tmp_);
116 
117     __ Bind(GetEntryLabel());
118     // The source range and destination pointer were initialized before entering the slow-path.
119     vixl::aarch64::Label slow_copy_loop;
120     __ Bind(&slow_copy_loop);
121     __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex));
122     codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg);
123     // TODO: Inline the mark bit check before calling the runtime?
124     // tmp_reg = ReadBarrier::Mark(tmp_reg);
125     // No need to save live registers; it's taken care of by the
126     // entrypoint. Also, there is no need to update the stack mask,
127     // as this runtime call will not trigger a garbage collection.
128     // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more
129     // explanations.)
130     DCHECK_NE(tmp_.reg(), LR);
131     DCHECK_NE(tmp_.reg(), WSP);
132     DCHECK_NE(tmp_.reg(), WZR);
133     // IP0 is used internally by the ReadBarrierMarkRegX entry point
134     // as a temporary (and not preserved).  It thus cannot be used by
135     // any live register in this slow path.
136     DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0);
137     DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0);
138     DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0);
139     DCHECK_NE(tmp_.reg(), IP0);
140     DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg();
141     // TODO: Load the entrypoint once before the loop, instead of
142     // loading it at every iteration.
143     int32_t entry_point_offset =
144         Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
145     // This runtime call does not require a stack map.
146     codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
147     codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg);
148     __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex));
149     __ Cmp(src_curr_addr, src_stop_addr);
150     __ B(&slow_copy_loop, ne);
151     __ B(GetExitLabel());
152   }
153 
GetDescription() const154   const char* GetDescription() const override { return "ReadBarrierSystemArrayCopySlowPathARM64"; }
155 
156  private:
157   Location tmp_;
158 
159   DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64);
160 };
161 #undef __
162 
TryDispatch(HInvoke * invoke)163 bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
164   Dispatch(invoke);
165   LocationSummary* res = invoke->GetLocations();
166   if (res == nullptr) {
167     return false;
168   }
169   return res->Intrinsified();
170 }
171 
172 #define __ masm->
173 
CreateFPToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)174 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
175   LocationSummary* locations =
176       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
177   locations->SetInAt(0, Location::RequiresFpuRegister());
178   locations->SetOut(Location::RequiresRegister());
179 }
180 
CreateIntToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)181 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
182   LocationSummary* locations =
183       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
184   locations->SetInAt(0, Location::RequiresRegister());
185   locations->SetOut(Location::RequiresFpuRegister());
186 }
187 
MoveFPToInt(LocationSummary * locations,bool is64bit,MacroAssembler * masm)188 static void MoveFPToInt(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
189   Location input = locations->InAt(0);
190   Location output = locations->Out();
191   __ Fmov(is64bit ? XRegisterFrom(output) : WRegisterFrom(output),
192           is64bit ? DRegisterFrom(input) : SRegisterFrom(input));
193 }
194 
MoveIntToFP(LocationSummary * locations,bool is64bit,MacroAssembler * masm)195 static void MoveIntToFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
196   Location input = locations->InAt(0);
197   Location output = locations->Out();
198   __ Fmov(is64bit ? DRegisterFrom(output) : SRegisterFrom(output),
199           is64bit ? XRegisterFrom(input) : WRegisterFrom(input));
200 }
201 
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)202 void IntrinsicLocationsBuilderARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
203   CreateFPToIntLocations(allocator_, invoke);
204 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)205 void IntrinsicLocationsBuilderARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
206   CreateIntToFPLocations(allocator_, invoke);
207 }
208 
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)209 void IntrinsicCodeGeneratorARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
210   MoveFPToInt(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
211 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)212 void IntrinsicCodeGeneratorARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
213   MoveIntToFP(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
214 }
215 
VisitFloatFloatToRawIntBits(HInvoke * invoke)216 void IntrinsicLocationsBuilderARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
217   CreateFPToIntLocations(allocator_, invoke);
218 }
VisitFloatIntBitsToFloat(HInvoke * invoke)219 void IntrinsicLocationsBuilderARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
220   CreateIntToFPLocations(allocator_, invoke);
221 }
222 
VisitFloatFloatToRawIntBits(HInvoke * invoke)223 void IntrinsicCodeGeneratorARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
224   MoveFPToInt(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
225 }
VisitFloatIntBitsToFloat(HInvoke * invoke)226 void IntrinsicCodeGeneratorARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
227   MoveIntToFP(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
228 }
229 
CreateIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)230 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
231   LocationSummary* locations =
232       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
233   locations->SetInAt(0, Location::RequiresRegister());
234   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
235 }
236 
CreateIntIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)237 static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
238   LocationSummary* locations =
239       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
240   locations->SetInAt(0, Location::RequiresRegister());
241   locations->SetInAt(1, Location::RequiresRegister());
242   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
243 }
244 
CreateIntIntToIntSlowPathCallLocations(ArenaAllocator * allocator,HInvoke * invoke)245 static void CreateIntIntToIntSlowPathCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
246   LocationSummary* locations =
247       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
248   locations->SetInAt(0, Location::RequiresRegister());
249   locations->SetInAt(1, Location::RequiresRegister());
250   // Force kOutputOverlap; see comments in IntrinsicSlowPath::EmitNativeCode.
251   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
252 }
253 
GenerateReverseBytes(MacroAssembler * masm,DataType::Type type,CPURegister in,CPURegister out)254 static void GenerateReverseBytes(MacroAssembler* masm,
255                                  DataType::Type type,
256                                  CPURegister in,
257                                  CPURegister out) {
258   switch (type) {
259     case DataType::Type::kUint16:
260       __ Rev16(out.W(), in.W());
261       break;
262     case DataType::Type::kInt16:
263       __ Rev16(out.W(), in.W());
264       __ Sxth(out.W(), out.W());
265       break;
266     case DataType::Type::kInt32:
267       __ Rev(out.W(), in.W());
268       break;
269     case DataType::Type::kInt64:
270       __ Rev(out.X(), in.X());
271       break;
272     case DataType::Type::kFloat32:
273       __ Rev(in.W(), in.W());  // Note: Clobbers `in`.
274       __ Fmov(out.S(), in.W());
275       break;
276     case DataType::Type::kFloat64:
277       __ Rev(in.X(), in.X());  // Note: Clobbers `in`.
278       __ Fmov(out.D(), in.X());
279       break;
280     default:
281       LOG(FATAL) << "Unexpected type for reverse-bytes: " << type;
282       UNREACHABLE();
283   }
284 }
285 
GenReverseBytes(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)286 static void GenReverseBytes(LocationSummary* locations,
287                             DataType::Type type,
288                             MacroAssembler* masm) {
289   Location in = locations->InAt(0);
290   Location out = locations->Out();
291   GenerateReverseBytes(masm, type, CPURegisterFrom(in, type), CPURegisterFrom(out, type));
292 }
293 
VisitIntegerReverseBytes(HInvoke * invoke)294 void IntrinsicLocationsBuilderARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
295   CreateIntToIntLocations(allocator_, invoke);
296 }
297 
VisitIntegerReverseBytes(HInvoke * invoke)298 void IntrinsicCodeGeneratorARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
299   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
300 }
301 
VisitLongReverseBytes(HInvoke * invoke)302 void IntrinsicLocationsBuilderARM64::VisitLongReverseBytes(HInvoke* invoke) {
303   CreateIntToIntLocations(allocator_, invoke);
304 }
305 
VisitLongReverseBytes(HInvoke * invoke)306 void IntrinsicCodeGeneratorARM64::VisitLongReverseBytes(HInvoke* invoke) {
307   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
308 }
309 
VisitShortReverseBytes(HInvoke * invoke)310 void IntrinsicLocationsBuilderARM64::VisitShortReverseBytes(HInvoke* invoke) {
311   CreateIntToIntLocations(allocator_, invoke);
312 }
313 
VisitShortReverseBytes(HInvoke * invoke)314 void IntrinsicCodeGeneratorARM64::VisitShortReverseBytes(HInvoke* invoke) {
315   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetVIXLAssembler());
316 }
317 
GenNumberOfLeadingZeros(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)318 static void GenNumberOfLeadingZeros(LocationSummary* locations,
319                                     DataType::Type type,
320                                     MacroAssembler* masm) {
321   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
322 
323   Location in = locations->InAt(0);
324   Location out = locations->Out();
325 
326   __ Clz(RegisterFrom(out, type), RegisterFrom(in, type));
327 }
328 
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)329 void IntrinsicLocationsBuilderARM64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
330   CreateIntToIntLocations(allocator_, invoke);
331 }
332 
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)333 void IntrinsicCodeGeneratorARM64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
334   GenNumberOfLeadingZeros(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
335 }
336 
VisitLongNumberOfLeadingZeros(HInvoke * invoke)337 void IntrinsicLocationsBuilderARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
338   CreateIntToIntLocations(allocator_, invoke);
339 }
340 
VisitLongNumberOfLeadingZeros(HInvoke * invoke)341 void IntrinsicCodeGeneratorARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
342   GenNumberOfLeadingZeros(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
343 }
344 
GenNumberOfTrailingZeros(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)345 static void GenNumberOfTrailingZeros(LocationSummary* locations,
346                                      DataType::Type type,
347                                      MacroAssembler* masm) {
348   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
349 
350   Location in = locations->InAt(0);
351   Location out = locations->Out();
352 
353   __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
354   __ Clz(RegisterFrom(out, type), RegisterFrom(out, type));
355 }
356 
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)357 void IntrinsicLocationsBuilderARM64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
358   CreateIntToIntLocations(allocator_, invoke);
359 }
360 
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)361 void IntrinsicCodeGeneratorARM64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
362   GenNumberOfTrailingZeros(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
363 }
364 
VisitLongNumberOfTrailingZeros(HInvoke * invoke)365 void IntrinsicLocationsBuilderARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
366   CreateIntToIntLocations(allocator_, invoke);
367 }
368 
VisitLongNumberOfTrailingZeros(HInvoke * invoke)369 void IntrinsicCodeGeneratorARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
370   GenNumberOfTrailingZeros(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
371 }
372 
GenReverse(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)373 static void GenReverse(LocationSummary* locations,
374                        DataType::Type type,
375                        MacroAssembler* masm) {
376   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
377 
378   Location in = locations->InAt(0);
379   Location out = locations->Out();
380 
381   __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
382 }
383 
VisitIntegerReverse(HInvoke * invoke)384 void IntrinsicLocationsBuilderARM64::VisitIntegerReverse(HInvoke* invoke) {
385   CreateIntToIntLocations(allocator_, invoke);
386 }
387 
VisitIntegerReverse(HInvoke * invoke)388 void IntrinsicCodeGeneratorARM64::VisitIntegerReverse(HInvoke* invoke) {
389   GenReverse(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
390 }
391 
VisitLongReverse(HInvoke * invoke)392 void IntrinsicLocationsBuilderARM64::VisitLongReverse(HInvoke* invoke) {
393   CreateIntToIntLocations(allocator_, invoke);
394 }
395 
VisitLongReverse(HInvoke * invoke)396 void IntrinsicCodeGeneratorARM64::VisitLongReverse(HInvoke* invoke) {
397   GenReverse(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
398 }
399 
GenBitCount(HInvoke * instr,DataType::Type type,MacroAssembler * masm)400 static void GenBitCount(HInvoke* instr, DataType::Type type, MacroAssembler* masm) {
401   DCHECK(DataType::IsIntOrLongType(type)) << type;
402   DCHECK_EQ(instr->GetType(), DataType::Type::kInt32);
403   DCHECK_EQ(DataType::Kind(instr->InputAt(0)->GetType()), type);
404 
405   UseScratchRegisterScope temps(masm);
406 
407   Register src = InputRegisterAt(instr, 0);
408   Register dst = RegisterFrom(instr->GetLocations()->Out(), type);
409   VRegister fpr = (type == DataType::Type::kInt64) ? temps.AcquireD() : temps.AcquireS();
410 
411   __ Fmov(fpr, src);
412   __ Cnt(fpr.V8B(), fpr.V8B());
413   __ Addv(fpr.B(), fpr.V8B());
414   __ Fmov(dst, fpr);
415 }
416 
VisitLongBitCount(HInvoke * invoke)417 void IntrinsicLocationsBuilderARM64::VisitLongBitCount(HInvoke* invoke) {
418   CreateIntToIntLocations(allocator_, invoke);
419 }
420 
VisitLongBitCount(HInvoke * invoke)421 void IntrinsicCodeGeneratorARM64::VisitLongBitCount(HInvoke* invoke) {
422   GenBitCount(invoke, DataType::Type::kInt64, GetVIXLAssembler());
423 }
424 
VisitIntegerBitCount(HInvoke * invoke)425 void IntrinsicLocationsBuilderARM64::VisitIntegerBitCount(HInvoke* invoke) {
426   CreateIntToIntLocations(allocator_, invoke);
427 }
428 
VisitIntegerBitCount(HInvoke * invoke)429 void IntrinsicCodeGeneratorARM64::VisitIntegerBitCount(HInvoke* invoke) {
430   GenBitCount(invoke, DataType::Type::kInt32, GetVIXLAssembler());
431 }
432 
GenHighestOneBit(HInvoke * invoke,DataType::Type type,MacroAssembler * masm)433 static void GenHighestOneBit(HInvoke* invoke, DataType::Type type, MacroAssembler* masm) {
434   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
435 
436   UseScratchRegisterScope temps(masm);
437 
438   Register src = InputRegisterAt(invoke, 0);
439   Register dst = RegisterFrom(invoke->GetLocations()->Out(), type);
440   Register temp = (type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
441   size_t high_bit = (type == DataType::Type::kInt64) ? 63u : 31u;
442   size_t clz_high_bit = (type == DataType::Type::kInt64) ? 6u : 5u;
443 
444   __ Clz(temp, src);
445   __ Mov(dst, UINT64_C(1) << high_bit);  // MOV (bitmask immediate)
446   __ Bic(dst, dst, Operand(temp, LSL, high_bit - clz_high_bit));  // Clear dst if src was 0.
447   __ Lsr(dst, dst, temp);
448 }
449 
VisitIntegerHighestOneBit(HInvoke * invoke)450 void IntrinsicLocationsBuilderARM64::VisitIntegerHighestOneBit(HInvoke* invoke) {
451   CreateIntToIntLocations(allocator_, invoke);
452 }
453 
VisitIntegerHighestOneBit(HInvoke * invoke)454 void IntrinsicCodeGeneratorARM64::VisitIntegerHighestOneBit(HInvoke* invoke) {
455   GenHighestOneBit(invoke, DataType::Type::kInt32, GetVIXLAssembler());
456 }
457 
VisitLongHighestOneBit(HInvoke * invoke)458 void IntrinsicLocationsBuilderARM64::VisitLongHighestOneBit(HInvoke* invoke) {
459   CreateIntToIntLocations(allocator_, invoke);
460 }
461 
VisitLongHighestOneBit(HInvoke * invoke)462 void IntrinsicCodeGeneratorARM64::VisitLongHighestOneBit(HInvoke* invoke) {
463   GenHighestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler());
464 }
465 
GenLowestOneBit(HInvoke * invoke,DataType::Type type,MacroAssembler * masm)466 static void GenLowestOneBit(HInvoke* invoke, DataType::Type type, MacroAssembler* masm) {
467   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
468 
469   UseScratchRegisterScope temps(masm);
470 
471   Register src = InputRegisterAt(invoke, 0);
472   Register dst = RegisterFrom(invoke->GetLocations()->Out(), type);
473   Register temp = (type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
474 
475   __ Neg(temp, src);
476   __ And(dst, temp, src);
477 }
478 
VisitIntegerLowestOneBit(HInvoke * invoke)479 void IntrinsicLocationsBuilderARM64::VisitIntegerLowestOneBit(HInvoke* invoke) {
480   CreateIntToIntLocations(allocator_, invoke);
481 }
482 
VisitIntegerLowestOneBit(HInvoke * invoke)483 void IntrinsicCodeGeneratorARM64::VisitIntegerLowestOneBit(HInvoke* invoke) {
484   GenLowestOneBit(invoke, DataType::Type::kInt32, GetVIXLAssembler());
485 }
486 
VisitLongLowestOneBit(HInvoke * invoke)487 void IntrinsicLocationsBuilderARM64::VisitLongLowestOneBit(HInvoke* invoke) {
488   CreateIntToIntLocations(allocator_, invoke);
489 }
490 
VisitLongLowestOneBit(HInvoke * invoke)491 void IntrinsicCodeGeneratorARM64::VisitLongLowestOneBit(HInvoke* invoke) {
492   GenLowestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler());
493 }
494 
CreateFPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)495 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
496   LocationSummary* locations =
497       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
498   locations->SetInAt(0, Location::RequiresFpuRegister());
499   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
500 }
501 
VisitMathSqrt(HInvoke * invoke)502 void IntrinsicLocationsBuilderARM64::VisitMathSqrt(HInvoke* invoke) {
503   CreateFPToFPLocations(allocator_, invoke);
504 }
505 
VisitMathSqrt(HInvoke * invoke)506 void IntrinsicCodeGeneratorARM64::VisitMathSqrt(HInvoke* invoke) {
507   LocationSummary* locations = invoke->GetLocations();
508   MacroAssembler* masm = GetVIXLAssembler();
509   __ Fsqrt(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
510 }
511 
VisitMathCeil(HInvoke * invoke)512 void IntrinsicLocationsBuilderARM64::VisitMathCeil(HInvoke* invoke) {
513   CreateFPToFPLocations(allocator_, invoke);
514 }
515 
VisitMathCeil(HInvoke * invoke)516 void IntrinsicCodeGeneratorARM64::VisitMathCeil(HInvoke* invoke) {
517   LocationSummary* locations = invoke->GetLocations();
518   MacroAssembler* masm = GetVIXLAssembler();
519   __ Frintp(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
520 }
521 
VisitMathFloor(HInvoke * invoke)522 void IntrinsicLocationsBuilderARM64::VisitMathFloor(HInvoke* invoke) {
523   CreateFPToFPLocations(allocator_, invoke);
524 }
525 
VisitMathFloor(HInvoke * invoke)526 void IntrinsicCodeGeneratorARM64::VisitMathFloor(HInvoke* invoke) {
527   LocationSummary* locations = invoke->GetLocations();
528   MacroAssembler* masm = GetVIXLAssembler();
529   __ Frintm(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
530 }
531 
VisitMathRint(HInvoke * invoke)532 void IntrinsicLocationsBuilderARM64::VisitMathRint(HInvoke* invoke) {
533   CreateFPToFPLocations(allocator_, invoke);
534 }
535 
VisitMathRint(HInvoke * invoke)536 void IntrinsicCodeGeneratorARM64::VisitMathRint(HInvoke* invoke) {
537   LocationSummary* locations = invoke->GetLocations();
538   MacroAssembler* masm = GetVIXLAssembler();
539   __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
540 }
541 
CreateFPToIntPlusFPTempLocations(ArenaAllocator * allocator,HInvoke * invoke)542 static void CreateFPToIntPlusFPTempLocations(ArenaAllocator* allocator, HInvoke* invoke) {
543   LocationSummary* locations =
544       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
545   locations->SetInAt(0, Location::RequiresFpuRegister());
546   locations->SetOut(Location::RequiresRegister());
547   locations->AddTemp(Location::RequiresFpuRegister());
548 }
549 
GenMathRound(HInvoke * invoke,bool is_double,vixl::aarch64::MacroAssembler * masm)550 static void GenMathRound(HInvoke* invoke, bool is_double, vixl::aarch64::MacroAssembler* masm) {
551   // Java 8 API definition for Math.round():
552   // Return the closest long or int to the argument, with ties rounding to positive infinity.
553   //
554   // There is no single instruction in ARMv8 that can support the above definition.
555   // We choose to use FCVTAS here, because it has closest semantic.
556   // FCVTAS performs rounding to nearest integer, ties away from zero.
557   // For most inputs (positive values, zero or NaN), this instruction is enough.
558   // We only need a few handling code after FCVTAS if the input is negative half value.
559   //
560   // The reason why we didn't choose FCVTPS instruction here is that
561   // although it performs rounding toward positive infinity, it doesn't perform rounding to nearest.
562   // For example, FCVTPS(-1.9) = -1 and FCVTPS(1.1) = 2.
563   // If we were using this instruction, for most inputs, more handling code would be needed.
564   LocationSummary* l = invoke->GetLocations();
565   VRegister in_reg = is_double ? DRegisterFrom(l->InAt(0)) : SRegisterFrom(l->InAt(0));
566   VRegister tmp_fp = is_double ? DRegisterFrom(l->GetTemp(0)) : SRegisterFrom(l->GetTemp(0));
567   Register out_reg = is_double ? XRegisterFrom(l->Out()) : WRegisterFrom(l->Out());
568   vixl::aarch64::Label done;
569 
570   // Round to nearest integer, ties away from zero.
571   __ Fcvtas(out_reg, in_reg);
572 
573   // For positive values, zero or NaN inputs, rounding is done.
574   __ Tbz(out_reg, out_reg.GetSizeInBits() - 1, &done);
575 
576   // Handle input < 0 cases.
577   // If input is negative but not a tie, previous result (round to nearest) is valid.
578   // If input is a negative tie, out_reg += 1.
579   __ Frinta(tmp_fp, in_reg);
580   __ Fsub(tmp_fp, in_reg, tmp_fp);
581   __ Fcmp(tmp_fp, 0.5);
582   __ Cinc(out_reg, out_reg, eq);
583 
584   __ Bind(&done);
585 }
586 
VisitMathRoundDouble(HInvoke * invoke)587 void IntrinsicLocationsBuilderARM64::VisitMathRoundDouble(HInvoke* invoke) {
588   CreateFPToIntPlusFPTempLocations(allocator_, invoke);
589 }
590 
VisitMathRoundDouble(HInvoke * invoke)591 void IntrinsicCodeGeneratorARM64::VisitMathRoundDouble(HInvoke* invoke) {
592   GenMathRound(invoke, /* is_double= */ true, GetVIXLAssembler());
593 }
594 
VisitMathRoundFloat(HInvoke * invoke)595 void IntrinsicLocationsBuilderARM64::VisitMathRoundFloat(HInvoke* invoke) {
596   CreateFPToIntPlusFPTempLocations(allocator_, invoke);
597 }
598 
VisitMathRoundFloat(HInvoke * invoke)599 void IntrinsicCodeGeneratorARM64::VisitMathRoundFloat(HInvoke* invoke) {
600   GenMathRound(invoke, /* is_double= */ false, GetVIXLAssembler());
601 }
602 
VisitMemoryPeekByte(HInvoke * invoke)603 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) {
604   CreateIntToIntLocations(allocator_, invoke);
605 }
606 
VisitMemoryPeekByte(HInvoke * invoke)607 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekByte(HInvoke* invoke) {
608   MacroAssembler* masm = GetVIXLAssembler();
609   __ Ldrsb(WRegisterFrom(invoke->GetLocations()->Out()),
610           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
611 }
612 
VisitMemoryPeekIntNative(HInvoke * invoke)613 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
614   CreateIntToIntLocations(allocator_, invoke);
615 }
616 
VisitMemoryPeekIntNative(HInvoke * invoke)617 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
618   MacroAssembler* masm = GetVIXLAssembler();
619   __ Ldr(WRegisterFrom(invoke->GetLocations()->Out()),
620          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
621 }
622 
VisitMemoryPeekLongNative(HInvoke * invoke)623 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
624   CreateIntToIntLocations(allocator_, invoke);
625 }
626 
VisitMemoryPeekLongNative(HInvoke * invoke)627 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
628   MacroAssembler* masm = GetVIXLAssembler();
629   __ Ldr(XRegisterFrom(invoke->GetLocations()->Out()),
630          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
631 }
632 
VisitMemoryPeekShortNative(HInvoke * invoke)633 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
634   CreateIntToIntLocations(allocator_, invoke);
635 }
636 
VisitMemoryPeekShortNative(HInvoke * invoke)637 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
638   MacroAssembler* masm = GetVIXLAssembler();
639   __ Ldrsh(WRegisterFrom(invoke->GetLocations()->Out()),
640            AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
641 }
642 
CreateIntIntToVoidLocations(ArenaAllocator * allocator,HInvoke * invoke)643 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
644   LocationSummary* locations =
645       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
646   locations->SetInAt(0, Location::RequiresRegister());
647   locations->SetInAt(1, Location::RequiresRegister());
648 }
649 
VisitMemoryPokeByte(HInvoke * invoke)650 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeByte(HInvoke* invoke) {
651   CreateIntIntToVoidLocations(allocator_, invoke);
652 }
653 
VisitMemoryPokeByte(HInvoke * invoke)654 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeByte(HInvoke* invoke) {
655   MacroAssembler* masm = GetVIXLAssembler();
656   __ Strb(WRegisterFrom(invoke->GetLocations()->InAt(1)),
657           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
658 }
659 
VisitMemoryPokeIntNative(HInvoke * invoke)660 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
661   CreateIntIntToVoidLocations(allocator_, invoke);
662 }
663 
VisitMemoryPokeIntNative(HInvoke * invoke)664 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
665   MacroAssembler* masm = GetVIXLAssembler();
666   __ Str(WRegisterFrom(invoke->GetLocations()->InAt(1)),
667          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
668 }
669 
VisitMemoryPokeLongNative(HInvoke * invoke)670 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
671   CreateIntIntToVoidLocations(allocator_, invoke);
672 }
673 
VisitMemoryPokeLongNative(HInvoke * invoke)674 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
675   MacroAssembler* masm = GetVIXLAssembler();
676   __ Str(XRegisterFrom(invoke->GetLocations()->InAt(1)),
677          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
678 }
679 
VisitMemoryPokeShortNative(HInvoke * invoke)680 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
681   CreateIntIntToVoidLocations(allocator_, invoke);
682 }
683 
VisitMemoryPokeShortNative(HInvoke * invoke)684 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
685   MacroAssembler* masm = GetVIXLAssembler();
686   __ Strh(WRegisterFrom(invoke->GetLocations()->InAt(1)),
687           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
688 }
689 
VisitThreadCurrentThread(HInvoke * invoke)690 void IntrinsicLocationsBuilderARM64::VisitThreadCurrentThread(HInvoke* invoke) {
691   LocationSummary* locations =
692       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
693   locations->SetOut(Location::RequiresRegister());
694 }
695 
VisitThreadCurrentThread(HInvoke * invoke)696 void IntrinsicCodeGeneratorARM64::VisitThreadCurrentThread(HInvoke* invoke) {
697   codegen_->Load(DataType::Type::kReference, WRegisterFrom(invoke->GetLocations()->Out()),
698                  MemOperand(tr, Thread::PeerOffset<kArm64PointerSize>().Int32Value()));
699 }
700 
GenUnsafeGet(HInvoke * invoke,DataType::Type type,bool is_volatile,CodeGeneratorARM64 * codegen)701 static void GenUnsafeGet(HInvoke* invoke,
702                          DataType::Type type,
703                          bool is_volatile,
704                          CodeGeneratorARM64* codegen) {
705   LocationSummary* locations = invoke->GetLocations();
706   DCHECK((type == DataType::Type::kInt8) ||
707          (type == DataType::Type::kInt32) ||
708          (type == DataType::Type::kInt64) ||
709          (type == DataType::Type::kReference));
710   Location base_loc = locations->InAt(1);
711   Register base = WRegisterFrom(base_loc);      // Object pointer.
712   Location offset_loc = locations->InAt(2);
713   Register offset = XRegisterFrom(offset_loc);  // Long offset.
714   Location trg_loc = locations->Out();
715   Register trg = RegisterFrom(trg_loc, type);
716 
717   if (type == DataType::Type::kReference && codegen->EmitBakerReadBarrier()) {
718     // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
719     Register temp = WRegisterFrom(locations->GetTemp(0));
720     MacroAssembler* masm = codegen->GetVIXLAssembler();
721     // Piggy-back on the field load path using introspection for the Baker read barrier.
722     __ Add(temp, base, offset.W());  // Offset should not exceed 32 bits.
723     codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
724                                                    trg_loc,
725                                                    base,
726                                                    MemOperand(temp.X()),
727                                                    /* needs_null_check= */ false,
728                                                    is_volatile);
729   } else {
730     // Other cases.
731     MemOperand mem_op(base.X(), offset);
732     if (is_volatile) {
733       codegen->LoadAcquire(invoke, type, trg, mem_op, /* needs_null_check= */ true);
734     } else {
735       codegen->Load(type, trg, mem_op);
736     }
737 
738     if (type == DataType::Type::kReference) {
739       DCHECK(trg.IsW());
740       codegen->MaybeGenerateReadBarrierSlow(invoke, trg_loc, trg_loc, base_loc, 0u, offset_loc);
741     }
742   }
743 }
744 
CreateUnsafeGetLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorARM64 * codegen)745 static void CreateUnsafeGetLocations(ArenaAllocator* allocator,
746                                      HInvoke* invoke,
747                                      CodeGeneratorARM64* codegen) {
748   bool can_call = codegen->EmitReadBarrier() && IsUnsafeGetReference(invoke);
749   LocationSummary* locations =
750       new (allocator) LocationSummary(invoke,
751                                       can_call
752                                           ? LocationSummary::kCallOnSlowPath
753                                           : LocationSummary::kNoCall,
754                                       kIntrinsified);
755   if (can_call && kUseBakerReadBarrier) {
756     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
757     // We need a temporary register for the read barrier load in order to use
758     // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier().
759     locations->AddTemp(FixedTempLocation());
760   }
761   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
762   locations->SetInAt(1, Location::RequiresRegister());
763   locations->SetInAt(2, Location::RequiresRegister());
764   locations->SetOut(Location::RequiresRegister(),
765                     (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
766 }
767 
VisitUnsafeGet(HInvoke * invoke)768 void IntrinsicLocationsBuilderARM64::VisitUnsafeGet(HInvoke* invoke) {
769   VisitJdkUnsafeGet(invoke);
770 }
VisitUnsafeGetVolatile(HInvoke * invoke)771 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
772   VisitJdkUnsafeGetVolatile(invoke);
773 }
VisitUnsafeGetLong(HInvoke * invoke)774 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLong(HInvoke* invoke) {
775   VisitJdkUnsafeGetLong(invoke);
776 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)777 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
778   VisitJdkUnsafeGetLongVolatile(invoke);
779 }
VisitUnsafeGetObject(HInvoke * invoke)780 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObject(HInvoke* invoke) {
781   VisitJdkUnsafeGetReference(invoke);
782 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)783 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
784   VisitJdkUnsafeGetReferenceVolatile(invoke);
785 }
VisitUnsafeGetByte(HInvoke * invoke)786 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetByte(HInvoke* invoke) {
787   VisitJdkUnsafeGetByte(invoke);
788 }
789 
VisitJdkUnsafeGet(HInvoke * invoke)790 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGet(HInvoke* invoke) {
791   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
792 }
VisitJdkUnsafeGetVolatile(HInvoke * invoke)793 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetVolatile(HInvoke* invoke) {
794   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
795 }
VisitJdkUnsafeGetAcquire(HInvoke * invoke)796 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAcquire(HInvoke* invoke) {
797   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
798 }
VisitJdkUnsafeGetLong(HInvoke * invoke)799 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetLong(HInvoke* invoke) {
800   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
801 }
VisitJdkUnsafeGetLongVolatile(HInvoke * invoke)802 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetLongVolatile(HInvoke* invoke) {
803   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
804 }
VisitJdkUnsafeGetLongAcquire(HInvoke * invoke)805 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetLongAcquire(HInvoke* invoke) {
806   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
807 }
VisitJdkUnsafeGetReference(HInvoke * invoke)808 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetReference(HInvoke* invoke) {
809   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
810 }
VisitJdkUnsafeGetReferenceVolatile(HInvoke * invoke)811 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetReferenceVolatile(HInvoke* invoke) {
812   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
813 }
VisitJdkUnsafeGetReferenceAcquire(HInvoke * invoke)814 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetReferenceAcquire(HInvoke* invoke) {
815   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
816 }
VisitJdkUnsafeGetByte(HInvoke * invoke)817 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetByte(HInvoke* invoke) {
818   CreateUnsafeGetLocations(allocator_, invoke, codegen_);
819 }
820 
VisitUnsafeGet(HInvoke * invoke)821 void IntrinsicCodeGeneratorARM64::VisitUnsafeGet(HInvoke* invoke) {
822   VisitJdkUnsafeGet(invoke);
823 }
VisitUnsafeGetVolatile(HInvoke * invoke)824 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
825   VisitJdkUnsafeGetVolatile(invoke);
826 }
VisitUnsafeGetLong(HInvoke * invoke)827 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLong(HInvoke* invoke) {
828   VisitJdkUnsafeGetLong(invoke);
829 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)830 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
831   VisitJdkUnsafeGetLongVolatile(invoke);
832 }
VisitUnsafeGetObject(HInvoke * invoke)833 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObject(HInvoke* invoke) {
834   VisitJdkUnsafeGetReference(invoke);
835 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)836 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
837   VisitJdkUnsafeGetReferenceVolatile(invoke);
838 }
VisitUnsafeGetByte(HInvoke * invoke)839 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetByte(HInvoke* invoke) {
840   VisitJdkUnsafeGetByte(invoke);
841 }
842 
VisitJdkUnsafeGet(HInvoke * invoke)843 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGet(HInvoke* invoke) {
844   GenUnsafeGet(invoke, DataType::Type::kInt32, /*is_volatile=*/ false, codegen_);
845 }
VisitJdkUnsafeGetVolatile(HInvoke * invoke)846 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetVolatile(HInvoke* invoke) {
847   GenUnsafeGet(invoke, DataType::Type::kInt32, /*is_volatile=*/ true, codegen_);
848 }
VisitJdkUnsafeGetAcquire(HInvoke * invoke)849 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAcquire(HInvoke* invoke) {
850   GenUnsafeGet(invoke, DataType::Type::kInt32, /*is_volatile=*/ true, codegen_);
851 }
VisitJdkUnsafeGetLong(HInvoke * invoke)852 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetLong(HInvoke* invoke) {
853   GenUnsafeGet(invoke, DataType::Type::kInt64, /*is_volatile=*/ false, codegen_);
854 }
VisitJdkUnsafeGetLongVolatile(HInvoke * invoke)855 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetLongVolatile(HInvoke* invoke) {
856   GenUnsafeGet(invoke, DataType::Type::kInt64, /*is_volatile=*/ true, codegen_);
857 }
VisitJdkUnsafeGetLongAcquire(HInvoke * invoke)858 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetLongAcquire(HInvoke* invoke) {
859   GenUnsafeGet(invoke, DataType::Type::kInt64, /*is_volatile=*/ true, codegen_);
860 }
VisitJdkUnsafeGetReference(HInvoke * invoke)861 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetReference(HInvoke* invoke) {
862   GenUnsafeGet(invoke, DataType::Type::kReference, /*is_volatile=*/ false, codegen_);
863 }
VisitJdkUnsafeGetReferenceVolatile(HInvoke * invoke)864 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetReferenceVolatile(HInvoke* invoke) {
865   GenUnsafeGet(invoke, DataType::Type::kReference, /*is_volatile=*/ true, codegen_);
866 }
VisitJdkUnsafeGetReferenceAcquire(HInvoke * invoke)867 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetReferenceAcquire(HInvoke* invoke) {
868   GenUnsafeGet(invoke, DataType::Type::kReference, /*is_volatile=*/ true, codegen_);
869 }
VisitJdkUnsafeGetByte(HInvoke * invoke)870 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetByte(HInvoke* invoke) {
871   GenUnsafeGet(invoke, DataType::Type::kInt8, /*is_volatile=*/ false, codegen_);
872 }
873 
CreateUnsafePutLocations(ArenaAllocator * allocator,HInvoke * invoke)874 static void CreateUnsafePutLocations(ArenaAllocator* allocator, HInvoke* invoke) {
875   LocationSummary* locations =
876       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
877   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
878   locations->SetInAt(1, Location::RequiresRegister());
879   locations->SetInAt(2, Location::RequiresRegister());
880   locations->SetInAt(3, Location::RequiresRegister());
881 }
882 
VisitUnsafePut(HInvoke * invoke)883 void IntrinsicLocationsBuilderARM64::VisitUnsafePut(HInvoke* invoke) {
884   VisitJdkUnsafePut(invoke);
885 }
VisitUnsafePutOrdered(HInvoke * invoke)886 void IntrinsicLocationsBuilderARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
887   VisitJdkUnsafePutOrdered(invoke);
888 }
VisitUnsafePutVolatile(HInvoke * invoke)889 void IntrinsicLocationsBuilderARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
890   VisitJdkUnsafePutVolatile(invoke);
891 }
VisitUnsafePutObject(HInvoke * invoke)892 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObject(HInvoke* invoke) {
893   VisitJdkUnsafePutReference(invoke);
894 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)895 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
896   VisitJdkUnsafePutObjectOrdered(invoke);
897 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)898 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
899   VisitJdkUnsafePutReferenceVolatile(invoke);
900 }
VisitUnsafePutLong(HInvoke * invoke)901 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLong(HInvoke* invoke) {
902   VisitJdkUnsafePutLong(invoke);
903 }
VisitUnsafePutLongOrdered(HInvoke * invoke)904 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
905   VisitJdkUnsafePutLongOrdered(invoke);
906 }
VisitUnsafePutLongVolatile(HInvoke * invoke)907 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
908   VisitJdkUnsafePutLongVolatile(invoke);
909 }
VisitUnsafePutByte(HInvoke * invoke)910 void IntrinsicLocationsBuilderARM64::VisitUnsafePutByte(HInvoke* invoke) {
911   VisitJdkUnsafePutByte(invoke);
912 }
913 
VisitJdkUnsafePut(HInvoke * invoke)914 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePut(HInvoke* invoke) {
915   CreateUnsafePutLocations(allocator_, invoke);
916 }
VisitJdkUnsafePutOrdered(HInvoke * invoke)917 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutOrdered(HInvoke* invoke) {
918   CreateUnsafePutLocations(allocator_, invoke);
919 }
VisitJdkUnsafePutVolatile(HInvoke * invoke)920 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutVolatile(HInvoke* invoke) {
921   CreateUnsafePutLocations(allocator_, invoke);
922 }
VisitJdkUnsafePutRelease(HInvoke * invoke)923 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutRelease(HInvoke* invoke) {
924   CreateUnsafePutLocations(allocator_, invoke);
925 }
VisitJdkUnsafePutReference(HInvoke * invoke)926 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutReference(HInvoke* invoke) {
927   CreateUnsafePutLocations(allocator_, invoke);
928 }
VisitJdkUnsafePutObjectOrdered(HInvoke * invoke)929 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutObjectOrdered(HInvoke* invoke) {
930   CreateUnsafePutLocations(allocator_, invoke);
931 }
VisitJdkUnsafePutReferenceVolatile(HInvoke * invoke)932 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutReferenceVolatile(HInvoke* invoke) {
933   CreateUnsafePutLocations(allocator_, invoke);
934 }
VisitJdkUnsafePutReferenceRelease(HInvoke * invoke)935 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutReferenceRelease(HInvoke* invoke) {
936   CreateUnsafePutLocations(allocator_, invoke);
937 }
VisitJdkUnsafePutLong(HInvoke * invoke)938 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLong(HInvoke* invoke) {
939   CreateUnsafePutLocations(allocator_, invoke);
940 }
VisitJdkUnsafePutLongOrdered(HInvoke * invoke)941 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLongOrdered(HInvoke* invoke) {
942   CreateUnsafePutLocations(allocator_, invoke);
943 }
VisitJdkUnsafePutLongVolatile(HInvoke * invoke)944 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLongVolatile(HInvoke* invoke) {
945   CreateUnsafePutLocations(allocator_, invoke);
946 }
VisitJdkUnsafePutLongRelease(HInvoke * invoke)947 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutLongRelease(HInvoke* invoke) {
948   CreateUnsafePutLocations(allocator_, invoke);
949 }
VisitJdkUnsafePutByte(HInvoke * invoke)950 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafePutByte(HInvoke* invoke) {
951   CreateUnsafePutLocations(allocator_, invoke);
952 }
953 
GenUnsafePut(HInvoke * invoke,DataType::Type type,bool is_volatile,bool is_ordered,CodeGeneratorARM64 * codegen)954 static void GenUnsafePut(HInvoke* invoke,
955                          DataType::Type type,
956                          bool is_volatile,
957                          bool is_ordered,
958                          CodeGeneratorARM64* codegen) {
959   LocationSummary* locations = invoke->GetLocations();
960   MacroAssembler* masm = codegen->GetVIXLAssembler();
961 
962   Register base = WRegisterFrom(locations->InAt(1));    // Object pointer.
963   Register offset = XRegisterFrom(locations->InAt(2));  // Long offset.
964   Register value = RegisterFrom(locations->InAt(3), type);
965   Register source = value;
966   MemOperand mem_op(base.X(), offset);
967 
968   {
969     // We use a block to end the scratch scope before the write barrier, thus
970     // freeing the temporary registers so they can be used in `MarkGCCard`.
971     UseScratchRegisterScope temps(masm);
972 
973     if (kPoisonHeapReferences && type == DataType::Type::kReference) {
974       DCHECK(value.IsW());
975       Register temp = temps.AcquireW();
976       __ Mov(temp.W(), value.W());
977       codegen->GetAssembler()->PoisonHeapReference(temp.W());
978       source = temp;
979     }
980 
981     if (is_volatile || is_ordered) {
982       codegen->StoreRelease(invoke, type, source, mem_op, /* needs_null_check= */ false);
983     } else {
984       codegen->Store(type, source, mem_op);
985     }
986   }
987 
988   if (type == DataType::Type::kReference) {
989     bool value_can_be_null = true;  // TODO: Worth finding out this information?
990     codegen->MaybeMarkGCCard(base, value, value_can_be_null);
991   }
992 }
993 
VisitUnsafePut(HInvoke * invoke)994 void IntrinsicCodeGeneratorARM64::VisitUnsafePut(HInvoke* invoke) {
995   VisitJdkUnsafePut(invoke);
996 }
VisitUnsafePutOrdered(HInvoke * invoke)997 void IntrinsicCodeGeneratorARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
998   VisitJdkUnsafePutOrdered(invoke);
999 }
VisitUnsafePutVolatile(HInvoke * invoke)1000 void IntrinsicCodeGeneratorARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
1001   VisitJdkUnsafePutVolatile(invoke);
1002 }
VisitUnsafePutObject(HInvoke * invoke)1003 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObject(HInvoke* invoke) {
1004   VisitJdkUnsafePutReference(invoke);
1005 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)1006 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
1007   VisitJdkUnsafePutObjectOrdered(invoke);
1008 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)1009 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
1010   VisitJdkUnsafePutReferenceVolatile(invoke);
1011 }
VisitUnsafePutLong(HInvoke * invoke)1012 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLong(HInvoke* invoke) {
1013   VisitJdkUnsafePutLong(invoke);
1014 }
VisitUnsafePutLongOrdered(HInvoke * invoke)1015 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
1016   VisitJdkUnsafePutLongOrdered(invoke);
1017 }
VisitUnsafePutLongVolatile(HInvoke * invoke)1018 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
1019   VisitJdkUnsafePutLongVolatile(invoke);
1020 }
VisitUnsafePutByte(HInvoke * invoke)1021 void IntrinsicCodeGeneratorARM64::VisitUnsafePutByte(HInvoke* invoke) {
1022   VisitJdkUnsafePutByte(invoke);
1023 }
1024 
VisitJdkUnsafePut(HInvoke * invoke)1025 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePut(HInvoke* invoke) {
1026   GenUnsafePut(invoke,
1027                DataType::Type::kInt32,
1028                /*is_volatile=*/ false,
1029                /*is_ordered=*/ false,
1030                codegen_);
1031 }
VisitJdkUnsafePutOrdered(HInvoke * invoke)1032 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutOrdered(HInvoke* invoke) {
1033   GenUnsafePut(invoke,
1034                DataType::Type::kInt32,
1035                /*is_volatile=*/ false,
1036                /*is_ordered=*/ true,
1037                codegen_);
1038 }
VisitJdkUnsafePutVolatile(HInvoke * invoke)1039 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutVolatile(HInvoke* invoke) {
1040   GenUnsafePut(invoke,
1041                DataType::Type::kInt32,
1042                /*is_volatile=*/ true,
1043                /*is_ordered=*/ false,
1044                codegen_);
1045 }
VisitJdkUnsafePutRelease(HInvoke * invoke)1046 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutRelease(HInvoke* invoke) {
1047   GenUnsafePut(invoke,
1048                DataType::Type::kInt32,
1049                /*is_volatile=*/ true,
1050                /*is_ordered=*/ false,
1051                codegen_);
1052 }
VisitJdkUnsafePutReference(HInvoke * invoke)1053 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutReference(HInvoke* invoke) {
1054   GenUnsafePut(invoke,
1055                DataType::Type::kReference,
1056                /*is_volatile=*/ false,
1057                /*is_ordered=*/ false,
1058                codegen_);
1059 }
VisitJdkUnsafePutObjectOrdered(HInvoke * invoke)1060 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutObjectOrdered(HInvoke* invoke) {
1061   GenUnsafePut(invoke,
1062                DataType::Type::kReference,
1063                /*is_volatile=*/ false,
1064                /*is_ordered=*/ true,
1065                codegen_);
1066 }
VisitJdkUnsafePutReferenceVolatile(HInvoke * invoke)1067 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutReferenceVolatile(HInvoke* invoke) {
1068   GenUnsafePut(invoke,
1069                DataType::Type::kReference,
1070                /*is_volatile=*/ true,
1071                /*is_ordered=*/ false,
1072                codegen_);
1073 }
VisitJdkUnsafePutReferenceRelease(HInvoke * invoke)1074 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutReferenceRelease(HInvoke* invoke) {
1075   GenUnsafePut(invoke,
1076                DataType::Type::kReference,
1077                /*is_volatile=*/ true,
1078                /*is_ordered=*/ false,
1079                codegen_);
1080 }
VisitJdkUnsafePutLong(HInvoke * invoke)1081 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLong(HInvoke* invoke) {
1082   GenUnsafePut(invoke,
1083                DataType::Type::kInt64,
1084                /*is_volatile=*/ false,
1085                /*is_ordered=*/ false,
1086                codegen_);
1087 }
VisitJdkUnsafePutLongOrdered(HInvoke * invoke)1088 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLongOrdered(HInvoke* invoke) {
1089   GenUnsafePut(invoke,
1090                DataType::Type::kInt64,
1091                /*is_volatile=*/ false,
1092                /*is_ordered=*/ true,
1093                codegen_);
1094 }
VisitJdkUnsafePutLongVolatile(HInvoke * invoke)1095 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLongVolatile(HInvoke* invoke) {
1096   GenUnsafePut(invoke,
1097                DataType::Type::kInt64,
1098                /*is_volatile=*/ true,
1099                /*is_ordered=*/ false,
1100                codegen_);
1101 }
VisitJdkUnsafePutLongRelease(HInvoke * invoke)1102 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutLongRelease(HInvoke* invoke) {
1103   GenUnsafePut(invoke,
1104                DataType::Type::kInt64,
1105                /*is_volatile=*/ true,
1106                /*is_ordered=*/ false,
1107                codegen_);
1108 }
VisitJdkUnsafePutByte(HInvoke * invoke)1109 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafePutByte(HInvoke* invoke) {
1110   GenUnsafePut(invoke,
1111                DataType::Type::kInt8,
1112                /*is_volatile=*/ false,
1113                /*is_ordered=*/ false,
1114                codegen_);
1115 }
1116 
CreateUnsafeCASLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorARM64 * codegen)1117 static void CreateUnsafeCASLocations(ArenaAllocator* allocator,
1118                                      HInvoke* invoke,
1119                                      CodeGeneratorARM64* codegen) {
1120   const bool can_call = codegen->EmitReadBarrier() && IsUnsafeCASReference(invoke);
1121   LocationSummary* locations =
1122       new (allocator) LocationSummary(invoke,
1123                                       can_call
1124                                           ? LocationSummary::kCallOnSlowPath
1125                                           : LocationSummary::kNoCall,
1126                                       kIntrinsified);
1127   if (can_call && kUseBakerReadBarrier) {
1128     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
1129   }
1130   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1131   locations->SetInAt(1, Location::RequiresRegister());
1132   locations->SetInAt(2, Location::RequiresRegister());
1133   locations->SetInAt(3, Location::RequiresRegister());
1134   locations->SetInAt(4, Location::RequiresRegister());
1135 
1136   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
1137 }
1138 
EmitLoadExclusive(CodeGeneratorARM64 * codegen,DataType::Type type,Register ptr,Register old_value,bool use_load_acquire)1139 static void EmitLoadExclusive(CodeGeneratorARM64* codegen,
1140                               DataType::Type type,
1141                               Register ptr,
1142                               Register old_value,
1143                               bool use_load_acquire) {
1144   Arm64Assembler* assembler = codegen->GetAssembler();
1145   MacroAssembler* masm = assembler->GetVIXLAssembler();
1146   switch (type) {
1147     case DataType::Type::kBool:
1148     case DataType::Type::kUint8:
1149     case DataType::Type::kInt8:
1150       if (use_load_acquire) {
1151         __ Ldaxrb(old_value, MemOperand(ptr));
1152       } else {
1153         __ Ldxrb(old_value, MemOperand(ptr));
1154       }
1155       break;
1156     case DataType::Type::kUint16:
1157     case DataType::Type::kInt16:
1158       if (use_load_acquire) {
1159         __ Ldaxrh(old_value, MemOperand(ptr));
1160       } else {
1161         __ Ldxrh(old_value, MemOperand(ptr));
1162       }
1163       break;
1164     case DataType::Type::kInt32:
1165     case DataType::Type::kInt64:
1166     case DataType::Type::kReference:
1167       if (use_load_acquire) {
1168         __ Ldaxr(old_value, MemOperand(ptr));
1169       } else {
1170         __ Ldxr(old_value, MemOperand(ptr));
1171       }
1172       break;
1173     default:
1174       LOG(FATAL) << "Unexpected type: " << type;
1175       UNREACHABLE();
1176   }
1177   switch (type) {
1178     case DataType::Type::kInt8:
1179       __ Sxtb(old_value, old_value);
1180       break;
1181     case DataType::Type::kInt16:
1182       __ Sxth(old_value, old_value);
1183       break;
1184     case DataType::Type::kReference:
1185       assembler->MaybeUnpoisonHeapReference(old_value);
1186       break;
1187     default:
1188       break;
1189   }
1190 }
1191 
EmitStoreExclusive(CodeGeneratorARM64 * codegen,DataType::Type type,Register ptr,Register store_result,Register new_value,bool use_store_release)1192 static void EmitStoreExclusive(CodeGeneratorARM64* codegen,
1193                                DataType::Type type,
1194                                Register ptr,
1195                                Register store_result,
1196                                Register new_value,
1197                                bool use_store_release) {
1198   Arm64Assembler* assembler = codegen->GetAssembler();
1199   MacroAssembler* masm = assembler->GetVIXLAssembler();
1200   if (type == DataType::Type::kReference) {
1201     assembler->MaybePoisonHeapReference(new_value);
1202   }
1203   switch (type) {
1204     case DataType::Type::kBool:
1205     case DataType::Type::kUint8:
1206     case DataType::Type::kInt8:
1207       if (use_store_release) {
1208         __ Stlxrb(store_result, new_value, MemOperand(ptr));
1209       } else {
1210         __ Stxrb(store_result, new_value, MemOperand(ptr));
1211       }
1212       break;
1213     case DataType::Type::kUint16:
1214     case DataType::Type::kInt16:
1215       if (use_store_release) {
1216         __ Stlxrh(store_result, new_value, MemOperand(ptr));
1217       } else {
1218         __ Stxrh(store_result, new_value, MemOperand(ptr));
1219       }
1220       break;
1221     case DataType::Type::kInt32:
1222     case DataType::Type::kInt64:
1223     case DataType::Type::kReference:
1224       if (use_store_release) {
1225         __ Stlxr(store_result, new_value, MemOperand(ptr));
1226       } else {
1227         __ Stxr(store_result, new_value, MemOperand(ptr));
1228       }
1229       break;
1230     default:
1231       LOG(FATAL) << "Unexpected type: " << type;
1232       UNREACHABLE();
1233   }
1234   if (type == DataType::Type::kReference) {
1235     assembler->MaybeUnpoisonHeapReference(new_value);
1236   }
1237 }
1238 
GenerateCompareAndSet(CodeGeneratorARM64 * codegen,DataType::Type type,std::memory_order order,bool strong,vixl::aarch64::Label * cmp_failure,Register ptr,Register new_value,Register old_value,Register store_result,Register expected,Register expected2=Register ())1239 static void GenerateCompareAndSet(CodeGeneratorARM64* codegen,
1240                                   DataType::Type type,
1241                                   std::memory_order order,
1242                                   bool strong,
1243                                   vixl::aarch64::Label* cmp_failure,
1244                                   Register ptr,
1245                                   Register new_value,
1246                                   Register old_value,
1247                                   Register store_result,
1248                                   Register expected,
1249                                   Register expected2 = Register()) {
1250   // The `expected2` is valid only for reference slow path and represents the unmarked old value
1251   // from the main path attempt to emit CAS when the marked old value matched `expected`.
1252   DCHECK_IMPLIES(expected2.IsValid(), type == DataType::Type::kReference);
1253 
1254   DCHECK(ptr.IsX());
1255   DCHECK_EQ(new_value.IsX(), type == DataType::Type::kInt64);
1256   DCHECK_EQ(old_value.IsX(), type == DataType::Type::kInt64);
1257   DCHECK(store_result.IsW());
1258   DCHECK_EQ(expected.IsX(), type == DataType::Type::kInt64);
1259   DCHECK_IMPLIES(expected2.IsValid(), expected2.IsW());
1260 
1261   Arm64Assembler* assembler = codegen->GetAssembler();
1262   MacroAssembler* masm = assembler->GetVIXLAssembler();
1263 
1264   bool use_load_acquire =
1265       (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
1266   bool use_store_release =
1267       (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
1268   DCHECK(use_load_acquire || use_store_release || order == std::memory_order_relaxed);
1269 
1270   // repeat: {
1271   //   old_value = [ptr];  // Load exclusive.
1272   //   if (old_value != expected && old_value != expected2) goto cmp_failure;
1273   //   store_result = failed([ptr] <- new_value);  // Store exclusive.
1274   // }
1275   // if (strong) {
1276   //   if (store_result) goto repeat;  // Repeat until compare fails or store exclusive succeeds.
1277   // } else {
1278   //   store_result = store_result ^ 1;  // Report success as 1, failure as 0.
1279   // }
1280   //
1281   // Flag Z indicates whether `old_value == expected || old_value == expected2`.
1282   // (If `expected2` is not valid, the `old_value == expected2` part is not emitted.)
1283 
1284   vixl::aarch64::Label loop_head;
1285   if (strong) {
1286     __ Bind(&loop_head);
1287   }
1288   EmitLoadExclusive(codegen, type, ptr, old_value, use_load_acquire);
1289   __ Cmp(old_value, expected);
1290   if (expected2.IsValid()) {
1291     __ Ccmp(old_value, expected2, ZFlag, ne);
1292   }
1293   // If the comparison failed, the Z flag is cleared as we branch to the `cmp_failure` label.
1294   // If the comparison succeeded, the Z flag is set and remains set after the end of the
1295   // code emitted here, unless we retry the whole operation.
1296   __ B(cmp_failure, ne);
1297   EmitStoreExclusive(codegen, type, ptr, store_result, new_value, use_store_release);
1298   if (strong) {
1299     __ Cbnz(store_result, &loop_head);
1300   } else {
1301     // Flip the `store_result` register to indicate success by 1 and failure by 0.
1302     __ Eor(store_result, store_result, 1);
1303   }
1304 }
1305 
1306 class ReadBarrierCasSlowPathARM64 : public SlowPathCodeARM64 {
1307  public:
ReadBarrierCasSlowPathARM64(HInvoke * invoke,std::memory_order order,bool strong,Register base,Register offset,Register expected,Register new_value,Register old_value,Register old_value_temp,Register store_result,bool update_old_value,CodeGeneratorARM64 * arm64_codegen)1308   ReadBarrierCasSlowPathARM64(HInvoke* invoke,
1309                               std::memory_order order,
1310                               bool strong,
1311                               Register base,
1312                               Register offset,
1313                               Register expected,
1314                               Register new_value,
1315                               Register old_value,
1316                               Register old_value_temp,
1317                               Register store_result,
1318                               bool update_old_value,
1319                               CodeGeneratorARM64* arm64_codegen)
1320       : SlowPathCodeARM64(invoke),
1321         order_(order),
1322         strong_(strong),
1323         base_(base),
1324         offset_(offset),
1325         expected_(expected),
1326         new_value_(new_value),
1327         old_value_(old_value),
1328         old_value_temp_(old_value_temp),
1329         store_result_(store_result),
1330         update_old_value_(update_old_value),
1331         mark_old_value_slow_path_(nullptr),
1332         update_old_value_slow_path_(nullptr) {
1333     if (!kUseBakerReadBarrier) {
1334       // We need to add the slow path now, it is too late when emitting slow path code.
1335       mark_old_value_slow_path_ = arm64_codegen->AddReadBarrierSlowPath(
1336           invoke,
1337           Location::RegisterLocation(old_value_temp.GetCode()),
1338           Location::RegisterLocation(old_value.GetCode()),
1339           Location::RegisterLocation(base.GetCode()),
1340           /*offset=*/ 0u,
1341           /*index=*/ Location::RegisterLocation(offset.GetCode()));
1342       if (update_old_value_) {
1343         update_old_value_slow_path_ = arm64_codegen->AddReadBarrierSlowPath(
1344             invoke,
1345             Location::RegisterLocation(old_value.GetCode()),
1346             Location::RegisterLocation(old_value_temp.GetCode()),
1347             Location::RegisterLocation(base.GetCode()),
1348             /*offset=*/ 0u,
1349             /*index=*/ Location::RegisterLocation(offset.GetCode()));
1350       }
1351     }
1352   }
1353 
GetDescription() const1354   const char* GetDescription() const override { return "ReadBarrierCasSlowPathARM64"; }
1355 
EmitNativeCode(CodeGenerator * codegen)1356   void EmitNativeCode(CodeGenerator* codegen) override {
1357     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
1358     Arm64Assembler* assembler = arm64_codegen->GetAssembler();
1359     MacroAssembler* masm = assembler->GetVIXLAssembler();
1360     __ Bind(GetEntryLabel());
1361 
1362     // Mark the `old_value_` from the main path and compare with `expected_`.
1363     if (kUseBakerReadBarrier) {
1364       DCHECK(mark_old_value_slow_path_ == nullptr);
1365       arm64_codegen->GenerateIntrinsicMoveWithBakerReadBarrier(old_value_temp_, old_value_);
1366     } else {
1367       DCHECK(mark_old_value_slow_path_ != nullptr);
1368       __ B(mark_old_value_slow_path_->GetEntryLabel());
1369       __ Bind(mark_old_value_slow_path_->GetExitLabel());
1370     }
1371     __ Cmp(old_value_temp_, expected_);
1372     if (update_old_value_) {
1373       // Update the old value if we're going to return from the slow path.
1374       __ Csel(old_value_, old_value_temp_, old_value_, ne);
1375     }
1376     __ B(GetExitLabel(), ne);  // If taken, Z=false indicates failure.
1377 
1378     // The `old_value` we have read did not match `expected` (which is always a to-space
1379     // reference) but after the read barrier the marked to-space value matched, so the
1380     // `old_value` must be a from-space reference to the same object. Do the same CAS loop
1381     // as the main path but check for both `expected` and the unmarked old value
1382     // representing the to-space and from-space references for the same object.
1383 
1384     UseScratchRegisterScope temps(masm);
1385     DCHECK_IMPLIES(store_result_.IsValid(), !temps.IsAvailable(store_result_));
1386     Register tmp_ptr = temps.AcquireX();
1387     Register store_result = store_result_.IsValid() ? store_result_ : temps.AcquireW();
1388 
1389     // Recalculate the `tmp_ptr` from main path clobbered by the read barrier above.
1390     __ Add(tmp_ptr, base_.X(), Operand(offset_));
1391 
1392     vixl::aarch64::Label mark_old_value;
1393     GenerateCompareAndSet(arm64_codegen,
1394                           DataType::Type::kReference,
1395                           order_,
1396                           strong_,
1397                           /*cmp_failure=*/ update_old_value_ ? &mark_old_value : GetExitLabel(),
1398                           tmp_ptr,
1399                           new_value_,
1400                           /*old_value=*/ old_value_temp_,
1401                           store_result,
1402                           expected_,
1403                           /*expected2=*/ old_value_);
1404     if (update_old_value_) {
1405       // To reach this point, the `old_value_temp_` must be either a from-space or a to-space
1406       // reference of the `expected_` object. Update the `old_value_` to the to-space reference.
1407       __ Mov(old_value_, expected_);
1408     }
1409 
1410     // Z=true from the CMP+CCMP in GenerateCompareAndSet() above indicates comparison success.
1411     // For strong CAS, that's the overall success. For weak CAS, the code also needs
1412     // to check the `store_result` after returning from the slow path.
1413     __ B(GetExitLabel());
1414 
1415     if (update_old_value_) {
1416       __ Bind(&mark_old_value);
1417       if (kUseBakerReadBarrier) {
1418         DCHECK(update_old_value_slow_path_ == nullptr);
1419         arm64_codegen->GenerateIntrinsicMoveWithBakerReadBarrier(old_value_, old_value_temp_);
1420       } else {
1421         // Note: We could redirect the `failure` above directly to the entry label and bind
1422         // the exit label in the main path, but the main path would need to access the
1423         // `update_old_value_slow_path_`. To keep the code simple, keep the extra jumps.
1424         DCHECK(update_old_value_slow_path_ != nullptr);
1425         __ B(update_old_value_slow_path_->GetEntryLabel());
1426         __ Bind(update_old_value_slow_path_->GetExitLabel());
1427       }
1428       __ B(GetExitLabel());
1429     }
1430   }
1431 
1432  private:
1433   std::memory_order order_;
1434   bool strong_;
1435   Register base_;
1436   Register offset_;
1437   Register expected_;
1438   Register new_value_;
1439   Register old_value_;
1440   Register old_value_temp_;
1441   Register store_result_;
1442   bool update_old_value_;
1443   SlowPathCodeARM64* mark_old_value_slow_path_;
1444   SlowPathCodeARM64* update_old_value_slow_path_;
1445 };
1446 
GenUnsafeCas(HInvoke * invoke,DataType::Type type,CodeGeneratorARM64 * codegen)1447 static void GenUnsafeCas(HInvoke* invoke, DataType::Type type, CodeGeneratorARM64* codegen) {
1448   MacroAssembler* masm = codegen->GetVIXLAssembler();
1449   LocationSummary* locations = invoke->GetLocations();
1450 
1451   Register out = WRegisterFrom(locations->Out());                 // Boolean result.
1452   Register base = WRegisterFrom(locations->InAt(1));              // Object pointer.
1453   Register offset = XRegisterFrom(locations->InAt(2));            // Long offset.
1454   Register expected = RegisterFrom(locations->InAt(3), type);     // Expected.
1455   Register new_value = RegisterFrom(locations->InAt(4), type);    // New value.
1456 
1457   // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
1458   if (type == DataType::Type::kReference) {
1459     // Mark card for object assuming new value is stored.
1460     bool new_value_can_be_null = true;  // TODO: Worth finding out this information?
1461     codegen->MaybeMarkGCCard(base, new_value, new_value_can_be_null);
1462   }
1463 
1464   UseScratchRegisterScope temps(masm);
1465   Register tmp_ptr = temps.AcquireX();                             // Pointer to actual memory.
1466   Register old_value;                                              // Value in memory.
1467 
1468   vixl::aarch64::Label exit_loop_label;
1469   vixl::aarch64::Label* exit_loop = &exit_loop_label;
1470   vixl::aarch64::Label* cmp_failure = &exit_loop_label;
1471 
1472   if (type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
1473     // We need to store the `old_value` in a non-scratch register to make sure
1474     // the read barrier in the slow path does not clobber it.
1475     old_value = WRegisterFrom(locations->GetTemp(0));  // The old value from main path.
1476     // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
1477     // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register.
1478     Register old_value_temp = WRegisterFrom(locations->GetTemp(1));
1479     ReadBarrierCasSlowPathARM64* slow_path =
1480         new (codegen->GetScopedAllocator()) ReadBarrierCasSlowPathARM64(
1481             invoke,
1482             std::memory_order_seq_cst,
1483             /*strong=*/ true,
1484             base,
1485             offset,
1486             expected,
1487             new_value,
1488             old_value,
1489             old_value_temp,
1490             /*store_result=*/ Register(),  // Use a scratch register.
1491             /*update_old_value=*/ false,
1492             codegen);
1493     codegen->AddSlowPath(slow_path);
1494     exit_loop = slow_path->GetExitLabel();
1495     cmp_failure = slow_path->GetEntryLabel();
1496   } else {
1497     old_value = temps.AcquireSameSizeAs(new_value);
1498   }
1499 
1500   __ Add(tmp_ptr, base.X(), Operand(offset));
1501 
1502   GenerateCompareAndSet(codegen,
1503                         type,
1504                         std::memory_order_seq_cst,
1505                         /*strong=*/ true,
1506                         cmp_failure,
1507                         tmp_ptr,
1508                         new_value,
1509                         old_value,
1510                         /*store_result=*/ old_value.W(),  // Reuse `old_value` for ST*XR* result.
1511                         expected);
1512   __ Bind(exit_loop);
1513   __ Cset(out, eq);
1514 }
1515 
VisitUnsafeCASInt(HInvoke * invoke)1516 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASInt(HInvoke* invoke) {
1517   VisitJdkUnsafeCASInt(invoke);
1518 }
VisitUnsafeCASLong(HInvoke * invoke)1519 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASLong(HInvoke* invoke) {
1520   VisitJdkUnsafeCASLong(invoke);
1521 }
VisitUnsafeCASObject(HInvoke * invoke)1522 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject(HInvoke* invoke) {
1523   VisitJdkUnsafeCASObject(invoke);
1524 }
1525 
VisitJdkUnsafeCASInt(HInvoke * invoke)1526 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCASInt(HInvoke* invoke) {
1527   // `jdk.internal.misc.Unsafe.compareAndSwapInt` has compare-and-set semantics (see javadoc).
1528   VisitJdkUnsafeCompareAndSetInt(invoke);
1529 }
VisitJdkUnsafeCASLong(HInvoke * invoke)1530 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCASLong(HInvoke* invoke) {
1531   // `jdk.internal.misc.Unsafe.compareAndSwapLong` has compare-and-set semantics (see javadoc).
1532   VisitJdkUnsafeCompareAndSetLong(invoke);
1533 }
VisitJdkUnsafeCASObject(HInvoke * invoke)1534 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCASObject(HInvoke* invoke) {
1535   // `jdk.internal.misc.Unsafe.compareAndSwapObject` has compare-and-set semantics (see javadoc).
1536   VisitJdkUnsafeCompareAndSetReference(invoke);
1537 }
1538 
VisitJdkUnsafeCompareAndSetInt(HInvoke * invoke)1539 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetInt(HInvoke* invoke) {
1540   CreateUnsafeCASLocations(allocator_, invoke, codegen_);
1541 }
VisitJdkUnsafeCompareAndSetLong(HInvoke * invoke)1542 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetLong(HInvoke* invoke) {
1543   CreateUnsafeCASLocations(allocator_, invoke, codegen_);
1544 }
VisitJdkUnsafeCompareAndSetReference(HInvoke * invoke)1545 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeCompareAndSetReference(HInvoke* invoke) {
1546   // The only supported read barrier implementation is the Baker-style read barriers.
1547   if (codegen_->EmitNonBakerReadBarrier()) {
1548     return;
1549   }
1550 
1551   CreateUnsafeCASLocations(allocator_, invoke, codegen_);
1552   if (codegen_->EmitReadBarrier()) {
1553     // We need two non-scratch temporary registers for read barrier.
1554     LocationSummary* locations = invoke->GetLocations();
1555     if (kUseBakerReadBarrier) {
1556       locations->AddTemp(Location::RequiresRegister());
1557       locations->AddTemp(Location::RequiresRegister());
1558     } else {
1559       // To preserve the old value across the non-Baker read barrier
1560       // slow path, use a fixed callee-save register.
1561       constexpr int first_callee_save = CTZ(kArm64CalleeSaveRefSpills);
1562       locations->AddTemp(Location::RegisterLocation(first_callee_save));
1563       // To reduce the number of moves, request x0 as the second temporary.
1564       DCHECK(InvokeRuntimeCallingConvention().GetReturnLocation(DataType::Type::kReference).Equals(
1565                  Location::RegisterLocation(x0.GetCode())));
1566       locations->AddTemp(Location::RegisterLocation(x0.GetCode()));
1567     }
1568   }
1569 }
1570 
VisitUnsafeCASInt(HInvoke * invoke)1571 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASInt(HInvoke* invoke) {
1572   VisitJdkUnsafeCASInt(invoke);
1573 }
VisitUnsafeCASLong(HInvoke * invoke)1574 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASLong(HInvoke* invoke) {
1575   VisitJdkUnsafeCASLong(invoke);
1576 }
VisitUnsafeCASObject(HInvoke * invoke)1577 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASObject(HInvoke* invoke) {
1578   VisitJdkUnsafeCASObject(invoke);
1579 }
1580 
VisitJdkUnsafeCASInt(HInvoke * invoke)1581 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCASInt(HInvoke* invoke) {
1582   // `jdk.internal.misc.Unsafe.compareAndSwapInt` has compare-and-set semantics (see javadoc).
1583   VisitJdkUnsafeCompareAndSetInt(invoke);
1584 }
VisitJdkUnsafeCASLong(HInvoke * invoke)1585 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCASLong(HInvoke* invoke) {
1586   // `jdk.internal.misc.Unsafe.compareAndSwapLong` has compare-and-set semantics (see javadoc).
1587   VisitJdkUnsafeCompareAndSetLong(invoke);
1588 }
VisitJdkUnsafeCASObject(HInvoke * invoke)1589 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCASObject(HInvoke* invoke) {
1590   // `jdk.internal.misc.Unsafe.compareAndSwapObject` has compare-and-set semantics (see javadoc).
1591   VisitJdkUnsafeCompareAndSetReference(invoke);
1592 }
1593 
VisitJdkUnsafeCompareAndSetInt(HInvoke * invoke)1594 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetInt(HInvoke* invoke) {
1595   GenUnsafeCas(invoke, DataType::Type::kInt32, codegen_);
1596 }
VisitJdkUnsafeCompareAndSetLong(HInvoke * invoke)1597 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetLong(HInvoke* invoke) {
1598   GenUnsafeCas(invoke, DataType::Type::kInt64, codegen_);
1599 }
VisitJdkUnsafeCompareAndSetReference(HInvoke * invoke)1600 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeCompareAndSetReference(HInvoke* invoke) {
1601   // The only supported read barrier implementation is the Baker-style read barriers.
1602   DCHECK_IMPLIES(codegen_->EmitReadBarrier(), kUseBakerReadBarrier);
1603 
1604   GenUnsafeCas(invoke, DataType::Type::kReference, codegen_);
1605 }
1606 
1607 enum class GetAndUpdateOp {
1608   kSet,
1609   kAdd,
1610   kAddWithByteSwap,
1611   kAnd,
1612   kOr,
1613   kXor
1614 };
1615 
GenerateGetAndUpdate(CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op,DataType::Type load_store_type,std::memory_order order,Register ptr,CPURegister arg,CPURegister old_value)1616 static void GenerateGetAndUpdate(CodeGeneratorARM64* codegen,
1617                                  GetAndUpdateOp get_and_update_op,
1618                                  DataType::Type load_store_type,
1619                                  std::memory_order order,
1620                                  Register ptr,
1621                                  CPURegister arg,
1622                                  CPURegister old_value) {
1623   MacroAssembler* masm = codegen->GetVIXLAssembler();
1624   UseScratchRegisterScope temps(masm);
1625   Register store_result = temps.AcquireW();
1626 
1627   DCHECK_EQ(old_value.GetSizeInBits(), arg.GetSizeInBits());
1628   Register old_value_reg;
1629   Register new_value;
1630   switch (get_and_update_op) {
1631     case GetAndUpdateOp::kSet:
1632       old_value_reg = old_value.IsX() ? old_value.X() : old_value.W();
1633       new_value = arg.IsX() ? arg.X() : arg.W();
1634       break;
1635     case GetAndUpdateOp::kAddWithByteSwap:
1636     case GetAndUpdateOp::kAdd:
1637       if (arg.IsVRegister()) {
1638         old_value_reg = arg.IsD() ? temps.AcquireX() : temps.AcquireW();
1639         new_value = old_value_reg;  // Use the same temporary.
1640         break;
1641       }
1642       FALLTHROUGH_INTENDED;
1643     case GetAndUpdateOp::kAnd:
1644     case GetAndUpdateOp::kOr:
1645     case GetAndUpdateOp::kXor:
1646       old_value_reg = old_value.IsX() ? old_value.X() : old_value.W();
1647       new_value = old_value.IsX() ? temps.AcquireX() : temps.AcquireW();
1648       break;
1649   }
1650 
1651   bool use_load_acquire =
1652       (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
1653   bool use_store_release =
1654       (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
1655   DCHECK(use_load_acquire || use_store_release);
1656 
1657   vixl::aarch64::Label loop_label;
1658   __ Bind(&loop_label);
1659   EmitLoadExclusive(codegen, load_store_type, ptr, old_value_reg, use_load_acquire);
1660   switch (get_and_update_op) {
1661     case GetAndUpdateOp::kSet:
1662       break;
1663     case GetAndUpdateOp::kAddWithByteSwap:
1664       // To avoid unnecessary sign extension before REV16, the caller must specify `kUint16`
1665       // instead of `kInt16` and do the sign-extension explicitly afterwards.
1666       DCHECK_NE(load_store_type, DataType::Type::kInt16);
1667       GenerateReverseBytes(masm, load_store_type, old_value_reg, old_value_reg);
1668       FALLTHROUGH_INTENDED;
1669     case GetAndUpdateOp::kAdd:
1670       if (arg.IsVRegister()) {
1671         VRegister old_value_vreg = old_value.IsD() ? old_value.D() : old_value.S();
1672         VRegister sum = temps.AcquireSameSizeAs(old_value_vreg);
1673         __ Fmov(old_value_vreg, old_value_reg);
1674         __ Fadd(sum, old_value_vreg, arg.IsD() ? arg.D() : arg.S());
1675         __ Fmov(new_value, sum);
1676       } else {
1677         __ Add(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1678       }
1679       if (get_and_update_op == GetAndUpdateOp::kAddWithByteSwap) {
1680         GenerateReverseBytes(masm, load_store_type, new_value, new_value);
1681       }
1682       break;
1683     case GetAndUpdateOp::kAnd:
1684       __ And(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1685       break;
1686     case GetAndUpdateOp::kOr:
1687       __ Orr(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1688       break;
1689     case GetAndUpdateOp::kXor:
1690       __ Eor(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
1691       break;
1692   }
1693   EmitStoreExclusive(codegen, load_store_type, ptr, store_result, new_value, use_store_release);
1694   __ Cbnz(store_result, &loop_label);
1695 }
1696 
CreateUnsafeGetAndUpdateLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorARM64 * codegen)1697 static void CreateUnsafeGetAndUpdateLocations(ArenaAllocator* allocator,
1698                                               HInvoke* invoke,
1699                                               CodeGeneratorARM64* codegen) {
1700   const bool can_call = codegen->EmitReadBarrier() && IsUnsafeGetAndSetReference(invoke);
1701   LocationSummary* locations =
1702       new (allocator) LocationSummary(invoke,
1703                                       can_call
1704                                           ? LocationSummary::kCallOnSlowPath
1705                                           : LocationSummary::kNoCall,
1706                                       kIntrinsified);
1707   if (can_call && kUseBakerReadBarrier) {
1708     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
1709   }
1710   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1711   locations->SetInAt(1, Location::RequiresRegister());
1712   locations->SetInAt(2, Location::RequiresRegister());
1713   locations->SetInAt(3, Location::RequiresRegister());
1714   locations->AddTemp(Location::RequiresRegister());
1715 
1716   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
1717 }
1718 
GenUnsafeGetAndUpdate(HInvoke * invoke,DataType::Type type,CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op)1719 static void GenUnsafeGetAndUpdate(HInvoke* invoke,
1720                                   DataType::Type type,
1721                                   CodeGeneratorARM64* codegen,
1722                                   GetAndUpdateOp get_and_update_op) {
1723   MacroAssembler* masm = codegen->GetVIXLAssembler();
1724   LocationSummary* locations = invoke->GetLocations();
1725 
1726   Register out = RegisterFrom(locations->Out(), type);            // Result.
1727   Register base = WRegisterFrom(locations->InAt(1));              // Object pointer.
1728   Register offset = XRegisterFrom(locations->InAt(2));            // Long offset.
1729   Register arg = RegisterFrom(locations->InAt(3), type);          // New value or addend.
1730   Register tmp_ptr = XRegisterFrom(locations->GetTemp(0));        // Pointer to actual memory.
1731 
1732   // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
1733   if (type == DataType::Type::kReference) {
1734     DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
1735     // Mark card for object as a new value shall be stored.
1736     bool new_value_can_be_null = true;  // TODO: Worth finding out this information?
1737     codegen->MaybeMarkGCCard(base, /*value=*/arg, new_value_can_be_null);
1738   }
1739 
1740   __ Add(tmp_ptr, base.X(), Operand(offset));
1741   GenerateGetAndUpdate(codegen,
1742                        get_and_update_op,
1743                        type,
1744                        std::memory_order_seq_cst,
1745                        tmp_ptr,
1746                        arg,
1747                        /*old_value=*/ out);
1748 
1749   if (type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
1750     DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
1751     if (kUseBakerReadBarrier) {
1752       codegen->GenerateIntrinsicMoveWithBakerReadBarrier(out.W(), out.W());
1753     } else {
1754       codegen->GenerateReadBarrierSlow(
1755           invoke,
1756           Location::RegisterLocation(out.GetCode()),
1757           Location::RegisterLocation(out.GetCode()),
1758           Location::RegisterLocation(base.GetCode()),
1759           /*offset=*/ 0u,
1760           /*index=*/ Location::RegisterLocation(offset.GetCode()));
1761     }
1762   }
1763 }
1764 
VisitUnsafeGetAndAddInt(HInvoke * invoke)1765 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndAddInt(HInvoke* invoke) {
1766   VisitJdkUnsafeGetAndAddInt(invoke);
1767 }
VisitUnsafeGetAndAddLong(HInvoke * invoke)1768 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndAddLong(HInvoke* invoke) {
1769   VisitJdkUnsafeGetAndAddLong(invoke);
1770 }
VisitUnsafeGetAndSetInt(HInvoke * invoke)1771 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndSetInt(HInvoke* invoke) {
1772   VisitJdkUnsafeGetAndSetInt(invoke);
1773 }
VisitUnsafeGetAndSetLong(HInvoke * invoke)1774 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndSetLong(HInvoke* invoke) {
1775   VisitJdkUnsafeGetAndSetLong(invoke);
1776 }
VisitUnsafeGetAndSetObject(HInvoke * invoke)1777 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetAndSetObject(HInvoke* invoke) {
1778   VisitJdkUnsafeGetAndSetReference(invoke);
1779 }
1780 
VisitJdkUnsafeGetAndAddInt(HInvoke * invoke)1781 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndAddInt(HInvoke* invoke) {
1782   CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1783 }
VisitJdkUnsafeGetAndAddLong(HInvoke * invoke)1784 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndAddLong(HInvoke* invoke) {
1785   CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1786 }
VisitJdkUnsafeGetAndSetInt(HInvoke * invoke)1787 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndSetInt(HInvoke* invoke) {
1788   CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1789 }
VisitJdkUnsafeGetAndSetLong(HInvoke * invoke)1790 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndSetLong(HInvoke* invoke) {
1791   CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1792 }
VisitJdkUnsafeGetAndSetReference(HInvoke * invoke)1793 void IntrinsicLocationsBuilderARM64::VisitJdkUnsafeGetAndSetReference(HInvoke* invoke) {
1794   CreateUnsafeGetAndUpdateLocations(allocator_, invoke, codegen_);
1795 }
1796 
VisitUnsafeGetAndAddInt(HInvoke * invoke)1797 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndAddInt(HInvoke* invoke) {
1798   VisitJdkUnsafeGetAndAddInt(invoke);
1799 }
VisitUnsafeGetAndAddLong(HInvoke * invoke)1800 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndAddLong(HInvoke* invoke) {
1801   VisitJdkUnsafeGetAndAddLong(invoke);
1802 }
VisitUnsafeGetAndSetInt(HInvoke * invoke)1803 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndSetInt(HInvoke* invoke) {
1804   VisitJdkUnsafeGetAndSetInt(invoke);
1805 }
VisitUnsafeGetAndSetLong(HInvoke * invoke)1806 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndSetLong(HInvoke* invoke) {
1807   VisitJdkUnsafeGetAndSetLong(invoke);
1808 }
VisitUnsafeGetAndSetObject(HInvoke * invoke)1809 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetAndSetObject(HInvoke* invoke) {
1810   VisitJdkUnsafeGetAndSetReference(invoke);
1811 }
1812 
VisitJdkUnsafeGetAndAddInt(HInvoke * invoke)1813 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndAddInt(HInvoke* invoke) {
1814   GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt32, codegen_, GetAndUpdateOp::kAdd);
1815 }
VisitJdkUnsafeGetAndAddLong(HInvoke * invoke)1816 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndAddLong(HInvoke* invoke) {
1817   GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt64, codegen_, GetAndUpdateOp::kAdd);
1818 }
VisitJdkUnsafeGetAndSetInt(HInvoke * invoke)1819 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndSetInt(HInvoke* invoke) {
1820   GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt32, codegen_, GetAndUpdateOp::kSet);
1821 }
VisitJdkUnsafeGetAndSetLong(HInvoke * invoke)1822 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndSetLong(HInvoke* invoke) {
1823   GenUnsafeGetAndUpdate(invoke, DataType::Type::kInt64, codegen_, GetAndUpdateOp::kSet);
1824 }
VisitJdkUnsafeGetAndSetReference(HInvoke * invoke)1825 void IntrinsicCodeGeneratorARM64::VisitJdkUnsafeGetAndSetReference(HInvoke* invoke) {
1826   GenUnsafeGetAndUpdate(invoke, DataType::Type::kReference, codegen_, GetAndUpdateOp::kSet);
1827 }
1828 
VisitStringCompareTo(HInvoke * invoke)1829 void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
1830   LocationSummary* locations =
1831       new (allocator_) LocationSummary(invoke,
1832                                        invoke->InputAt(1)->CanBeNull()
1833                                            ? LocationSummary::kCallOnSlowPath
1834                                            : LocationSummary::kNoCall,
1835                                        kIntrinsified);
1836   locations->SetInAt(0, Location::RequiresRegister());
1837   locations->SetInAt(1, Location::RequiresRegister());
1838   locations->AddTemp(Location::RequiresRegister());
1839   locations->AddTemp(Location::RequiresRegister());
1840   locations->AddTemp(Location::RequiresRegister());
1841   // Need temporary registers for String compression's feature.
1842   if (mirror::kUseStringCompression) {
1843     locations->AddTemp(Location::RequiresRegister());
1844   }
1845   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
1846 }
1847 
VisitStringCompareTo(HInvoke * invoke)1848 void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
1849   MacroAssembler* masm = GetVIXLAssembler();
1850   LocationSummary* locations = invoke->GetLocations();
1851 
1852   Register str = InputRegisterAt(invoke, 0);
1853   Register arg = InputRegisterAt(invoke, 1);
1854   DCHECK(str.IsW());
1855   DCHECK(arg.IsW());
1856   Register out = OutputRegister(invoke);
1857 
1858   Register temp0 = WRegisterFrom(locations->GetTemp(0));
1859   Register temp1 = WRegisterFrom(locations->GetTemp(1));
1860   Register temp2 = WRegisterFrom(locations->GetTemp(2));
1861   Register temp3;
1862   if (mirror::kUseStringCompression) {
1863     temp3 = WRegisterFrom(locations->GetTemp(3));
1864   }
1865 
1866   vixl::aarch64::Label loop;
1867   vixl::aarch64::Label find_char_diff;
1868   vixl::aarch64::Label end;
1869   vixl::aarch64::Label different_compression;
1870 
1871   // Get offsets of count and value fields within a string object.
1872   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
1873   const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1874 
1875   // Note that the null check must have been done earlier.
1876   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1877 
1878   // Take slow path and throw if input can be and is null.
1879   SlowPathCodeARM64* slow_path = nullptr;
1880   const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
1881   if (can_slow_path) {
1882     slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
1883     codegen_->AddSlowPath(slow_path);
1884     __ Cbz(arg, slow_path->GetEntryLabel());
1885   }
1886 
1887   // Reference equality check, return 0 if same reference.
1888   __ Subs(out, str, arg);
1889   __ B(&end, eq);
1890 
1891   if (mirror::kUseStringCompression) {
1892     // Load `count` fields of this and argument strings.
1893     __ Ldr(temp3, HeapOperand(str, count_offset));
1894     __ Ldr(temp2, HeapOperand(arg, count_offset));
1895     // Clean out compression flag from lengths.
1896     __ Lsr(temp0, temp3, 1u);
1897     __ Lsr(temp1, temp2, 1u);
1898   } else {
1899     // Load lengths of this and argument strings.
1900     __ Ldr(temp0, HeapOperand(str, count_offset));
1901     __ Ldr(temp1, HeapOperand(arg, count_offset));
1902   }
1903   // out = length diff.
1904   __ Subs(out, temp0, temp1);
1905   // temp0 = min(len(str), len(arg)).
1906   __ Csel(temp0, temp1, temp0, ge);
1907   // Shorter string is empty?
1908   __ Cbz(temp0, &end);
1909 
1910   if (mirror::kUseStringCompression) {
1911     // Check if both strings using same compression style to use this comparison loop.
1912     __ Eor(temp2, temp2, Operand(temp3));
1913     // Interleave with compression flag extraction which is needed for both paths
1914     // and also set flags which is needed only for the different compressions path.
1915     __ Ands(temp3.W(), temp3.W(), Operand(1));
1916     __ Tbnz(temp2, 0, &different_compression);  // Does not use flags.
1917   }
1918   // Store offset of string value in preparation for comparison loop.
1919   __ Mov(temp1, value_offset);
1920   if (mirror::kUseStringCompression) {
1921     // For string compression, calculate the number of bytes to compare (not chars).
1922     // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
1923     __ Lsl(temp0, temp0, temp3);
1924   }
1925 
1926   UseScratchRegisterScope scratch_scope(masm);
1927   Register temp4 = scratch_scope.AcquireX();
1928 
1929   // Assertions that must hold in order to compare strings 8 bytes at a time.
1930   DCHECK_ALIGNED(value_offset, 8);
1931   static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
1932 
1933   const size_t char_size = DataType::Size(DataType::Type::kUint16);
1934   DCHECK_EQ(char_size, 2u);
1935 
1936   // Promote temp2 to an X reg, ready for LDR.
1937   temp2 = temp2.X();
1938 
1939   // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
1940   __ Bind(&loop);
1941   __ Ldr(temp4, MemOperand(str.X(), temp1.X()));
1942   __ Ldr(temp2, MemOperand(arg.X(), temp1.X()));
1943   __ Cmp(temp4, temp2);
1944   __ B(ne, &find_char_diff);
1945   __ Add(temp1, temp1, char_size * 4);
1946   // With string compression, we have compared 8 bytes, otherwise 4 chars.
1947   __ Subs(temp0, temp0, (mirror::kUseStringCompression) ? 8 : 4);
1948   __ B(&loop, hi);
1949   __ B(&end);
1950 
1951   // Promote temp1 to an X reg, ready for EOR.
1952   temp1 = temp1.X();
1953 
1954   // Find the single character difference.
1955   __ Bind(&find_char_diff);
1956   // Get the bit position of the first character that differs.
1957   __ Eor(temp1, temp2, temp4);
1958   __ Rbit(temp1, temp1);
1959   __ Clz(temp1, temp1);
1960 
1961   // If the number of chars remaining <= the index where the difference occurs (0-3), then
1962   // the difference occurs outside the remaining string data, so just return length diff (out).
1963   // Unlike ARM, we're doing the comparison in one go here, without the subtraction at the
1964   // find_char_diff_2nd_cmp path, so it doesn't matter whether the comparison is signed or
1965   // unsigned when string compression is disabled.
1966   // When it's enabled, the comparison must be unsigned.
1967   __ Cmp(temp0, Operand(temp1.W(), LSR, (mirror::kUseStringCompression) ? 3 : 4));
1968   __ B(ls, &end);
1969 
1970   // Extract the characters and calculate the difference.
1971   if (mirror:: kUseStringCompression) {
1972     __ Bic(temp1, temp1, 0x7);
1973     __ Bic(temp1, temp1, Operand(temp3.X(), LSL, 3u));
1974   } else {
1975     __ Bic(temp1, temp1, 0xf);
1976   }
1977   __ Lsr(temp2, temp2, temp1);
1978   __ Lsr(temp4, temp4, temp1);
1979   if (mirror::kUseStringCompression) {
1980     // Prioritize the case of compressed strings and calculate such result first.
1981     __ Uxtb(temp1, temp4);
1982     __ Sub(out, temp1.W(), Operand(temp2.W(), UXTB));
1983     __ Tbz(temp3, 0u, &end);  // If actually compressed, we're done.
1984   }
1985   __ Uxth(temp4, temp4);
1986   __ Sub(out, temp4.W(), Operand(temp2.W(), UXTH));
1987 
1988   if (mirror::kUseStringCompression) {
1989     __ B(&end);
1990     __ Bind(&different_compression);
1991 
1992     // Comparison for different compression style.
1993     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
1994     DCHECK_EQ(c_char_size, 1u);
1995     temp1 = temp1.W();
1996     temp2 = temp2.W();
1997     temp4 = temp4.W();
1998 
1999     // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
2000     // Note that flags have been set by the `str` compression flag extraction to `temp3`
2001     // before branching to the `different_compression` label.
2002     __ Csel(temp1, str, arg, eq);   // Pointer to the compressed string.
2003     __ Csel(temp2, str, arg, ne);   // Pointer to the uncompressed string.
2004 
2005     // We want to free up the temp3, currently holding `str` compression flag, for comparison.
2006     // So, we move it to the bottom bit of the iteration count `temp0` which we then need to treat
2007     // as unsigned. Start by freeing the bit with a LSL and continue further down by a SUB which
2008     // will allow `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
2009     __ Lsl(temp0, temp0, 1u);
2010 
2011     // Adjust temp1 and temp2 from string pointers to data pointers.
2012     __ Add(temp1, temp1, Operand(value_offset));
2013     __ Add(temp2, temp2, Operand(value_offset));
2014 
2015     // Complete the move of the compression flag.
2016     __ Sub(temp0, temp0, Operand(temp3));
2017 
2018     vixl::aarch64::Label different_compression_loop;
2019     vixl::aarch64::Label different_compression_diff;
2020 
2021     __ Bind(&different_compression_loop);
2022     __ Ldrb(temp4, MemOperand(temp1.X(), c_char_size, PostIndex));
2023     __ Ldrh(temp3, MemOperand(temp2.X(), char_size, PostIndex));
2024     __ Subs(temp4, temp4, Operand(temp3));
2025     __ B(&different_compression_diff, ne);
2026     __ Subs(temp0, temp0, 2);
2027     __ B(&different_compression_loop, hi);
2028     __ B(&end);
2029 
2030     // Calculate the difference.
2031     __ Bind(&different_compression_diff);
2032     __ Tst(temp0, Operand(1));
2033     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
2034                   "Expecting 0=compressed, 1=uncompressed");
2035     __ Cneg(out, temp4, ne);
2036   }
2037 
2038   __ Bind(&end);
2039 
2040   if (can_slow_path) {
2041     __ Bind(slow_path->GetExitLabel());
2042   }
2043 }
2044 
2045 // The cut off for unrolling the loop in String.equals() intrinsic for const strings.
2046 // The normal loop plus the pre-header is 9 instructions without string compression and 12
2047 // instructions with string compression. We can compare up to 8 bytes in 4 instructions
2048 // (LDR+LDR+CMP+BNE) and up to 16 bytes in 5 instructions (LDP+LDP+CMP+CCMP+BNE). Allow up
2049 // to 10 instructions for the unrolled loop.
2050 constexpr size_t kShortConstStringEqualsCutoffInBytes = 32;
2051 
GetConstString(HInstruction * candidate,uint32_t * utf16_length)2052 static const char* GetConstString(HInstruction* candidate, uint32_t* utf16_length) {
2053   if (candidate->IsLoadString()) {
2054     HLoadString* load_string = candidate->AsLoadString();
2055     const DexFile& dex_file = load_string->GetDexFile();
2056     return dex_file.GetStringDataAndUtf16Length(load_string->GetStringIndex(), utf16_length);
2057   }
2058   return nullptr;
2059 }
2060 
VisitStringEquals(HInvoke * invoke)2061 void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
2062   LocationSummary* locations =
2063       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2064   locations->SetInAt(0, Location::RequiresRegister());
2065   locations->SetInAt(1, Location::RequiresRegister());
2066 
2067   // For the generic implementation and for long const strings we need a temporary.
2068   // We do not need it for short const strings, up to 8 bytes, see code generation below.
2069   uint32_t const_string_length = 0u;
2070   const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
2071   if (const_string == nullptr) {
2072     const_string = GetConstString(invoke->InputAt(1), &const_string_length);
2073   }
2074   bool is_compressed =
2075       mirror::kUseStringCompression &&
2076       const_string != nullptr &&
2077       mirror::String::DexFileStringAllASCII(const_string, const_string_length);
2078   if (const_string == nullptr || const_string_length > (is_compressed ? 8u : 4u)) {
2079     locations->AddTemp(Location::RequiresRegister());
2080   }
2081 
2082   // TODO: If the String.equals() is used only for an immediately following HIf, we can
2083   // mark it as emitted-at-use-site and emit branches directly to the appropriate blocks.
2084   // Then we shall need an extra temporary register instead of the output register.
2085   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
2086 }
2087 
VisitStringEquals(HInvoke * invoke)2088 void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
2089   MacroAssembler* masm = GetVIXLAssembler();
2090   LocationSummary* locations = invoke->GetLocations();
2091 
2092   Register str = WRegisterFrom(locations->InAt(0));
2093   Register arg = WRegisterFrom(locations->InAt(1));
2094   Register out = XRegisterFrom(locations->Out());
2095 
2096   UseScratchRegisterScope scratch_scope(masm);
2097   Register temp = scratch_scope.AcquireW();
2098   Register temp1 = scratch_scope.AcquireW();
2099 
2100   vixl::aarch64::Label loop;
2101   vixl::aarch64::Label end;
2102   vixl::aarch64::Label return_true;
2103   vixl::aarch64::Label return_false;
2104 
2105   // Get offsets of count, value, and class fields within a string object.
2106   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
2107   const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
2108   const int32_t class_offset = mirror::Object::ClassOffset().Int32Value();
2109 
2110   // Note that the null check must have been done earlier.
2111   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
2112 
2113   StringEqualsOptimizations optimizations(invoke);
2114   if (!optimizations.GetArgumentNotNull()) {
2115     // Check if input is null, return false if it is.
2116     __ Cbz(arg, &return_false);
2117   }
2118 
2119   // Reference equality check, return true if same reference.
2120   __ Cmp(str, arg);
2121   __ B(&return_true, eq);
2122 
2123   if (!optimizations.GetArgumentIsString()) {
2124     // Instanceof check for the argument by comparing class fields.
2125     // All string objects must have the same type since String cannot be subclassed.
2126     // Receiver must be a string object, so its class field is equal to all strings' class fields.
2127     // If the argument is a string object, its class field must be equal to receiver's class field.
2128     //
2129     // As the String class is expected to be non-movable, we can read the class
2130     // field from String.equals' arguments without read barriers.
2131     AssertNonMovableStringClass();
2132     // /* HeapReference<Class> */ temp = str->klass_
2133     __ Ldr(temp, MemOperand(str.X(), class_offset));
2134     // /* HeapReference<Class> */ temp1 = arg->klass_
2135     __ Ldr(temp1, MemOperand(arg.X(), class_offset));
2136     // Also, because we use the previously loaded class references only in the
2137     // following comparison, we don't need to unpoison them.
2138     __ Cmp(temp, temp1);
2139     __ B(&return_false, ne);
2140   }
2141 
2142   // Check if one of the inputs is a const string. Do not special-case both strings
2143   // being const, such cases should be handled by constant folding if needed.
2144   uint32_t const_string_length = 0u;
2145   const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
2146   if (const_string == nullptr) {
2147     const_string = GetConstString(invoke->InputAt(1), &const_string_length);
2148     if (const_string != nullptr) {
2149       std::swap(str, arg);  // Make sure the const string is in `str`.
2150     }
2151   }
2152   bool is_compressed =
2153       mirror::kUseStringCompression &&
2154       const_string != nullptr &&
2155       mirror::String::DexFileStringAllASCII(const_string, const_string_length);
2156 
2157   if (const_string != nullptr) {
2158     // Load `count` field of the argument string and check if it matches the const string.
2159     // Also compares the compression style, if differs return false.
2160     __ Ldr(temp, MemOperand(arg.X(), count_offset));
2161     // Temporarily release temp1 as we may not be able to embed the flagged count in CMP immediate.
2162     scratch_scope.Release(temp1);
2163     __ Cmp(temp, Operand(mirror::String::GetFlaggedCount(const_string_length, is_compressed)));
2164     temp1 = scratch_scope.AcquireW();
2165     __ B(&return_false, ne);
2166   } else {
2167     // Load `count` fields of this and argument strings.
2168     __ Ldr(temp, MemOperand(str.X(), count_offset));
2169     __ Ldr(temp1, MemOperand(arg.X(), count_offset));
2170     // Check if `count` fields are equal, return false if they're not.
2171     // Also compares the compression style, if differs return false.
2172     __ Cmp(temp, temp1);
2173     __ B(&return_false, ne);
2174   }
2175 
2176   // Assertions that must hold in order to compare strings 8 bytes at a time.
2177   // Ok to do this because strings are zero-padded to kObjectAlignment.
2178   DCHECK_ALIGNED(value_offset, 8);
2179   static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
2180 
2181   if (const_string != nullptr &&
2182       const_string_length <= (is_compressed ? kShortConstStringEqualsCutoffInBytes
2183                                             : kShortConstStringEqualsCutoffInBytes / 2u)) {
2184     // Load and compare the contents. Though we know the contents of the short const string
2185     // at compile time, materializing constants may be more code than loading from memory.
2186     int32_t offset = value_offset;
2187     size_t remaining_bytes =
2188         RoundUp(is_compressed ? const_string_length : const_string_length * 2u, 8u);
2189     temp = temp.X();
2190     temp1 = temp1.X();
2191     while (remaining_bytes > sizeof(uint64_t)) {
2192       Register temp2 = XRegisterFrom(locations->GetTemp(0));
2193       __ Ldp(temp, temp1, MemOperand(str.X(), offset));
2194       __ Ldp(temp2, out, MemOperand(arg.X(), offset));
2195       __ Cmp(temp, temp2);
2196       __ Ccmp(temp1, out, NoFlag, eq);
2197       __ B(&return_false, ne);
2198       offset += 2u * sizeof(uint64_t);
2199       remaining_bytes -= 2u * sizeof(uint64_t);
2200     }
2201     if (remaining_bytes != 0u) {
2202       __ Ldr(temp, MemOperand(str.X(), offset));
2203       __ Ldr(temp1, MemOperand(arg.X(), offset));
2204       __ Cmp(temp, temp1);
2205       __ B(&return_false, ne);
2206     }
2207   } else {
2208     // Return true if both strings are empty. Even with string compression `count == 0` means empty.
2209     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
2210                   "Expecting 0=compressed, 1=uncompressed");
2211     __ Cbz(temp, &return_true);
2212 
2213     if (mirror::kUseStringCompression) {
2214       // For string compression, calculate the number of bytes to compare (not chars).
2215       // This could in theory exceed INT32_MAX, so treat temp as unsigned.
2216       __ And(temp1, temp, Operand(1));    // Extract compression flag.
2217       __ Lsr(temp, temp, 1u);             // Extract length.
2218       __ Lsl(temp, temp, temp1);          // Calculate number of bytes to compare.
2219     }
2220 
2221     // Store offset of string value in preparation for comparison loop
2222     __ Mov(temp1, value_offset);
2223 
2224     temp1 = temp1.X();
2225     Register temp2 = XRegisterFrom(locations->GetTemp(0));
2226     // Loop to compare strings 8 bytes at a time starting at the front of the string.
2227     __ Bind(&loop);
2228     __ Ldr(out, MemOperand(str.X(), temp1));
2229     __ Ldr(temp2, MemOperand(arg.X(), temp1));
2230     __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
2231     __ Cmp(out, temp2);
2232     __ B(&return_false, ne);
2233     // With string compression, we have compared 8 bytes, otherwise 4 chars.
2234     __ Sub(temp, temp, Operand(mirror::kUseStringCompression ? 8 : 4), SetFlags);
2235     __ B(&loop, hi);
2236   }
2237 
2238   // Return true and exit the function.
2239   // If loop does not result in returning false, we return true.
2240   __ Bind(&return_true);
2241   __ Mov(out, 1);
2242   __ B(&end);
2243 
2244   // Return false and exit the function.
2245   __ Bind(&return_false);
2246   __ Mov(out, 0);
2247   __ Bind(&end);
2248 }
2249 
GenerateVisitStringIndexOf(HInvoke * invoke,MacroAssembler * masm,CodeGeneratorARM64 * codegen,bool start_at_zero)2250 static void GenerateVisitStringIndexOf(HInvoke* invoke,
2251                                        MacroAssembler* masm,
2252                                        CodeGeneratorARM64* codegen,
2253                                        bool start_at_zero) {
2254   LocationSummary* locations = invoke->GetLocations();
2255 
2256   // Note that the null check must have been done earlier.
2257   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
2258 
2259   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
2260   // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
2261   SlowPathCodeARM64* slow_path = nullptr;
2262   HInstruction* code_point = invoke->InputAt(1);
2263   if (code_point->IsIntConstant()) {
2264     if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) > 0xFFFFU) {
2265       // Always needs the slow-path. We could directly dispatch to it, but this case should be
2266       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
2267       slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2268       codegen->AddSlowPath(slow_path);
2269       __ B(slow_path->GetEntryLabel());
2270       __ Bind(slow_path->GetExitLabel());
2271       return;
2272     }
2273   } else if (code_point->GetType() != DataType::Type::kUint16) {
2274     Register char_reg = WRegisterFrom(locations->InAt(1));
2275     __ Tst(char_reg, 0xFFFF0000);
2276     slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2277     codegen->AddSlowPath(slow_path);
2278     __ B(ne, slow_path->GetEntryLabel());
2279   }
2280 
2281   if (start_at_zero) {
2282     // Start-index = 0.
2283     Register tmp_reg = WRegisterFrom(locations->GetTemp(0));
2284     __ Mov(tmp_reg, 0);
2285   }
2286 
2287   codegen->InvokeRuntime(kQuickIndexOf, invoke, invoke->GetDexPc(), slow_path);
2288   CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
2289 
2290   if (slow_path != nullptr) {
2291     __ Bind(slow_path->GetExitLabel());
2292   }
2293 }
2294 
VisitStringIndexOf(HInvoke * invoke)2295 void IntrinsicLocationsBuilderARM64::VisitStringIndexOf(HInvoke* invoke) {
2296   LocationSummary* locations = new (allocator_) LocationSummary(
2297       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2298   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
2299   // best to align the inputs accordingly.
2300   InvokeRuntimeCallingConvention calling_convention;
2301   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2302   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2303   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kInt32));
2304 
2305   // Need to send start_index=0.
2306   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
2307 }
2308 
VisitStringIndexOf(HInvoke * invoke)2309 void IntrinsicCodeGeneratorARM64::VisitStringIndexOf(HInvoke* invoke) {
2310   GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, /* start_at_zero= */ true);
2311 }
2312 
VisitStringIndexOfAfter(HInvoke * invoke)2313 void IntrinsicLocationsBuilderARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
2314   LocationSummary* locations = new (allocator_) LocationSummary(
2315       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2316   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
2317   // best to align the inputs accordingly.
2318   InvokeRuntimeCallingConvention calling_convention;
2319   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2320   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2321   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
2322   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kInt32));
2323 }
2324 
VisitStringIndexOfAfter(HInvoke * invoke)2325 void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
2326   GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, /* start_at_zero= */ false);
2327 }
2328 
VisitStringNewStringFromBytes(HInvoke * invoke)2329 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
2330   LocationSummary* locations = new (allocator_) LocationSummary(
2331       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2332   InvokeRuntimeCallingConvention calling_convention;
2333   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2334   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2335   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
2336   locations->SetInAt(3, LocationFrom(calling_convention.GetRegisterAt(3)));
2337   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
2338 }
2339 
VisitStringNewStringFromBytes(HInvoke * invoke)2340 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
2341   MacroAssembler* masm = GetVIXLAssembler();
2342   LocationSummary* locations = invoke->GetLocations();
2343 
2344   Register byte_array = WRegisterFrom(locations->InAt(0));
2345   __ Cmp(byte_array, 0);
2346   SlowPathCodeARM64* slow_path =
2347       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2348   codegen_->AddSlowPath(slow_path);
2349   __ B(eq, slow_path->GetEntryLabel());
2350 
2351   codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc(), slow_path);
2352   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
2353   __ Bind(slow_path->GetExitLabel());
2354 }
2355 
VisitStringNewStringFromChars(HInvoke * invoke)2356 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
2357   LocationSummary* locations =
2358       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
2359   InvokeRuntimeCallingConvention calling_convention;
2360   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2361   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
2362   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
2363   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
2364 }
2365 
VisitStringNewStringFromChars(HInvoke * invoke)2366 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
2367   // No need to emit code checking whether `locations->InAt(2)` is a null
2368   // pointer, as callers of the native method
2369   //
2370   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
2371   //
2372   // all include a null check on `data` before calling that method.
2373   codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
2374   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
2375 }
2376 
VisitStringNewStringFromString(HInvoke * invoke)2377 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* invoke) {
2378   LocationSummary* locations = new (allocator_) LocationSummary(
2379       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
2380   InvokeRuntimeCallingConvention calling_convention;
2381   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
2382   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
2383 }
2384 
VisitStringNewStringFromString(HInvoke * invoke)2385 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke) {
2386   MacroAssembler* masm = GetVIXLAssembler();
2387   LocationSummary* locations = invoke->GetLocations();
2388 
2389   Register string_to_copy = WRegisterFrom(locations->InAt(0));
2390   __ Cmp(string_to_copy, 0);
2391   SlowPathCodeARM64* slow_path =
2392       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2393   codegen_->AddSlowPath(slow_path);
2394   __ B(eq, slow_path->GetEntryLabel());
2395 
2396   codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc(), slow_path);
2397   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
2398   __ Bind(slow_path->GetExitLabel());
2399 }
2400 
CreateFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)2401 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2402   DCHECK_EQ(invoke->GetNumberOfArguments(), 1U);
2403   DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
2404   DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
2405 
2406   LocationSummary* const locations =
2407       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
2408   InvokeRuntimeCallingConvention calling_convention;
2409 
2410   locations->SetInAt(0, LocationFrom(calling_convention.GetFpuRegisterAt(0)));
2411   locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
2412 }
2413 
CreateFPFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)2414 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2415   DCHECK_EQ(invoke->GetNumberOfArguments(), 2U);
2416   DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
2417   DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
2418   DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
2419 
2420   LocationSummary* const locations =
2421       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
2422   InvokeRuntimeCallingConvention calling_convention;
2423 
2424   locations->SetInAt(0, LocationFrom(calling_convention.GetFpuRegisterAt(0)));
2425   locations->SetInAt(1, LocationFrom(calling_convention.GetFpuRegisterAt(1)));
2426   locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
2427 }
2428 
CreateFPFPFPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)2429 static void CreateFPFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2430   DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
2431   DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
2432   DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
2433   DCHECK(DataType::IsFloatingPointType(invoke->InputAt(2)->GetType()));
2434   DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
2435 
2436   LocationSummary* const locations =
2437       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2438 
2439   locations->SetInAt(0, Location::RequiresFpuRegister());
2440   locations->SetInAt(1, Location::RequiresFpuRegister());
2441   locations->SetInAt(2, Location::RequiresFpuRegister());
2442   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
2443 }
2444 
GenFPToFPCall(HInvoke * invoke,CodeGeneratorARM64 * codegen,QuickEntrypointEnum entry)2445 static void GenFPToFPCall(HInvoke* invoke,
2446                           CodeGeneratorARM64* codegen,
2447                           QuickEntrypointEnum entry) {
2448   codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
2449 }
2450 
VisitMathCos(HInvoke * invoke)2451 void IntrinsicLocationsBuilderARM64::VisitMathCos(HInvoke* invoke) {
2452   CreateFPToFPCallLocations(allocator_, invoke);
2453 }
2454 
VisitMathCos(HInvoke * invoke)2455 void IntrinsicCodeGeneratorARM64::VisitMathCos(HInvoke* invoke) {
2456   GenFPToFPCall(invoke, codegen_, kQuickCos);
2457 }
2458 
VisitMathSin(HInvoke * invoke)2459 void IntrinsicLocationsBuilderARM64::VisitMathSin(HInvoke* invoke) {
2460   CreateFPToFPCallLocations(allocator_, invoke);
2461 }
2462 
VisitMathSin(HInvoke * invoke)2463 void IntrinsicCodeGeneratorARM64::VisitMathSin(HInvoke* invoke) {
2464   GenFPToFPCall(invoke, codegen_, kQuickSin);
2465 }
2466 
VisitMathAcos(HInvoke * invoke)2467 void IntrinsicLocationsBuilderARM64::VisitMathAcos(HInvoke* invoke) {
2468   CreateFPToFPCallLocations(allocator_, invoke);
2469 }
2470 
VisitMathAcos(HInvoke * invoke)2471 void IntrinsicCodeGeneratorARM64::VisitMathAcos(HInvoke* invoke) {
2472   GenFPToFPCall(invoke, codegen_, kQuickAcos);
2473 }
2474 
VisitMathAsin(HInvoke * invoke)2475 void IntrinsicLocationsBuilderARM64::VisitMathAsin(HInvoke* invoke) {
2476   CreateFPToFPCallLocations(allocator_, invoke);
2477 }
2478 
VisitMathAsin(HInvoke * invoke)2479 void IntrinsicCodeGeneratorARM64::VisitMathAsin(HInvoke* invoke) {
2480   GenFPToFPCall(invoke, codegen_, kQuickAsin);
2481 }
2482 
VisitMathAtan(HInvoke * invoke)2483 void IntrinsicLocationsBuilderARM64::VisitMathAtan(HInvoke* invoke) {
2484   CreateFPToFPCallLocations(allocator_, invoke);
2485 }
2486 
VisitMathAtan(HInvoke * invoke)2487 void IntrinsicCodeGeneratorARM64::VisitMathAtan(HInvoke* invoke) {
2488   GenFPToFPCall(invoke, codegen_, kQuickAtan);
2489 }
2490 
VisitMathCbrt(HInvoke * invoke)2491 void IntrinsicLocationsBuilderARM64::VisitMathCbrt(HInvoke* invoke) {
2492   CreateFPToFPCallLocations(allocator_, invoke);
2493 }
2494 
VisitMathCbrt(HInvoke * invoke)2495 void IntrinsicCodeGeneratorARM64::VisitMathCbrt(HInvoke* invoke) {
2496   GenFPToFPCall(invoke, codegen_, kQuickCbrt);
2497 }
2498 
VisitMathCosh(HInvoke * invoke)2499 void IntrinsicLocationsBuilderARM64::VisitMathCosh(HInvoke* invoke) {
2500   CreateFPToFPCallLocations(allocator_, invoke);
2501 }
2502 
VisitMathCosh(HInvoke * invoke)2503 void IntrinsicCodeGeneratorARM64::VisitMathCosh(HInvoke* invoke) {
2504   GenFPToFPCall(invoke, codegen_, kQuickCosh);
2505 }
2506 
VisitMathExp(HInvoke * invoke)2507 void IntrinsicLocationsBuilderARM64::VisitMathExp(HInvoke* invoke) {
2508   CreateFPToFPCallLocations(allocator_, invoke);
2509 }
2510 
VisitMathExp(HInvoke * invoke)2511 void IntrinsicCodeGeneratorARM64::VisitMathExp(HInvoke* invoke) {
2512   GenFPToFPCall(invoke, codegen_, kQuickExp);
2513 }
2514 
VisitMathExpm1(HInvoke * invoke)2515 void IntrinsicLocationsBuilderARM64::VisitMathExpm1(HInvoke* invoke) {
2516   CreateFPToFPCallLocations(allocator_, invoke);
2517 }
2518 
VisitMathExpm1(HInvoke * invoke)2519 void IntrinsicCodeGeneratorARM64::VisitMathExpm1(HInvoke* invoke) {
2520   GenFPToFPCall(invoke, codegen_, kQuickExpm1);
2521 }
2522 
VisitMathLog(HInvoke * invoke)2523 void IntrinsicLocationsBuilderARM64::VisitMathLog(HInvoke* invoke) {
2524   CreateFPToFPCallLocations(allocator_, invoke);
2525 }
2526 
VisitMathLog(HInvoke * invoke)2527 void IntrinsicCodeGeneratorARM64::VisitMathLog(HInvoke* invoke) {
2528   GenFPToFPCall(invoke, codegen_, kQuickLog);
2529 }
2530 
VisitMathLog10(HInvoke * invoke)2531 void IntrinsicLocationsBuilderARM64::VisitMathLog10(HInvoke* invoke) {
2532   CreateFPToFPCallLocations(allocator_, invoke);
2533 }
2534 
VisitMathLog10(HInvoke * invoke)2535 void IntrinsicCodeGeneratorARM64::VisitMathLog10(HInvoke* invoke) {
2536   GenFPToFPCall(invoke, codegen_, kQuickLog10);
2537 }
2538 
VisitMathSinh(HInvoke * invoke)2539 void IntrinsicLocationsBuilderARM64::VisitMathSinh(HInvoke* invoke) {
2540   CreateFPToFPCallLocations(allocator_, invoke);
2541 }
2542 
VisitMathSinh(HInvoke * invoke)2543 void IntrinsicCodeGeneratorARM64::VisitMathSinh(HInvoke* invoke) {
2544   GenFPToFPCall(invoke, codegen_, kQuickSinh);
2545 }
2546 
VisitMathTan(HInvoke * invoke)2547 void IntrinsicLocationsBuilderARM64::VisitMathTan(HInvoke* invoke) {
2548   CreateFPToFPCallLocations(allocator_, invoke);
2549 }
2550 
VisitMathTan(HInvoke * invoke)2551 void IntrinsicCodeGeneratorARM64::VisitMathTan(HInvoke* invoke) {
2552   GenFPToFPCall(invoke, codegen_, kQuickTan);
2553 }
2554 
VisitMathTanh(HInvoke * invoke)2555 void IntrinsicLocationsBuilderARM64::VisitMathTanh(HInvoke* invoke) {
2556   CreateFPToFPCallLocations(allocator_, invoke);
2557 }
2558 
VisitMathTanh(HInvoke * invoke)2559 void IntrinsicCodeGeneratorARM64::VisitMathTanh(HInvoke* invoke) {
2560   GenFPToFPCall(invoke, codegen_, kQuickTanh);
2561 }
2562 
VisitMathAtan2(HInvoke * invoke)2563 void IntrinsicLocationsBuilderARM64::VisitMathAtan2(HInvoke* invoke) {
2564   CreateFPFPToFPCallLocations(allocator_, invoke);
2565 }
2566 
VisitMathAtan2(HInvoke * invoke)2567 void IntrinsicCodeGeneratorARM64::VisitMathAtan2(HInvoke* invoke) {
2568   GenFPToFPCall(invoke, codegen_, kQuickAtan2);
2569 }
2570 
VisitMathPow(HInvoke * invoke)2571 void IntrinsicLocationsBuilderARM64::VisitMathPow(HInvoke* invoke) {
2572   CreateFPFPToFPCallLocations(allocator_, invoke);
2573 }
2574 
VisitMathPow(HInvoke * invoke)2575 void IntrinsicCodeGeneratorARM64::VisitMathPow(HInvoke* invoke) {
2576   GenFPToFPCall(invoke, codegen_, kQuickPow);
2577 }
2578 
VisitMathHypot(HInvoke * invoke)2579 void IntrinsicLocationsBuilderARM64::VisitMathHypot(HInvoke* invoke) {
2580   CreateFPFPToFPCallLocations(allocator_, invoke);
2581 }
2582 
VisitMathHypot(HInvoke * invoke)2583 void IntrinsicCodeGeneratorARM64::VisitMathHypot(HInvoke* invoke) {
2584   GenFPToFPCall(invoke, codegen_, kQuickHypot);
2585 }
2586 
VisitMathNextAfter(HInvoke * invoke)2587 void IntrinsicLocationsBuilderARM64::VisitMathNextAfter(HInvoke* invoke) {
2588   CreateFPFPToFPCallLocations(allocator_, invoke);
2589 }
2590 
VisitMathNextAfter(HInvoke * invoke)2591 void IntrinsicCodeGeneratorARM64::VisitMathNextAfter(HInvoke* invoke) {
2592   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
2593 }
2594 
VisitStringGetCharsNoCheck(HInvoke * invoke)2595 void IntrinsicLocationsBuilderARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
2596   LocationSummary* locations =
2597       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2598   locations->SetInAt(0, Location::RequiresRegister());
2599   locations->SetInAt(1, Location::RequiresRegister());
2600   locations->SetInAt(2, Location::RequiresRegister());
2601   locations->SetInAt(3, Location::RequiresRegister());
2602   locations->SetInAt(4, Location::RequiresRegister());
2603 
2604   locations->AddTemp(Location::RequiresRegister());
2605   locations->AddTemp(Location::RequiresRegister());
2606   locations->AddTemp(Location::RequiresRegister());
2607 }
2608 
VisitStringGetCharsNoCheck(HInvoke * invoke)2609 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
2610   MacroAssembler* masm = GetVIXLAssembler();
2611   LocationSummary* locations = invoke->GetLocations();
2612 
2613   // Check assumption that sizeof(Char) is 2 (used in scaling below).
2614   const size_t char_size = DataType::Size(DataType::Type::kUint16);
2615   DCHECK_EQ(char_size, 2u);
2616 
2617   // Location of data in char array buffer.
2618   const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
2619 
2620   // Location of char array data in string.
2621   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
2622 
2623   // void getCharsNoCheck(int srcBegin, int srcEnd, char[] dst, int dstBegin);
2624   // Since getChars() calls getCharsNoCheck() - we use registers rather than constants.
2625   Register srcObj = XRegisterFrom(locations->InAt(0));
2626   Register srcBegin = XRegisterFrom(locations->InAt(1));
2627   Register srcEnd = XRegisterFrom(locations->InAt(2));
2628   Register dstObj = XRegisterFrom(locations->InAt(3));
2629   Register dstBegin = XRegisterFrom(locations->InAt(4));
2630 
2631   Register src_ptr = XRegisterFrom(locations->GetTemp(0));
2632   Register num_chr = XRegisterFrom(locations->GetTemp(1));
2633   Register tmp1 = XRegisterFrom(locations->GetTemp(2));
2634 
2635   UseScratchRegisterScope temps(masm);
2636   Register dst_ptr = temps.AcquireX();
2637   Register tmp2 = temps.AcquireX();
2638 
2639   vixl::aarch64::Label done;
2640   vixl::aarch64::Label compressed_string_vector_loop;
2641   vixl::aarch64::Label compressed_string_remainder;
2642   __ Sub(num_chr, srcEnd, srcBegin);
2643   // Early out for valid zero-length retrievals.
2644   __ Cbz(num_chr, &done);
2645 
2646   // dst address start to copy to.
2647   __ Add(dst_ptr, dstObj, Operand(data_offset));
2648   __ Add(dst_ptr, dst_ptr, Operand(dstBegin, LSL, 1));
2649 
2650   // src address to copy from.
2651   __ Add(src_ptr, srcObj, Operand(value_offset));
2652   vixl::aarch64::Label compressed_string_preloop;
2653   if (mirror::kUseStringCompression) {
2654     // Location of count in string.
2655     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
2656     // String's length.
2657     __ Ldr(tmp2, MemOperand(srcObj, count_offset));
2658     __ Tbz(tmp2, 0, &compressed_string_preloop);
2659   }
2660   __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
2661 
2662   // Do the copy.
2663   vixl::aarch64::Label loop;
2664   vixl::aarch64::Label remainder;
2665 
2666   // Save repairing the value of num_chr on the < 8 character path.
2667   __ Subs(tmp1, num_chr, 8);
2668   __ B(lt, &remainder);
2669 
2670   // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
2671   __ Mov(num_chr, tmp1);
2672 
2673   // Main loop used for longer fetches loads and stores 8x16-bit characters at a time.
2674   // (Unaligned addresses are acceptable here and not worth inlining extra code to rectify.)
2675   __ Bind(&loop);
2676   __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, PostIndex));
2677   __ Subs(num_chr, num_chr, 8);
2678   __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, PostIndex));
2679   __ B(ge, &loop);
2680 
2681   __ Adds(num_chr, num_chr, 8);
2682   __ B(eq, &done);
2683 
2684   // Main loop for < 8 character case and remainder handling. Loads and stores one
2685   // 16-bit Java character at a time.
2686   __ Bind(&remainder);
2687   __ Ldrh(tmp1, MemOperand(src_ptr, char_size, PostIndex));
2688   __ Subs(num_chr, num_chr, 1);
2689   __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
2690   __ B(gt, &remainder);
2691   __ B(&done);
2692 
2693   if (mirror::kUseStringCompression) {
2694     // For compressed strings, acquire a SIMD temporary register.
2695     VRegister vtmp1 = temps.AcquireVRegisterOfSize(kQRegSize);
2696     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
2697     DCHECK_EQ(c_char_size, 1u);
2698     __ Bind(&compressed_string_preloop);
2699     __ Add(src_ptr, src_ptr, Operand(srcBegin));
2700 
2701     // Save repairing the value of num_chr on the < 8 character path.
2702     __ Subs(tmp1, num_chr, 8);
2703     __ B(lt, &compressed_string_remainder);
2704 
2705     // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
2706     __ Mov(num_chr, tmp1);
2707 
2708     // Main loop for compressed src, copying 8 characters (8-bit) to (16-bit) at a time.
2709     // Uses SIMD instructions.
2710     __ Bind(&compressed_string_vector_loop);
2711     __ Ld1(vtmp1.V8B(), MemOperand(src_ptr, c_char_size * 8, PostIndex));
2712     __ Subs(num_chr, num_chr, 8);
2713     __ Uxtl(vtmp1.V8H(), vtmp1.V8B());
2714     __ St1(vtmp1.V8H(), MemOperand(dst_ptr, char_size * 8, PostIndex));
2715     __ B(ge, &compressed_string_vector_loop);
2716 
2717     __ Adds(num_chr, num_chr, 8);
2718     __ B(eq, &done);
2719 
2720     // Loop for < 8 character case and remainder handling with a compressed src.
2721     // Copies 1 character (8-bit) to (16-bit) at a time.
2722     __ Bind(&compressed_string_remainder);
2723     __ Ldrb(tmp1, MemOperand(src_ptr, c_char_size, PostIndex));
2724     __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
2725     __ Subs(num_chr, num_chr, Operand(1));
2726     __ B(gt, &compressed_string_remainder);
2727   }
2728 
2729   __ Bind(&done);
2730 }
2731 
2732 // This value is greater than ARRAYCOPY_SHORT_CHAR_ARRAY_THRESHOLD in libcore,
2733 // so if we choose to jump to the slow path we will end up in the native implementation.
2734 static constexpr int32_t kSystemArrayCopyCharThreshold = 192;
2735 
LocationForSystemArrayCopyInput(HInstruction * input)2736 static Location LocationForSystemArrayCopyInput(HInstruction* input) {
2737   HIntConstant* const_input = input->AsIntConstantOrNull();
2738   if (const_input != nullptr && vixl::aarch64::Assembler::IsImmAddSub(const_input->GetValue())) {
2739     return Location::ConstantLocation(const_input);
2740   } else {
2741     return Location::RequiresRegister();
2742   }
2743 }
2744 
VisitSystemArrayCopyChar(HInvoke * invoke)2745 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
2746   // Check to see if we have known failures that will cause us to have to bail out
2747   // to the runtime, and just generate the runtime call directly.
2748   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstantOrNull();
2749   HIntConstant* dst_pos = invoke->InputAt(3)->AsIntConstantOrNull();
2750 
2751   // The positions must be non-negative.
2752   if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
2753       (dst_pos != nullptr && dst_pos->GetValue() < 0)) {
2754     // We will have to fail anyways.
2755     return;
2756   }
2757 
2758   // The length must be >= 0 and not so long that we would (currently) prefer libcore's
2759   // native implementation.
2760   HIntConstant* length = invoke->InputAt(4)->AsIntConstantOrNull();
2761   if (length != nullptr) {
2762     int32_t len = length->GetValue();
2763     if (len < 0 || len > kSystemArrayCopyCharThreshold) {
2764       // Just call as normal.
2765       return;
2766     }
2767   }
2768 
2769   ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
2770   LocationSummary* locations =
2771       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
2772   // arraycopy(char[] src, int src_pos, char[] dst, int dst_pos, int length).
2773   locations->SetInAt(0, Location::RequiresRegister());
2774   locations->SetInAt(1, LocationForSystemArrayCopyInput(invoke->InputAt(1)));
2775   locations->SetInAt(2, Location::RequiresRegister());
2776   locations->SetInAt(3, LocationForSystemArrayCopyInput(invoke->InputAt(3)));
2777   locations->SetInAt(4, LocationForSystemArrayCopyInput(invoke->InputAt(4)));
2778 
2779   locations->AddTemp(Location::RequiresRegister());
2780   locations->AddTemp(Location::RequiresRegister());
2781   locations->AddTemp(Location::RequiresRegister());
2782 }
2783 
CheckSystemArrayCopyPosition(MacroAssembler * masm,Register array,Location pos,Location length,SlowPathCodeARM64 * slow_path,Register temp,bool length_is_array_length,bool position_sign_checked)2784 static void CheckSystemArrayCopyPosition(MacroAssembler* masm,
2785                                          Register array,
2786                                          Location pos,
2787                                          Location length,
2788                                          SlowPathCodeARM64* slow_path,
2789                                          Register temp,
2790                                          bool length_is_array_length,
2791                                          bool position_sign_checked) {
2792   const int32_t length_offset = mirror::Array::LengthOffset().Int32Value();
2793   if (pos.IsConstant()) {
2794     int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
2795     if (pos_const == 0) {
2796       if (!length_is_array_length) {
2797         // Check that length(array) >= length.
2798         __ Ldr(temp, MemOperand(array, length_offset));
2799         __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2800         __ B(slow_path->GetEntryLabel(), lt);
2801       }
2802     } else {
2803       // Calculate length(array) - pos.
2804       // Both operands are known to be non-negative `int32_t`, so the difference cannot underflow
2805       // as `int32_t`. If the result is negative, the B.LT below shall go to the slow path.
2806       __ Ldr(temp, MemOperand(array, length_offset));
2807       __ Sub(temp, temp, pos_const);
2808 
2809       // Check that (length(array) - pos) >= length.
2810       __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2811       __ B(slow_path->GetEntryLabel(), lt);
2812     }
2813   } else if (length_is_array_length) {
2814     // The only way the copy can succeed is if pos is zero.
2815     __ Cbnz(WRegisterFrom(pos), slow_path->GetEntryLabel());
2816   } else {
2817     // Check that pos >= 0.
2818     Register pos_reg = WRegisterFrom(pos);
2819     if (!position_sign_checked) {
2820       __ Tbnz(pos_reg, pos_reg.GetSizeInBits() - 1, slow_path->GetEntryLabel());
2821     }
2822 
2823     // Calculate length(array) - pos.
2824     // Both operands are known to be non-negative `int32_t`, so the difference cannot underflow
2825     // as `int32_t`. If the result is negative, the B.LT below shall go to the slow path.
2826     __ Ldr(temp, MemOperand(array, length_offset));
2827     __ Sub(temp, temp, pos_reg);
2828 
2829     // Check that (length(array) - pos) >= length.
2830     __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2831     __ B(slow_path->GetEntryLabel(), lt);
2832   }
2833 }
2834 
GenArrayAddress(MacroAssembler * masm,Register dest,Register base,Location pos,DataType::Type type,int32_t data_offset)2835 static void GenArrayAddress(MacroAssembler* masm,
2836                             Register dest,
2837                             Register base,
2838                             Location pos,
2839                             DataType::Type type,
2840                             int32_t data_offset) {
2841   if (pos.IsConstant()) {
2842     int32_t constant = pos.GetConstant()->AsIntConstant()->GetValue();
2843     __ Add(dest, base, DataType::Size(type) * constant + data_offset);
2844   } else {
2845     if (data_offset != 0) {
2846       __ Add(dest, base, data_offset);
2847       base = dest;
2848     }
2849     __ Add(dest, base, Operand(XRegisterFrom(pos), LSL, DataType::SizeShift(type)));
2850   }
2851 }
2852 
2853 // Compute base source address, base destination address, and end
2854 // source address for System.arraycopy* intrinsics in `src_base`,
2855 // `dst_base` and `src_end` respectively.
GenSystemArrayCopyAddresses(MacroAssembler * masm,DataType::Type type,Register src,Location src_pos,Register dst,Location dst_pos,Location copy_length,Register src_base,Register dst_base,Register src_end)2856 static void GenSystemArrayCopyAddresses(MacroAssembler* masm,
2857                                         DataType::Type type,
2858                                         Register src,
2859                                         Location src_pos,
2860                                         Register dst,
2861                                         Location dst_pos,
2862                                         Location copy_length,
2863                                         Register src_base,
2864                                         Register dst_base,
2865                                         Register src_end) {
2866   // This routine is used by the SystemArrayCopy and the SystemArrayCopyChar intrinsics.
2867   DCHECK(type == DataType::Type::kReference || type == DataType::Type::kUint16)
2868       << "Unexpected element type: " << type;
2869   const int32_t element_size = DataType::Size(type);
2870   const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
2871 
2872   GenArrayAddress(masm, src_base, src, src_pos, type, data_offset);
2873   GenArrayAddress(masm, dst_base, dst, dst_pos, type, data_offset);
2874   if (src_end.IsValid()) {
2875     GenArrayAddress(masm, src_end, src_base, copy_length, type, /*data_offset=*/ 0);
2876   }
2877 }
2878 
VisitSystemArrayCopyChar(HInvoke * invoke)2879 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
2880   MacroAssembler* masm = GetVIXLAssembler();
2881   LocationSummary* locations = invoke->GetLocations();
2882   Register src = XRegisterFrom(locations->InAt(0));
2883   Location src_pos = locations->InAt(1);
2884   Register dst = XRegisterFrom(locations->InAt(2));
2885   Location dst_pos = locations->InAt(3);
2886   Location length = locations->InAt(4);
2887 
2888   SlowPathCodeARM64* slow_path =
2889       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2890   codegen_->AddSlowPath(slow_path);
2891 
2892   // If source and destination are the same, take the slow path. Overlapping copy regions must be
2893   // copied in reverse and we can't know in all cases if it's needed.
2894   __ Cmp(src, dst);
2895   __ B(slow_path->GetEntryLabel(), eq);
2896 
2897   // Bail out if the source is null.
2898   __ Cbz(src, slow_path->GetEntryLabel());
2899 
2900   // Bail out if the destination is null.
2901   __ Cbz(dst, slow_path->GetEntryLabel());
2902 
2903   if (!length.IsConstant()) {
2904     // Merge the following two comparisons into one:
2905     //   If the length is negative, bail out (delegate to libcore's native implementation).
2906     //   If the length > kSystemArrayCopyCharThreshold then (currently) prefer libcore's
2907     //   native implementation.
2908     __ Cmp(WRegisterFrom(length), kSystemArrayCopyCharThreshold);
2909     __ B(slow_path->GetEntryLabel(), hi);
2910   } else {
2911     // We have already checked in the LocationsBuilder for the constant case.
2912     DCHECK_GE(length.GetConstant()->AsIntConstant()->GetValue(), 0);
2913     DCHECK_LE(length.GetConstant()->AsIntConstant()->GetValue(), kSystemArrayCopyCharThreshold);
2914   }
2915 
2916   Register src_curr_addr = WRegisterFrom(locations->GetTemp(0));
2917   Register dst_curr_addr = WRegisterFrom(locations->GetTemp(1));
2918   Register src_stop_addr = WRegisterFrom(locations->GetTemp(2));
2919 
2920   CheckSystemArrayCopyPosition(masm,
2921                                src,
2922                                src_pos,
2923                                length,
2924                                slow_path,
2925                                src_curr_addr,
2926                                /*length_is_array_length=*/ false,
2927                                /*position_sign_checked=*/ false);
2928 
2929   CheckSystemArrayCopyPosition(masm,
2930                                dst,
2931                                dst_pos,
2932                                length,
2933                                slow_path,
2934                                src_curr_addr,
2935                                /*length_is_array_length=*/ false,
2936                                /*position_sign_checked=*/ false);
2937 
2938   src_curr_addr = src_curr_addr.X();
2939   dst_curr_addr = dst_curr_addr.X();
2940   src_stop_addr = src_stop_addr.X();
2941 
2942   GenSystemArrayCopyAddresses(masm,
2943                               DataType::Type::kUint16,
2944                               src,
2945                               src_pos,
2946                               dst,
2947                               dst_pos,
2948                               length,
2949                               src_curr_addr,
2950                               dst_curr_addr,
2951                               Register());
2952 
2953   // Iterate over the arrays and do a raw copy of the chars.
2954   const int32_t char_size = DataType::Size(DataType::Type::kUint16);
2955   UseScratchRegisterScope temps(masm);
2956 
2957   // We split processing of the array in two parts: head and tail.
2958   // A first loop handles the head by copying a block of characters per
2959   // iteration (see: chars_per_block).
2960   // A second loop handles the tail by copying the remaining characters.
2961   // If the copy length is not constant, we copy them one-by-one.
2962   // If the copy length is constant, we optimize by always unrolling the tail
2963   // loop, and also unrolling the head loop when the copy length is small (see:
2964   // unroll_threshold).
2965   //
2966   // Both loops are inverted for better performance, meaning they are
2967   // implemented as conditional do-while loops.
2968   // Here, the loop condition is first checked to determine if there are
2969   // sufficient chars to run an iteration, then we enter the do-while: an
2970   // iteration is performed followed by a conditional branch only if another
2971   // iteration is necessary. As opposed to a standard while-loop, this inversion
2972   // can save some branching (e.g. we don't branch back to the initial condition
2973   // at the end of every iteration only to potentially immediately branch
2974   // again).
2975   //
2976   // A full block of chars is subtracted and added before and after the head
2977   // loop, respectively. This ensures that any remaining length after each
2978   // head loop iteration means there is a full block remaining, reducing the
2979   // number of conditional checks required on every iteration.
2980   constexpr int32_t chars_per_block = 4;
2981   constexpr int32_t unroll_threshold = 2 * chars_per_block;
2982   vixl::aarch64::Label loop1, loop2, pre_loop2, done;
2983 
2984   Register length_tmp = src_stop_addr.W();
2985   Register tmp = temps.AcquireRegisterOfSize(char_size * chars_per_block * kBitsPerByte);
2986 
2987   auto emitHeadLoop = [&]() {
2988     __ Bind(&loop1);
2989     __ Ldr(tmp, MemOperand(src_curr_addr, char_size * chars_per_block, PostIndex));
2990     __ Subs(length_tmp, length_tmp, chars_per_block);
2991     __ Str(tmp, MemOperand(dst_curr_addr, char_size * chars_per_block, PostIndex));
2992     __ B(&loop1, ge);
2993   };
2994 
2995   auto emitTailLoop = [&]() {
2996     __ Bind(&loop2);
2997     __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex));
2998     __ Subs(length_tmp, length_tmp, 1);
2999     __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex));
3000     __ B(&loop2, gt);
3001   };
3002 
3003   auto emitUnrolledTailLoop = [&](const int32_t tail_length) {
3004     DCHECK_LT(tail_length, 4);
3005 
3006     // Don't use post-index addressing, and instead add a constant offset later.
3007     if ((tail_length & 2) != 0) {
3008       __ Ldr(tmp.W(), MemOperand(src_curr_addr));
3009       __ Str(tmp.W(), MemOperand(dst_curr_addr));
3010     }
3011     if ((tail_length & 1) != 0) {
3012       const int32_t offset = (tail_length & ~1) * char_size;
3013       __ Ldrh(tmp, MemOperand(src_curr_addr, offset));
3014       __ Strh(tmp, MemOperand(dst_curr_addr, offset));
3015     }
3016   };
3017 
3018   if (length.IsConstant()) {
3019     const int32_t constant_length = length.GetConstant()->AsIntConstant()->GetValue();
3020     if (constant_length >= unroll_threshold) {
3021       __ Mov(length_tmp, constant_length - chars_per_block);
3022       emitHeadLoop();
3023     } else {
3024       static_assert(unroll_threshold == 8, "The unroll_threshold must be 8.");
3025       // Fully unroll both the head and tail loops.
3026       if ((constant_length & 4) != 0) {
3027         __ Ldr(tmp, MemOperand(src_curr_addr, 4 * char_size, PostIndex));
3028         __ Str(tmp, MemOperand(dst_curr_addr, 4 * char_size, PostIndex));
3029       }
3030     }
3031     emitUnrolledTailLoop(constant_length % chars_per_block);
3032   } else {
3033     Register length_reg = WRegisterFrom(length);
3034     __ Subs(length_tmp, length_reg, chars_per_block);
3035     __ B(&pre_loop2, lt);
3036 
3037     emitHeadLoop();
3038 
3039     __ Bind(&pre_loop2);
3040     __ Adds(length_tmp, length_tmp, chars_per_block);
3041     __ B(&done, eq);
3042 
3043     emitTailLoop();
3044   }
3045 
3046   __ Bind(&done);
3047   __ Bind(slow_path->GetExitLabel());
3048 }
3049 
3050 // We choose to use the native implementation for longer copy lengths.
3051 static constexpr int32_t kSystemArrayCopyThreshold = 128;
3052 
VisitSystemArrayCopy(HInvoke * invoke)3053 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
3054   // The only read barrier implementation supporting the
3055   // SystemArrayCopy intrinsic is the Baker-style read barriers.
3056   if (codegen_->EmitNonBakerReadBarrier()) {
3057     return;
3058   }
3059 
3060   constexpr size_t kInitialNumTemps = 2u;  // We need at least two temps.
3061   LocationSummary* locations = CodeGenerator::CreateSystemArrayCopyLocationSummary(
3062       invoke, kSystemArrayCopyThreshold, kInitialNumTemps);
3063   if (locations != nullptr) {
3064     locations->SetInAt(1, LocationForSystemArrayCopyInput(invoke->InputAt(1)));
3065     locations->SetInAt(3, LocationForSystemArrayCopyInput(invoke->InputAt(3)));
3066     locations->SetInAt(4, LocationForSystemArrayCopyInput(invoke->InputAt(4)));
3067     if (codegen_->EmitBakerReadBarrier()) {
3068       // Temporary register IP0, obtained from the VIXL scratch register
3069       // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
3070       // (because that register is clobbered by ReadBarrierMarkRegX
3071       // entry points). It cannot be used in calls to
3072       // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
3073       // either. For these reasons, get a third extra temporary register
3074       // from the register allocator.
3075       locations->AddTemp(Location::RequiresRegister());
3076     } else {
3077       // Cases other than Baker read barriers: the third temporary will
3078       // be acquired from the VIXL scratch register pool.
3079     }
3080   }
3081 }
3082 
VisitSystemArrayCopy(HInvoke * invoke)3083 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
3084   // The only read barrier implementation supporting the
3085   // SystemArrayCopy intrinsic is the Baker-style read barriers.
3086   DCHECK_IMPLIES(codegen_->EmitReadBarrier(), kUseBakerReadBarrier);
3087 
3088   MacroAssembler* masm = GetVIXLAssembler();
3089   LocationSummary* locations = invoke->GetLocations();
3090 
3091   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
3092   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
3093   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
3094   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
3095   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
3096 
3097   Register src = XRegisterFrom(locations->InAt(0));
3098   Location src_pos = locations->InAt(1);
3099   Register dest = XRegisterFrom(locations->InAt(2));
3100   Location dest_pos = locations->InAt(3);
3101   Location length = locations->InAt(4);
3102   Register temp1 = WRegisterFrom(locations->GetTemp(0));
3103   Location temp1_loc = LocationFrom(temp1);
3104   Register temp2 = WRegisterFrom(locations->GetTemp(1));
3105   Location temp2_loc = LocationFrom(temp2);
3106 
3107   SlowPathCodeARM64* intrinsic_slow_path =
3108       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
3109   codegen_->AddSlowPath(intrinsic_slow_path);
3110 
3111   vixl::aarch64::Label conditions_on_positions_validated;
3112   SystemArrayCopyOptimizations optimizations(invoke);
3113 
3114   // If source and destination are the same, we go to slow path if we need to do forward copying.
3115   // We do not need to do this check if the source and destination positions are the same.
3116   if (!optimizations.GetSourcePositionIsDestinationPosition()) {
3117     if (src_pos.IsConstant()) {
3118       int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
3119       if (dest_pos.IsConstant()) {
3120         int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
3121         if (optimizations.GetDestinationIsSource()) {
3122           // Checked when building locations.
3123           DCHECK_GE(src_pos_constant, dest_pos_constant);
3124         } else if (src_pos_constant < dest_pos_constant) {
3125           __ Cmp(src, dest);
3126           __ B(intrinsic_slow_path->GetEntryLabel(), eq);
3127         }
3128       } else {
3129         if (!optimizations.GetDestinationIsSource()) {
3130           __ Cmp(src, dest);
3131           __ B(&conditions_on_positions_validated, ne);
3132         }
3133         __ Cmp(WRegisterFrom(dest_pos), src_pos_constant);
3134         __ B(intrinsic_slow_path->GetEntryLabel(), gt);
3135       }
3136     } else {
3137       if (!optimizations.GetDestinationIsSource()) {
3138         __ Cmp(src, dest);
3139         __ B(&conditions_on_positions_validated, ne);
3140       }
3141       __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()),
3142              OperandFrom(dest_pos, invoke->InputAt(3)->GetType()));
3143       __ B(intrinsic_slow_path->GetEntryLabel(), lt);
3144     }
3145   }
3146 
3147   __ Bind(&conditions_on_positions_validated);
3148 
3149   if (!optimizations.GetSourceIsNotNull()) {
3150     // Bail out if the source is null.
3151     __ Cbz(src, intrinsic_slow_path->GetEntryLabel());
3152   }
3153 
3154   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
3155     // Bail out if the destination is null.
3156     __ Cbz(dest, intrinsic_slow_path->GetEntryLabel());
3157   }
3158 
3159   // We have already checked in the LocationsBuilder for the constant case.
3160   if (!length.IsConstant()) {
3161     // Merge the following two comparisons into one:
3162     //   If the length is negative, bail out (delegate to libcore's native implementation).
3163     //   If the length >= 128 then (currently) prefer native implementation.
3164     __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold);
3165     __ B(intrinsic_slow_path->GetEntryLabel(), hs);
3166   }
3167   // Validity checks: source.
3168   CheckSystemArrayCopyPosition(masm,
3169                                src,
3170                                src_pos,
3171                                length,
3172                                intrinsic_slow_path,
3173                                temp1,
3174                                optimizations.GetCountIsSourceLength(),
3175                                /*position_sign_checked=*/ false);
3176 
3177   // Validity checks: dest.
3178   bool dest_position_sign_checked = optimizations.GetSourcePositionIsDestinationPosition();
3179   CheckSystemArrayCopyPosition(masm,
3180                                dest,
3181                                dest_pos,
3182                                length,
3183                                intrinsic_slow_path,
3184                                temp1,
3185                                optimizations.GetCountIsDestinationLength(),
3186                                dest_position_sign_checked);
3187 
3188   auto check_non_primitive_array_class = [&](Register klass, Register temp) {
3189     // No read barrier is needed for reading a chain of constant references for comparing
3190     // with null, or for reading a constant primitive value, see `ReadBarrierOption`.
3191     // /* HeapReference<Class> */ temp = klass->component_type_
3192     __ Ldr(temp, HeapOperand(klass, component_offset));
3193     codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp);
3194     // Check that the component type is not null.
3195     __ Cbz(temp, intrinsic_slow_path->GetEntryLabel());
3196     // Check that the component type is not a primitive.
3197     // /* uint16_t */ temp = static_cast<uint16>(klass->primitive_type_);
3198     __ Ldrh(temp, HeapOperand(temp, primitive_offset));
3199     static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
3200     __ Cbnz(temp, intrinsic_slow_path->GetEntryLabel());
3201   };
3202 
3203   if (!optimizations.GetDoesNotNeedTypeCheck()) {
3204     // Check whether all elements of the source array are assignable to the component
3205     // type of the destination array. We do two checks: the classes are the same,
3206     // or the destination is Object[]. If none of these checks succeed, we go to the
3207     // slow path.
3208 
3209     if (codegen_->EmitBakerReadBarrier()) {
3210       Location temp3_loc = locations->GetTemp(2);
3211       // /* HeapReference<Class> */ temp1 = dest->klass_
3212       codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
3213                                                       temp1_loc,
3214                                                       dest.W(),
3215                                                       class_offset,
3216                                                       temp3_loc,
3217                                                       /* needs_null_check= */ false,
3218                                                       /* use_load_acquire= */ false);
3219       // Register `temp1` is not trashed by the read barrier emitted
3220       // by GenerateFieldLoadWithBakerReadBarrier below, as that
3221       // method produces a call to a ReadBarrierMarkRegX entry point,
3222       // which saves all potentially live registers, including
3223       // temporaries such a `temp1`.
3224       // /* HeapReference<Class> */ temp2 = src->klass_
3225       codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
3226                                                       temp2_loc,
3227                                                       src.W(),
3228                                                       class_offset,
3229                                                       temp3_loc,
3230                                                       /* needs_null_check= */ false,
3231                                                       /* use_load_acquire= */ false);
3232     } else {
3233       // /* HeapReference<Class> */ temp1 = dest->klass_
3234       __ Ldr(temp1, MemOperand(dest, class_offset));
3235       codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
3236       // /* HeapReference<Class> */ temp2 = src->klass_
3237       __ Ldr(temp2, MemOperand(src, class_offset));
3238       codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
3239     }
3240 
3241     __ Cmp(temp1, temp2);
3242     if (optimizations.GetDestinationIsTypedObjectArray()) {
3243       DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
3244       vixl::aarch64::Label do_copy;
3245       // For class match, we can skip the source type check regardless of the optimization flag.
3246       __ B(&do_copy, eq);
3247       // No read barrier is needed for reading a chain of constant references
3248       // for comparing with null, see `ReadBarrierOption`.
3249       // /* HeapReference<Class> */ temp1 = temp1->component_type_
3250       __ Ldr(temp1, HeapOperand(temp1, component_offset));
3251       codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
3252       // /* HeapReference<Class> */ temp1 = temp1->super_class_
3253       __ Ldr(temp1, HeapOperand(temp1, super_offset));
3254       // No need to unpoison the result, we're comparing against null.
3255       __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
3256       // Bail out if the source is not a non primitive array.
3257       if (!optimizations.GetSourceIsNonPrimitiveArray()) {
3258         check_non_primitive_array_class(temp2, temp2);
3259       }
3260       __ Bind(&do_copy);
3261     } else {
3262       DCHECK(!optimizations.GetDestinationIsTypedObjectArray());
3263       // For class match, we can skip the array type check completely if at least one of source
3264       // and destination is known to be a non primitive array, otherwise one check is enough.
3265       __ B(intrinsic_slow_path->GetEntryLabel(), ne);
3266       if (!optimizations.GetDestinationIsNonPrimitiveArray() &&
3267           !optimizations.GetSourceIsNonPrimitiveArray()) {
3268         check_non_primitive_array_class(temp2, temp2);
3269       }
3270     }
3271   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
3272     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
3273     // Bail out if the source is not a non primitive array.
3274     // No read barrier is needed for reading a chain of constant references for comparing
3275     // with null, or for reading a constant primitive value, see `ReadBarrierOption`.
3276     // /* HeapReference<Class> */ temp2 = src->klass_
3277     __ Ldr(temp2, MemOperand(src, class_offset));
3278     codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
3279     check_non_primitive_array_class(temp2, temp2);
3280   }
3281 
3282   if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) {
3283     // Null constant length: not need to emit the loop code at all.
3284   } else {
3285     vixl::aarch64::Label skip_copy_and_write_barrier;
3286     if (length.IsRegister()) {
3287       // Don't enter the copy loop if the length is null.
3288       __ Cbz(WRegisterFrom(length), &skip_copy_and_write_barrier);
3289     }
3290 
3291     {
3292       // We use a block to end the scratch scope before the write barrier, thus
3293       // freeing the temporary registers so they can be used in `MarkGCCard`.
3294       UseScratchRegisterScope temps(masm);
3295       bool emit_rb = codegen_->EmitBakerReadBarrier();
3296       Register temp3;
3297       Register tmp;
3298       if (emit_rb) {
3299         temp3 = WRegisterFrom(locations->GetTemp(2));
3300         // Make sure `tmp` is not IP0, as it is clobbered by ReadBarrierMarkRegX entry points
3301         // in ReadBarrierSystemArrayCopySlowPathARM64. Explicitly allocate the register IP1.
3302         DCHECK(temps.IsAvailable(ip1));
3303         temps.Exclude(ip1);
3304         tmp = ip1.W();
3305       } else {
3306         temp3 = temps.AcquireW();
3307         tmp = temps.AcquireW();
3308       }
3309 
3310       Register src_curr_addr = temp1.X();
3311       Register dst_curr_addr = temp2.X();
3312       Register src_stop_addr = temp3.X();
3313       const DataType::Type type = DataType::Type::kReference;
3314       const int32_t element_size = DataType::Size(type);
3315 
3316       SlowPathCodeARM64* read_barrier_slow_path = nullptr;
3317       if (emit_rb) {
3318         // TODO: Also convert this intrinsic to the IsGcMarking strategy?
3319 
3320         // SystemArrayCopy implementation for Baker read barriers (see
3321         // also CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier):
3322         //
3323         //   uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
3324         //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
3325         //   bool is_gray = (rb_state == ReadBarrier::GrayState());
3326         //   if (is_gray) {
3327         //     // Slow-path copy.
3328         //     do {
3329         //       *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
3330         //     } while (src_ptr != end_ptr)
3331         //   } else {
3332         //     // Fast-path copy.
3333         //     do {
3334         //       *dest_ptr++ = *src_ptr++;
3335         //     } while (src_ptr != end_ptr)
3336         //   }
3337 
3338         // /* int32_t */ monitor = src->monitor_
3339         __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
3340         // /* LockWord */ lock_word = LockWord(monitor)
3341         static_assert(sizeof(LockWord) == sizeof(int32_t),
3342                       "art::LockWord and int32_t have different sizes.");
3343 
3344         // Introduce a dependency on the lock_word including rb_state,
3345         // to prevent load-load reordering, and without using
3346         // a memory barrier (which would be more expensive).
3347         // `src` is unchanged by this operation, but its value now depends
3348         // on `tmp`.
3349         __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
3350 
3351         // Slow path used to copy array when `src` is gray.
3352         read_barrier_slow_path =
3353             new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(
3354                 invoke, LocationFrom(tmp));
3355         codegen_->AddSlowPath(read_barrier_slow_path);
3356       }
3357 
3358       // Compute base source address, base destination address, and end
3359       // source address for System.arraycopy* intrinsics in `src_base`,
3360       // `dst_base` and `src_end` respectively.
3361       // Note that `src_curr_addr` is computed from from `src` (and
3362       // `src_pos`) here, and thus honors the artificial dependency
3363       // of `src` on `tmp`.
3364       GenSystemArrayCopyAddresses(masm,
3365                                   type,
3366                                   src,
3367                                   src_pos,
3368                                   dest,
3369                                   dest_pos,
3370                                   length,
3371                                   src_curr_addr,
3372                                   dst_curr_addr,
3373                                   src_stop_addr);
3374 
3375       if (emit_rb) {
3376         // Given the numeric representation, it's enough to check the low bit of the rb_state.
3377         static_assert(ReadBarrier::NonGrayState() == 0, "Expecting non-gray to have value 0");
3378         static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
3379         __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
3380       }
3381 
3382       // Iterate over the arrays and do a raw copy of the objects. We don't need to
3383       // poison/unpoison.
3384       vixl::aarch64::Label loop;
3385       __ Bind(&loop);
3386       __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
3387       __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
3388       __ Cmp(src_curr_addr, src_stop_addr);
3389       __ B(&loop, ne);
3390 
3391       if (emit_rb) {
3392         DCHECK(read_barrier_slow_path != nullptr);
3393         __ Bind(read_barrier_slow_path->GetExitLabel());
3394       }
3395     }
3396 
3397     // We only need one card marking on the destination array.
3398     codegen_->MarkGCCard(dest.W());
3399 
3400     __ Bind(&skip_copy_and_write_barrier);
3401   }
3402 
3403   __ Bind(intrinsic_slow_path->GetExitLabel());
3404 }
3405 
GenIsInfinite(LocationSummary * locations,bool is64bit,MacroAssembler * masm)3406 static void GenIsInfinite(LocationSummary* locations,
3407                           bool is64bit,
3408                           MacroAssembler* masm) {
3409   Operand infinity(0);
3410   Operand tst_mask(0);
3411   Register out;
3412 
3413   if (is64bit) {
3414     infinity = Operand(kPositiveInfinityDouble);
3415     tst_mask = MaskLeastSignificant<uint64_t>(63);
3416     out = XRegisterFrom(locations->Out());
3417   } else {
3418     infinity = Operand(kPositiveInfinityFloat);
3419     tst_mask = MaskLeastSignificant<uint32_t>(31);
3420     out = WRegisterFrom(locations->Out());
3421   }
3422 
3423   MoveFPToInt(locations, is64bit, masm);
3424   // Checks whether exponent bits are all 1 and fraction bits are all 0.
3425   __ Eor(out, out, infinity);
3426   // TST bitmask is used to mask out the sign bit: either 0x7fffffff or 0x7fffffffffffffff
3427   // depending on is64bit.
3428   __ Tst(out, tst_mask);
3429   __ Cset(out, eq);
3430 }
3431 
VisitFloatIsInfinite(HInvoke * invoke)3432 void IntrinsicLocationsBuilderARM64::VisitFloatIsInfinite(HInvoke* invoke) {
3433   CreateFPToIntLocations(allocator_, invoke);
3434 }
3435 
VisitFloatIsInfinite(HInvoke * invoke)3436 void IntrinsicCodeGeneratorARM64::VisitFloatIsInfinite(HInvoke* invoke) {
3437   GenIsInfinite(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
3438 }
3439 
VisitDoubleIsInfinite(HInvoke * invoke)3440 void IntrinsicLocationsBuilderARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
3441   CreateFPToIntLocations(allocator_, invoke);
3442 }
3443 
VisitDoubleIsInfinite(HInvoke * invoke)3444 void IntrinsicCodeGeneratorARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
3445   GenIsInfinite(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
3446 }
3447 
3448 #define VISIT_INTRINSIC(name, low, high, type, start_index)                              \
3449   void IntrinsicLocationsBuilderARM64::Visit##name##ValueOf(HInvoke* invoke) {           \
3450     InvokeRuntimeCallingConvention calling_convention;                                   \
3451     IntrinsicVisitor::ComputeValueOfLocations(                                           \
3452         invoke,                                                                          \
3453         codegen_,                                                                        \
3454         low,                                                                             \
3455         (high) - (low) + 1,                                                              \
3456         calling_convention.GetReturnLocation(DataType::Type::kReference),                \
3457         Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode()));      \
3458   }                                                                                      \
3459   void IntrinsicCodeGeneratorARM64::Visit##name##ValueOf(HInvoke* invoke) {              \
3460     IntrinsicVisitor::ValueOfInfo info =                                                 \
3461         IntrinsicVisitor::ComputeValueOfInfo(invoke,                                     \
3462                                              codegen_->GetCompilerOptions(),             \
3463                                              WellKnownClasses::java_lang_##name##_value, \
3464                                              low,                                        \
3465                                              (high) - (low) + 1,                         \
3466                                              start_index);                               \
3467     HandleValueOf(invoke, info, type);                                                   \
3468   }
BOXED_TYPES(VISIT_INTRINSIC)3469   BOXED_TYPES(VISIT_INTRINSIC)
3470 #undef VISIT_INTRINSIC
3471 
3472 void IntrinsicCodeGeneratorARM64::HandleValueOf(HInvoke* invoke,
3473                                                 const IntrinsicVisitor::ValueOfInfo& info,
3474                                                 DataType::Type type) {
3475   LocationSummary* locations = invoke->GetLocations();
3476   MacroAssembler* masm = GetVIXLAssembler();
3477 
3478   Register out = RegisterFrom(locations->Out(), DataType::Type::kReference);
3479   UseScratchRegisterScope temps(masm);
3480   Register temp = temps.AcquireW();
3481   auto allocate_instance = [&]() {
3482     DCHECK(out.X().Is(InvokeRuntimeCallingConvention().GetRegisterAt(0)));
3483     codegen_->LoadIntrinsicDeclaringClass(out, invoke);
3484     codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
3485     CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
3486   };
3487   if (invoke->InputAt(0)->IsIntConstant()) {
3488     int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
3489     if (static_cast<uint32_t>(value - info.low) < info.length) {
3490       // Just embed the object in the code.
3491       DCHECK_NE(info.value_boot_image_reference, ValueOfInfo::kInvalidReference);
3492       codegen_->LoadBootImageAddress(out, info.value_boot_image_reference);
3493     } else {
3494       DCHECK(locations->CanCall());
3495       // Allocate and initialize a new object.
3496       // TODO: If we JIT, we could allocate the object now, and store it in the
3497       // JIT object table.
3498       allocate_instance();
3499       __ Mov(temp.W(), value);
3500       codegen_->Store(type, temp.W(), HeapOperand(out.W(), info.value_offset));
3501       // Class pointer and `value` final field stores require a barrier before publication.
3502       codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
3503     }
3504   } else {
3505     DCHECK(locations->CanCall());
3506     Register in = RegisterFrom(locations->InAt(0), DataType::Type::kInt32);
3507     // Check bounds of our cache.
3508     __ Add(out.W(), in.W(), -info.low);
3509     __ Cmp(out.W(), info.length);
3510     vixl::aarch64::Label allocate, done;
3511     __ B(&allocate, hs);
3512     // If the value is within the bounds, load the object directly from the array.
3513     codegen_->LoadBootImageAddress(temp, info.array_data_boot_image_reference);
3514     MemOperand source = HeapOperand(
3515         temp, out.X(), LSL, DataType::SizeShift(DataType::Type::kReference));
3516     codegen_->Load(DataType::Type::kReference, out, source);
3517     codegen_->GetAssembler()->MaybeUnpoisonHeapReference(out);
3518     __ B(&done);
3519     __ Bind(&allocate);
3520     // Otherwise allocate and initialize a new object.
3521     allocate_instance();
3522     codegen_->Store(type, in.W(), HeapOperand(out.W(), info.value_offset));
3523     // Class pointer and `value` final field stores require a barrier before publication.
3524     codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
3525     __ Bind(&done);
3526   }
3527 }
3528 
VisitReferenceGetReferent(HInvoke * invoke)3529 void IntrinsicLocationsBuilderARM64::VisitReferenceGetReferent(HInvoke* invoke) {
3530   IntrinsicVisitor::CreateReferenceGetReferentLocations(invoke, codegen_);
3531 
3532   if (codegen_->EmitBakerReadBarrier() && invoke->GetLocations() != nullptr) {
3533     invoke->GetLocations()->AddTemp(Location::RequiresRegister());
3534   }
3535 }
3536 
VisitReferenceGetReferent(HInvoke * invoke)3537 void IntrinsicCodeGeneratorARM64::VisitReferenceGetReferent(HInvoke* invoke) {
3538   MacroAssembler* masm = GetVIXLAssembler();
3539   LocationSummary* locations = invoke->GetLocations();
3540 
3541   Location obj = locations->InAt(0);
3542   Location out = locations->Out();
3543 
3544   SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
3545   codegen_->AddSlowPath(slow_path);
3546 
3547   if (codegen_->EmitReadBarrier()) {
3548     // Check self->GetWeakRefAccessEnabled().
3549     UseScratchRegisterScope temps(masm);
3550     Register temp = temps.AcquireW();
3551     __ Ldr(temp,
3552            MemOperand(tr, Thread::WeakRefAccessEnabledOffset<kArm64PointerSize>().Uint32Value()));
3553     static_assert(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled) == 0);
3554     __ Cbnz(temp, slow_path->GetEntryLabel());
3555   }
3556 
3557   {
3558     // Load the java.lang.ref.Reference class.
3559     UseScratchRegisterScope temps(masm);
3560     Register temp = temps.AcquireW();
3561     codegen_->LoadIntrinsicDeclaringClass(temp, invoke);
3562 
3563     // Check static fields java.lang.ref.Reference.{disableIntrinsic,slowPathEnabled} together.
3564     MemberOffset disable_intrinsic_offset = IntrinsicVisitor::GetReferenceDisableIntrinsicOffset();
3565     DCHECK_ALIGNED(disable_intrinsic_offset.Uint32Value(), 2u);
3566     DCHECK_EQ(disable_intrinsic_offset.Uint32Value() + 1u,
3567               IntrinsicVisitor::GetReferenceSlowPathEnabledOffset().Uint32Value());
3568     __ Ldrh(temp, HeapOperand(temp, disable_intrinsic_offset.Uint32Value()));
3569     __ Cbnz(temp, slow_path->GetEntryLabel());
3570   }
3571 
3572   // Load the value from the field.
3573   uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
3574   if (codegen_->EmitBakerReadBarrier()) {
3575     codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
3576                                                     out,
3577                                                     WRegisterFrom(obj),
3578                                                     referent_offset,
3579                                                     /*maybe_temp=*/ locations->GetTemp(0),
3580                                                     /*needs_null_check=*/ true,
3581                                                     /*use_load_acquire=*/ true);
3582   } else {
3583     MemOperand field = HeapOperand(WRegisterFrom(obj), referent_offset);
3584     codegen_->LoadAcquire(
3585         invoke, DataType::Type::kReference, WRegisterFrom(out), field, /*needs_null_check=*/ true);
3586     codegen_->MaybeGenerateReadBarrierSlow(invoke, out, out, obj, referent_offset);
3587   }
3588   __ Bind(slow_path->GetExitLabel());
3589 }
3590 
VisitReferenceRefersTo(HInvoke * invoke)3591 void IntrinsicLocationsBuilderARM64::VisitReferenceRefersTo(HInvoke* invoke) {
3592   IntrinsicVisitor::CreateReferenceRefersToLocations(invoke, codegen_);
3593 }
3594 
VisitReferenceRefersTo(HInvoke * invoke)3595 void IntrinsicCodeGeneratorARM64::VisitReferenceRefersTo(HInvoke* invoke) {
3596   LocationSummary* locations = invoke->GetLocations();
3597   MacroAssembler* masm = codegen_->GetVIXLAssembler();
3598   UseScratchRegisterScope temps(masm);
3599 
3600   Register obj = WRegisterFrom(locations->InAt(0));
3601   Register other = WRegisterFrom(locations->InAt(1));
3602   Register out = WRegisterFrom(locations->Out());
3603   Register tmp = temps.AcquireW();
3604 
3605   uint32_t referent_offset = mirror::Reference::ReferentOffset().Uint32Value();
3606   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
3607 
3608   MemOperand field = HeapOperand(obj, referent_offset);
3609   codegen_->LoadAcquire(invoke, DataType::Type::kReference, tmp, field, /*needs_null_check=*/ true);
3610   codegen_->GetAssembler()->MaybeUnpoisonHeapReference(tmp);
3611 
3612   __ Cmp(tmp, other);
3613 
3614   if (codegen_->EmitReadBarrier()) {
3615     DCHECK(kUseBakerReadBarrier);
3616 
3617     vixl::aarch64::Label calculate_result;
3618 
3619     // If the GC is not marking, the comparison result is final.
3620     __ Cbz(mr, &calculate_result);
3621 
3622     __ B(&calculate_result, eq);  // ZF set if taken.
3623 
3624     // Check if the loaded reference is null.
3625     __ Cbz(tmp, &calculate_result);  // ZF clear if taken.
3626 
3627     // For correct memory visibility, we need a barrier before loading the lock word.
3628     codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
3629 
3630     // Load the lockword and check if it is a forwarding address.
3631     static_assert(LockWord::kStateShift == 30u);
3632     static_assert(LockWord::kStateForwardingAddress == 3u);
3633     __ Ldr(tmp, HeapOperand(tmp, monitor_offset));
3634     __ Cmp(tmp, Operand(0xc0000000));
3635     __ B(&calculate_result, lo);   // ZF clear if taken.
3636 
3637     // Extract the forwarding address and compare with `other`.
3638     __ Cmp(other, Operand(tmp, LSL, LockWord::kForwardingAddressShift));
3639 
3640     __ Bind(&calculate_result);
3641   }
3642 
3643   // Convert ZF into the Boolean result.
3644   __ Cset(out, eq);
3645 }
3646 
VisitThreadInterrupted(HInvoke * invoke)3647 void IntrinsicLocationsBuilderARM64::VisitThreadInterrupted(HInvoke* invoke) {
3648   LocationSummary* locations =
3649       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
3650   locations->SetOut(Location::RequiresRegister());
3651 }
3652 
VisitThreadInterrupted(HInvoke * invoke)3653 void IntrinsicCodeGeneratorARM64::VisitThreadInterrupted(HInvoke* invoke) {
3654   MacroAssembler* masm = GetVIXLAssembler();
3655   Register out = RegisterFrom(invoke->GetLocations()->Out(), DataType::Type::kInt32);
3656   UseScratchRegisterScope temps(masm);
3657   Register temp = temps.AcquireX();
3658 
3659   __ Add(temp, tr, Thread::InterruptedOffset<kArm64PointerSize>().Int32Value());
3660   __ Ldar(out.W(), MemOperand(temp));
3661 
3662   vixl::aarch64::Label done;
3663   __ Cbz(out.W(), &done);
3664   __ Stlr(wzr, MemOperand(temp));
3665   __ Bind(&done);
3666 }
3667 
VisitReachabilityFence(HInvoke * invoke)3668 void IntrinsicLocationsBuilderARM64::VisitReachabilityFence(HInvoke* invoke) {
3669   LocationSummary* locations =
3670       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
3671   locations->SetInAt(0, Location::Any());
3672 }
3673 
VisitReachabilityFence(HInvoke * invoke)3674 void IntrinsicCodeGeneratorARM64::VisitReachabilityFence([[maybe_unused]] HInvoke* invoke) {}
3675 
VisitCRC32Update(HInvoke * invoke)3676 void IntrinsicLocationsBuilderARM64::VisitCRC32Update(HInvoke* invoke) {
3677   if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
3678     return;
3679   }
3680 
3681   LocationSummary* locations = new (allocator_) LocationSummary(invoke,
3682                                                                 LocationSummary::kNoCall,
3683                                                                 kIntrinsified);
3684 
3685   locations->SetInAt(0, Location::RequiresRegister());
3686   locations->SetInAt(1, Location::RequiresRegister());
3687   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
3688 }
3689 
3690 // Lower the invoke of CRC32.update(int crc, int b).
VisitCRC32Update(HInvoke * invoke)3691 void IntrinsicCodeGeneratorARM64::VisitCRC32Update(HInvoke* invoke) {
3692   DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
3693 
3694   MacroAssembler* masm = GetVIXLAssembler();
3695 
3696   Register crc = InputRegisterAt(invoke, 0);
3697   Register val = InputRegisterAt(invoke, 1);
3698   Register out = OutputRegister(invoke);
3699 
3700   // The general algorithm of the CRC32 calculation is:
3701   //   crc = ~crc
3702   //   result = crc32_for_byte(crc, b)
3703   //   crc = ~result
3704   // It is directly lowered to three instructions.
3705 
3706   UseScratchRegisterScope temps(masm);
3707   Register tmp = temps.AcquireSameSizeAs(out);
3708 
3709   __ Mvn(tmp, crc);
3710   __ Crc32b(tmp, tmp, val);
3711   __ Mvn(out, tmp);
3712 }
3713 
3714 // Generate code using CRC32 instructions which calculates
3715 // a CRC32 value of a byte.
3716 //
3717 // Parameters:
3718 //   masm   - VIXL macro assembler
3719 //   crc    - a register holding an initial CRC value
3720 //   ptr    - a register holding a memory address of bytes
3721 //   length - a register holding a number of bytes to process
3722 //   out    - a register to put a result of calculation
GenerateCodeForCalculationCRC32ValueOfBytes(MacroAssembler * masm,const Register & crc,const Register & ptr,const Register & length,const Register & out)3723 static void GenerateCodeForCalculationCRC32ValueOfBytes(MacroAssembler* masm,
3724                                                         const Register& crc,
3725                                                         const Register& ptr,
3726                                                         const Register& length,
3727                                                         const Register& out) {
3728   // The algorithm of CRC32 of bytes is:
3729   //   crc = ~crc
3730   //   process a few first bytes to make the array 8-byte aligned
3731   //   while array has 8 bytes do:
3732   //     crc = crc32_of_8bytes(crc, 8_bytes(array))
3733   //   if array has 4 bytes:
3734   //     crc = crc32_of_4bytes(crc, 4_bytes(array))
3735   //   if array has 2 bytes:
3736   //     crc = crc32_of_2bytes(crc, 2_bytes(array))
3737   //   if array has a byte:
3738   //     crc = crc32_of_byte(crc, 1_byte(array))
3739   //   crc = ~crc
3740 
3741   vixl::aarch64::Label loop, done;
3742   vixl::aarch64::Label process_4bytes, process_2bytes, process_1byte;
3743   vixl::aarch64::Label aligned2, aligned4, aligned8;
3744 
3745   // Use VIXL scratch registers as the VIXL macro assembler won't use them in
3746   // instructions below.
3747   UseScratchRegisterScope temps(masm);
3748   Register len = temps.AcquireW();
3749   Register array_elem = temps.AcquireW();
3750 
3751   __ Mvn(out, crc);
3752   __ Mov(len, length);
3753 
3754   __ Tbz(ptr, 0, &aligned2);
3755   __ Subs(len, len, 1);
3756   __ B(&done, lo);
3757   __ Ldrb(array_elem, MemOperand(ptr, 1, PostIndex));
3758   __ Crc32b(out, out, array_elem);
3759 
3760   __ Bind(&aligned2);
3761   __ Tbz(ptr, 1, &aligned4);
3762   __ Subs(len, len, 2);
3763   __ B(&process_1byte, lo);
3764   __ Ldrh(array_elem, MemOperand(ptr, 2, PostIndex));
3765   __ Crc32h(out, out, array_elem);
3766 
3767   __ Bind(&aligned4);
3768   __ Tbz(ptr, 2, &aligned8);
3769   __ Subs(len, len, 4);
3770   __ B(&process_2bytes, lo);
3771   __ Ldr(array_elem, MemOperand(ptr, 4, PostIndex));
3772   __ Crc32w(out, out, array_elem);
3773 
3774   __ Bind(&aligned8);
3775   __ Subs(len, len, 8);
3776   // If len < 8 go to process data by 4 bytes, 2 bytes and a byte.
3777   __ B(&process_4bytes, lo);
3778 
3779   // The main loop processing data by 8 bytes.
3780   __ Bind(&loop);
3781   __ Ldr(array_elem.X(), MemOperand(ptr, 8, PostIndex));
3782   __ Subs(len, len, 8);
3783   __ Crc32x(out, out, array_elem.X());
3784   // if len >= 8, process the next 8 bytes.
3785   __ B(&loop, hs);
3786 
3787   // Process the data which is less than 8 bytes.
3788   // The code generated below works with values of len
3789   // which come in the range [-8, 0].
3790   // The first three bits are used to detect whether 4 bytes or 2 bytes or
3791   // a byte can be processed.
3792   // The checking order is from bit 2 to bit 0:
3793   //  bit 2 is set: at least 4 bytes available
3794   //  bit 1 is set: at least 2 bytes available
3795   //  bit 0 is set: at least a byte available
3796   __ Bind(&process_4bytes);
3797   // Goto process_2bytes if less than four bytes available
3798   __ Tbz(len, 2, &process_2bytes);
3799   __ Ldr(array_elem, MemOperand(ptr, 4, PostIndex));
3800   __ Crc32w(out, out, array_elem);
3801 
3802   __ Bind(&process_2bytes);
3803   // Goto process_1bytes if less than two bytes available
3804   __ Tbz(len, 1, &process_1byte);
3805   __ Ldrh(array_elem, MemOperand(ptr, 2, PostIndex));
3806   __ Crc32h(out, out, array_elem);
3807 
3808   __ Bind(&process_1byte);
3809   // Goto done if no bytes available
3810   __ Tbz(len, 0, &done);
3811   __ Ldrb(array_elem, MemOperand(ptr));
3812   __ Crc32b(out, out, array_elem);
3813 
3814   __ Bind(&done);
3815   __ Mvn(out, out);
3816 }
3817 
3818 // The threshold for sizes of arrays to use the library provided implementation
3819 // of CRC32.updateBytes instead of the intrinsic.
3820 static constexpr int32_t kCRC32UpdateBytesThreshold = 64 * 1024;
3821 
VisitCRC32UpdateBytes(HInvoke * invoke)3822 void IntrinsicLocationsBuilderARM64::VisitCRC32UpdateBytes(HInvoke* invoke) {
3823   if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
3824     return;
3825   }
3826 
3827   LocationSummary* locations =
3828       new (allocator_) LocationSummary(invoke,
3829                                        LocationSummary::kCallOnSlowPath,
3830                                        kIntrinsified);
3831 
3832   locations->SetInAt(0, Location::RequiresRegister());
3833   locations->SetInAt(1, Location::RequiresRegister());
3834   locations->SetInAt(2, Location::RegisterOrConstant(invoke->InputAt(2)));
3835   locations->SetInAt(3, Location::RequiresRegister());
3836   locations->AddTemp(Location::RequiresRegister());
3837   locations->SetOut(Location::RequiresRegister());
3838 }
3839 
3840 // Lower the invoke of CRC32.updateBytes(int crc, byte[] b, int off, int len)
3841 //
3842 // Note: The intrinsic is not used if len exceeds a threshold.
VisitCRC32UpdateBytes(HInvoke * invoke)3843 void IntrinsicCodeGeneratorARM64::VisitCRC32UpdateBytes(HInvoke* invoke) {
3844   DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
3845 
3846   MacroAssembler* masm = GetVIXLAssembler();
3847   LocationSummary* locations = invoke->GetLocations();
3848 
3849   SlowPathCodeARM64* slow_path =
3850       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
3851   codegen_->AddSlowPath(slow_path);
3852 
3853   Register length = WRegisterFrom(locations->InAt(3));
3854   __ Cmp(length, kCRC32UpdateBytesThreshold);
3855   __ B(slow_path->GetEntryLabel(), hi);
3856 
3857   const uint32_t array_data_offset =
3858       mirror::Array::DataOffset(Primitive::kPrimByte).Uint32Value();
3859   Register ptr = XRegisterFrom(locations->GetTemp(0));
3860   Register array = XRegisterFrom(locations->InAt(1));
3861   Location offset = locations->InAt(2);
3862   if (offset.IsConstant()) {
3863     int32_t offset_value = offset.GetConstant()->AsIntConstant()->GetValue();
3864     __ Add(ptr, array, array_data_offset + offset_value);
3865   } else {
3866     __ Add(ptr, array, array_data_offset);
3867     __ Add(ptr, ptr, XRegisterFrom(offset));
3868   }
3869 
3870   Register crc = WRegisterFrom(locations->InAt(0));
3871   Register out = WRegisterFrom(locations->Out());
3872 
3873   GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
3874 
3875   __ Bind(slow_path->GetExitLabel());
3876 }
3877 
VisitCRC32UpdateByteBuffer(HInvoke * invoke)3878 void IntrinsicLocationsBuilderARM64::VisitCRC32UpdateByteBuffer(HInvoke* invoke) {
3879   if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
3880     return;
3881   }
3882 
3883   LocationSummary* locations =
3884       new (allocator_) LocationSummary(invoke,
3885                                        LocationSummary::kNoCall,
3886                                        kIntrinsified);
3887 
3888   locations->SetInAt(0, Location::RequiresRegister());
3889   locations->SetInAt(1, Location::RequiresRegister());
3890   locations->SetInAt(2, Location::RequiresRegister());
3891   locations->SetInAt(3, Location::RequiresRegister());
3892   locations->AddTemp(Location::RequiresRegister());
3893   locations->SetOut(Location::RequiresRegister());
3894 }
3895 
3896 // Lower the invoke of CRC32.updateByteBuffer(int crc, long addr, int off, int len)
3897 //
3898 // There is no need to generate code checking if addr is 0.
3899 // The method updateByteBuffer is a private method of java.util.zip.CRC32.
3900 // This guarantees no calls outside of the CRC32 class.
3901 // An address of DirectBuffer is always passed to the call of updateByteBuffer.
3902 // It might be an implementation of an empty DirectBuffer which can use a zero
3903 // address but it must have the length to be zero. The current generated code
3904 // correctly works with the zero length.
VisitCRC32UpdateByteBuffer(HInvoke * invoke)3905 void IntrinsicCodeGeneratorARM64::VisitCRC32UpdateByteBuffer(HInvoke* invoke) {
3906   DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
3907 
3908   MacroAssembler* masm = GetVIXLAssembler();
3909   LocationSummary* locations = invoke->GetLocations();
3910 
3911   Register addr = XRegisterFrom(locations->InAt(1));
3912   Register ptr = XRegisterFrom(locations->GetTemp(0));
3913   __ Add(ptr, addr, XRegisterFrom(locations->InAt(2)));
3914 
3915   Register crc = WRegisterFrom(locations->InAt(0));
3916   Register length = WRegisterFrom(locations->InAt(3));
3917   Register out = WRegisterFrom(locations->Out());
3918   GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
3919 }
3920 
VisitFP16ToFloat(HInvoke * invoke)3921 void IntrinsicLocationsBuilderARM64::VisitFP16ToFloat(HInvoke* invoke) {
3922   if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3923     return;
3924   }
3925 
3926   LocationSummary* locations = new (allocator_) LocationSummary(invoke,
3927                                                                 LocationSummary::kNoCall,
3928                                                                 kIntrinsified);
3929   locations->SetInAt(0, Location::RequiresRegister());
3930   locations->SetOut(Location::RequiresFpuRegister());
3931 }
3932 
VisitFP16ToFloat(HInvoke * invoke)3933 void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
3934   DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
3935   MacroAssembler* masm = GetVIXLAssembler();
3936   UseScratchRegisterScope scratch_scope(masm);
3937   Register bits = InputRegisterAt(invoke, 0);
3938   VRegister out = SRegisterFrom(invoke->GetLocations()->Out());
3939   VRegister half = scratch_scope.AcquireH();
3940   __ Fmov(half, bits);  // ARMv8.2
3941   __ Fcvt(out, half);
3942 }
3943 
VisitFP16ToHalf(HInvoke * invoke)3944 void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) {
3945   if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3946     return;
3947   }
3948 
3949   LocationSummary* locations = new (allocator_) LocationSummary(invoke,
3950                                                                 LocationSummary::kNoCall,
3951                                                                 kIntrinsified);
3952   locations->SetInAt(0, Location::RequiresFpuRegister());
3953   locations->SetOut(Location::RequiresRegister());
3954 }
3955 
VisitFP16ToHalf(HInvoke * invoke)3956 void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) {
3957   DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
3958   MacroAssembler* masm = GetVIXLAssembler();
3959   UseScratchRegisterScope scratch_scope(masm);
3960   VRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0));
3961   VRegister half = scratch_scope.AcquireH();
3962   Register out = WRegisterFrom(invoke->GetLocations()->Out());
3963   __ Fcvt(half, in);
3964   __ Fmov(out, half);
3965   __ Sxth(out, out);  // sign extend due to returning a short type.
3966 }
3967 
3968 template<typename OP>
GenerateFP16Round(HInvoke * invoke,CodeGeneratorARM64 * const codegen_,MacroAssembler * masm,OP && roundOp)3969 void GenerateFP16Round(HInvoke* invoke,
3970                        CodeGeneratorARM64* const codegen_,
3971                        MacroAssembler* masm,
3972                        OP&& roundOp) {
3973   DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
3974   LocationSummary* locations = invoke->GetLocations();
3975   UseScratchRegisterScope scratch_scope(masm);
3976   Register out = WRegisterFrom(locations->Out());
3977   VRegister half = scratch_scope.AcquireH();
3978   __ Fmov(half, WRegisterFrom(locations->InAt(0)));
3979   roundOp(half, half);
3980   __ Fmov(out, half);
3981   __ Sxth(out, out);
3982 }
3983 
VisitFP16Floor(HInvoke * invoke)3984 void IntrinsicLocationsBuilderARM64::VisitFP16Floor(HInvoke* invoke) {
3985   if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3986     return;
3987   }
3988 
3989   CreateIntToIntLocations(allocator_, invoke);
3990 }
3991 
VisitFP16Floor(HInvoke * invoke)3992 void IntrinsicCodeGeneratorARM64::VisitFP16Floor(HInvoke* invoke) {
3993   MacroAssembler* masm = GetVIXLAssembler();
3994   auto roundOp = [masm](const VRegister& out, const VRegister& in) {
3995     __ Frintm(out, in);  // Round towards Minus infinity
3996   };
3997   GenerateFP16Round(invoke, codegen_, masm, roundOp);
3998 }
3999 
VisitFP16Ceil(HInvoke * invoke)4000 void IntrinsicLocationsBuilderARM64::VisitFP16Ceil(HInvoke* invoke) {
4001   if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4002     return;
4003   }
4004 
4005   CreateIntToIntLocations(allocator_, invoke);
4006 }
4007 
VisitFP16Ceil(HInvoke * invoke)4008 void IntrinsicCodeGeneratorARM64::VisitFP16Ceil(HInvoke* invoke) {
4009   MacroAssembler* masm = GetVIXLAssembler();
4010   auto roundOp = [masm](const VRegister& out, const VRegister& in) {
4011     __ Frintp(out, in);  // Round towards Plus infinity
4012   };
4013   GenerateFP16Round(invoke, codegen_, masm, roundOp);
4014 }
4015 
VisitFP16Rint(HInvoke * invoke)4016 void IntrinsicLocationsBuilderARM64::VisitFP16Rint(HInvoke* invoke) {
4017   if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4018     return;
4019   }
4020 
4021   CreateIntToIntLocations(allocator_, invoke);
4022 }
4023 
VisitFP16Rint(HInvoke * invoke)4024 void IntrinsicCodeGeneratorARM64::VisitFP16Rint(HInvoke* invoke) {
4025   MacroAssembler* masm = GetVIXLAssembler();
4026   auto roundOp = [masm](const VRegister& out, const VRegister& in) {
4027     __ Frintn(out, in);  // Round to nearest, with ties to even
4028   };
4029   GenerateFP16Round(invoke, codegen_, masm, roundOp);
4030 }
4031 
FP16ComparisonLocations(HInvoke * invoke,ArenaAllocator * allocator_,CodeGeneratorARM64 * codegen_,int requiredTemps)4032 void FP16ComparisonLocations(HInvoke* invoke,
4033                              ArenaAllocator* allocator_,
4034                              CodeGeneratorARM64* codegen_,
4035                              int requiredTemps) {
4036   if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
4037     return;
4038   }
4039 
4040   CreateIntIntToIntLocations(allocator_, invoke);
4041   for (int i = 0; i < requiredTemps; i++) {
4042     invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
4043   }
4044 }
4045 
4046 template<typename OP>
GenerateFP16Compare(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,const OP compareOp)4047 void GenerateFP16Compare(HInvoke* invoke,
4048                          CodeGeneratorARM64* codegen,
4049                          MacroAssembler* masm,
4050                          const OP compareOp) {
4051   DCHECK(codegen->GetInstructionSetFeatures().HasFP16());
4052   LocationSummary* locations = invoke->GetLocations();
4053   Register out = WRegisterFrom(locations->Out());
4054   VRegister half0 = HRegisterFrom(locations->GetTemp(0));
4055   VRegister half1 = HRegisterFrom(locations->GetTemp(1));
4056   __ Fmov(half0, WRegisterFrom(locations->InAt(0)));
4057   __ Fmov(half1, WRegisterFrom(locations->InAt(1)));
4058   compareOp(out, half0, half1);
4059 }
4060 
GenerateFP16Compare(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,vixl::aarch64::Condition cond)4061 static inline void GenerateFP16Compare(HInvoke* invoke,
4062                                        CodeGeneratorARM64* codegen,
4063                                        MacroAssembler* masm,
4064                                        vixl::aarch64::Condition cond) {
4065   auto compareOp = [masm, cond](const Register out, const VRegister& in0, const VRegister& in1) {
4066     __ Fcmp(in0, in1);
4067     __ Cset(out, cond);
4068   };
4069   GenerateFP16Compare(invoke, codegen, masm, compareOp);
4070 }
4071 
VisitFP16Greater(HInvoke * invoke)4072 void IntrinsicLocationsBuilderARM64::VisitFP16Greater(HInvoke* invoke) {
4073   FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4074 }
4075 
VisitFP16Greater(HInvoke * invoke)4076 void IntrinsicCodeGeneratorARM64::VisitFP16Greater(HInvoke* invoke) {
4077   MacroAssembler* masm = GetVIXLAssembler();
4078   GenerateFP16Compare(invoke, codegen_, masm, gt);
4079 }
4080 
VisitFP16GreaterEquals(HInvoke * invoke)4081 void IntrinsicLocationsBuilderARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
4082   FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4083 }
4084 
VisitFP16GreaterEquals(HInvoke * invoke)4085 void IntrinsicCodeGeneratorARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
4086   MacroAssembler* masm = GetVIXLAssembler();
4087   GenerateFP16Compare(invoke, codegen_, masm, ge);
4088 }
4089 
VisitFP16Less(HInvoke * invoke)4090 void IntrinsicLocationsBuilderARM64::VisitFP16Less(HInvoke* invoke) {
4091   FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4092 }
4093 
VisitFP16Less(HInvoke * invoke)4094 void IntrinsicCodeGeneratorARM64::VisitFP16Less(HInvoke* invoke) {
4095   MacroAssembler* masm = GetVIXLAssembler();
4096   GenerateFP16Compare(invoke, codegen_, masm, mi);
4097 }
4098 
VisitFP16LessEquals(HInvoke * invoke)4099 void IntrinsicLocationsBuilderARM64::VisitFP16LessEquals(HInvoke* invoke) {
4100   FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4101 }
4102 
VisitFP16LessEquals(HInvoke * invoke)4103 void IntrinsicCodeGeneratorARM64::VisitFP16LessEquals(HInvoke* invoke) {
4104   MacroAssembler* masm = GetVIXLAssembler();
4105   GenerateFP16Compare(invoke, codegen_, masm, ls);
4106 }
4107 
VisitFP16Compare(HInvoke * invoke)4108 void IntrinsicLocationsBuilderARM64::VisitFP16Compare(HInvoke* invoke) {
4109   FP16ComparisonLocations(invoke, allocator_, codegen_, 2);
4110 }
4111 
VisitFP16Compare(HInvoke * invoke)4112 void IntrinsicCodeGeneratorARM64::VisitFP16Compare(HInvoke* invoke) {
4113   MacroAssembler* masm = GetVIXLAssembler();
4114   auto compareOp = [masm](const Register out,
4115                           const VRegister& in0,
4116                           const VRegister& in1) {
4117     vixl::aarch64::Label end;
4118     vixl::aarch64::Label equal;
4119     vixl::aarch64::Label normal;
4120 
4121     // The normal cases for this method are:
4122     // - in0 > in1 => out = 1
4123     // - in0 < in1 => out = -1
4124     // - in0 == in1 => out = 0
4125     // +/-Infinity are ordered by default so are handled by the normal case.
4126     // There are two special cases that Fcmp is insufficient for distinguishing:
4127     // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
4128     // - in0 or in1 is NaN => manually compare with in0 and in1 separately
4129     __ Fcmp(in0, in1);
4130     __ B(eq, &equal);  // in0==in1 or +0 -0 case.
4131     __ B(vc, &normal);  // in0 and in1 are ordered (not NaN).
4132 
4133     // Either of the inputs is NaN.
4134     // NaN is equal to itself and greater than any other number so:
4135     // - if only in0 is NaN => return 1
4136     // - if only in1 is NaN => return -1
4137     // - if both in0 and in1 are NaN => return 0
4138     __ Fcmp(in0, 0.0);
4139     __ Mov(out, -1);
4140     __ B(vc, &end);  // in0 != NaN => out = -1.
4141     __ Fcmp(in1, 0.0);
4142     __ Cset(out, vc);  // if in1 != NaN => out = 1, otherwise both are NaNs => out = 0.
4143     __ B(&end);
4144 
4145     // in0 == in1 or if one of the inputs is +0 and the other is -0.
4146     __ Bind(&equal);
4147     // Compare encoding of in0 and in1 as the denormal fraction of single precision float.
4148     // Reverse operand order because -0 > +0 when compared as S registers.
4149     // The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
4150     // Therefore the value of bits[127:16] will not matter when doing the
4151     // below Fcmp as they are set to 0.
4152     __ Fcmp(in1.S(), in0.S());
4153 
4154     __ Bind(&normal);
4155     __ Cset(out, gt);  // if in0 > in1 => out = 1, otherwise out = 0.
4156                        // Note: could be from equals path or original comparison
4157     __ Csinv(out, out, wzr, pl);  // if in0 >= in1 out=out, otherwise out=-1.
4158 
4159     __ Bind(&end);
4160   };
4161 
4162   GenerateFP16Compare(invoke, codegen_, masm, compareOp);
4163 }
4164 
4165 const int kFP16NaN = 0x7e00;
4166 
GenerateFP16MinMax(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,vixl::aarch64::Condition cond)4167 static inline void GenerateFP16MinMax(HInvoke* invoke,
4168                                        CodeGeneratorARM64* codegen,
4169                                        MacroAssembler* masm,
4170                                        vixl::aarch64::Condition cond) {
4171   DCHECK(codegen->GetInstructionSetFeatures().HasFP16());
4172   LocationSummary* locations = invoke->GetLocations();
4173 
4174   vixl::aarch64::Label equal;
4175   vixl::aarch64::Label end;
4176 
4177   UseScratchRegisterScope temps(masm);
4178 
4179   Register out = WRegisterFrom(locations->Out());
4180   Register in0 = WRegisterFrom(locations->InAt(0));
4181   Register in1 = WRegisterFrom(locations->InAt(1));
4182   VRegister half0 = HRegisterFrom(locations->GetTemp(0));
4183   VRegister half1 = temps.AcquireH();
4184 
4185   // The normal cases for this method are:
4186   // - in0.h == in1.h => out = in0 or in1
4187   // - in0.h <cond> in1.h => out = in0
4188   // - in0.h <!cond> in1.h => out = in1
4189   // +/-Infinity are ordered by default so are handled by the normal case.
4190   // There are two special cases that Fcmp is insufficient for distinguishing:
4191   // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
4192   // - in0 or in1 is NaN => out = NaN
4193   __ Fmov(half0, in0);
4194   __ Fmov(half1, in1);
4195   __ Fcmp(half0, half1);
4196   __ B(eq, &equal);  // half0 = half1 or +0/-0 case.
4197   __ Csel(out, in0, in1, cond);  // if half0 <cond> half1 => out = in0, otherwise out = in1.
4198   __ B(vc, &end);  // None of the inputs were NaN.
4199 
4200   // Atleast one input was NaN.
4201   __ Mov(out, kFP16NaN);  // out=NaN.
4202   __ B(&end);
4203 
4204   // in0 == in1 or if one of the inputs is +0 and the other is -0.
4205   __ Bind(&equal);
4206   // Fcmp cannot normally distinguish +0 and -0 so compare encoding.
4207   // Encoding is compared as the denormal fraction of a Single.
4208   // Note: encoding of -0 > encoding of +0 despite +0 > -0 so in0 and in1 are swapped.
4209   // Note: The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
4210   __ Fcmp(half1.S(), half0.S());
4211 
4212   __ Csel(out, in0, in1, cond);  // if half0 <cond> half1 => out = in0, otherwise out = in1.
4213 
4214   __ Bind(&end);
4215 }
4216 
VisitFP16Min(HInvoke * invoke)4217 void IntrinsicLocationsBuilderARM64::VisitFP16Min(HInvoke* invoke) {
4218   FP16ComparisonLocations(invoke, allocator_, codegen_, 1);
4219 }
4220 
VisitFP16Min(HInvoke * invoke)4221 void IntrinsicCodeGeneratorARM64::VisitFP16Min(HInvoke* invoke) {
4222   DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4223   MacroAssembler* masm = GetVIXLAssembler();
4224   GenerateFP16MinMax(invoke, codegen_, masm, mi);
4225 }
4226 
VisitFP16Max(HInvoke * invoke)4227 void IntrinsicLocationsBuilderARM64::VisitFP16Max(HInvoke* invoke) {
4228   FP16ComparisonLocations(invoke, allocator_, codegen_, 1);
4229 }
4230 
VisitFP16Max(HInvoke * invoke)4231 void IntrinsicCodeGeneratorARM64::VisitFP16Max(HInvoke* invoke) {
4232   DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
4233   MacroAssembler* masm = GetVIXLAssembler();
4234   GenerateFP16MinMax(invoke, codegen_, masm, gt);
4235 }
4236 
GenerateDivideUnsigned(HInvoke * invoke,CodeGeneratorARM64 * codegen)4237 static void GenerateDivideUnsigned(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4238   LocationSummary* locations = invoke->GetLocations();
4239   MacroAssembler* masm = codegen->GetVIXLAssembler();
4240   DataType::Type type = invoke->GetType();
4241   DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
4242 
4243   Register dividend = RegisterFrom(locations->InAt(0), type);
4244   Register divisor = RegisterFrom(locations->InAt(1), type);
4245   Register out = RegisterFrom(locations->Out(), type);
4246 
4247   // Check if divisor is zero, bail to managed implementation to handle.
4248   SlowPathCodeARM64* slow_path =
4249       new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
4250   codegen->AddSlowPath(slow_path);
4251   __ Cbz(divisor, slow_path->GetEntryLabel());
4252 
4253   __ Udiv(out, dividend, divisor);
4254 
4255   __ Bind(slow_path->GetExitLabel());
4256 }
4257 
VisitIntegerDivideUnsigned(HInvoke * invoke)4258 void IntrinsicLocationsBuilderARM64::VisitIntegerDivideUnsigned(HInvoke* invoke) {
4259   CreateIntIntToIntSlowPathCallLocations(allocator_, invoke);
4260 }
4261 
VisitIntegerDivideUnsigned(HInvoke * invoke)4262 void IntrinsicCodeGeneratorARM64::VisitIntegerDivideUnsigned(HInvoke* invoke) {
4263   GenerateDivideUnsigned(invoke, codegen_);
4264 }
4265 
VisitLongDivideUnsigned(HInvoke * invoke)4266 void IntrinsicLocationsBuilderARM64::VisitLongDivideUnsigned(HInvoke* invoke) {
4267   CreateIntIntToIntSlowPathCallLocations(allocator_, invoke);
4268 }
4269 
VisitLongDivideUnsigned(HInvoke * invoke)4270 void IntrinsicCodeGeneratorARM64::VisitLongDivideUnsigned(HInvoke* invoke) {
4271   GenerateDivideUnsigned(invoke, codegen_);
4272 }
4273 
VisitMathMultiplyHigh(HInvoke * invoke)4274 void IntrinsicLocationsBuilderARM64::VisitMathMultiplyHigh(HInvoke* invoke) {
4275   CreateIntIntToIntLocations(allocator_, invoke);
4276 }
4277 
VisitMathMultiplyHigh(HInvoke * invoke)4278 void IntrinsicCodeGeneratorARM64::VisitMathMultiplyHigh(HInvoke* invoke) {
4279   LocationSummary* locations = invoke->GetLocations();
4280   MacroAssembler* masm = codegen_->GetVIXLAssembler();
4281   DataType::Type type = invoke->GetType();
4282   DCHECK(type == DataType::Type::kInt64);
4283 
4284   Register x = RegisterFrom(locations->InAt(0), type);
4285   Register y = RegisterFrom(locations->InAt(1), type);
4286   Register out = RegisterFrom(locations->Out(), type);
4287 
4288   __ Smulh(out, x, y);
4289 }
4290 
GenerateMathFma(HInvoke * invoke,CodeGeneratorARM64 * codegen)4291 static void GenerateMathFma(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4292   MacroAssembler* masm = codegen->GetVIXLAssembler();
4293 
4294   VRegister n = helpers::InputFPRegisterAt(invoke, 0);
4295   VRegister m = helpers::InputFPRegisterAt(invoke, 1);
4296   VRegister a = helpers::InputFPRegisterAt(invoke, 2);
4297   VRegister out = helpers::OutputFPRegister(invoke);
4298 
4299   __ Fmadd(out, n, m, a);
4300 }
4301 
VisitMathFmaDouble(HInvoke * invoke)4302 void IntrinsicLocationsBuilderARM64::VisitMathFmaDouble(HInvoke* invoke) {
4303   CreateFPFPFPToFPLocations(allocator_, invoke);
4304 }
4305 
VisitMathFmaDouble(HInvoke * invoke)4306 void IntrinsicCodeGeneratorARM64::VisitMathFmaDouble(HInvoke* invoke) {
4307   GenerateMathFma(invoke, codegen_);
4308 }
4309 
VisitMathFmaFloat(HInvoke * invoke)4310 void IntrinsicLocationsBuilderARM64::VisitMathFmaFloat(HInvoke* invoke) {
4311   CreateFPFPFPToFPLocations(allocator_, invoke);
4312 }
4313 
VisitMathFmaFloat(HInvoke * invoke)4314 void IntrinsicCodeGeneratorARM64::VisitMathFmaFloat(HInvoke* invoke) {
4315   GenerateMathFma(invoke, codegen_);
4316 }
4317 
4318 class VarHandleSlowPathARM64 : public IntrinsicSlowPathARM64 {
4319  public:
VarHandleSlowPathARM64(HInvoke * invoke,std::memory_order order)4320   VarHandleSlowPathARM64(HInvoke* invoke, std::memory_order order)
4321       : IntrinsicSlowPathARM64(invoke),
4322         order_(order),
4323         return_success_(false),
4324         strong_(false),
4325         get_and_update_op_(GetAndUpdateOp::kAdd) {
4326   }
4327 
GetByteArrayViewCheckLabel()4328   vixl::aarch64::Label* GetByteArrayViewCheckLabel() {
4329     return &byte_array_view_check_label_;
4330   }
4331 
GetNativeByteOrderLabel()4332   vixl::aarch64::Label* GetNativeByteOrderLabel() {
4333     return &native_byte_order_label_;
4334   }
4335 
SetCompareAndSetOrExchangeArgs(bool return_success,bool strong)4336   void SetCompareAndSetOrExchangeArgs(bool return_success, bool strong) {
4337     if (return_success) {
4338       DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kCompareAndSet);
4339     } else {
4340       DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kCompareAndExchange);
4341     }
4342     return_success_ = return_success;
4343     strong_ = strong;
4344   }
4345 
SetGetAndUpdateOp(GetAndUpdateOp get_and_update_op)4346   void SetGetAndUpdateOp(GetAndUpdateOp get_and_update_op) {
4347     DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kGetAndUpdate);
4348     get_and_update_op_ = get_and_update_op;
4349   }
4350 
EmitNativeCode(CodeGenerator * codegen_in)4351   void EmitNativeCode(CodeGenerator* codegen_in) override {
4352     if (GetByteArrayViewCheckLabel()->IsLinked()) {
4353       EmitByteArrayViewCode(codegen_in);
4354     }
4355     IntrinsicSlowPathARM64::EmitNativeCode(codegen_in);
4356   }
4357 
4358  private:
GetInvoke() const4359   HInvoke* GetInvoke() const {
4360     return GetInstruction()->AsInvoke();
4361   }
4362 
GetAccessModeTemplate() const4363   mirror::VarHandle::AccessModeTemplate GetAccessModeTemplate() const {
4364     return mirror::VarHandle::GetAccessModeTemplateByIntrinsic(GetInvoke()->GetIntrinsic());
4365   }
4366 
4367   void EmitByteArrayViewCode(CodeGenerator* codegen_in);
4368 
4369   vixl::aarch64::Label byte_array_view_check_label_;
4370   vixl::aarch64::Label native_byte_order_label_;
4371   // Shared parameter for all VarHandle intrinsics.
4372   std::memory_order order_;
4373   // Extra arguments for GenerateVarHandleCompareAndSetOrExchange().
4374   bool return_success_;
4375   bool strong_;
4376   // Extra argument for GenerateVarHandleGetAndUpdate().
4377   GetAndUpdateOp get_and_update_op_;
4378 };
4379 
4380 // Generate subtype check without read barriers.
GenerateSubTypeObjectCheckNoReadBarrier(CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path,Register object,Register type,bool object_can_be_null=true)4381 static void GenerateSubTypeObjectCheckNoReadBarrier(CodeGeneratorARM64* codegen,
4382                                                     SlowPathCodeARM64* slow_path,
4383                                                     Register object,
4384                                                     Register type,
4385                                                     bool object_can_be_null = true) {
4386   MacroAssembler* masm = codegen->GetVIXLAssembler();
4387 
4388   const MemberOffset class_offset = mirror::Object::ClassOffset();
4389   const MemberOffset super_class_offset = mirror::Class::SuperClassOffset();
4390 
4391   vixl::aarch64::Label success;
4392   if (object_can_be_null) {
4393     __ Cbz(object, &success);
4394   }
4395 
4396   UseScratchRegisterScope temps(masm);
4397   Register temp = temps.AcquireW();
4398 
4399   __ Ldr(temp, HeapOperand(object, class_offset.Int32Value()));
4400   codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4401   vixl::aarch64::Label loop;
4402   __ Bind(&loop);
4403   __ Cmp(type, temp);
4404   __ B(&success, eq);
4405   __ Ldr(temp, HeapOperand(temp, super_class_offset.Int32Value()));
4406   codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4407   __ Cbz(temp, slow_path->GetEntryLabel());
4408   __ B(&loop);
4409   __ Bind(&success);
4410 }
4411 
4412 // Check access mode and the primitive type from VarHandle.varType.
4413 // Check reference arguments against the VarHandle.varType; for references this is a subclass
4414 // check without read barrier, so it can have false negatives which we handle in the slow path.
GenerateVarHandleAccessModeAndVarTypeChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path,DataType::Type type)4415 static void GenerateVarHandleAccessModeAndVarTypeChecks(HInvoke* invoke,
4416                                                         CodeGeneratorARM64* codegen,
4417                                                         SlowPathCodeARM64* slow_path,
4418                                                         DataType::Type type) {
4419   mirror::VarHandle::AccessMode access_mode =
4420       mirror::VarHandle::GetAccessModeByIntrinsic(invoke->GetIntrinsic());
4421   Primitive::Type primitive_type = DataTypeToPrimitive(type);
4422 
4423   MacroAssembler* masm = codegen->GetVIXLAssembler();
4424   Register varhandle = InputRegisterAt(invoke, 0);
4425 
4426   const MemberOffset var_type_offset = mirror::VarHandle::VarTypeOffset();
4427   const MemberOffset access_mode_bit_mask_offset = mirror::VarHandle::AccessModesBitMaskOffset();
4428   const MemberOffset primitive_type_offset = mirror::Class::PrimitiveTypeOffset();
4429 
4430   UseScratchRegisterScope temps(masm);
4431   Register var_type_no_rb = temps.AcquireW();
4432   Register temp2 = temps.AcquireW();
4433 
4434   // Check that the operation is permitted and the primitive type of varhandle.varType.
4435   // We do not need a read barrier when loading a reference only for loading constant
4436   // primitive field through the reference. Use LDP to load the fields together.
4437   DCHECK_EQ(var_type_offset.Int32Value() + 4, access_mode_bit_mask_offset.Int32Value());
4438   __ Ldp(var_type_no_rb, temp2, HeapOperand(varhandle, var_type_offset.Int32Value()));
4439   codegen->GetAssembler()->MaybeUnpoisonHeapReference(var_type_no_rb);
4440   __ Tbz(temp2, static_cast<uint32_t>(access_mode), slow_path->GetEntryLabel());
4441   __ Ldrh(temp2, HeapOperand(var_type_no_rb, primitive_type_offset.Int32Value()));
4442   if (primitive_type == Primitive::kPrimNot) {
4443     static_assert(Primitive::kPrimNot == 0);
4444     __ Cbnz(temp2, slow_path->GetEntryLabel());
4445   } else {
4446     __ Cmp(temp2, static_cast<uint16_t>(primitive_type));
4447     __ B(slow_path->GetEntryLabel(), ne);
4448   }
4449 
4450   temps.Release(temp2);
4451 
4452   if (type == DataType::Type::kReference) {
4453     // Check reference arguments against the varType.
4454     // False negatives due to varType being an interface or array type
4455     // or due to the missing read barrier are handled by the slow path.
4456     size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4457     uint32_t arguments_start = /* VarHandle object */ 1u + expected_coordinates_count;
4458     uint32_t number_of_arguments = invoke->GetNumberOfArguments();
4459     for (size_t arg_index = arguments_start; arg_index != number_of_arguments; ++arg_index) {
4460       HInstruction* arg = invoke->InputAt(arg_index);
4461       DCHECK_EQ(arg->GetType(), DataType::Type::kReference);
4462       if (!arg->IsNullConstant()) {
4463         Register arg_reg = WRegisterFrom(invoke->GetLocations()->InAt(arg_index));
4464         GenerateSubTypeObjectCheckNoReadBarrier(codegen, slow_path, arg_reg, var_type_no_rb);
4465       }
4466     }
4467   }
4468 }
4469 
GenerateVarHandleStaticFieldCheck(HInvoke * invoke,CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path)4470 static void GenerateVarHandleStaticFieldCheck(HInvoke* invoke,
4471                                               CodeGeneratorARM64* codegen,
4472                                               SlowPathCodeARM64* slow_path) {
4473   MacroAssembler* masm = codegen->GetVIXLAssembler();
4474   Register varhandle = InputRegisterAt(invoke, 0);
4475 
4476   const MemberOffset coordinate_type0_offset = mirror::VarHandle::CoordinateType0Offset();
4477 
4478   UseScratchRegisterScope temps(masm);
4479   Register temp = temps.AcquireW();
4480 
4481   // Check that the VarHandle references a static field by checking that coordinateType0 == null.
4482   // Do not emit read barrier (or unpoison the reference) for comparing to null.
4483   __ Ldr(temp, HeapOperand(varhandle, coordinate_type0_offset.Int32Value()));
4484   __ Cbnz(temp, slow_path->GetEntryLabel());
4485 }
4486 
GenerateVarHandleInstanceFieldChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,SlowPathCodeARM64 * slow_path)4487 static void GenerateVarHandleInstanceFieldChecks(HInvoke* invoke,
4488                                                  CodeGeneratorARM64* codegen,
4489                                                  SlowPathCodeARM64* slow_path) {
4490   VarHandleOptimizations optimizations(invoke);
4491   MacroAssembler* masm = codegen->GetVIXLAssembler();
4492   Register varhandle = InputRegisterAt(invoke, 0);
4493   Register object = InputRegisterAt(invoke, 1);
4494 
4495   const MemberOffset coordinate_type0_offset = mirror::VarHandle::CoordinateType0Offset();
4496   const MemberOffset coordinate_type1_offset = mirror::VarHandle::CoordinateType1Offset();
4497 
4498   // Null-check the object.
4499   if (!optimizations.GetSkipObjectNullCheck()) {
4500     __ Cbz(object, slow_path->GetEntryLabel());
4501   }
4502 
4503   if (!optimizations.GetUseKnownImageVarHandle()) {
4504     UseScratchRegisterScope temps(masm);
4505     Register temp = temps.AcquireW();
4506     Register temp2 = temps.AcquireW();
4507 
4508     // Check that the VarHandle references an instance field by checking that
4509     // coordinateType1 == null. coordinateType0 should not be null, but this is handled by the
4510     // type compatibility check with the source object's type, which will fail for null.
4511     DCHECK_EQ(coordinate_type0_offset.Int32Value() + 4, coordinate_type1_offset.Int32Value());
4512     __ Ldp(temp, temp2, HeapOperand(varhandle, coordinate_type0_offset.Int32Value()));
4513     codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4514     // No need for read barrier or unpoisoning of coordinateType1 for comparison with null.
4515     __ Cbnz(temp2, slow_path->GetEntryLabel());
4516 
4517     // Check that the object has the correct type.
4518     // We deliberately avoid the read barrier, letting the slow path handle the false negatives.
4519     temps.Release(temp2);  // Needed by GenerateSubTypeObjectCheckNoReadBarrier().
4520     GenerateSubTypeObjectCheckNoReadBarrier(
4521         codegen, slow_path, object, temp, /*object_can_be_null=*/ false);
4522   }
4523 }
4524 
GenerateVarHandleArrayChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,VarHandleSlowPathARM64 * slow_path)4525 static void GenerateVarHandleArrayChecks(HInvoke* invoke,
4526                                          CodeGeneratorARM64* codegen,
4527                                          VarHandleSlowPathARM64* slow_path) {
4528   VarHandleOptimizations optimizations(invoke);
4529   MacroAssembler* masm = codegen->GetVIXLAssembler();
4530   Register varhandle = InputRegisterAt(invoke, 0);
4531   Register object = InputRegisterAt(invoke, 1);
4532   Register index = InputRegisterAt(invoke, 2);
4533   DataType::Type value_type =
4534       GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
4535   Primitive::Type primitive_type = DataTypeToPrimitive(value_type);
4536 
4537   const MemberOffset coordinate_type0_offset = mirror::VarHandle::CoordinateType0Offset();
4538   const MemberOffset coordinate_type1_offset = mirror::VarHandle::CoordinateType1Offset();
4539   const MemberOffset component_type_offset = mirror::Class::ComponentTypeOffset();
4540   const MemberOffset primitive_type_offset = mirror::Class::PrimitiveTypeOffset();
4541   const MemberOffset class_offset = mirror::Object::ClassOffset();
4542   const MemberOffset array_length_offset = mirror::Array::LengthOffset();
4543 
4544   // Null-check the object.
4545   if (!optimizations.GetSkipObjectNullCheck()) {
4546     __ Cbz(object, slow_path->GetEntryLabel());
4547   }
4548 
4549   UseScratchRegisterScope temps(masm);
4550   Register temp = temps.AcquireW();
4551   Register temp2 = temps.AcquireW();
4552 
4553   // Check that the VarHandle references an array, byte array view or ByteBuffer by checking
4554   // that coordinateType1 != null. If that's true, coordinateType1 shall be int.class and
4555   // coordinateType0 shall not be null but we do not explicitly verify that.
4556   DCHECK_EQ(coordinate_type0_offset.Int32Value() + 4, coordinate_type1_offset.Int32Value());
4557   __ Ldp(temp, temp2, HeapOperand(varhandle, coordinate_type0_offset.Int32Value()));
4558   codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
4559   // No need for read barrier or unpoisoning of coordinateType1 for comparison with null.
4560   __ Cbz(temp2, slow_path->GetEntryLabel());
4561 
4562   // Check object class against componentType0.
4563   //
4564   // This is an exact check and we defer other cases to the runtime. This includes
4565   // conversion to array of superclass references, which is valid but subsequently
4566   // requires all update operations to check that the value can indeed be stored.
4567   // We do not want to perform such extra checks in the intrinsified code.
4568   //
4569   // We do this check without read barrier, so there can be false negatives which we
4570   // defer to the slow path. There shall be no false negatives for array classes in the
4571   // boot image (including Object[] and primitive arrays) because they are non-movable.
4572   __ Ldr(temp2, HeapOperand(object, class_offset.Int32Value()));
4573   codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
4574   __ Cmp(temp, temp2);
4575   __ B(slow_path->GetEntryLabel(), ne);
4576 
4577   // Check that the coordinateType0 is an array type. We do not need a read barrier
4578   // for loading constant reference fields (or chains of them) for comparison with null,
4579   // nor for finally loading a constant primitive field (primitive type) below.
4580   __ Ldr(temp2, HeapOperand(temp, component_type_offset.Int32Value()));
4581   codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
4582   __ Cbz(temp2, slow_path->GetEntryLabel());
4583 
4584   // Check that the array component type matches the primitive type.
4585   __ Ldrh(temp2, HeapOperand(temp2, primitive_type_offset.Int32Value()));
4586   if (primitive_type == Primitive::kPrimNot) {
4587     static_assert(Primitive::kPrimNot == 0);
4588     __ Cbnz(temp2, slow_path->GetEntryLabel());
4589   } else {
4590     // With the exception of `kPrimNot` (handled above), `kPrimByte` and `kPrimBoolean`,
4591     // we shall check for a byte array view in the slow path.
4592     // The check requires the ByteArrayViewVarHandle.class to be in the boot image,
4593     // so we cannot emit that if we're JITting without boot image.
4594     bool boot_image_available =
4595         codegen->GetCompilerOptions().IsBootImage() ||
4596         !Runtime::Current()->GetHeap()->GetBootImageSpaces().empty();
4597     bool can_be_view = (DataType::Size(value_type) != 1u) && boot_image_available;
4598     vixl::aarch64::Label* slow_path_label =
4599         can_be_view ? slow_path->GetByteArrayViewCheckLabel() : slow_path->GetEntryLabel();
4600     __ Cmp(temp2, static_cast<uint16_t>(primitive_type));
4601     __ B(slow_path_label, ne);
4602   }
4603 
4604   // Check for array index out of bounds.
4605   __ Ldr(temp, HeapOperand(object, array_length_offset.Int32Value()));
4606   __ Cmp(index, temp);
4607   __ B(slow_path->GetEntryLabel(), hs);
4608 }
4609 
GenerateVarHandleCoordinateChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,VarHandleSlowPathARM64 * slow_path)4610 static void GenerateVarHandleCoordinateChecks(HInvoke* invoke,
4611                                               CodeGeneratorARM64* codegen,
4612                                               VarHandleSlowPathARM64* slow_path) {
4613   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4614   if (expected_coordinates_count == 0u) {
4615     GenerateVarHandleStaticFieldCheck(invoke, codegen, slow_path);
4616   } else if (expected_coordinates_count == 1u) {
4617     GenerateVarHandleInstanceFieldChecks(invoke, codegen, slow_path);
4618   } else {
4619     DCHECK_EQ(expected_coordinates_count, 2u);
4620     GenerateVarHandleArrayChecks(invoke, codegen, slow_path);
4621   }
4622 }
4623 
GenerateVarHandleChecks(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,DataType::Type type)4624 static VarHandleSlowPathARM64* GenerateVarHandleChecks(HInvoke* invoke,
4625                                                        CodeGeneratorARM64* codegen,
4626                                                        std::memory_order order,
4627                                                        DataType::Type type) {
4628   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4629   VarHandleOptimizations optimizations(invoke);
4630   if (optimizations.GetUseKnownImageVarHandle()) {
4631     DCHECK_NE(expected_coordinates_count, 2u);
4632     if (expected_coordinates_count == 0u || optimizations.GetSkipObjectNullCheck()) {
4633       return nullptr;
4634     }
4635   }
4636 
4637   VarHandleSlowPathARM64* slow_path =
4638       new (codegen->GetScopedAllocator()) VarHandleSlowPathARM64(invoke, order);
4639   codegen->AddSlowPath(slow_path);
4640 
4641   if (!optimizations.GetUseKnownImageVarHandle()) {
4642     GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, type);
4643   }
4644   GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
4645 
4646   return slow_path;
4647 }
4648 
4649 struct VarHandleTarget {
4650   Register object;  // The object holding the value to operate on.
4651   Register offset;  // The offset of the value to operate on.
4652 };
4653 
GetVarHandleTarget(HInvoke * invoke)4654 static VarHandleTarget GetVarHandleTarget(HInvoke* invoke) {
4655   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4656   LocationSummary* locations = invoke->GetLocations();
4657 
4658   VarHandleTarget target;
4659   // The temporary allocated for loading the offset.
4660   target.offset = WRegisterFrom(locations->GetTemp(0u));
4661   // The reference to the object that holds the value to operate on.
4662   target.object = (expected_coordinates_count == 0u)
4663       ? WRegisterFrom(locations->GetTemp(1u))
4664       : InputRegisterAt(invoke, 1);
4665   return target;
4666 }
4667 
GenerateVarHandleTarget(HInvoke * invoke,const VarHandleTarget & target,CodeGeneratorARM64 * codegen)4668 static void GenerateVarHandleTarget(HInvoke* invoke,
4669                                     const VarHandleTarget& target,
4670                                     CodeGeneratorARM64* codegen) {
4671   MacroAssembler* masm = codegen->GetVIXLAssembler();
4672   Register varhandle = InputRegisterAt(invoke, 0);
4673   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4674 
4675   if (expected_coordinates_count <= 1u) {
4676     if (VarHandleOptimizations(invoke).GetUseKnownImageVarHandle()) {
4677       ScopedObjectAccess soa(Thread::Current());
4678       ArtField* target_field = GetBootImageVarHandleField(invoke);
4679       if (expected_coordinates_count == 0u) {
4680         ObjPtr<mirror::Class> declaring_class = target_field->GetDeclaringClass();
4681         if (Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(declaring_class)) {
4682           uint32_t boot_image_offset = CodeGenerator::GetBootImageOffset(declaring_class);
4683           codegen->LoadBootImageRelRoEntry(target.object, boot_image_offset);
4684         } else {
4685           codegen->LoadTypeForBootImageIntrinsic(
4686               target.object,
4687               TypeReference(&declaring_class->GetDexFile(), declaring_class->GetDexTypeIndex()));
4688         }
4689       }
4690       __ Mov(target.offset, target_field->GetOffset().Uint32Value());
4691     } else {
4692       // For static fields, we need to fill the `target.object` with the declaring class,
4693       // so we can use `target.object` as temporary for the `ArtField*`. For instance fields,
4694       // we do not need the declaring class, so we can forget the `ArtField*` when
4695       // we load the `target.offset`, so use the `target.offset` to hold the `ArtField*`.
4696       Register field = (expected_coordinates_count == 0) ? target.object : target.offset;
4697 
4698       const MemberOffset art_field_offset = mirror::FieldVarHandle::ArtFieldOffset();
4699       const MemberOffset offset_offset = ArtField::OffsetOffset();
4700 
4701       // Load the ArtField*, the offset and, if needed, declaring class.
4702       __ Ldr(field.X(), HeapOperand(varhandle, art_field_offset.Int32Value()));
4703       __ Ldr(target.offset, MemOperand(field.X(), offset_offset.Int32Value()));
4704       if (expected_coordinates_count == 0u) {
4705         codegen->GenerateGcRootFieldLoad(invoke,
4706                                          LocationFrom(target.object),
4707                                          field.X(),
4708                                          ArtField::DeclaringClassOffset().Int32Value(),
4709                                          /*fixup_label=*/nullptr,
4710                                          codegen->GetCompilerReadBarrierOption());
4711       }
4712     }
4713   } else {
4714     DCHECK_EQ(expected_coordinates_count, 2u);
4715     DataType::Type value_type =
4716         GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
4717     size_t size_shift = DataType::SizeShift(value_type);
4718     MemberOffset data_offset = mirror::Array::DataOffset(DataType::Size(value_type));
4719 
4720     Register index = InputRegisterAt(invoke, 2);
4721     Register shifted_index = index;
4722     if (size_shift != 0u) {
4723       shifted_index = target.offset;
4724       __ Lsl(shifted_index, index, size_shift);
4725     }
4726     __ Add(target.offset, shifted_index, data_offset.Int32Value());
4727   }
4728 }
4729 
CreateVarHandleCommonLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen)4730 static LocationSummary* CreateVarHandleCommonLocations(HInvoke* invoke,
4731                                                        CodeGeneratorARM64* codegen) {
4732   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
4733   DataType::Type return_type = invoke->GetType();
4734 
4735   ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
4736   LocationSummary* locations =
4737       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
4738   locations->SetInAt(0, Location::RequiresRegister());
4739   // Require coordinates in registers. These are the object holding the value
4740   // to operate on (except for static fields) and index (for arrays and views).
4741   for (size_t i = 0; i != expected_coordinates_count; ++i) {
4742     locations->SetInAt(/* VarHandle object */ 1u + i, Location::RequiresRegister());
4743   }
4744   if (return_type != DataType::Type::kVoid) {
4745     if (DataType::IsFloatingPointType(return_type)) {
4746       locations->SetOut(Location::RequiresFpuRegister());
4747     } else {
4748       locations->SetOut(Location::RequiresRegister());
4749     }
4750   }
4751   uint32_t arguments_start = /* VarHandle object */ 1u + expected_coordinates_count;
4752   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
4753   for (size_t arg_index = arguments_start; arg_index != number_of_arguments; ++arg_index) {
4754     HInstruction* arg = invoke->InputAt(arg_index);
4755     if (IsZeroBitPattern(arg)) {
4756       locations->SetInAt(arg_index, Location::ConstantLocation(arg));
4757     } else if (DataType::IsFloatingPointType(arg->GetType())) {
4758       locations->SetInAt(arg_index, Location::RequiresFpuRegister());
4759     } else {
4760       locations->SetInAt(arg_index, Location::RequiresRegister());
4761     }
4762   }
4763 
4764   // Add a temporary for offset.
4765   if (codegen->EmitNonBakerReadBarrier() &&
4766       GetExpectedVarHandleCoordinatesCount(invoke) == 0u) {  // For static fields.
4767     // To preserve the offset value across the non-Baker read barrier slow path
4768     // for loading the declaring class, use a fixed callee-save register.
4769     constexpr int first_callee_save = CTZ(kArm64CalleeSaveRefSpills);
4770     locations->AddTemp(Location::RegisterLocation(first_callee_save));
4771   } else {
4772     locations->AddTemp(Location::RequiresRegister());
4773   }
4774   if (expected_coordinates_count == 0u) {
4775     // Add a temporary to hold the declaring class.
4776     locations->AddTemp(Location::RequiresRegister());
4777   }
4778 
4779   return locations;
4780 }
4781 
CreateVarHandleGetLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen)4782 static void CreateVarHandleGetLocations(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4783   VarHandleOptimizations optimizations(invoke);
4784   if (optimizations.GetDoNotIntrinsify()) {
4785     return;
4786   }
4787 
4788   if (codegen->EmitNonBakerReadBarrier() &&
4789       invoke->GetType() == DataType::Type::kReference &&
4790       invoke->GetIntrinsic() != Intrinsics::kVarHandleGet &&
4791       invoke->GetIntrinsic() != Intrinsics::kVarHandleGetOpaque) {
4792     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
4793     // the passed reference and reloads it from the field. This gets the memory visibility
4794     // wrong for Acquire/Volatile operations. b/173104084
4795     return;
4796   }
4797 
4798   CreateVarHandleCommonLocations(invoke, codegen);
4799 }
4800 
GenerateVarHandleGet(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,bool byte_swap=false)4801 static void GenerateVarHandleGet(HInvoke* invoke,
4802                                  CodeGeneratorARM64* codegen,
4803                                  std::memory_order order,
4804                                  bool byte_swap = false) {
4805   DataType::Type type = invoke->GetType();
4806   DCHECK_NE(type, DataType::Type::kVoid);
4807 
4808   LocationSummary* locations = invoke->GetLocations();
4809   MacroAssembler* masm = codegen->GetVIXLAssembler();
4810   CPURegister out = helpers::OutputCPURegister(invoke);
4811 
4812   VarHandleTarget target = GetVarHandleTarget(invoke);
4813   VarHandleSlowPathARM64* slow_path = nullptr;
4814   if (!byte_swap) {
4815     slow_path = GenerateVarHandleChecks(invoke, codegen, order, type);
4816     GenerateVarHandleTarget(invoke, target, codegen);
4817     if (slow_path != nullptr) {
4818       __ Bind(slow_path->GetNativeByteOrderLabel());
4819     }
4820   }
4821 
4822   // ARM64 load-acquire instructions are implicitly sequentially consistent.
4823   bool use_load_acquire =
4824       (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
4825   DCHECK(use_load_acquire || order == std::memory_order_relaxed);
4826 
4827   // Load the value from the target location.
4828   if (type == DataType::Type::kReference && codegen->EmitBakerReadBarrier()) {
4829     // Piggy-back on the field load path using introspection for the Baker read barrier.
4830     // The `target.offset` is a temporary, use it for field address.
4831     Register tmp_ptr = target.offset.X();
4832     __ Add(tmp_ptr, target.object.X(), target.offset.X());
4833     codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
4834                                                    locations->Out(),
4835                                                    target.object,
4836                                                    MemOperand(tmp_ptr),
4837                                                    /*needs_null_check=*/ false,
4838                                                    use_load_acquire);
4839     DCHECK(!byte_swap);
4840   } else {
4841     MemOperand address(target.object.X(), target.offset.X());
4842     CPURegister load_reg = out;
4843     DataType::Type load_type = type;
4844     UseScratchRegisterScope temps(masm);
4845     if (byte_swap) {
4846       if (type == DataType::Type::kInt16) {
4847         // Avoid unnecessary sign extension before REV16.
4848         load_type = DataType::Type::kUint16;
4849       } else if (type == DataType::Type::kFloat32) {
4850         load_type = DataType::Type::kInt32;
4851         load_reg = target.offset.W();
4852       } else if (type == DataType::Type::kFloat64) {
4853         load_type = DataType::Type::kInt64;
4854         load_reg = target.offset.X();
4855       }
4856     }
4857     if (use_load_acquire) {
4858       codegen->LoadAcquire(invoke, load_type, load_reg, address, /*needs_null_check=*/ false);
4859     } else {
4860       codegen->Load(load_type, load_reg, address);
4861     }
4862     if (type == DataType::Type::kReference) {
4863       DCHECK(!byte_swap);
4864       DCHECK(out.IsW());
4865       Location out_loc = locations->Out();
4866       Location object_loc = LocationFrom(target.object);
4867       Location offset_loc = LocationFrom(target.offset);
4868       codegen->MaybeGenerateReadBarrierSlow(invoke, out_loc, out_loc, object_loc, 0u, offset_loc);
4869     } else if (byte_swap) {
4870       GenerateReverseBytes(masm, type, load_reg, out);
4871     }
4872   }
4873 
4874   if (slow_path != nullptr) {
4875     DCHECK(!byte_swap);
4876     __ Bind(slow_path->GetExitLabel());
4877   }
4878 }
4879 
VisitVarHandleGet(HInvoke * invoke)4880 void IntrinsicLocationsBuilderARM64::VisitVarHandleGet(HInvoke* invoke) {
4881   CreateVarHandleGetLocations(invoke, codegen_);
4882 }
4883 
VisitVarHandleGet(HInvoke * invoke)4884 void IntrinsicCodeGeneratorARM64::VisitVarHandleGet(HInvoke* invoke) {
4885   GenerateVarHandleGet(invoke, codegen_, std::memory_order_relaxed);
4886 }
4887 
VisitVarHandleGetOpaque(HInvoke * invoke)4888 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetOpaque(HInvoke* invoke) {
4889   CreateVarHandleGetLocations(invoke, codegen_);
4890 }
4891 
VisitVarHandleGetOpaque(HInvoke * invoke)4892 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetOpaque(HInvoke* invoke) {
4893   GenerateVarHandleGet(invoke, codegen_, std::memory_order_relaxed);
4894 }
4895 
VisitVarHandleGetAcquire(HInvoke * invoke)4896 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAcquire(HInvoke* invoke) {
4897   CreateVarHandleGetLocations(invoke, codegen_);
4898 }
4899 
VisitVarHandleGetAcquire(HInvoke * invoke)4900 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAcquire(HInvoke* invoke) {
4901   GenerateVarHandleGet(invoke, codegen_, std::memory_order_acquire);
4902 }
4903 
VisitVarHandleGetVolatile(HInvoke * invoke)4904 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetVolatile(HInvoke* invoke) {
4905   CreateVarHandleGetLocations(invoke, codegen_);
4906 }
4907 
VisitVarHandleGetVolatile(HInvoke * invoke)4908 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetVolatile(HInvoke* invoke) {
4909   GenerateVarHandleGet(invoke, codegen_, std::memory_order_seq_cst);
4910 }
4911 
CreateVarHandleSetLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen)4912 static void CreateVarHandleSetLocations(HInvoke* invoke, CodeGeneratorARM64* codegen) {
4913   VarHandleOptimizations optimizations(invoke);
4914   if (optimizations.GetDoNotIntrinsify()) {
4915     return;
4916   }
4917 
4918   CreateVarHandleCommonLocations(invoke, codegen);
4919 }
4920 
GenerateVarHandleSet(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,bool byte_swap=false)4921 static void GenerateVarHandleSet(HInvoke* invoke,
4922                                  CodeGeneratorARM64* codegen,
4923                                  std::memory_order order,
4924                                  bool byte_swap = false) {
4925   uint32_t value_index = invoke->GetNumberOfArguments() - 1;
4926   DataType::Type value_type = GetDataTypeFromShorty(invoke, value_index);
4927 
4928   MacroAssembler* masm = codegen->GetVIXLAssembler();
4929   CPURegister value = InputCPURegisterOrZeroRegAt(invoke, value_index);
4930 
4931   VarHandleTarget target = GetVarHandleTarget(invoke);
4932   VarHandleSlowPathARM64* slow_path = nullptr;
4933   if (!byte_swap) {
4934     slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
4935     GenerateVarHandleTarget(invoke, target, codegen);
4936     if (slow_path != nullptr) {
4937       __ Bind(slow_path->GetNativeByteOrderLabel());
4938     }
4939   }
4940 
4941   // ARM64 store-release instructions are implicitly sequentially consistent.
4942   bool use_store_release =
4943       (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
4944   DCHECK(use_store_release || order == std::memory_order_relaxed);
4945 
4946   // Store the value to the target location.
4947   {
4948     CPURegister source = value;
4949     UseScratchRegisterScope temps(masm);
4950     if (kPoisonHeapReferences && value_type == DataType::Type::kReference) {
4951       DCHECK(value.IsW());
4952       Register temp = temps.AcquireW();
4953       __ Mov(temp, value.W());
4954       codegen->GetAssembler()->PoisonHeapReference(temp);
4955       source = temp;
4956     }
4957     if (byte_swap) {
4958       DCHECK(!source.IsZero());  // We use the main path for zero as it does not need a byte swap.
4959       Register temp = source.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
4960       if (value_type == DataType::Type::kInt16) {
4961         // Avoid unnecessary sign extension before storing.
4962         value_type = DataType::Type::kUint16;
4963       } else if (DataType::IsFloatingPointType(value_type)) {
4964         __ Fmov(temp, source.Is64Bits() ? source.D() : source.S());
4965         value_type = source.Is64Bits() ? DataType::Type::kInt64 : DataType::Type::kInt32;
4966         source = temp;  // Source for the `GenerateReverseBytes()` below.
4967       }
4968       GenerateReverseBytes(masm, value_type, source, temp);
4969       source = temp;
4970     }
4971     MemOperand address(target.object.X(), target.offset.X());
4972     if (use_store_release) {
4973       codegen->StoreRelease(invoke, value_type, source, address, /*needs_null_check=*/ false);
4974     } else {
4975       codegen->Store(value_type, source, address);
4976     }
4977   }
4978 
4979   if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(value_index))) {
4980     codegen->MaybeMarkGCCard(target.object, Register(value), /* emit_null_check= */ true);
4981   }
4982 
4983   if (slow_path != nullptr) {
4984     DCHECK(!byte_swap);
4985     __ Bind(slow_path->GetExitLabel());
4986   }
4987 }
4988 
VisitVarHandleSet(HInvoke * invoke)4989 void IntrinsicLocationsBuilderARM64::VisitVarHandleSet(HInvoke* invoke) {
4990   CreateVarHandleSetLocations(invoke, codegen_);
4991 }
4992 
VisitVarHandleSet(HInvoke * invoke)4993 void IntrinsicCodeGeneratorARM64::VisitVarHandleSet(HInvoke* invoke) {
4994   GenerateVarHandleSet(invoke, codegen_, std::memory_order_relaxed);
4995 }
4996 
VisitVarHandleSetOpaque(HInvoke * invoke)4997 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetOpaque(HInvoke* invoke) {
4998   CreateVarHandleSetLocations(invoke, codegen_);
4999 }
5000 
VisitVarHandleSetOpaque(HInvoke * invoke)5001 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetOpaque(HInvoke* invoke) {
5002   GenerateVarHandleSet(invoke, codegen_, std::memory_order_relaxed);
5003 }
5004 
VisitVarHandleSetRelease(HInvoke * invoke)5005 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetRelease(HInvoke* invoke) {
5006   CreateVarHandleSetLocations(invoke, codegen_);
5007 }
5008 
VisitVarHandleSetRelease(HInvoke * invoke)5009 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetRelease(HInvoke* invoke) {
5010   GenerateVarHandleSet(invoke, codegen_, std::memory_order_release);
5011 }
5012 
VisitVarHandleSetVolatile(HInvoke * invoke)5013 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetVolatile(HInvoke* invoke) {
5014   CreateVarHandleSetLocations(invoke, codegen_);
5015 }
5016 
VisitVarHandleSetVolatile(HInvoke * invoke)5017 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetVolatile(HInvoke* invoke) {
5018   GenerateVarHandleSet(invoke, codegen_, std::memory_order_seq_cst);
5019 }
5020 
CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen,bool return_success)5021 static void CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke* invoke,
5022                                                             CodeGeneratorARM64* codegen,
5023                                                             bool return_success) {
5024   VarHandleOptimizations optimizations(invoke);
5025   if (optimizations.GetDoNotIntrinsify()) {
5026     return;
5027   }
5028 
5029   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
5030   DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
5031   if (value_type == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) {
5032     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
5033     // the passed reference and reloads it from the field. This breaks the read barriers
5034     // in slow path in different ways. The marked old value may not actually be a to-space
5035     // reference to the same object as `old_value`, breaking slow path assumptions. And
5036     // for CompareAndExchange, marking the old value after comparison failure may actually
5037     // return the reference to `expected`, erroneously indicating success even though we
5038     // did not set the new value. (And it also gets the memory visibility wrong.) b/173104084
5039     return;
5040   }
5041 
5042   LocationSummary* locations = CreateVarHandleCommonLocations(invoke, codegen);
5043 
5044   if (codegen->EmitNonBakerReadBarrier()) {
5045     // We need callee-save registers for both the class object and offset instead of
5046     // the temporaries reserved in CreateVarHandleCommonLocations().
5047     static_assert(POPCOUNT(kArm64CalleeSaveRefSpills) >= 2u);
5048     uint32_t first_callee_save = CTZ(kArm64CalleeSaveRefSpills);
5049     uint32_t second_callee_save = CTZ(kArm64CalleeSaveRefSpills ^ (1u << first_callee_save));
5050     if (GetExpectedVarHandleCoordinatesCount(invoke) == 0u) {  // For static fields.
5051       DCHECK_EQ(locations->GetTempCount(), 2u);
5052       DCHECK(locations->GetTemp(0u).Equals(Location::RequiresRegister()));
5053       DCHECK(locations->GetTemp(1u).Equals(Location::RegisterLocation(first_callee_save)));
5054       locations->SetTempAt(0u, Location::RegisterLocation(second_callee_save));
5055     } else {
5056       DCHECK_EQ(locations->GetTempCount(), 1u);
5057       DCHECK(locations->GetTemp(0u).Equals(Location::RequiresRegister()));
5058       locations->SetTempAt(0u, Location::RegisterLocation(first_callee_save));
5059     }
5060   }
5061   size_t old_temp_count = locations->GetTempCount();
5062   DCHECK_EQ(old_temp_count, (GetExpectedVarHandleCoordinatesCount(invoke) == 0) ? 2u : 1u);
5063   if (!return_success) {
5064     if (DataType::IsFloatingPointType(value_type)) {
5065       // Add a temporary for old value and exclusive store result if floating point
5066       // `expected` and/or `new_value` take scratch registers.
5067       size_t available_scratch_registers =
5068           (IsZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) ? 1u : 0u) +
5069           (IsZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) ? 1u : 0u);
5070       size_t temps_needed = /* pointer, old value, store result */ 3u - available_scratch_registers;
5071       // We can reuse the declaring class (if present) and offset temporary.
5072       if (temps_needed > old_temp_count) {
5073         locations->AddRegisterTemps(temps_needed - old_temp_count);
5074       }
5075     } else if ((value_type != DataType::Type::kReference && DataType::Size(value_type) != 1u) &&
5076                !IsZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) &&
5077                !IsZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) &&
5078                GetExpectedVarHandleCoordinatesCount(invoke) == 2u) {
5079       // Allocate a normal temporary for store result in the non-native byte order path
5080       // because scratch registers are used by the byte-swapped `expected` and `new_value`.
5081       DCHECK_EQ(old_temp_count, 1u);
5082       locations->AddTemp(Location::RequiresRegister());
5083     }
5084   }
5085   if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5086     // Add a temporary for the `old_value_temp` in slow path.
5087     locations->AddTemp(Location::RequiresRegister());
5088   }
5089 }
5090 
MoveToTempIfFpRegister(const CPURegister & cpu_reg,DataType::Type type,MacroAssembler * masm,UseScratchRegisterScope * temps)5091 static Register MoveToTempIfFpRegister(const CPURegister& cpu_reg,
5092                                        DataType::Type type,
5093                                        MacroAssembler* masm,
5094                                        UseScratchRegisterScope* temps) {
5095   if (cpu_reg.IsS()) {
5096     DCHECK_EQ(type, DataType::Type::kFloat32);
5097     Register reg = temps->AcquireW();
5098     __ Fmov(reg, cpu_reg.S());
5099     return reg;
5100   } else if (cpu_reg.IsD()) {
5101     DCHECK_EQ(type, DataType::Type::kFloat64);
5102     Register reg = temps->AcquireX();
5103     __ Fmov(reg, cpu_reg.D());
5104     return reg;
5105   } else {
5106     return DataType::Is64BitType(type) ? cpu_reg.X() : cpu_reg.W();
5107   }
5108 }
5109 
GenerateVarHandleCompareAndSetOrExchange(HInvoke * invoke,CodeGeneratorARM64 * codegen,std::memory_order order,bool return_success,bool strong,bool byte_swap=false)5110 static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke,
5111                                                      CodeGeneratorARM64* codegen,
5112                                                      std::memory_order order,
5113                                                      bool return_success,
5114                                                      bool strong,
5115                                                      bool byte_swap = false) {
5116   DCHECK(return_success || strong);
5117 
5118   uint32_t expected_index = invoke->GetNumberOfArguments() - 2;
5119   uint32_t new_value_index = invoke->GetNumberOfArguments() - 1;
5120   DataType::Type value_type = GetDataTypeFromShorty(invoke, new_value_index);
5121   DCHECK_EQ(value_type, GetDataTypeFromShorty(invoke, expected_index));
5122 
5123   MacroAssembler* masm = codegen->GetVIXLAssembler();
5124   LocationSummary* locations = invoke->GetLocations();
5125   CPURegister expected = InputCPURegisterOrZeroRegAt(invoke, expected_index);
5126   CPURegister new_value = InputCPURegisterOrZeroRegAt(invoke, new_value_index);
5127   CPURegister out = helpers::OutputCPURegister(invoke);
5128 
5129   VarHandleTarget target = GetVarHandleTarget(invoke);
5130   VarHandleSlowPathARM64* slow_path = nullptr;
5131   if (!byte_swap) {
5132     slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
5133     GenerateVarHandleTarget(invoke, target, codegen);
5134     if (slow_path != nullptr) {
5135       slow_path->SetCompareAndSetOrExchangeArgs(return_success, strong);
5136       __ Bind(slow_path->GetNativeByteOrderLabel());
5137     }
5138   }
5139 
5140   // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
5141   if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(new_value_index))) {
5142     // Mark card for object assuming new value is stored.
5143     bool new_value_can_be_null = true;  // TODO: Worth finding out this information?
5144     codegen->MaybeMarkGCCard(target.object, new_value.W(), new_value_can_be_null);
5145   }
5146 
5147   // Reuse the `offset` temporary for the pointer to the target location,
5148   // except for references that need the offset for the read barrier.
5149   UseScratchRegisterScope temps(masm);
5150   Register tmp_ptr = target.offset.X();
5151   if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5152     tmp_ptr = temps.AcquireX();
5153   }
5154   __ Add(tmp_ptr, target.object.X(), target.offset.X());
5155 
5156   // Move floating point values to scratch registers.
5157   // Note that float/double CAS uses bitwise comparison, rather than the operator==.
5158   Register expected_reg = MoveToTempIfFpRegister(expected, value_type, masm, &temps);
5159   Register new_value_reg = MoveToTempIfFpRegister(new_value, value_type, masm, &temps);
5160   bool is_fp = DataType::IsFloatingPointType(value_type);
5161   DataType::Type cas_type = is_fp
5162       ? ((value_type == DataType::Type::kFloat64) ? DataType::Type::kInt64 : DataType::Type::kInt32)
5163       : value_type;
5164   // Avoid sign extension in the CAS loop by zero-extending `expected` before the loop. This adds
5165   // one instruction for CompareAndExchange as we shall need to sign-extend the returned value.
5166   if (value_type == DataType::Type::kInt16 && !expected.IsZero()) {
5167     Register temp = temps.AcquireW();
5168     __ Uxth(temp, expected_reg);
5169     expected_reg = temp;
5170     cas_type = DataType::Type::kUint16;
5171   } else if (value_type == DataType::Type::kInt8 && !expected.IsZero()) {
5172     Register temp = temps.AcquireW();
5173     __ Uxtb(temp, expected_reg);
5174     expected_reg = temp;
5175     cas_type = DataType::Type::kUint8;
5176   }
5177 
5178   if (byte_swap) {
5179     // Do the byte swap and move values to scratch registers if needed.
5180     // Non-zero FP values and non-zero `expected` for `kInt16` are already in scratch registers.
5181     DCHECK_NE(value_type, DataType::Type::kInt8);
5182     if (!expected.IsZero()) {
5183       bool is_scratch = is_fp || (value_type == DataType::Type::kInt16);
5184       Register temp = is_scratch ? expected_reg : temps.AcquireSameSizeAs(expected_reg);
5185       GenerateReverseBytes(masm, cas_type, expected_reg, temp);
5186       expected_reg = temp;
5187     }
5188     if (!new_value.IsZero()) {
5189       Register temp = is_fp ? new_value_reg : temps.AcquireSameSizeAs(new_value_reg);
5190       GenerateReverseBytes(masm, cas_type, new_value_reg, temp);
5191       new_value_reg = temp;
5192     }
5193   }
5194 
5195   // Prepare registers for old value and the result of the exclusive store.
5196   Register old_value;
5197   Register store_result;
5198   if (return_success) {
5199     // Use the output register for both old value and exclusive store result.
5200     old_value = (cas_type == DataType::Type::kInt64) ? out.X() : out.W();
5201     store_result = out.W();
5202   } else if (DataType::IsFloatingPointType(value_type)) {
5203     // We need two temporary registers but we have already used scratch registers for
5204     // holding the expected and new value unless they are zero bit pattern (+0.0f or
5205     // +0.0). We have allocated sufficient normal temporaries to handle that.
5206     size_t next_temp = 1u;
5207     if (expected.IsZero()) {
5208       old_value = (cas_type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
5209     } else {
5210       Location temp = locations->GetTemp(next_temp);
5211       ++next_temp;
5212       old_value = (cas_type == DataType::Type::kInt64) ? XRegisterFrom(temp) : WRegisterFrom(temp);
5213     }
5214     store_result =
5215         new_value.IsZero() ? temps.AcquireW() : WRegisterFrom(locations->GetTemp(next_temp));
5216     DCHECK(!old_value.Is(tmp_ptr));
5217     DCHECK(!store_result.Is(tmp_ptr));
5218   } else {
5219     // Use the output register for the old value.
5220     old_value = (cas_type == DataType::Type::kInt64) ? out.X() : out.W();
5221     // Use scratch register for the store result, except when we have used up
5222     // scratch registers for byte-swapped `expected` and `new_value`.
5223     // In that case, we have allocated a normal temporary.
5224     store_result = (byte_swap && !expected.IsZero() && !new_value.IsZero())
5225         ? WRegisterFrom(locations->GetTemp(1))
5226         : temps.AcquireW();
5227     DCHECK(!store_result.Is(tmp_ptr));
5228   }
5229 
5230   vixl::aarch64::Label exit_loop_label;
5231   vixl::aarch64::Label* exit_loop = &exit_loop_label;
5232   vixl::aarch64::Label* cmp_failure = &exit_loop_label;
5233 
5234   if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5235     // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
5236     // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register.
5237     size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
5238     Register old_value_temp =
5239         WRegisterFrom(locations->GetTemp((expected_coordinates_count == 0u) ? 2u : 1u));
5240     // For strong CAS, use a scratch register for the store result in slow path.
5241     // For weak CAS, we need to check the store result, so store it in `store_result`.
5242     Register slow_path_store_result = strong ? Register() : store_result;
5243     ReadBarrierCasSlowPathARM64* rb_slow_path =
5244         new (codegen->GetScopedAllocator()) ReadBarrierCasSlowPathARM64(
5245             invoke,
5246             order,
5247             strong,
5248             target.object,
5249             target.offset.X(),
5250             expected_reg,
5251             new_value_reg,
5252             old_value,
5253             old_value_temp,
5254             slow_path_store_result,
5255             /*update_old_value=*/ !return_success,
5256             codegen);
5257     codegen->AddSlowPath(rb_slow_path);
5258     exit_loop = rb_slow_path->GetExitLabel();
5259     cmp_failure = rb_slow_path->GetEntryLabel();
5260   }
5261 
5262   GenerateCompareAndSet(codegen,
5263                         cas_type,
5264                         order,
5265                         strong,
5266                         cmp_failure,
5267                         tmp_ptr,
5268                         new_value_reg,
5269                         old_value,
5270                         store_result,
5271                         expected_reg);
5272   __ Bind(exit_loop);
5273 
5274   if (return_success) {
5275     if (strong) {
5276       __ Cset(out.W(), eq);
5277     } else {
5278       // On success, the Z flag is set and the store result is 1, see GenerateCompareAndSet().
5279       // On failure, either the Z flag is clear or the store result is 0.
5280       // Determine the final success value with a CSEL.
5281       __ Csel(out.W(), store_result, wzr, eq);
5282     }
5283   } else if (byte_swap) {
5284     // Also handles moving to FP registers.
5285     GenerateReverseBytes(masm, value_type, old_value, out);
5286   } else if (DataType::IsFloatingPointType(value_type)) {
5287     __ Fmov((value_type == DataType::Type::kFloat64) ? out.D() : out.S(), old_value);
5288   } else if (value_type == DataType::Type::kInt8) {
5289     __ Sxtb(out.W(), old_value);
5290   } else if (value_type == DataType::Type::kInt16) {
5291     __ Sxth(out.W(), old_value);
5292   }
5293 
5294   if (slow_path != nullptr) {
5295     DCHECK(!byte_swap);
5296     __ Bind(slow_path->GetExitLabel());
5297   }
5298 }
5299 
VisitVarHandleCompareAndExchange(HInvoke * invoke)5300 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchange(HInvoke* invoke) {
5301   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ false);
5302 }
5303 
VisitVarHandleCompareAndExchange(HInvoke * invoke)5304 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndExchange(HInvoke* invoke) {
5305   GenerateVarHandleCompareAndSetOrExchange(
5306       invoke, codegen_, std::memory_order_seq_cst, /*return_success=*/ false, /*strong=*/ true);
5307 }
5308 
VisitVarHandleCompareAndExchangeAcquire(HInvoke * invoke)5309 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchangeAcquire(HInvoke* invoke) {
5310   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ false);
5311 }
5312 
VisitVarHandleCompareAndExchangeAcquire(HInvoke * invoke)5313 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndExchangeAcquire(HInvoke* invoke) {
5314   GenerateVarHandleCompareAndSetOrExchange(
5315       invoke, codegen_, std::memory_order_acquire, /*return_success=*/ false, /*strong=*/ true);
5316 }
5317 
VisitVarHandleCompareAndExchangeRelease(HInvoke * invoke)5318 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchangeRelease(HInvoke* invoke) {
5319   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ false);
5320 }
5321 
VisitVarHandleCompareAndExchangeRelease(HInvoke * invoke)5322 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndExchangeRelease(HInvoke* invoke) {
5323   GenerateVarHandleCompareAndSetOrExchange(
5324       invoke, codegen_, std::memory_order_release, /*return_success=*/ false, /*strong=*/ true);
5325 }
5326 
VisitVarHandleCompareAndSet(HInvoke * invoke)5327 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndSet(HInvoke* invoke) {
5328   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5329 }
5330 
VisitVarHandleCompareAndSet(HInvoke * invoke)5331 void IntrinsicCodeGeneratorARM64::VisitVarHandleCompareAndSet(HInvoke* invoke) {
5332   GenerateVarHandleCompareAndSetOrExchange(
5333       invoke, codegen_, std::memory_order_seq_cst, /*return_success=*/ true, /*strong=*/ true);
5334 }
5335 
VisitVarHandleWeakCompareAndSet(HInvoke * invoke)5336 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSet(HInvoke* invoke) {
5337   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5338 }
5339 
VisitVarHandleWeakCompareAndSet(HInvoke * invoke)5340 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSet(HInvoke* invoke) {
5341   GenerateVarHandleCompareAndSetOrExchange(
5342       invoke, codegen_, std::memory_order_seq_cst, /*return_success=*/ true, /*strong=*/ false);
5343 }
5344 
VisitVarHandleWeakCompareAndSetAcquire(HInvoke * invoke)5345 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSetAcquire(HInvoke* invoke) {
5346   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5347 }
5348 
VisitVarHandleWeakCompareAndSetAcquire(HInvoke * invoke)5349 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSetAcquire(HInvoke* invoke) {
5350   GenerateVarHandleCompareAndSetOrExchange(
5351       invoke, codegen_, std::memory_order_acquire, /*return_success=*/ true, /*strong=*/ false);
5352 }
5353 
VisitVarHandleWeakCompareAndSetPlain(HInvoke * invoke)5354 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSetPlain(HInvoke* invoke) {
5355   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5356 }
5357 
VisitVarHandleWeakCompareAndSetPlain(HInvoke * invoke)5358 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSetPlain(HInvoke* invoke) {
5359   GenerateVarHandleCompareAndSetOrExchange(
5360       invoke, codegen_, std::memory_order_relaxed, /*return_success=*/ true, /*strong=*/ false);
5361 }
5362 
VisitVarHandleWeakCompareAndSetRelease(HInvoke * invoke)5363 void IntrinsicLocationsBuilderARM64::VisitVarHandleWeakCompareAndSetRelease(HInvoke* invoke) {
5364   CreateVarHandleCompareAndSetOrExchangeLocations(invoke, codegen_, /*return_success=*/ true);
5365 }
5366 
VisitVarHandleWeakCompareAndSetRelease(HInvoke * invoke)5367 void IntrinsicCodeGeneratorARM64::VisitVarHandleWeakCompareAndSetRelease(HInvoke* invoke) {
5368   GenerateVarHandleCompareAndSetOrExchange(
5369       invoke, codegen_, std::memory_order_release, /*return_success=*/ true, /*strong=*/ false);
5370 }
5371 
CreateVarHandleGetAndUpdateLocations(HInvoke * invoke,CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op)5372 static void CreateVarHandleGetAndUpdateLocations(HInvoke* invoke,
5373                                                  CodeGeneratorARM64* codegen,
5374                                                  GetAndUpdateOp get_and_update_op) {
5375   VarHandleOptimizations optimizations(invoke);
5376   if (optimizations.GetDoNotIntrinsify()) {
5377     return;
5378   }
5379 
5380   if (invoke->GetType() == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) {
5381     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
5382     // the passed reference and reloads it from the field, thus seeing the new value
5383     // that we have just stored. (And it also gets the memory visibility wrong.) b/173104084
5384     return;
5385   }
5386 
5387   LocationSummary* locations = CreateVarHandleCommonLocations(invoke, codegen);
5388 
5389   size_t old_temp_count = locations->GetTempCount();
5390   DCHECK_EQ(old_temp_count, (GetExpectedVarHandleCoordinatesCount(invoke) == 0) ? 2u : 1u);
5391   if (DataType::IsFloatingPointType(invoke->GetType())) {
5392     if (get_and_update_op == GetAndUpdateOp::kAdd) {
5393       // For ADD, do not use ZR for zero bit pattern (+0.0f or +0.0).
5394       locations->SetInAt(invoke->GetNumberOfArguments() - 1u, Location::RequiresFpuRegister());
5395     } else {
5396       DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
5397       // We can reuse the declaring class temporary if present.
5398       if (old_temp_count == 1u &&
5399           !IsZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
5400         // Add a temporary for `old_value` if floating point `new_value` takes a scratch register.
5401         locations->AddTemp(Location::RequiresRegister());
5402       }
5403     }
5404   }
5405   // We need a temporary for the byte-swap path for bitwise operations unless the argument is a
5406   // zero which does not need a byte-swap. We can reuse the declaring class temporary if present.
5407   if (old_temp_count == 1u &&
5408       (get_and_update_op != GetAndUpdateOp::kSet && get_and_update_op != GetAndUpdateOp::kAdd) &&
5409       GetExpectedVarHandleCoordinatesCount(invoke) == 2u &&
5410       !IsZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
5411     DataType::Type value_type =
5412         GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
5413     if (value_type != DataType::Type::kReference && DataType::Size(value_type) != 1u) {
5414       locations->AddTemp(Location::RequiresRegister());
5415     }
5416   }
5417 }
5418 
GenerateVarHandleGetAndUpdate(HInvoke * invoke,CodeGeneratorARM64 * codegen,GetAndUpdateOp get_and_update_op,std::memory_order order,bool byte_swap=false)5419 static void GenerateVarHandleGetAndUpdate(HInvoke* invoke,
5420                                           CodeGeneratorARM64* codegen,
5421                                           GetAndUpdateOp get_and_update_op,
5422                                           std::memory_order order,
5423                                           bool byte_swap = false) {
5424   uint32_t arg_index = invoke->GetNumberOfArguments() - 1;
5425   DataType::Type value_type = GetDataTypeFromShorty(invoke, arg_index);
5426   bool is_fp = DataType::IsFloatingPointType(value_type);
5427 
5428   MacroAssembler* masm = codegen->GetVIXLAssembler();
5429   LocationSummary* locations = invoke->GetLocations();
5430   CPURegister arg = (is_fp && get_and_update_op == GetAndUpdateOp::kAdd)
5431       ? InputCPURegisterAt(invoke, arg_index)
5432       : InputCPURegisterOrZeroRegAt(invoke, arg_index);
5433   CPURegister out = helpers::OutputCPURegister(invoke);
5434 
5435   VarHandleTarget target = GetVarHandleTarget(invoke);
5436   VarHandleSlowPathARM64* slow_path = nullptr;
5437   if (!byte_swap) {
5438     slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
5439     GenerateVarHandleTarget(invoke, target, codegen);
5440     if (slow_path != nullptr) {
5441       slow_path->SetGetAndUpdateOp(get_and_update_op);
5442       __ Bind(slow_path->GetNativeByteOrderLabel());
5443     }
5444   }
5445 
5446   // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
5447   if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(arg_index))) {
5448     DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
5449     // Mark card for object, the new value shall be stored.
5450     bool new_value_can_be_null = true;  // TODO: Worth finding out this information?
5451     codegen->MaybeMarkGCCard(target.object, arg.W(), new_value_can_be_null);
5452   }
5453 
5454   // Reuse the `target.offset` temporary for the pointer to the target location,
5455   // except for references that need the offset for the non-Baker read barrier.
5456   UseScratchRegisterScope temps(masm);
5457   Register tmp_ptr = target.offset.X();
5458   if (value_type == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) {
5459     tmp_ptr = temps.AcquireX();
5460   }
5461   __ Add(tmp_ptr, target.object.X(), target.offset.X());
5462 
5463   // The load/store type is never floating point.
5464   DataType::Type load_store_type = is_fp
5465       ? ((value_type == DataType::Type::kFloat32) ? DataType::Type::kInt32 : DataType::Type::kInt64)
5466       : value_type;
5467   // Avoid sign extension in the CAS loop. Sign-extend after the loop.
5468   // Note: Using unsigned values yields the same value to store (we do not store higher bits).
5469   if (value_type == DataType::Type::kInt8) {
5470     load_store_type = DataType::Type::kUint8;
5471   } else if (value_type == DataType::Type::kInt16) {
5472     load_store_type = DataType::Type::kUint16;
5473   }
5474 
5475   // Prepare register for old value.
5476   CPURegister old_value = out;
5477   if (get_and_update_op == GetAndUpdateOp::kSet) {
5478     // For floating point GetAndSet, do the GenerateGetAndUpdate() with core registers,
5479     // rather than moving between core and FP registers in the loop.
5480     arg = MoveToTempIfFpRegister(arg, value_type, masm, &temps);
5481     if (is_fp && !arg.IsZero()) {
5482       // We need a temporary register but we have already used a scratch register for
5483       // the new value unless it is zero bit pattern (+0.0f or +0.0) and need another one
5484       // in GenerateGetAndUpdate(). We have allocated a normal temporary to handle that.
5485       old_value = CPURegisterFrom(locations->GetTemp(1u), load_store_type);
5486     } else if (value_type == DataType::Type::kReference && codegen->EmitBakerReadBarrier()) {
5487       // Load the old value initially to a scratch register.
5488       // We shall move it to `out` later with a read barrier.
5489       old_value = temps.AcquireW();
5490     }
5491   }
5492 
5493   if (byte_swap) {
5494     DCHECK_NE(value_type, DataType::Type::kReference);
5495     DCHECK_NE(DataType::Size(value_type), 1u);
5496     if (get_and_update_op == GetAndUpdateOp::kAdd) {
5497       // We need to do the byte swapping in the CAS loop for GetAndAdd.
5498       get_and_update_op = GetAndUpdateOp::kAddWithByteSwap;
5499     } else if (!arg.IsZero()) {
5500       // For other operations, avoid byte swap inside the CAS loop by providing an adjusted `arg`.
5501       // For GetAndSet use a scratch register; FP argument is already in a scratch register.
5502       // For bitwise operations GenerateGetAndUpdate() needs both scratch registers;
5503       // we have allocated a normal temporary to handle that.
5504       CPURegister temp = (get_and_update_op == GetAndUpdateOp::kSet)
5505           ? (is_fp ? arg : (arg.Is64Bits() ? temps.AcquireX() : temps.AcquireW()))
5506           : CPURegisterFrom(locations->GetTemp(1u), load_store_type);
5507       GenerateReverseBytes(masm, load_store_type, arg, temp);
5508       arg = temp;
5509     }
5510   }
5511 
5512   GenerateGetAndUpdate(codegen, get_and_update_op, load_store_type, order, tmp_ptr, arg, old_value);
5513 
5514   if (get_and_update_op == GetAndUpdateOp::kAddWithByteSwap) {
5515     // The only adjustment needed is sign-extension for `kInt16`.
5516     // Everything else has been done by the `GenerateGetAndUpdate()`.
5517     DCHECK(byte_swap);
5518     if (value_type == DataType::Type::kInt16) {
5519       DCHECK_EQ(load_store_type, DataType::Type::kUint16);
5520       __ Sxth(out.W(), old_value.W());
5521     }
5522   } else if (byte_swap) {
5523     // Also handles moving to FP registers.
5524     GenerateReverseBytes(masm, value_type, old_value, out);
5525   } else if (get_and_update_op == GetAndUpdateOp::kSet && value_type == DataType::Type::kFloat64) {
5526     __ Fmov(out.D(), old_value.X());
5527   } else if (get_and_update_op == GetAndUpdateOp::kSet && value_type == DataType::Type::kFloat32) {
5528     __ Fmov(out.S(), old_value.W());
5529   } else if (value_type == DataType::Type::kInt8) {
5530     __ Sxtb(out.W(), old_value.W());
5531   } else if (value_type == DataType::Type::kInt16) {
5532     __ Sxth(out.W(), old_value.W());
5533   } else if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) {
5534     if (kUseBakerReadBarrier) {
5535       codegen->GenerateIntrinsicMoveWithBakerReadBarrier(out.W(), old_value.W());
5536     } else {
5537       codegen->GenerateReadBarrierSlow(
5538           invoke,
5539           Location::RegisterLocation(out.GetCode()),
5540           Location::RegisterLocation(old_value.GetCode()),
5541           Location::RegisterLocation(target.object.GetCode()),
5542           /*offset=*/ 0u,
5543           /*index=*/ Location::RegisterLocation(target.offset.GetCode()));
5544     }
5545   }
5546 
5547   if (slow_path != nullptr) {
5548     DCHECK(!byte_swap);
5549     __ Bind(slow_path->GetExitLabel());
5550   }
5551 }
5552 
VisitVarHandleGetAndSet(HInvoke * invoke)5553 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSet(HInvoke* invoke) {
5554   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet);
5555 }
5556 
VisitVarHandleGetAndSet(HInvoke * invoke)5557 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndSet(HInvoke* invoke) {
5558   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_seq_cst);
5559 }
5560 
VisitVarHandleGetAndSetAcquire(HInvoke * invoke)5561 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSetAcquire(HInvoke* invoke) {
5562   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet);
5563 }
5564 
VisitVarHandleGetAndSetAcquire(HInvoke * invoke)5565 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndSetAcquire(HInvoke* invoke) {
5566   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_acquire);
5567 }
5568 
VisitVarHandleGetAndSetRelease(HInvoke * invoke)5569 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSetRelease(HInvoke* invoke) {
5570   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet);
5571 }
5572 
VisitVarHandleGetAndSetRelease(HInvoke * invoke)5573 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndSetRelease(HInvoke* invoke) {
5574   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_release);
5575 }
5576 
VisitVarHandleGetAndAdd(HInvoke * invoke)5577 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndAdd(HInvoke* invoke) {
5578   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd);
5579 }
5580 
VisitVarHandleGetAndAdd(HInvoke * invoke)5581 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndAdd(HInvoke* invoke) {
5582   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_seq_cst);
5583 }
5584 
VisitVarHandleGetAndAddAcquire(HInvoke * invoke)5585 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndAddAcquire(HInvoke* invoke) {
5586   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd);
5587 }
5588 
VisitVarHandleGetAndAddAcquire(HInvoke * invoke)5589 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndAddAcquire(HInvoke* invoke) {
5590   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_acquire);
5591 }
5592 
VisitVarHandleGetAndAddRelease(HInvoke * invoke)5593 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndAddRelease(HInvoke* invoke) {
5594   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd);
5595 }
5596 
VisitVarHandleGetAndAddRelease(HInvoke * invoke)5597 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndAddRelease(HInvoke* invoke) {
5598   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_release);
5599 }
5600 
VisitVarHandleGetAndBitwiseAnd(HInvoke * invoke)5601 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseAnd(HInvoke* invoke) {
5602   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd);
5603 }
5604 
VisitVarHandleGetAndBitwiseAnd(HInvoke * invoke)5605 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseAnd(HInvoke* invoke) {
5606   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_seq_cst);
5607 }
5608 
VisitVarHandleGetAndBitwiseAndAcquire(HInvoke * invoke)5609 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseAndAcquire(HInvoke* invoke) {
5610   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd);
5611 }
5612 
VisitVarHandleGetAndBitwiseAndAcquire(HInvoke * invoke)5613 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseAndAcquire(HInvoke* invoke) {
5614   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_acquire);
5615 }
5616 
VisitVarHandleGetAndBitwiseAndRelease(HInvoke * invoke)5617 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseAndRelease(HInvoke* invoke) {
5618   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd);
5619 }
5620 
VisitVarHandleGetAndBitwiseAndRelease(HInvoke * invoke)5621 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseAndRelease(HInvoke* invoke) {
5622   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_release);
5623 }
5624 
VisitVarHandleGetAndBitwiseOr(HInvoke * invoke)5625 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseOr(HInvoke* invoke) {
5626   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr);
5627 }
5628 
VisitVarHandleGetAndBitwiseOr(HInvoke * invoke)5629 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseOr(HInvoke* invoke) {
5630   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_seq_cst);
5631 }
5632 
VisitVarHandleGetAndBitwiseOrAcquire(HInvoke * invoke)5633 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseOrAcquire(HInvoke* invoke) {
5634   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr);
5635 }
5636 
VisitVarHandleGetAndBitwiseOrAcquire(HInvoke * invoke)5637 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseOrAcquire(HInvoke* invoke) {
5638   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_acquire);
5639 }
5640 
VisitVarHandleGetAndBitwiseOrRelease(HInvoke * invoke)5641 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseOrRelease(HInvoke* invoke) {
5642   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr);
5643 }
5644 
VisitVarHandleGetAndBitwiseOrRelease(HInvoke * invoke)5645 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseOrRelease(HInvoke* invoke) {
5646   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_release);
5647 }
5648 
VisitVarHandleGetAndBitwiseXor(HInvoke * invoke)5649 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseXor(HInvoke* invoke) {
5650   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor);
5651 }
5652 
VisitVarHandleGetAndBitwiseXor(HInvoke * invoke)5653 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseXor(HInvoke* invoke) {
5654   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_seq_cst);
5655 }
5656 
VisitVarHandleGetAndBitwiseXorAcquire(HInvoke * invoke)5657 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseXorAcquire(HInvoke* invoke) {
5658   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor);
5659 }
5660 
VisitVarHandleGetAndBitwiseXorAcquire(HInvoke * invoke)5661 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseXorAcquire(HInvoke* invoke) {
5662   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_acquire);
5663 }
5664 
VisitVarHandleGetAndBitwiseXorRelease(HInvoke * invoke)5665 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndBitwiseXorRelease(HInvoke* invoke) {
5666   CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor);
5667 }
5668 
VisitVarHandleGetAndBitwiseXorRelease(HInvoke * invoke)5669 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAndBitwiseXorRelease(HInvoke* invoke) {
5670   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_release);
5671 }
5672 
EmitByteArrayViewCode(CodeGenerator * codegen_in)5673 void VarHandleSlowPathARM64::EmitByteArrayViewCode(CodeGenerator* codegen_in) {
5674   DCHECK(GetByteArrayViewCheckLabel()->IsLinked());
5675   CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
5676   MacroAssembler* masm = codegen->GetVIXLAssembler();
5677   HInvoke* invoke = GetInvoke();
5678   mirror::VarHandle::AccessModeTemplate access_mode_template = GetAccessModeTemplate();
5679   DataType::Type value_type =
5680       GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
5681   DCHECK_NE(value_type, DataType::Type::kReference);
5682   size_t size = DataType::Size(value_type);
5683   DCHECK_GT(size, 1u);
5684   Register varhandle = InputRegisterAt(invoke, 0);
5685   Register object = InputRegisterAt(invoke, 1);
5686   Register index = InputRegisterAt(invoke, 2);
5687 
5688   MemberOffset class_offset = mirror::Object::ClassOffset();
5689   MemberOffset array_length_offset = mirror::Array::LengthOffset();
5690   MemberOffset data_offset = mirror::Array::DataOffset(Primitive::kPrimByte);
5691   MemberOffset native_byte_order_offset = mirror::ByteArrayViewVarHandle::NativeByteOrderOffset();
5692 
5693   __ Bind(GetByteArrayViewCheckLabel());
5694 
5695   VarHandleTarget target = GetVarHandleTarget(invoke);
5696   {
5697     UseScratchRegisterScope temps(masm);
5698     Register temp = temps.AcquireW();
5699     Register temp2 = temps.AcquireW();
5700 
5701     // The main path checked that the coordinateType0 is an array class that matches
5702     // the class of the actual coordinate argument but it does not match the value type.
5703     // Check if the `varhandle` references a ByteArrayViewVarHandle instance.
5704     __ Ldr(temp, HeapOperand(varhandle, class_offset.Int32Value()));
5705     codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp);
5706     codegen->LoadClassRootForIntrinsic(temp2, ClassRoot::kJavaLangInvokeByteArrayViewVarHandle);
5707     __ Cmp(temp, temp2);
5708     __ B(GetEntryLabel(), ne);
5709 
5710     // Check for array index out of bounds.
5711     __ Ldr(temp, HeapOperand(object, array_length_offset.Int32Value()));
5712     __ Subs(temp, temp, index);
5713     __ Ccmp(temp, size, NoFlag, hs);  // If SUBS yields LO (C=false), keep the C flag clear.
5714     __ B(GetEntryLabel(), lo);
5715 
5716     // Construct the target.
5717     __ Add(target.offset, index, data_offset.Int32Value());
5718 
5719     // Alignment check. For unaligned access, go to the runtime.
5720     DCHECK(IsPowerOfTwo(size));
5721     if (size == 2u) {
5722       __ Tbnz(target.offset, 0, GetEntryLabel());
5723     } else {
5724       __ Tst(target.offset, size - 1u);
5725       __ B(GetEntryLabel(), ne);
5726     }
5727 
5728     // Byte order check. For native byte order return to the main path.
5729     if (access_mode_template == mirror::VarHandle::AccessModeTemplate::kSet &&
5730         IsZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
5731       // There is no reason to differentiate between native byte order and byte-swap
5732       // for setting a zero bit pattern. Just return to the main path.
5733       __ B(GetNativeByteOrderLabel());
5734       return;
5735     }
5736     __ Ldr(temp, HeapOperand(varhandle, native_byte_order_offset.Int32Value()));
5737     __ Cbnz(temp, GetNativeByteOrderLabel());
5738   }
5739 
5740   switch (access_mode_template) {
5741     case mirror::VarHandle::AccessModeTemplate::kGet:
5742       GenerateVarHandleGet(invoke, codegen, order_, /*byte_swap=*/ true);
5743       break;
5744     case mirror::VarHandle::AccessModeTemplate::kSet:
5745       GenerateVarHandleSet(invoke, codegen, order_, /*byte_swap=*/ true);
5746       break;
5747     case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
5748     case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
5749       GenerateVarHandleCompareAndSetOrExchange(
5750           invoke, codegen, order_, return_success_, strong_, /*byte_swap=*/ true);
5751       break;
5752     case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate:
5753       GenerateVarHandleGetAndUpdate(
5754           invoke, codegen, get_and_update_op_, order_, /*byte_swap=*/ true);
5755       break;
5756   }
5757   __ B(GetExitLabel());
5758 }
5759 
5760 #define MARK_UNIMPLEMENTED(Name) UNIMPLEMENTED_INTRINSIC(ARM64, Name)
5761 UNIMPLEMENTED_INTRINSIC_LIST_ARM64(MARK_UNIMPLEMENTED);
5762 #undef MARK_UNIMPLEMENTED
5763 
5764 UNREACHABLE_INTRINSICS(ARM64)
5765 
5766 #undef __
5767 
5768 }  // namespace arm64
5769 }  // namespace art
5770