1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "berberis/backend/x86_64/loop_guest_context_optimizer.h"
18 
19 #include <algorithm>
20 #include <functional>
21 #include <tuple>
22 
23 #include "berberis/backend/x86_64/machine_ir.h"
24 #include "berberis/base/algorithm.h"
25 #include "berberis/base/logging.h"
26 #include "berberis/guest_state/guest_state_opaque.h"
27 
28 namespace berberis::x86_64 {
29 
ReplaceGetAndUpdateMap(MachineIR * ir,const MachineInsnList::iterator insn_it,MemRegMap & mem_reg_map)30 void ReplaceGetAndUpdateMap(MachineIR* ir,
31                             const MachineInsnList::iterator insn_it,
32                             MemRegMap& mem_reg_map) {
33   auto* insn = AsMachineInsnX86_64(*insn_it);
34   auto disp = insn->disp();
35 
36   MovType regtype;
37   switch (insn->opcode()) {
38     case kMachineOpMovqRegMemBaseDisp:
39       regtype = MovType::kMovq;
40       break;
41     case kMachineOpMovdqaXRegMemBaseDisp:
42       regtype = MovType::kMovdqa;
43       break;
44     case kMachineOpMovwRegMemBaseDisp:
45       regtype = MovType::kMovw;
46       break;
47     case kMachineOpMovsdXRegMemBaseDisp:
48       regtype = MovType::kMovsd;
49       break;
50     default:
51       LOG_ALWAYS_FATAL("Unrecognized Get instruction opcode");
52   }
53 
54   if (!mem_reg_map[disp].has_value()) {
55     auto reg = ir->AllocVReg();
56     mem_reg_map[disp] = {reg, regtype, false};
57   }
58 
59   auto dst = insn->RegAt(0);
60   auto copy_size = insn->opcode() == kMachineOpMovdqaXRegMemBaseDisp ? 16 : 8;
61   auto* new_insn = ir->NewInsn<PseudoCopy>(dst, mem_reg_map[disp].value().reg, copy_size);
62   *insn_it = new_insn;
63 }
64 
ReplacePutAndUpdateMap(MachineIR * ir,const MachineInsnList::iterator insn_it,MemRegMap & mem_reg_map)65 void ReplacePutAndUpdateMap(MachineIR* ir,
66                             const MachineInsnList::iterator insn_it,
67                             MemRegMap& mem_reg_map) {
68   auto* insn = AsMachineInsnX86_64(*insn_it);
69   auto disp = insn->disp();
70 
71   MovType regtype;
72   switch (insn->opcode()) {
73     case kMachineOpMovqMemBaseDispReg:
74       regtype = MovType::kMovq;
75       break;
76     case kMachineOpMovdqaMemBaseDispXReg:
77       regtype = MovType::kMovdqa;
78       break;
79     case kMachineOpMovwMemBaseDispReg:
80       regtype = MovType::kMovw;
81       break;
82     case kMachineOpMovsdMemBaseDispXReg:
83       regtype = MovType::kMovsd;
84       break;
85     default:
86       LOG_ALWAYS_FATAL("Unrecognized Put instruction opcode");
87   }
88 
89   if (!mem_reg_map[disp].has_value()) {
90     auto reg = ir->AllocVReg();
91     mem_reg_map[disp] = {reg, regtype, true};
92   } else {
93     mem_reg_map[disp].value().is_modified = true;
94   }
95 
96   auto src = insn->RegAt(1);
97   auto copy_size = insn->opcode() == kMachineOpMovdqaMemBaseDispXReg ? 16 : 8;
98   auto* new_insn = static_cast<MachineInsn*>(
99       ir->NewInsn<PseudoCopy>(mem_reg_map[disp].value().reg, src, copy_size));
100   *insn_it = new_insn;
101 }
102 
GenerateGetInsns(MachineIR * ir,MachineBasicBlock * bb,const MemRegMap & mem_reg_map)103 void GenerateGetInsns(MachineIR* ir, MachineBasicBlock* bb, const MemRegMap& mem_reg_map) {
104   // Check that there is no critical edge.
105   CHECK_EQ(bb->out_edges().size(), 1);
106 
107   auto insert_it = std::prev(bb->insn_list().end());
108   for (unsigned long disp = 0; disp < mem_reg_map.size(); disp++) {
109     if (!mem_reg_map[disp].has_value()) {
110       continue;
111     }
112 
113     // It's tempting to generate Get only if there is a Get insn for the guest register in the loop,
114     // but it would be incorrect because the loop can exit without updating
115     // the mapped register, making the afterloop loading from the uninitialized
116     // mapped register.
117 
118     // TODO(b/203826752) Do not generate the Get insn if the initialization of the mapped
119     // register is not needed.
120     auto reg_info = mem_reg_map[disp].value();
121     MachineInsn* get_insn;
122     switch (reg_info.mov_type) {
123       case MovType::kMovq:
124         get_insn = ir->NewInsn<MovqRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
125         break;
126       case MovType::kMovdqa:
127         get_insn = ir->NewInsn<MovdqaXRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
128         break;
129       case MovType::kMovw:
130         get_insn = ir->NewInsn<MovwRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
131         break;
132       case MovType::kMovsd:
133         get_insn = ir->NewInsn<MovsdXRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
134         break;
135     }
136 
137     bb->insn_list().insert(insert_it, get_insn);
138   }
139 }
140 
GeneratePutInsns(MachineIR * ir,MachineBasicBlock * bb,const MemRegMap & mem_reg_map)141 void GeneratePutInsns(MachineIR* ir, MachineBasicBlock* bb, const MemRegMap& mem_reg_map) {
142   // Check that there is no critical edge.
143   CHECK_EQ(bb->in_edges().size(), 1);
144 
145   auto insert_it = bb->insn_list().begin();
146   for (unsigned long disp = 0; disp < mem_reg_map.size(); disp++) {
147     if (!mem_reg_map[disp].has_value()) {
148       continue;
149     }
150 
151     auto reg_info = mem_reg_map[disp].value();
152     if (!reg_info.is_modified) {
153       continue;
154     }
155 
156     MachineInsn* put_insn;
157     switch (reg_info.mov_type) {
158       case MovType::kMovq:
159         put_insn = ir->NewInsn<MovqMemBaseDispReg>(kMachineRegRBP, disp, reg_info.reg);
160         break;
161       case MovType::kMovdqa:
162         put_insn = ir->NewInsn<MovdqaMemBaseDispXReg>(kMachineRegRBP, disp, reg_info.reg);
163         break;
164       case MovType::kMovw:
165         put_insn = ir->NewInsn<MovwMemBaseDispReg>(kMachineRegRBP, disp, reg_info.reg);
166         break;
167       case MovType::kMovsd:
168         put_insn = ir->NewInsn<MovsdMemBaseDispXReg>(kMachineRegRBP, disp, reg_info.reg);
169         break;
170     }
171 
172     bb->insn_list().insert(insert_it, put_insn);
173   }
174 }
175 
GenerateGetsInPreloop(MachineIR * ir,const Loop * loop,const MemRegMap & mem_reg_map)176 void GenerateGetsInPreloop(MachineIR* ir, const Loop* loop, const MemRegMap& mem_reg_map) {
177   auto* header = (*loop)[0];
178   CHECK_GE(header->in_edges().size(), 2);
179   for (auto in_edge : header->in_edges()) {
180     if (Contains(*loop, in_edge->src())) {
181       // The source of the edge is inside the loop.
182       continue;
183     }
184 
185     GenerateGetInsns(ir, in_edge->src(), mem_reg_map);
186   }
187 }
188 
GeneratePutsInPostloop(MachineIR * ir,const Loop * loop,const MemRegMap & mem_reg_map)189 void GeneratePutsInPostloop(MachineIR* ir, const Loop* loop, const MemRegMap& mem_reg_map) {
190   for (auto bb : *loop) {
191     for (auto* out_edge : bb->out_edges()) {
192       if (Contains(*loop, out_edge->dst())) {
193         continue;
194       }
195 
196       GeneratePutInsns(ir, out_edge->dst(), mem_reg_map);
197     }
198   }
199 }
200 
CountGuestRegAccesses(const MachineIR * ir,const Loop * loop)201 ArenaVector<int> CountGuestRegAccesses(const MachineIR* ir, const Loop* loop) {
202   ArenaVector<int> guest_access_count(sizeof(CPUState), 0, ir->arena());
203   for (auto* bb : *loop) {
204     for (auto* base_insn : bb->insn_list()) {
205       auto insn = AsMachineInsnX86_64(base_insn);
206       if (insn->IsCPUStateGet() || insn->IsCPUStatePut()) {
207         guest_access_count.at(insn->disp())++;
208       }
209     }
210   }
211   return guest_access_count;
212 }
213 
GetSortedOffsetCounters(MachineIR * ir,Loop * loop)214 OffsetCounterMap GetSortedOffsetCounters(MachineIR* ir, Loop* loop) {
215   auto guest_access_count = CountGuestRegAccesses(ir, loop);
216 
217   OffsetCounterMap offset_counter_map(ir->arena());
218   for (size_t offset = 0; offset < sizeof(CPUState); offset++) {
219     int cnt = guest_access_count.at(offset);
220     if (cnt > 0) {
221       offset_counter_map.push_back({offset, cnt});
222     }
223   }
224 
225   std::sort(offset_counter_map.begin(), offset_counter_map.end(), [](auto pair1, auto pair2) {
226     return std::get<1>(pair1) > std::get<1>(pair2);
227   });
228 
229   return offset_counter_map;
230 }
231 
OptimizeLoop(MachineIR * machine_ir,Loop * loop,const OptimizeLoopParams & params)232 void OptimizeLoop(MachineIR* machine_ir, Loop* loop, const OptimizeLoopParams& params) {
233   OffsetCounterMap sorted_offsets = GetSortedOffsetCounters(machine_ir, loop);
234   ArenaVector<bool> optimized_offsets(sizeof(CPUState), false, machine_ir->arena());
235 
236   size_t general_reg_count = 0;
237   size_t simd_reg_count = 0;
238   for (auto [offset, unused_counter] : sorted_offsets) {
239     // TODO(b/232598137) Account for f and v register classes.
240     // Simd regs.
241     if (IsSimdOffset(offset)) {
242       if (simd_reg_count++ < params.simd_reg_limit) {
243         optimized_offsets[offset] = true;
244       }
245       continue;
246     }
247     // General regs and flags.
248     if (general_reg_count++ < params.general_reg_limit) {
249       optimized_offsets[offset] = true;
250     }
251   }
252 
253   MemRegMap mem_reg_map(sizeof(CPUState), std::nullopt, machine_ir->arena());
254   // Replace gets and puts with Pseudocopy and update mem_reg_map.
255   for (auto* bb : *loop) {
256     for (auto insn_it = bb->insn_list().begin(); insn_it != bb->insn_list().end(); insn_it++) {
257       auto insn = AsMachineInsnX86_64(*insn_it);
258 
259       // Skip insn if it accesses regs with low priority
260       if (insn->IsCPUStateGet() || insn->IsCPUStatePut()) {
261         if (!optimized_offsets.at(insn->disp())) {
262           continue;
263         }
264       }
265 
266       if (insn->IsCPUStateGet()) {
267         ReplaceGetAndUpdateMap(machine_ir, insn_it, mem_reg_map);
268       } else if (insn->IsCPUStatePut()) {
269         ReplacePutAndUpdateMap(machine_ir, insn_it, mem_reg_map);
270       }
271     }
272   }
273 
274   GenerateGetsInPreloop(machine_ir, loop, mem_reg_map);
275   GeneratePutsInPostloop(machine_ir, loop, mem_reg_map);
276 }
277 
ContainsCall(Loop * loop)278 bool ContainsCall(Loop* loop) {
279   for (auto* bb : *loop) {
280     for (auto* insn : bb->insn_list()) {
281       if (AsMachineInsnX86_64(insn)->opcode() == kMachineOpCallImm) {
282         return true;
283       }
284     }
285   }
286   return false;
287 }
288 
289 template <typename PredicateFunction>
OptimizeLoopTree(MachineIR * machine_ir,LoopTreeNode * node,PredicateFunction predicate)290 void OptimizeLoopTree(MachineIR* machine_ir, LoopTreeNode* node, PredicateFunction predicate) {
291   if (node->loop() && predicate(node)) {
292     OptimizeLoop(machine_ir, node->loop());
293     return;
294   }
295 
296   for (size_t i = 0; i < node->NumInnerloops(); i++) {
297     OptimizeLoopTree(machine_ir, node->GetInnerloopNode(i), predicate);
298   }
299 }
300 
RemoveLoopGuestContextAccesses(MachineIR * machine_ir)301 void RemoveLoopGuestContextAccesses(MachineIR* machine_ir) {
302   // TODO(b/203826752): Provide a better heuristic for deciding which loop to optimize.
303   auto loop_tree = BuildLoopTree(machine_ir);
304 
305   auto predicate = [](LoopTreeNode* node) -> bool {
306     // TODO(b/203826752): Avoid repeating invoking ContainsCall for innerloops.
307     return !ContainsCall(node->loop());
308   };
309 
310   OptimizeLoopTree(machine_ir, loop_tree.root(), predicate);
311 }
312 
313 }  // namespace berberis::x86_64
314