1 /*
2 * Copyright (C) 2023 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "berberis/backend/x86_64/loop_guest_context_optimizer.h"
18
19 #include <algorithm>
20 #include <functional>
21 #include <tuple>
22
23 #include "berberis/backend/x86_64/machine_ir.h"
24 #include "berberis/base/algorithm.h"
25 #include "berberis/base/logging.h"
26 #include "berberis/guest_state/guest_state_opaque.h"
27
28 namespace berberis::x86_64 {
29
ReplaceGetAndUpdateMap(MachineIR * ir,const MachineInsnList::iterator insn_it,MemRegMap & mem_reg_map)30 void ReplaceGetAndUpdateMap(MachineIR* ir,
31 const MachineInsnList::iterator insn_it,
32 MemRegMap& mem_reg_map) {
33 auto* insn = AsMachineInsnX86_64(*insn_it);
34 auto disp = insn->disp();
35
36 MovType regtype;
37 switch (insn->opcode()) {
38 case kMachineOpMovqRegMemBaseDisp:
39 regtype = MovType::kMovq;
40 break;
41 case kMachineOpMovdqaXRegMemBaseDisp:
42 regtype = MovType::kMovdqa;
43 break;
44 case kMachineOpMovwRegMemBaseDisp:
45 regtype = MovType::kMovw;
46 break;
47 case kMachineOpMovsdXRegMemBaseDisp:
48 regtype = MovType::kMovsd;
49 break;
50 default:
51 LOG_ALWAYS_FATAL("Unrecognized Get instruction opcode");
52 }
53
54 if (!mem_reg_map[disp].has_value()) {
55 auto reg = ir->AllocVReg();
56 mem_reg_map[disp] = {reg, regtype, false};
57 }
58
59 auto dst = insn->RegAt(0);
60 auto copy_size = insn->opcode() == kMachineOpMovdqaXRegMemBaseDisp ? 16 : 8;
61 auto* new_insn = ir->NewInsn<PseudoCopy>(dst, mem_reg_map[disp].value().reg, copy_size);
62 *insn_it = new_insn;
63 }
64
ReplacePutAndUpdateMap(MachineIR * ir,const MachineInsnList::iterator insn_it,MemRegMap & mem_reg_map)65 void ReplacePutAndUpdateMap(MachineIR* ir,
66 const MachineInsnList::iterator insn_it,
67 MemRegMap& mem_reg_map) {
68 auto* insn = AsMachineInsnX86_64(*insn_it);
69 auto disp = insn->disp();
70
71 MovType regtype;
72 switch (insn->opcode()) {
73 case kMachineOpMovqMemBaseDispReg:
74 regtype = MovType::kMovq;
75 break;
76 case kMachineOpMovdqaMemBaseDispXReg:
77 regtype = MovType::kMovdqa;
78 break;
79 case kMachineOpMovwMemBaseDispReg:
80 regtype = MovType::kMovw;
81 break;
82 case kMachineOpMovsdMemBaseDispXReg:
83 regtype = MovType::kMovsd;
84 break;
85 default:
86 LOG_ALWAYS_FATAL("Unrecognized Put instruction opcode");
87 }
88
89 if (!mem_reg_map[disp].has_value()) {
90 auto reg = ir->AllocVReg();
91 mem_reg_map[disp] = {reg, regtype, true};
92 } else {
93 mem_reg_map[disp].value().is_modified = true;
94 }
95
96 auto src = insn->RegAt(1);
97 auto copy_size = insn->opcode() == kMachineOpMovdqaMemBaseDispXReg ? 16 : 8;
98 auto* new_insn = static_cast<MachineInsn*>(
99 ir->NewInsn<PseudoCopy>(mem_reg_map[disp].value().reg, src, copy_size));
100 *insn_it = new_insn;
101 }
102
GenerateGetInsns(MachineIR * ir,MachineBasicBlock * bb,const MemRegMap & mem_reg_map)103 void GenerateGetInsns(MachineIR* ir, MachineBasicBlock* bb, const MemRegMap& mem_reg_map) {
104 // Check that there is no critical edge.
105 CHECK_EQ(bb->out_edges().size(), 1);
106
107 auto insert_it = std::prev(bb->insn_list().end());
108 for (unsigned long disp = 0; disp < mem_reg_map.size(); disp++) {
109 if (!mem_reg_map[disp].has_value()) {
110 continue;
111 }
112
113 // It's tempting to generate Get only if there is a Get insn for the guest register in the loop,
114 // but it would be incorrect because the loop can exit without updating
115 // the mapped register, making the afterloop loading from the uninitialized
116 // mapped register.
117
118 // TODO(b/203826752) Do not generate the Get insn if the initialization of the mapped
119 // register is not needed.
120 auto reg_info = mem_reg_map[disp].value();
121 MachineInsn* get_insn;
122 switch (reg_info.mov_type) {
123 case MovType::kMovq:
124 get_insn = ir->NewInsn<MovqRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
125 break;
126 case MovType::kMovdqa:
127 get_insn = ir->NewInsn<MovdqaXRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
128 break;
129 case MovType::kMovw:
130 get_insn = ir->NewInsn<MovwRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
131 break;
132 case MovType::kMovsd:
133 get_insn = ir->NewInsn<MovsdXRegMemBaseDisp>(reg_info.reg, kMachineRegRBP, disp);
134 break;
135 }
136
137 bb->insn_list().insert(insert_it, get_insn);
138 }
139 }
140
GeneratePutInsns(MachineIR * ir,MachineBasicBlock * bb,const MemRegMap & mem_reg_map)141 void GeneratePutInsns(MachineIR* ir, MachineBasicBlock* bb, const MemRegMap& mem_reg_map) {
142 // Check that there is no critical edge.
143 CHECK_EQ(bb->in_edges().size(), 1);
144
145 auto insert_it = bb->insn_list().begin();
146 for (unsigned long disp = 0; disp < mem_reg_map.size(); disp++) {
147 if (!mem_reg_map[disp].has_value()) {
148 continue;
149 }
150
151 auto reg_info = mem_reg_map[disp].value();
152 if (!reg_info.is_modified) {
153 continue;
154 }
155
156 MachineInsn* put_insn;
157 switch (reg_info.mov_type) {
158 case MovType::kMovq:
159 put_insn = ir->NewInsn<MovqMemBaseDispReg>(kMachineRegRBP, disp, reg_info.reg);
160 break;
161 case MovType::kMovdqa:
162 put_insn = ir->NewInsn<MovdqaMemBaseDispXReg>(kMachineRegRBP, disp, reg_info.reg);
163 break;
164 case MovType::kMovw:
165 put_insn = ir->NewInsn<MovwMemBaseDispReg>(kMachineRegRBP, disp, reg_info.reg);
166 break;
167 case MovType::kMovsd:
168 put_insn = ir->NewInsn<MovsdMemBaseDispXReg>(kMachineRegRBP, disp, reg_info.reg);
169 break;
170 }
171
172 bb->insn_list().insert(insert_it, put_insn);
173 }
174 }
175
GenerateGetsInPreloop(MachineIR * ir,const Loop * loop,const MemRegMap & mem_reg_map)176 void GenerateGetsInPreloop(MachineIR* ir, const Loop* loop, const MemRegMap& mem_reg_map) {
177 auto* header = (*loop)[0];
178 CHECK_GE(header->in_edges().size(), 2);
179 for (auto in_edge : header->in_edges()) {
180 if (Contains(*loop, in_edge->src())) {
181 // The source of the edge is inside the loop.
182 continue;
183 }
184
185 GenerateGetInsns(ir, in_edge->src(), mem_reg_map);
186 }
187 }
188
GeneratePutsInPostloop(MachineIR * ir,const Loop * loop,const MemRegMap & mem_reg_map)189 void GeneratePutsInPostloop(MachineIR* ir, const Loop* loop, const MemRegMap& mem_reg_map) {
190 for (auto bb : *loop) {
191 for (auto* out_edge : bb->out_edges()) {
192 if (Contains(*loop, out_edge->dst())) {
193 continue;
194 }
195
196 GeneratePutInsns(ir, out_edge->dst(), mem_reg_map);
197 }
198 }
199 }
200
CountGuestRegAccesses(const MachineIR * ir,const Loop * loop)201 ArenaVector<int> CountGuestRegAccesses(const MachineIR* ir, const Loop* loop) {
202 ArenaVector<int> guest_access_count(sizeof(CPUState), 0, ir->arena());
203 for (auto* bb : *loop) {
204 for (auto* base_insn : bb->insn_list()) {
205 auto insn = AsMachineInsnX86_64(base_insn);
206 if (insn->IsCPUStateGet() || insn->IsCPUStatePut()) {
207 guest_access_count.at(insn->disp())++;
208 }
209 }
210 }
211 return guest_access_count;
212 }
213
GetSortedOffsetCounters(MachineIR * ir,Loop * loop)214 OffsetCounterMap GetSortedOffsetCounters(MachineIR* ir, Loop* loop) {
215 auto guest_access_count = CountGuestRegAccesses(ir, loop);
216
217 OffsetCounterMap offset_counter_map(ir->arena());
218 for (size_t offset = 0; offset < sizeof(CPUState); offset++) {
219 int cnt = guest_access_count.at(offset);
220 if (cnt > 0) {
221 offset_counter_map.push_back({offset, cnt});
222 }
223 }
224
225 std::sort(offset_counter_map.begin(), offset_counter_map.end(), [](auto pair1, auto pair2) {
226 return std::get<1>(pair1) > std::get<1>(pair2);
227 });
228
229 return offset_counter_map;
230 }
231
OptimizeLoop(MachineIR * machine_ir,Loop * loop,const OptimizeLoopParams & params)232 void OptimizeLoop(MachineIR* machine_ir, Loop* loop, const OptimizeLoopParams& params) {
233 OffsetCounterMap sorted_offsets = GetSortedOffsetCounters(machine_ir, loop);
234 ArenaVector<bool> optimized_offsets(sizeof(CPUState), false, machine_ir->arena());
235
236 size_t general_reg_count = 0;
237 size_t simd_reg_count = 0;
238 for (auto [offset, unused_counter] : sorted_offsets) {
239 // TODO(b/232598137) Account for f and v register classes.
240 // Simd regs.
241 if (IsSimdOffset(offset)) {
242 if (simd_reg_count++ < params.simd_reg_limit) {
243 optimized_offsets[offset] = true;
244 }
245 continue;
246 }
247 // General regs and flags.
248 if (general_reg_count++ < params.general_reg_limit) {
249 optimized_offsets[offset] = true;
250 }
251 }
252
253 MemRegMap mem_reg_map(sizeof(CPUState), std::nullopt, machine_ir->arena());
254 // Replace gets and puts with Pseudocopy and update mem_reg_map.
255 for (auto* bb : *loop) {
256 for (auto insn_it = bb->insn_list().begin(); insn_it != bb->insn_list().end(); insn_it++) {
257 auto insn = AsMachineInsnX86_64(*insn_it);
258
259 // Skip insn if it accesses regs with low priority
260 if (insn->IsCPUStateGet() || insn->IsCPUStatePut()) {
261 if (!optimized_offsets.at(insn->disp())) {
262 continue;
263 }
264 }
265
266 if (insn->IsCPUStateGet()) {
267 ReplaceGetAndUpdateMap(machine_ir, insn_it, mem_reg_map);
268 } else if (insn->IsCPUStatePut()) {
269 ReplacePutAndUpdateMap(machine_ir, insn_it, mem_reg_map);
270 }
271 }
272 }
273
274 GenerateGetsInPreloop(machine_ir, loop, mem_reg_map);
275 GeneratePutsInPostloop(machine_ir, loop, mem_reg_map);
276 }
277
ContainsCall(Loop * loop)278 bool ContainsCall(Loop* loop) {
279 for (auto* bb : *loop) {
280 for (auto* insn : bb->insn_list()) {
281 if (AsMachineInsnX86_64(insn)->opcode() == kMachineOpCallImm) {
282 return true;
283 }
284 }
285 }
286 return false;
287 }
288
289 template <typename PredicateFunction>
OptimizeLoopTree(MachineIR * machine_ir,LoopTreeNode * node,PredicateFunction predicate)290 void OptimizeLoopTree(MachineIR* machine_ir, LoopTreeNode* node, PredicateFunction predicate) {
291 if (node->loop() && predicate(node)) {
292 OptimizeLoop(machine_ir, node->loop());
293 return;
294 }
295
296 for (size_t i = 0; i < node->NumInnerloops(); i++) {
297 OptimizeLoopTree(machine_ir, node->GetInnerloopNode(i), predicate);
298 }
299 }
300
RemoveLoopGuestContextAccesses(MachineIR * machine_ir)301 void RemoveLoopGuestContextAccesses(MachineIR* machine_ir) {
302 // TODO(b/203826752): Provide a better heuristic for deciding which loop to optimize.
303 auto loop_tree = BuildLoopTree(machine_ir);
304
305 auto predicate = [](LoopTreeNode* node) -> bool {
306 // TODO(b/203826752): Avoid repeating invoking ContainsCall for innerloops.
307 return !ContainsCall(node->loop());
308 };
309
310 OptimizeLoopTree(machine_ir, loop_tree.root(), predicate);
311 }
312
313 } // namespace berberis::x86_64
314