1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the ARMSelectionDAGInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "ARMTargetMachine.h"
15 #include "llvm/CodeGen/SelectionDAG.h"
16 #include "llvm/IR/DerivedTypes.h"
17 using namespace llvm;
18
19 #define DEBUG_TYPE "arm-selectiondag-info"
20
21 // Emit, if possible, a specialized version of the given Libcall. Typically this
22 // means selecting the appropriately aligned version, but we also convert memset
23 // of 0 into memclr.
EmitSpecializedLibcall(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,unsigned Align,RTLIB::Libcall LC) const24 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
25 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
26 SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
27 const ARMSubtarget &Subtarget =
28 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
29 const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
30
31 // Only use a specialized AEABI function if the default version of this
32 // Libcall is an AEABI function.
33 if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
34 return SDValue();
35
36 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
37 // able to translate memset to memclr and use the value to index the function
38 // name array.
39 enum {
40 AEABI_MEMCPY = 0,
41 AEABI_MEMMOVE,
42 AEABI_MEMSET,
43 AEABI_MEMCLR
44 } AEABILibcall;
45 switch (LC) {
46 case RTLIB::MEMCPY:
47 AEABILibcall = AEABI_MEMCPY;
48 break;
49 case RTLIB::MEMMOVE:
50 AEABILibcall = AEABI_MEMMOVE;
51 break;
52 case RTLIB::MEMSET:
53 AEABILibcall = AEABI_MEMSET;
54 if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
55 if (ConstantSrc->getZExtValue() == 0)
56 AEABILibcall = AEABI_MEMCLR;
57 break;
58 default:
59 return SDValue();
60 }
61
62 // Choose the most-aligned libcall variant that we can
63 enum {
64 ALIGN1 = 0,
65 ALIGN4,
66 ALIGN8
67 } AlignVariant;
68 if ((Align & 7) == 0)
69 AlignVariant = ALIGN8;
70 else if ((Align & 3) == 0)
71 AlignVariant = ALIGN4;
72 else
73 AlignVariant = ALIGN1;
74
75 TargetLowering::ArgListTy Args;
76 TargetLowering::ArgListEntry Entry;
77 Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
78 Entry.Node = Dst;
79 Args.push_back(Entry);
80 if (AEABILibcall == AEABI_MEMCLR) {
81 Entry.Node = Size;
82 Args.push_back(Entry);
83 } else if (AEABILibcall == AEABI_MEMSET) {
84 // Adjust parameters for memset, EABI uses format (ptr, size, value),
85 // GNU library uses (ptr, value, size)
86 // See RTABI section 4.3.4
87 Entry.Node = Size;
88 Args.push_back(Entry);
89
90 // Extend or truncate the argument to be an i32 value for the call.
91 if (Src.getValueType().bitsGT(MVT::i32))
92 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
93 else if (Src.getValueType().bitsLT(MVT::i32))
94 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
95
96 Entry.Node = Src;
97 Entry.Ty = Type::getInt32Ty(*DAG.getContext());
98 Entry.IsSExt = false;
99 Args.push_back(Entry);
100 } else {
101 Entry.Node = Src;
102 Args.push_back(Entry);
103
104 Entry.Node = Size;
105 Args.push_back(Entry);
106 }
107
108 char const *FunctionNames[4][3] = {
109 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
110 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
111 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
112 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
113 };
114 TargetLowering::CallLoweringInfo CLI(DAG);
115 CLI.setDebugLoc(dl)
116 .setChain(Chain)
117 .setLibCallee(
118 TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
119 DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
120 TLI->getPointerTy(DAG.getDataLayout())),
121 std::move(Args))
122 .setDiscardResult();
123 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
124
125 return CallResult.second;
126 }
127
EmitTargetCodeForMemcpy(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,unsigned Align,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo) const128 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
129 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
130 SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
131 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
132 const ARMSubtarget &Subtarget =
133 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
134 // Do repeated 4-byte loads and stores. To be improved.
135 // This requires 4-byte alignment.
136 if ((Align & 3) != 0)
137 return SDValue();
138 // This requires the copy size to be a constant, preferably
139 // within a subtarget-specific limit.
140 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
141 if (!ConstantSize)
142 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
143 RTLIB::MEMCPY);
144 uint64_t SizeVal = ConstantSize->getZExtValue();
145 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
146 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
147 RTLIB::MEMCPY);
148
149 unsigned BytesLeft = SizeVal & 3;
150 unsigned NumMemOps = SizeVal >> 2;
151 unsigned EmittedNumMemOps = 0;
152 EVT VT = MVT::i32;
153 unsigned VTSize = 4;
154 unsigned i = 0;
155 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
156 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
157 SDValue TFOps[6];
158 SDValue Loads[6];
159 uint64_t SrcOff = 0, DstOff = 0;
160
161 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
162 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
163 // pressure on the general purpose registers. However this seems harder to map
164 // onto the register allocator's view of the world.
165
166 // The number of MEMCPY pseudo-instructions to emit. We use up to
167 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
168 // later on. This is a lower bound on the number of MEMCPY operations we must
169 // emit.
170 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
171
172 // Code size optimisation: do not inline memcpy if expansion results in
173 // more instructions than the libary call.
174 if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction().optForMinSize()) {
175 return SDValue();
176 }
177
178 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
179
180 for (unsigned I = 0; I != NumMEMCPYs; ++I) {
181 // Evenly distribute registers among MEMCPY operations to reduce register
182 // pressure.
183 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
184 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
185
186 Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
187 DAG.getConstant(NumRegs, dl, MVT::i32));
188 Src = Dst.getValue(1);
189 Chain = Dst.getValue(2);
190
191 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
192 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
193
194 EmittedNumMemOps = NextEmittedNumMemOps;
195 }
196
197 if (BytesLeft == 0)
198 return Chain;
199
200 // Issue loads / stores for the trailing (1 - 3) bytes.
201 auto getRemainingValueType = [](unsigned BytesLeft) {
202 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
203 };
204 auto getRemainingSize = [](unsigned BytesLeft) {
205 return (BytesLeft >= 2) ? 2 : 1;
206 };
207
208 unsigned BytesLeftSave = BytesLeft;
209 i = 0;
210 while (BytesLeft) {
211 VT = getRemainingValueType(BytesLeft);
212 VTSize = getRemainingSize(BytesLeft);
213 Loads[i] = DAG.getLoad(VT, dl, Chain,
214 DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
215 DAG.getConstant(SrcOff, dl, MVT::i32)),
216 SrcPtrInfo.getWithOffset(SrcOff));
217 TFOps[i] = Loads[i].getValue(1);
218 ++i;
219 SrcOff += VTSize;
220 BytesLeft -= VTSize;
221 }
222 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
223 makeArrayRef(TFOps, i));
224
225 i = 0;
226 BytesLeft = BytesLeftSave;
227 while (BytesLeft) {
228 VT = getRemainingValueType(BytesLeft);
229 VTSize = getRemainingSize(BytesLeft);
230 TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
231 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
232 DAG.getConstant(DstOff, dl, MVT::i32)),
233 DstPtrInfo.getWithOffset(DstOff));
234 ++i;
235 DstOff += VTSize;
236 BytesLeft -= VTSize;
237 }
238 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
239 makeArrayRef(TFOps, i));
240 }
241
EmitTargetCodeForMemmove(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,unsigned Align,bool isVolatile,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo) const242 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
243 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
244 SDValue Size, unsigned Align, bool isVolatile,
245 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
246 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
247 RTLIB::MEMMOVE);
248 }
249
EmitTargetCodeForMemset(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,unsigned Align,bool isVolatile,MachinePointerInfo DstPtrInfo) const250 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
251 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
252 SDValue Size, unsigned Align, bool isVolatile,
253 MachinePointerInfo DstPtrInfo) const {
254 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
255 RTLIB::MEMSET);
256 }
257