1//===---------------------------------------------------------------------===//
2// Random ideas for the ARM backend (Thumb specific).
3//===---------------------------------------------------------------------===//
4
5* Add support for compiling functions in both ARM and Thumb mode, then taking
6  the smallest.
7
8* Add support for compiling individual basic blocks in thumb mode, when in a
9  larger ARM function.  This can be used for presumed cold code, like paths
10  to abort (failure path of asserts), EH handling code, etc.
11
12* Thumb doesn't have normal pre/post increment addressing modes, but you can
13  load/store 32-bit integers with pre/postinc by using load/store multiple
14  instrs with a single register.
15
16* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
17  and cmp instructions can use high registers. Also, we can use them as
18  temporaries to spill values into.
19
20* In thumb mode, short, byte, and bool preferred alignments are currently set
21  to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
22  of 4).
23
24//===---------------------------------------------------------------------===//
25
26Potential jumptable improvements:
27
28* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
29  jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
30  function is even smaller. This also applies to ARM.
31
32* Thumb jumptable codegen can improve given some help from the assembler. This
33  is what we generate right now:
34
35	.set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
36LPCRELL0:
37	mov r1, #PCRELV0
38	add r1, pc
39	ldr r0, [r0, r1]
40	mov pc, r0
41	.align	2
42LJTI1_0_0:
43	.long	 LBB1_3
44        ...
45
46Note there is another pc relative add that we can take advantage of.
47     add r1, pc, #imm_8 * 4
48
49We should be able to generate:
50
51LPCRELL0:
52	add r1, LJTI1_0_0
53	ldr r0, [r0, r1]
54	mov pc, r0
55	.align	2
56LJTI1_0_0:
57	.long	 LBB1_3
58
59if the assembler can translate the add to:
60       add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
61
62Note the assembler also does something similar to constpool load:
63LPCRELL0:
64     ldr r0, LCPI1_0
65=>
66     ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
67
68
69//===---------------------------------------------------------------------===//
70
71We compile the following:
72
73define i16 @func_entry_2E_ce(i32 %i) {
74        switch i32 %i, label %bb12.exitStub [
75                 i32 0, label %bb4.exitStub
76                 i32 1, label %bb9.exitStub
77                 i32 2, label %bb4.exitStub
78                 i32 3, label %bb4.exitStub
79                 i32 7, label %bb9.exitStub
80                 i32 8, label %bb.exitStub
81                 i32 9, label %bb9.exitStub
82        ]
83
84bb12.exitStub:
85        ret i16 0
86
87bb4.exitStub:
88        ret i16 1
89
90bb9.exitStub:
91        ret i16 2
92
93bb.exitStub:
94        ret i16 3
95}
96
97into:
98
99_func_entry_2E_ce:
100        mov r2, #1
101        lsl r2, r0
102        cmp r0, #9
103        bhi LBB1_4      @bb12.exitStub
104LBB1_1: @newFuncRoot
105        mov r1, #13
106        tst r2, r1
107        bne LBB1_5      @bb4.exitStub
108LBB1_2: @newFuncRoot
109        ldr r1, LCPI1_0
110        tst r2, r1
111        bne LBB1_6      @bb9.exitStub
112LBB1_3: @newFuncRoot
113        mov r1, #1
114        lsl r1, r1, #8
115        tst r2, r1
116        bne LBB1_7      @bb.exitStub
117LBB1_4: @bb12.exitStub
118        mov r0, #0
119        bx lr
120LBB1_5: @bb4.exitStub
121        mov r0, #1
122        bx lr
123LBB1_6: @bb9.exitStub
124        mov r0, #2
125        bx lr
126LBB1_7: @bb.exitStub
127        mov r0, #3
128        bx lr
129LBB1_8:
130        .align  2
131LCPI1_0:
132        .long   642
133
134
135gcc compiles to:
136
137	cmp	r0, #9
138	@ lr needed for prologue
139	bhi	L2
140	ldr	r3, L11
141	mov	r2, #1
142	mov	r1, r2, asl r0
143	ands	r0, r3, r2, asl r0
144	movne	r0, #2
145	bxne	lr
146	tst	r1, #13
147	beq	L9
148L3:
149	mov	r0, r2
150	bx	lr
151L9:
152	tst	r1, #256
153	movne	r0, #3
154	bxne	lr
155L2:
156	mov	r0, #0
157	bx	lr
158L12:
159	.align 2
160L11:
161	.long	642
162
163
164GCC is doing a couple of clever things here:
165  1. It is predicating one of the returns.  This isn't a clear win though: in
166     cases where that return isn't taken, it is replacing one condbranch with
167     two 'ne' predicated instructions.
168  2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
169     tst.  This will probably require whole function isel.
170  3. GCC emits:
171  	tst	r1, #256
172     we emit:
173        mov r1, #1
174        lsl r1, r1, #8
175        tst r2, r1
176
177
178//===---------------------------------------------------------------------===//
179
180When spilling in thumb mode and the sp offset is too large to fit in the ldr /
181str offset field, we load the offset from a constpool entry and add it to sp:
182
183ldr r2, LCPI
184add r2, sp
185ldr r2, [r2]
186
187These instructions preserve the condition code which is important if the spill
188is between a cmp and a bcc instruction. However, we can use the (potentially)
189cheaper sequnce if we know it's ok to clobber the condition register.
190
191add r2, sp, #255 * 4
192add r2, #132
193ldr r2, [r2, #7 * 4]
194
195This is especially bad when dynamic alloca is used. The all fixed size stack
196objects are referenced off the frame pointer with negative offsets. See
197oggenc for an example.
198
199
200//===---------------------------------------------------------------------===//
201
202Poor codegen test/CodeGen/ARM/select.ll f7:
203
204	ldr r5, LCPI1_0
205LPC0:
206	add r5, pc
207	ldr r6, LCPI1_1
208	ldr r2, LCPI1_2
209	mov r3, r6
210	mov lr, pc
211	bx r5
212
213//===---------------------------------------------------------------------===//
214
215Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
216etc. Almost all Thumb instructions clobber condition code.
217
218//===---------------------------------------------------------------------===//
219
220Add ldmia, stmia support.
221
222//===---------------------------------------------------------------------===//
223
224Thumb load / store address mode offsets are scaled. The values kept in the
225instruction operands are pre-scale values. This probably ought to be changed
226to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
227
228//===---------------------------------------------------------------------===//
229
230We need to make (some of the) Thumb1 instructions predicable. That will allow
231shrinking of predicated Thumb2 instructions. To allow this, we need to be able
232to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
233
234//===---------------------------------------------------------------------===//
235
236Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
237
238//===---------------------------------------------------------------------===//
239
240Thumb1 immediate field sometimes keep pre-scaled values. See
241Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
242Thumb2.
243
244//===---------------------------------------------------------------------===//
245
246Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
247add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes
248won't have to over-estimate. It can also be used for loop alignment pass.
249
250//===---------------------------------------------------------------------===//
251
252We generate conditional code for icmp when we don't need to. This code:
253
254  int foo(int s) {
255    return s == 1;
256  }
257
258produces:
259
260foo:
261        cmp     r0, #1
262        mov.w   r0, #0
263        it      eq
264        moveq   r0, #1
265        bx      lr
266
267when it could use subs + adcs. This is GCC PR46975.
268