1; When optimising for minimum size, we don't want to expand a div to a mul 2; and a shift sequence. As a result, the urem instruction e.g. will not be 3; expanded to a sequence of umull, lsrs, muls and sub instructions, but 4; just a call to __aeabi_uidivmod. 5; 6; When the processor features hardware division, UDIV + UREM can be turned 7; into UDIV + MLS. This prevents the library function __aeabi_uidivmod to be 8; pulled into the binary. The test uses ARMv7-M. 9; 10; RUN: llc -mtriple=armv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | FileCheck %s 11; RUN: llc -mtriple=thumbv7m-eabi -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=V7M 12 13target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" 14target triple = "thumbv7m-arm-none-eabi" 15 16define i32 @foo1() local_unnamed_addr #0 { 17entry: 18; CHECK-LABEL: foo1: 19; CHECK:__aeabi_idiv 20; CHECK-NOT: smmul 21 %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() 22 %div = sdiv i32 %call, 1000000 23 ret i32 %div 24} 25 26define i32 @foo2() local_unnamed_addr #0 { 27entry: 28; CHECK-LABEL: foo2: 29; CHECK: __aeabi_uidiv 30; CHECK-NOT: umull 31 %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() 32 %div = udiv i32 %call, 1000000 33 ret i32 %div 34} 35 36; Test for unsigned remainder 37define i32 @foo3() local_unnamed_addr #0 { 38entry: 39; CHECK-LABEL: foo3: 40; CHECK: __aeabi_uidivmod 41; CHECK-NOT: umull 42; V7M-LABEL: foo3: 43; V7M: udiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] 44; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] 45; V7M-NOT: __aeabi_uidivmod 46 %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() 47 %rem = urem i32 %call, 1000000 48 %cmp = icmp eq i32 %rem, 0 49 %conv = zext i1 %cmp to i32 50 ret i32 %conv 51} 52 53; Test for signed remainder 54define i32 @foo4() local_unnamed_addr #0 { 55entry: 56; CHECK-LABEL: foo4: 57; CHECK:__aeabi_idivmod 58; V7M-LABEL: foo4: 59; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] 60; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] 61; V7M-NOT: __aeabi_idivmod 62 %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() 63 %rem = srem i32 %call, 1000000 64 ret i32 %rem 65} 66 67; Check that doing a sdiv+srem has the same effect as only the srem, 68; as the division needs to be computed anyway in order to calculate 69; the remainder (i.e. make sure we don't end up with two divisions). 70define i32 @foo5() local_unnamed_addr #0 { 71entry: 72; CHECK-LABEL: foo5: 73; CHECK:__aeabi_idivmod 74; V7M-LABEL: foo5: 75; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] 76; V7M-NOT: sdiv 77; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] 78; V7M-NOT: __aeabi_idivmod 79 %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() 80 %div = sdiv i32 %call, 1000000 81 %rem = srem i32 %call, 1000000 82 %add = add i32 %div, %rem 83 ret i32 %add 84} 85 86; An early version of this patch caused isel to hang. The reason 87; was that it shouldn't do the rewrite for i64 because that's not 88; supported by hardware. Isel was stuck in a loop with type 89; legalization and this optimisation. 90; Function Attrs: norecurse nounwind 91define i64 @isel_dont_hang(i32 %bar) local_unnamed_addr #4 { 92entry: 93; CHECK-LABEL: isel_dont_hang: 94; CHECK: __aeabi_uldivmod 95 %temp.0 = sext i32 %bar to i64 96 %mul83 = shl i64 %temp.0, 1 97 %add84 = add i64 %temp.0, 2 98 %div85 = udiv i64 %mul83, %add84 99 ret i64 %div85 100} 101 102; i16 types are promoted to i32, and we expect a normal udiv here: 103define i16 @isel_dont_hang_2(i16 %bar) local_unnamed_addr #4 { 104entry: 105; CHECK-LABEL: isel_dont_hang_2: 106; CHECK: udiv 107; CHECK-NOT: __aeabi_ 108 %mul83 = shl i16 %bar, 1 109 %add84 = add i16 %bar, 2 110 %div85 = udiv i16 %mul83, %add84 111 ret i16 %div85 112} 113declare i32 @GetValue(...) local_unnamed_addr 114 115attributes #0 = { minsize nounwind optsize } 116attributes #4 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-jump-tables"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a15" "target-features"="+dsp,+hwdiv,+hwdiv-arm,+neon,+vfp4" "use-soft-float"="false" } 117 118