1; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s 2 3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 4target triple = "nvptx64-nvidia-cuda" 5 6; CHECK-LABEL: t1 7define <4 x float> @t1(i8* %p1) { 8; CHECK-NOT: ld.v4 9; CHECK-NOT: ld.v2 10; CHECK-NOT: ld.f32 11; CHECK: ld.u8 12 %cast = bitcast i8* %p1 to <4 x float>* 13 %r = load <4 x float>, <4 x float>* %cast, align 1 14 ret <4 x float> %r 15} 16 17; CHECK-LABEL: t2 18define <4 x float> @t2(i8* %p1) { 19; CHECK-NOT: ld.v4 20; CHECK-NOT: ld.v2 21; CHECK: ld.f32 22 %cast = bitcast i8* %p1 to <4 x float>* 23 %r = load <4 x float>, <4 x float>* %cast, align 4 24 ret <4 x float> %r 25} 26 27; CHECK-LABEL: t3 28define <4 x float> @t3(i8* %p1) { 29; CHECK-NOT: ld.v4 30; CHECK: ld.v2 31 %cast = bitcast i8* %p1 to <4 x float>* 32 %r = load <4 x float>, <4 x float>* %cast, align 8 33 ret <4 x float> %r 34} 35 36; CHECK-LABEL: t4 37define <4 x float> @t4(i8* %p1) { 38; CHECK: ld.v4 39 %cast = bitcast i8* %p1 to <4 x float>* 40 %r = load <4 x float>, <4 x float>* %cast, align 16 41 ret <4 x float> %r 42} 43 44 45; CHECK-LABEL: s1 46define void @s1(<4 x float>* %p1, <4 x float> %v) { 47; CHECK-NOT: st.v4 48; CHECK-NOT: st.v2 49; CHECK-NOT: st.f32 50; CHECK: st.u8 51 store <4 x float> %v, <4 x float>* %p1, align 1 52 ret void 53} 54 55; CHECK-LABEL: s2 56define void @s2(<4 x float>* %p1, <4 x float> %v) { 57; CHECK-NOT: st.v4 58; CHECK-NOT: st.v2 59; CHECK: st.f32 60 store <4 x float> %v, <4 x float>* %p1, align 4 61 ret void 62} 63 64; CHECK-LABEL: s3 65define void @s3(<4 x float>* %p1, <4 x float> %v) { 66; CHECK-NOT: st.v4 67 store <4 x float> %v, <4 x float>* %p1, align 8 68 ret void 69} 70 71; CHECK-LABEL: s4 72define void @s4(<4 x float>* %p1, <4 x float> %v) { 73; CHECK: st.v4 74 store <4 x float> %v, <4 x float>* %p1, align 16 75 ret void 76} 77 78