1#include <clc/clc.h> 2 3#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ 4 typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 5 _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ 6 return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \ 7 } \ 8\ 9 typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 10 _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ 11 PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \ 12 return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \ 13 } \ 14\ 15 typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 16 _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ 17 return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \ 18 } \ 19\ 20 typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 21 _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ 22 return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \ 23 } \ 24\ 25 typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 26 _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ 27 return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \ 28 } \ 29 30#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ 31 VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ 32 VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ 33 VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ 34 VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ 35 36#define VLOAD_TYPES() \ 37 VLOAD_ADDR_SPACES(char) \ 38 VLOAD_ADDR_SPACES(uchar) \ 39 VLOAD_ADDR_SPACES(short) \ 40 VLOAD_ADDR_SPACES(ushort) \ 41 VLOAD_ADDR_SPACES(int) \ 42 VLOAD_ADDR_SPACES(uint) \ 43 VLOAD_ADDR_SPACES(long) \ 44 VLOAD_ADDR_SPACES(ulong) \ 45 VLOAD_ADDR_SPACES(float) \ 46 47VLOAD_TYPES() 48 49#ifdef cl_khr_fp64 50#pragma OPENCL EXTENSION cl_khr_fp64 : enable 51 VLOAD_ADDR_SPACES(double) 52#endif 53#ifdef cl_khr_fp16 54#pragma OPENCL EXTENSION cl_khr_fp16 : enable 55 VLOAD_ADDR_SPACES(half) 56#endif 57 58/* vload_half are legal even without cl_khr_fp16 */ 59/* no vload_half for double */ 60#if __clang_major__ < 6 61float __clc_vload_half_float_helper__constant(const __constant half *); 62float __clc_vload_half_float_helper__global(const __global half *); 63float __clc_vload_half_float_helper__local(const __local half *); 64float __clc_vload_half_float_helper__private(const __private half *); 65 66#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]); 67#else 68#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); 69#endif 70 71#define VEC_LOAD2(val, AS) \ 72 VEC_LOAD1(val.lo, AS) \ 73 VEC_LOAD1(val.hi, AS) 74#define VEC_LOAD3(val, AS) \ 75 VEC_LOAD1(val.s0, AS) \ 76 VEC_LOAD1(val.s1, AS) \ 77 VEC_LOAD1(val.s2, AS) 78#define VEC_LOAD4(val, AS) \ 79 VEC_LOAD2(val.lo, AS) \ 80 VEC_LOAD2(val.hi, AS) 81#define VEC_LOAD8(val, AS) \ 82 VEC_LOAD4(val.lo, AS) \ 83 VEC_LOAD4(val.hi, AS) 84#define VEC_LOAD16(val, AS) \ 85 VEC_LOAD8(val.lo, AS) \ 86 VEC_LOAD8(val.hi, AS) 87 88#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ 89 _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \ 90 offset *= VEC_SIZE; \ 91 TYPE __tmp; \ 92 VEC_LOAD##VEC_SIZE(__tmp, AS) \ 93 return __tmp; \ 94 } \ 95 _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, const AS half *mem) { \ 96 offset *= OFFSET_SIZE; \ 97 TYPE __tmp; \ 98 VEC_LOAD##VEC_SIZE(__tmp, AS) \ 99 return __tmp; \ 100 } 101 102#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) 103 104#define __CLC_BODY "vload_half.inc" 105#include <clc/math/gentype.inc> 106#undef __CLC_BODY 107#undef FUNC 108#undef __FUNC 109#undef VEC_LOAD16 110#undef VEC_LOAD8 111#undef VEC_LOAD4 112#undef VEC_LOAD3 113#undef VEC_LOAD2 114#undef VEC_LOAD1 115#undef VLOAD_TYPES 116#undef VLOAD_ADDR_SPACES 117#undef VLOAD_VECTORIZE 118