1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===------------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12 #endif /* __IMMINTRIN_H */
13 
14 #ifndef __AMXINTRIN_H
15 #define __AMXINTRIN_H
16 #ifdef __x86_64__
17 
18 #define __DEFAULT_FN_ATTRS \
19   __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
20 
21 /// Load tile configuration from a 64-byte memory location specified by
22 /// "mem_addr". The tile configuration includes the tile type palette, the
23 /// number of bytes per row, and the number of rows. If the specified
24 /// palette_id is zero, that signifies the init state for both the tile
25 /// config and the tile data, and the tiles are zeroed. Any invalid
26 /// configurations will result in #GP fault.
27 ///
28 /// \headerfile <x86intrin.h>
29 ///
30 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
31 ///
32 /// \param __config
33 ///    A pointer to 512-bits configuration
34 static __inline__ void __DEFAULT_FN_ATTRS
_tile_loadconfig(const void * __config)35 _tile_loadconfig(const void *__config)
36 {
37   __builtin_ia32_tile_loadconfig(__config);
38 }
39 
40 /// Stores the current tile configuration to a 64-byte memory location
41 /// specified by "mem_addr". The tile configuration includes the tile type
42 /// palette, the number of bytes per row, and the number of rows. If tiles
43 /// are not configured, all zeroes will be stored to memory.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
48 ///
49 /// \param __config
50 ///    A pointer to 512-bits configuration
51 static __inline__ void __DEFAULT_FN_ATTRS
_tile_storeconfig(void * __config)52 _tile_storeconfig(void *__config)
53 {
54   __builtin_ia32_tile_storeconfig(__config);
55 }
56 
57 /// Release the tile configuration to return to the init state, which
58 /// releases all storage it currently holds.
59 ///
60 /// \headerfile <x86intrin.h>
61 ///
62 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
63 static __inline__ void __DEFAULT_FN_ATTRS
_tile_release(void)64 _tile_release(void)
65 {
66   __builtin_ia32_tilerelease();
67 }
68 
69 /// Load tile rows from memory specifieid by "base" address and "stride" into
70 /// destination tile "dst" using the tile configuration previously configured
71 /// via "_tile_loadconfig".
72 ///
73 /// \headerfile <x86intrin.h>
74 ///
75 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
76 ///
77 /// \param dst
78 ///    A destination tile. Max size is 1024 Bytes.
79 /// \param base
80 ///    A pointer to base address.
81 /// \param stride
82 ///    The stride between the rows' data to be loaded in memory.
83 #define _tile_loadd(dst, base, stride) \
84   __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
85 
86 /// Load tile rows from memory specifieid by "base" address and "stride" into
87 /// destination tile "dst" using the tile configuration previously configured
88 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
89 /// that the data will likely not be reused in the near future and the data
90 /// caching can be optimized accordingly.
91 ///
92 /// \headerfile <x86intrin.h>
93 ///
94 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
95 ///
96 /// \param dst
97 ///    A destination tile. Max size is 1024 Bytes.
98 /// \param base
99 ///    A pointer to base address.
100 /// \param stride
101 ///    The stride between the rows' data to be loaded in memory.
102 #define _tile_stream_loadd(dst, base, stride) \
103   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
104 
105 /// Store the tile specified by "src" to memory specifieid by "base" address and
106 /// "stride" using the tile configuration previously configured via
107 /// "_tile_loadconfig".
108 ///
109 /// \headerfile <x86intrin.h>
110 ///
111 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
112 ///
113 /// \param dst
114 ///    A destination tile. Max size is 1024 Bytes.
115 /// \param base
116 ///    A pointer to base address.
117 /// \param stride
118 ///    The stride between the rows' data to be stored in memory.
119 #define _tile_stored(dst, base, stride) \
120   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
121 
122 /// Zero the tile specified by "tdest".
123 ///
124 /// \headerfile <x86intrin.h>
125 ///
126 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
127 ///
128 /// \param tile
129 ///    The destination tile to be zero. Max size is 1024 Bytes.
130 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
131 
132 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
133 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
134 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
135 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
136 /// and store the 32-bit result back to tile "dst".
137 ///
138 /// \headerfile <x86intrin.h>
139 ///
140 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
141 ///
142 /// \param dst
143 ///    The destination tile. Max size is 1024 Bytes.
144 /// \param src0
145 ///    The 1st source tile. Max size is 1024 Bytes.
146 /// \param src1
147 ///    The 2nd source tile. Max size is 1024 Bytes.
148 #define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
149 
150 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
151 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
152 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
153 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
154 /// in "dst", and store the 32-bit result back to tile "dst".
155 ///
156 /// \headerfile <x86intrin.h>
157 ///
158 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
159 ///
160 /// \param dst
161 ///    The destination tile. Max size is 1024 Bytes.
162 /// \param src0
163 ///    The 1st source tile. Max size is 1024 Bytes.
164 /// \param src1
165 ///    The 2nd source tile. Max size is 1024 Bytes.
166 #define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
167 
168 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
169 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
170 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
171 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
172 /// and store the 32-bit result back to tile "dst".
173 ///
174 /// \headerfile <x86intrin.h>
175 ///
176 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
177 ///
178 /// \param dst
179 ///    The destination tile. Max size is 1024 Bytes.
180 /// \param src0
181 ///    The 1st source tile. Max size is 1024 Bytes.
182 /// \param src1
183 ///    The 2nd source tile. Max size is 1024 Bytes.
184 #define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
185 
186 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
187 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
188 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
189 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
190 /// "dst", and store the 32-bit result back to tile "dst".
191 ///
192 /// \headerfile <x86intrin.h>
193 ///
194 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
195 ///
196 /// \param dst
197 ///    The destination tile. Max size is 1024 Bytes.
198 /// \param src0
199 ///    The 1st source tile. Max size is 1024 Bytes.
200 /// \param src1
201 ///    The 2nd source tile. Max size is 1024 Bytes.
202 #define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
203 
204 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
205 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
206 /// elements with elements in "dst", and store the 32-bit result back to tile
207 /// "dst".
208 ///
209 /// \headerfile <x86intrin.h>
210 ///
211 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
212 ///
213 /// \param dst
214 ///    The destination tile. Max size is 1024 Bytes.
215 /// \param src0
216 ///    The 1st source tile. Max size is 1024 Bytes.
217 /// \param src1
218 ///    The 2nd source tile. Max size is 1024 Bytes.
219 #define _tile_dpbf16ps(dst, src0, src1) \
220   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
221 
222 #undef __DEFAULT_FN_ATTRS
223 
224 #endif /* __x86_64__ */
225 #endif /* __AMXINTRIN_H */
226