1 // -*- C++ -*-
2 //===----------------------------------------------------------------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9
10 #ifndef _PSTL_UNSEQ_BACKEND_SIMD_H
11 #define _PSTL_UNSEQ_BACKEND_SIMD_H
12
13 #include <type_traits>
14
15 #include "pstl_config.h"
16 #include "utils.h"
17
18 // This header defines the minimum set of vector routines required
19 // to support parallel STL.
20
21 _PSTL_HIDE_FROM_ABI_PUSH
22
23 namespace __pstl
24 {
25 namespace __unseq_backend
26 {
27
28 // Expect vector width up to 64 (or 512 bit)
29 const std::size_t __lane_size = 64;
30
31 template <class _Iterator, class _DifferenceType, class _Function>
32 _Iterator
__simd_walk_1(_Iterator __first,_DifferenceType __n,_Function __f)33 __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept
34 {
35 _PSTL_PRAGMA_SIMD
36 for (_DifferenceType __i = 0; __i < __n; ++__i)
37 __f(__first[__i]);
38
39 return __first + __n;
40 }
41
42 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
43 _Iterator2
__simd_walk_2(_Iterator1 __first1,_DifferenceType __n,_Iterator2 __first2,_Function __f)44 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept
45 {
46 _PSTL_PRAGMA_SIMD
47 for (_DifferenceType __i = 0; __i < __n; ++__i)
48 __f(__first1[__i], __first2[__i]);
49 return __first2 + __n;
50 }
51
52 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
53 _Iterator3
__simd_walk_3(_Iterator1 __first1,_DifferenceType __n,_Iterator2 __first2,_Iterator3 __first3,_Function __f)54 __simd_walk_3(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3,
55 _Function __f) noexcept
56 {
57 _PSTL_PRAGMA_SIMD
58 for (_DifferenceType __i = 0; __i < __n; ++__i)
59 __f(__first1[__i], __first2[__i], __first3[__i]);
60 return __first3 + __n;
61 }
62
63 // TODO: check whether __simd_first() can be used here
64 template <class _Index, class _DifferenceType, class _Pred>
65 bool
__simd_or(_Index __first,_DifferenceType __n,_Pred __pred)66 __simd_or(_Index __first, _DifferenceType __n, _Pred __pred) noexcept
67 {
68 #if _PSTL_EARLYEXIT_PRESENT
69 _DifferenceType __i;
70 _PSTL_PRAGMA_VECTOR_UNALIGNED
71 _PSTL_PRAGMA_SIMD_EARLYEXIT
72 for (__i = 0; __i < __n; ++__i)
73 if (__pred(__first[__i]))
74 break;
75 return __i < __n;
76 #else
77 _DifferenceType __block_size = 4 < __n ? 4 : __n;
78 const _Index __last = __first + __n;
79 while (__last != __first)
80 {
81 int32_t __flag = 1;
82 _PSTL_PRAGMA_SIMD_REDUCTION(& : __flag)
83 for (_DifferenceType __i = 0; __i < __block_size; ++__i)
84 if (__pred(*(__first + __i)))
85 __flag = 0;
86 if (!__flag)
87 return true;
88
89 __first += __block_size;
90 if (__last - __first >= __block_size << 1)
91 {
92 // Double the block _Size. Any unnecessary iterations can be amortized against work done so far.
93 __block_size <<= 1;
94 }
95 else
96 {
97 __block_size = __last - __first;
98 }
99 }
100 return false;
101 #endif
102 }
103
104 template <class _Index, class _DifferenceType, class _Compare>
105 _Index
__simd_first(_Index __first,_DifferenceType __begin,_DifferenceType __end,_Compare __comp)106 __simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept
107 {
108 #if _PSTL_EARLYEXIT_PRESENT
109 _DifferenceType __i = __begin;
110 _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
111 _PSTL_PRAGMA_SIMD_EARLYEXIT for (; __i < __end; ++__i)
112 {
113 if (__comp(__first, __i))
114 {
115 break;
116 }
117 }
118 return __first + __i;
119 #else
120 // Experiments show good block sizes like this
121 const _DifferenceType __block_size = 8;
122 alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
123 while (__end - __begin >= __block_size)
124 {
125 _DifferenceType __found = 0;
126 _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
127 _PSTL_PRAGMA_SIMD_REDUCTION(|
128 : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size;
129 ++__i)
130 {
131 const _DifferenceType __t = __comp(__first, __i);
132 __lane[__i - __begin] = __t;
133 __found |= __t;
134 }
135 if (__found)
136 {
137 _DifferenceType __i;
138 // This will vectorize
139 for (__i = 0; __i < __block_size; ++__i)
140 {
141 if (__lane[__i])
142 {
143 break;
144 }
145 }
146 return __first + __begin + __i;
147 }
148 __begin += __block_size;
149 }
150
151 //Keep remainder scalar
152 while (__begin != __end)
153 {
154 if (__comp(__first, __begin))
155 {
156 return __first + __begin;
157 }
158 ++__begin;
159 }
160 return __first + __end;
161 #endif //_PSTL_EARLYEXIT_PRESENT
162 }
163
164 template <class _Index1, class _DifferenceType, class _Index2, class _Pred>
165 std::pair<_Index1, _Index2>
__simd_first(_Index1 __first1,_DifferenceType __n,_Index2 __first2,_Pred __pred)166 __simd_first(_Index1 __first1, _DifferenceType __n, _Index2 __first2, _Pred __pred) noexcept
167 {
168 #if _PSTL_EARLYEXIT_PRESENT
169 _DifferenceType __i = 0;
170 _PSTL_PRAGMA_VECTOR_UNALIGNED
171 _PSTL_PRAGMA_SIMD_EARLYEXIT
172 for (; __i < __n; ++__i)
173 if (__pred(__first1[__i], __first2[__i]))
174 break;
175 return std::make_pair(__first1 + __i, __first2 + __i);
176 #else
177 const _Index1 __last1 = __first1 + __n;
178 const _Index2 __last2 = __first2 + __n;
179 // Experiments show good block sizes like this
180 const _DifferenceType __block_size = 8;
181 alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
182 while (__last1 - __first1 >= __block_size)
183 {
184 _DifferenceType __found = 0;
185 _DifferenceType __i;
186 _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
187 _PSTL_PRAGMA_SIMD_REDUCTION(|
188 : __found) for (__i = 0; __i < __block_size; ++__i)
189 {
190 const _DifferenceType __t = __pred(__first1[__i], __first2[__i]);
191 __lane[__i] = __t;
192 __found |= __t;
193 }
194 if (__found)
195 {
196 _DifferenceType __i2;
197 // This will vectorize
198 for (__i2 = 0; __i2 < __block_size; ++__i2)
199 {
200 if (__lane[__i2])
201 break;
202 }
203 return std::make_pair(__first1 + __i2, __first2 + __i2);
204 }
205 __first1 += __block_size;
206 __first2 += __block_size;
207 }
208
209 //Keep remainder scalar
210 for (; __last1 != __first1; ++__first1, ++__first2)
211 if (__pred(*(__first1), *(__first2)))
212 return std::make_pair(__first1, __first2);
213
214 return std::make_pair(__last1, __last2);
215 #endif //_PSTL_EARLYEXIT_PRESENT
216 }
217
218 template <class _Index, class _DifferenceType, class _Pred>
219 _DifferenceType
__simd_count(_Index __index,_DifferenceType __n,_Pred __pred)220 __simd_count(_Index __index, _DifferenceType __n, _Pred __pred) noexcept
221 {
222 _DifferenceType __count = 0;
223 _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
224 for (_DifferenceType __i = 0; __i < __n; ++__i)
225 if (__pred(*(__index + __i)))
226 ++__count;
227
228 return __count;
229 }
230
231 template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _BinaryPredicate>
232 _OutputIterator
__simd_unique_copy(_InputIterator __first,_DifferenceType __n,_OutputIterator __result,_BinaryPredicate __pred)233 __simd_unique_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator __result,
234 _BinaryPredicate __pred) noexcept
235 {
236 if (__n == 0)
237 return __result;
238
239 _DifferenceType __cnt = 1;
240 __result[0] = __first[0];
241
242 _PSTL_PRAGMA_SIMD
243 for (_DifferenceType __i = 1; __i < __n; ++__i)
244 {
245 _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
246 if (!__pred(__first[__i], __first[__i - 1]))
247 {
248 __result[__cnt] = __first[__i];
249 ++__cnt;
250 }
251 }
252 return __result + __cnt;
253 }
254
255 template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
256 _OutputIterator
__simd_assign(_InputIterator __first,_DifferenceType __n,_OutputIterator __result,_Assigner __assigner)257 __simd_assign(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _Assigner __assigner) noexcept
258 {
259 _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
260 _PSTL_PRAGMA_SIMD
261 for (_DifferenceType __i = 0; __i < __n; ++__i)
262 __assigner(__first + __i, __result + __i);
263 return __result + __n;
264 }
265
266 template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _UnaryPredicate>
267 _OutputIterator
__simd_copy_if(_InputIterator __first,_DifferenceType __n,_OutputIterator __result,_UnaryPredicate __pred)268 __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _UnaryPredicate __pred) noexcept
269 {
270 _DifferenceType __cnt = 0;
271
272 _PSTL_PRAGMA_SIMD
273 for (_DifferenceType __i = 0; __i < __n; ++__i)
274 {
275 _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
276 if (__pred(__first[__i]))
277 {
278 __result[__cnt] = __first[__i];
279 ++__cnt;
280 }
281 }
282 return __result + __cnt;
283 }
284
285 template <class _InputIterator, class _DifferenceType, class _BinaryPredicate>
286 _DifferenceType
__simd_calc_mask_2(_InputIterator __first,_DifferenceType __n,bool * __mask,_BinaryPredicate __pred)287 __simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept
288 {
289 _DifferenceType __count = 0;
290
291 _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
292 for (_DifferenceType __i = 0; __i < __n; ++__i)
293 {
294 __mask[__i] = !__pred(__first[__i], __first[__i - 1]);
295 __count += __mask[__i];
296 }
297 return __count;
298 }
299
300 template <class _InputIterator, class _DifferenceType, class _UnaryPredicate>
301 _DifferenceType
__simd_calc_mask_1(_InputIterator __first,_DifferenceType __n,bool * __mask,_UnaryPredicate __pred)302 __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _UnaryPredicate __pred) noexcept
303 {
304 _DifferenceType __count = 0;
305
306 _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
307 for (_DifferenceType __i = 0; __i < __n; ++__i)
308 {
309 __mask[__i] = __pred(__first[__i]);
310 __count += __mask[__i];
311 }
312 return __count;
313 }
314
315 template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
316 void
__simd_copy_by_mask(_InputIterator __first,_DifferenceType __n,_OutputIterator __result,bool * __mask,_Assigner __assigner)317 __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask,
318 _Assigner __assigner) noexcept
319 {
320 _DifferenceType __cnt = 0;
321 _PSTL_PRAGMA_SIMD
322 for (_DifferenceType __i = 0; __i < __n; ++__i)
323 {
324 if (__mask[__i])
325 {
326 _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
327 {
328 __assigner(__first + __i, __result + __cnt);
329 ++__cnt;
330 }
331 }
332 }
333 }
334
335 template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2>
336 void
__simd_partition_by_mask(_InputIterator __first,_DifferenceType __n,_OutputIterator1 __out_true,_OutputIterator2 __out_false,bool * __mask)337 __simd_partition_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
338 _OutputIterator2 __out_false, bool* __mask) noexcept
339 {
340 _DifferenceType __cnt_true = 0, __cnt_false = 0;
341 _PSTL_PRAGMA_SIMD
342 for (_DifferenceType __i = 0; __i < __n; ++__i)
343 {
344 _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(__cnt_true : 1, __cnt_false : 1)
345 if (__mask[__i])
346 {
347 __out_true[__cnt_true] = __first[__i];
348 ++__cnt_true;
349 }
350 else
351 {
352 __out_false[__cnt_false] = __first[__i];
353 ++__cnt_false;
354 }
355 }
356 }
357
358 template <class _Index, class _DifferenceType, class _Tp>
359 _Index
__simd_fill_n(_Index __first,_DifferenceType __n,const _Tp & __value)360 __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept
361 {
362 _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
363 _PSTL_PRAGMA_SIMD
364 for (_DifferenceType __i = 0; __i < __n; ++__i)
365 __first[__i] = __value;
366 return __first + __n;
367 }
368
369 template <class _Index, class _DifferenceType, class _Generator>
370 _Index
__simd_generate_n(_Index __first,_DifferenceType __size,_Generator __g)371 __simd_generate_n(_Index __first, _DifferenceType __size, _Generator __g) noexcept
372 {
373 _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
374 _PSTL_PRAGMA_SIMD
375 for (_DifferenceType __i = 0; __i < __size; ++__i)
376 __first[__i] = __g();
377 return __first + __size;
378 }
379
380 template <class _Index, class _BinaryPredicate>
381 _Index
__simd_adjacent_find(_Index __first,_Index __last,_BinaryPredicate __pred,bool __or_semantic)382 __simd_adjacent_find(_Index __first, _Index __last, _BinaryPredicate __pred, bool __or_semantic) noexcept
383 {
384 if (__last - __first < 2)
385 return __last;
386
387 typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
388 _DifferenceType __i = 0;
389
390 #if _PSTL_EARLYEXIT_PRESENT
391 //Some compiler versions fail to compile the following loop when iterators are used. Indices are used instead
392 const _DifferenceType __n = __last - __first - 1;
393 _PSTL_PRAGMA_VECTOR_UNALIGNED
394 _PSTL_PRAGMA_SIMD_EARLYEXIT
395 for (; __i < __n; ++__i)
396 if (__pred(__first[__i], __first[__i + 1]))
397 break;
398
399 return __i < __n ? __first + __i : __last;
400 #else
401 // Experiments show good block sizes like this
402 //TODO: to consider tuning block_size for various data types
403 const _DifferenceType __block_size = 8;
404 alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
405 while (__last - __first >= __block_size)
406 {
407 _DifferenceType __found = 0;
408 _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
409 _PSTL_PRAGMA_SIMD_REDUCTION(|
410 : __found) for (__i = 0; __i < __block_size - 1; ++__i)
411 {
412 //TODO: to improve SIMD vectorization
413 const _DifferenceType __t = __pred(*(__first + __i), *(__first + __i + 1));
414 __lane[__i] = __t;
415 __found |= __t;
416 }
417
418 //Process a pair of elements on a boundary of a data block
419 if (__first + __block_size < __last && __pred(*(__first + __i), *(__first + __i + 1)))
420 __lane[__i] = __found = 1;
421
422 if (__found)
423 {
424 if (__or_semantic)
425 return __first;
426
427 // This will vectorize
428 for (__i = 0; __i < __block_size; ++__i)
429 if (__lane[__i])
430 break;
431 return __first + __i; //As far as found is true a __result (__lane[__i] is true) is guaranteed
432 }
433 __first += __block_size;
434 }
435 //Process the rest elements
436 for (; __last - __first > 1; ++__first)
437 if (__pred(*__first, *(__first + 1)))
438 return __first;
439
440 return __last;
441 #endif
442 }
443
444 // It was created to reduce the code inside std::enable_if
445 template <typename _Tp, typename _BinaryOperation>
446 using is_arithmetic_plus = std::integral_constant<bool, std::is_arithmetic<_Tp>::value &&
447 std::is_same<_BinaryOperation, std::plus<_Tp>>::value>;
448
449 template <typename _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
450 typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, _Tp>::type
__simd_transform_reduce(_DifferenceType __n,_Tp __init,_BinaryOperation,_UnaryOperation __f)451 __simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _UnaryOperation __f) noexcept
452 {
453 _PSTL_PRAGMA_SIMD_REDUCTION(+ : __init)
454 for (_DifferenceType __i = 0; __i < __n; ++__i)
455 __init += __f(__i);
456 return __init;
457 }
458
459 template <typename _Size, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
460 typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, _Tp>::type
__simd_transform_reduce(_Size __n,_Tp __init,_BinaryOperation __binary_op,_UnaryOperation __f)461 __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept
462 {
463 const _Size __block_size = __lane_size / sizeof(_Tp);
464 if (__n > 2 * __block_size && __block_size > 1)
465 {
466 alignas(__lane_size) char __lane_[__lane_size];
467 _Tp* __lane = reinterpret_cast<_Tp*>(__lane_);
468
469 // initializer
470 _PSTL_PRAGMA_SIMD
471 for (_Size __i = 0; __i < __block_size; ++__i)
472 {
473 ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
474 }
475 // main loop
476 _Size __i = 2 * __block_size;
477 const _Size last_iteration = __block_size * (__n / __block_size);
478 for (; __i < last_iteration; __i += __block_size)
479 {
480 _PSTL_PRAGMA_SIMD
481 for (_Size __j = 0; __j < __block_size; ++__j)
482 {
483 __lane[__j] = __binary_op(__lane[__j], __f(__i + __j));
484 }
485 }
486 // remainder
487 _PSTL_PRAGMA_SIMD
488 for (_Size __j = 0; __j < __n - last_iteration; ++__j)
489 {
490 __lane[__j] = __binary_op(__lane[__j], __f(last_iteration + __j));
491 }
492 // combiner
493 for (_Size __j = 0; __j < __block_size; ++__j)
494 {
495 __init = __binary_op(__init, __lane[__j]);
496 }
497 // destroyer
498 _PSTL_PRAGMA_SIMD
499 for (_Size __j = 0; __j < __block_size; ++__j)
500 {
501 __lane[__j].~_Tp();
502 }
503 }
504 else
505 {
506 for (_Size __i = 0; __i < __n; ++__i)
507 {
508 __init = __binary_op(__init, __f(__i));
509 }
510 }
511 return __init;
512 }
513
514 // Exclusive scan for "+" and arithmetic types
515 template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
516 class _BinaryOperation>
517 typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
__simd_scan(_InputIterator __first,_Size __n,_OutputIterator __result,_UnaryOperation __unary_op,_Tp __init,_BinaryOperation,std::false_type)518 __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
519 _BinaryOperation, /*Inclusive*/ std::false_type)
520 {
521 _PSTL_PRAGMA_SIMD_SCAN(+ : __init)
522 for (_Size __i = 0; __i < __n; ++__i)
523 {
524 __result[__i] = __init;
525 _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(__init)
526 __init += __unary_op(__first[__i]);
527 }
528 return std::make_pair(__result + __n, __init);
529 }
530
531 // As soon as we cannot call __binary_op in "combiner" we create a wrapper over _Tp to encapsulate __binary_op
532 template <typename _Tp, typename _BinaryOp>
533 struct _Combiner
534 {
535 _Tp __value;
536 _BinaryOp* __bin_op; // Here is a pointer to function because of default ctor
537
_Combiner_Combiner538 _Combiner() : __value{}, __bin_op(nullptr) {}
_Combiner_Combiner539 _Combiner(const _Tp& value, const _BinaryOp* bin_op) : __value(value), __bin_op(const_cast<_BinaryOp*>(bin_op)) {}
_Combiner_Combiner540 _Combiner(const _Combiner& __obj) : __value{}, __bin_op(__obj.__bin_op) {}
541
542 void
operator_Combiner543 operator()(const _Combiner& __obj)
544 {
545 __value = (*__bin_op)(__value, __obj.__value);
546 }
547 };
548
549 // Exclusive scan for other binary operations and types
550 template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
551 class _BinaryOperation>
552 typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
__simd_scan(_InputIterator __first,_Size __n,_OutputIterator __result,_UnaryOperation __unary_op,_Tp __init,_BinaryOperation __binary_op,std::false_type)553 __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
554 _BinaryOperation __binary_op, /*Inclusive*/ std::false_type)
555 {
556 typedef _Combiner<_Tp, _BinaryOperation> _CombinerType;
557 _CombinerType __init_{__init, &__binary_op};
558
559 _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType)
560
561 _PSTL_PRAGMA_SIMD_SCAN(__bin_op : __init_)
562 for (_Size __i = 0; __i < __n; ++__i)
563 {
564 __result[__i] = __init_.__value;
565 _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(__init_)
566 _PSTL_PRAGMA_FORCEINLINE
567 __init_.__value = __binary_op(__init_.__value, __unary_op(__first[__i]));
568 }
569 return std::make_pair(__result + __n, __init_.__value);
570 }
571
572 // Inclusive scan for "+" and arithmetic types
573 template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
574 class _BinaryOperation>
575 typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
__simd_scan(_InputIterator __first,_Size __n,_OutputIterator __result,_UnaryOperation __unary_op,_Tp __init,_BinaryOperation,std::true_type)576 __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
577 _BinaryOperation, /*Inclusive*/ std::true_type)
578 {
579 _PSTL_PRAGMA_SIMD_SCAN(+ : __init)
580 for (_Size __i = 0; __i < __n; ++__i)
581 {
582 __init += __unary_op(__first[__i]);
583 _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(__init)
584 __result[__i] = __init;
585 }
586 return std::make_pair(__result + __n, __init);
587 }
588
589 // Inclusive scan for other binary operations and types
590 template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
591 class _BinaryOperation>
592 typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
__simd_scan(_InputIterator __first,_Size __n,_OutputIterator __result,_UnaryOperation __unary_op,_Tp __init,_BinaryOperation __binary_op,std::true_type)593 __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
594 _BinaryOperation __binary_op, std::true_type)
595 {
596 typedef _Combiner<_Tp, _BinaryOperation> _CombinerType;
597 _CombinerType __init_{__init, &__binary_op};
598
599 _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType)
600
601 _PSTL_PRAGMA_SIMD_SCAN(__bin_op : __init_)
602 for (_Size __i = 0; __i < __n; ++__i)
603 {
604 _PSTL_PRAGMA_FORCEINLINE
605 __init_.__value = __binary_op(__init_.__value, __unary_op(__first[__i]));
606 _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(__init_)
607 __result[__i] = __init_.__value;
608 }
609 return std::make_pair(__result + __n, __init_.__value);
610 }
611
612 // [restriction] - std::iterator_traits<_ForwardIterator>::value_type should be DefaultConstructible.
613 // complexity [violation] - We will have at most (__n-1 + number_of_lanes) comparisons instead of at most __n-1.
614 template <typename _ForwardIterator, typename _Size, typename _Compare>
615 _ForwardIterator
__simd_min_element(_ForwardIterator __first,_Size __n,_Compare __comp)616 __simd_min_element(_ForwardIterator __first, _Size __n, _Compare __comp) noexcept
617 {
618 if (__n == 0)
619 {
620 return __first;
621 }
622
623 typedef typename std::iterator_traits<_ForwardIterator>::value_type _ValueType;
624 struct _ComplexType
625 {
626 _ValueType __min_val;
627 _Size __min_ind;
628 _Compare* __min_comp;
629
630 _ComplexType() : __min_val{}, __min_ind{}, __min_comp(nullptr) {}
631 _ComplexType(const _ValueType& val, const _Compare* comp)
632 : __min_val(val), __min_ind(0), __min_comp(const_cast<_Compare*>(comp))
633 {
634 }
635 _ComplexType(const _ComplexType& __obj)
636 : __min_val(__obj.__min_val), __min_ind(__obj.__min_ind), __min_comp(__obj.__min_comp)
637 {
638 }
639
640 _PSTL_PRAGMA_DECLARE_SIMD
641 void
642 operator()(const _ComplexType& __obj)
643 {
644 if (!(*__min_comp)(__min_val, __obj.__min_val) &&
645 ((*__min_comp)(__obj.__min_val, __min_val) || __obj.__min_ind - __min_ind < 0))
646 {
647 __min_val = __obj.__min_val;
648 __min_ind = __obj.__min_ind;
649 }
650 }
651 };
652
653 _ComplexType __init{*__first, &__comp};
654
655 _PSTL_PRAGMA_DECLARE_REDUCTION(__min_func, _ComplexType)
656
657 _PSTL_PRAGMA_SIMD_REDUCTION(__min_func : __init)
658 for (_Size __i = 1; __i < __n; ++__i)
659 {
660 const _ValueType __min_val = __init.__min_val;
661 const _ValueType __current = __first[__i];
662 if (__comp(__current, __min_val))
663 {
664 __init.__min_val = __current;
665 __init.__min_ind = __i;
666 }
667 }
668 return __first + __init.__min_ind;
669 }
670
671 // [restriction] - std::iterator_traits<_ForwardIterator>::value_type should be DefaultConstructible.
672 // complexity [violation] - We will have at most (2*(__n-1) + 4*number_of_lanes) comparisons instead of at most [1.5*(__n-1)].
673 template <typename _ForwardIterator, typename _Size, typename _Compare>
674 std::pair<_ForwardIterator, _ForwardIterator>
__simd_minmax_element(_ForwardIterator __first,_Size __n,_Compare __comp)675 __simd_minmax_element(_ForwardIterator __first, _Size __n, _Compare __comp) noexcept
676 {
677 if (__n == 0)
678 {
679 return std::make_pair(__first, __first);
680 }
681 typedef typename std::iterator_traits<_ForwardIterator>::value_type _ValueType;
682
683 struct _ComplexType
684 {
685 _ValueType __min_val;
686 _ValueType __max_val;
687 _Size __min_ind;
688 _Size __max_ind;
689 _Compare* __minmax_comp;
690
691 _ComplexType() : __min_val{}, __max_val{}, __min_ind{}, __max_ind{}, __minmax_comp(nullptr) {}
692 _ComplexType(const _ValueType& min_val, const _ValueType& max_val, const _Compare* comp)
693 : __min_val(min_val), __max_val(max_val), __min_ind(0), __max_ind(0),
694 __minmax_comp(const_cast<_Compare*>(comp))
695 {
696 }
697 _ComplexType(const _ComplexType& __obj)
698 : __min_val(__obj.__min_val), __max_val(__obj.__max_val), __min_ind(__obj.__min_ind),
699 __max_ind(__obj.__max_ind), __minmax_comp(__obj.__minmax_comp)
700 {
701 }
702
703 void
704 operator()(const _ComplexType& __obj)
705 {
706 // min
707 if ((*__minmax_comp)(__obj.__min_val, __min_val))
708 {
709 __min_val = __obj.__min_val;
710 __min_ind = __obj.__min_ind;
711 }
712 else if (!(*__minmax_comp)(__min_val, __obj.__min_val))
713 {
714 __min_val = __obj.__min_val;
715 __min_ind = (__min_ind - __obj.__min_ind < 0) ? __min_ind : __obj.__min_ind;
716 }
717
718 // max
719 if ((*__minmax_comp)(__max_val, __obj.__max_val))
720 {
721 __max_val = __obj.__max_val;
722 __max_ind = __obj.__max_ind;
723 }
724 else if (!(*__minmax_comp)(__obj.__max_val, __max_val))
725 {
726 __max_val = __obj.__max_val;
727 __max_ind = (__max_ind - __obj.__max_ind < 0) ? __obj.__max_ind : __max_ind;
728 }
729 }
730 };
731
732 _ComplexType __init{*__first, *__first, &__comp};
733
734 _PSTL_PRAGMA_DECLARE_REDUCTION(__min_func, _ComplexType);
735
736 _PSTL_PRAGMA_SIMD_REDUCTION(__min_func : __init)
737 for (_Size __i = 1; __i < __n; ++__i)
738 {
739 auto __min_val = __init.__min_val;
740 auto __max_val = __init.__max_val;
741 auto __current = __first + __i;
742 if (__comp(*__current, __min_val))
743 {
744 __init.__min_val = *__current;
745 __init.__min_ind = __i;
746 }
747 else if (!__comp(*__current, __max_val))
748 {
749 __init.__max_val = *__current;
750 __init.__max_ind = __i;
751 }
752 }
753 return std::make_pair(__first + __init.__min_ind, __first + __init.__max_ind);
754 }
755
756 template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2,
757 class _UnaryPredicate>
758 std::pair<_OutputIterator1, _OutputIterator2>
__simd_partition_copy(_InputIterator __first,_DifferenceType __n,_OutputIterator1 __out_true,_OutputIterator2 __out_false,_UnaryPredicate __pred)759 __simd_partition_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
760 _OutputIterator2 __out_false, _UnaryPredicate __pred) noexcept
761 {
762 _DifferenceType __cnt_true = 0, __cnt_false = 0;
763
764 _PSTL_PRAGMA_SIMD
765 for (_DifferenceType __i = 0; __i < __n; ++__i)
766 {
767 _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(__cnt_true : 1, __cnt_false : 1)
768 if (__pred(__first[__i]))
769 {
770 __out_true[__cnt_true] = __first[__i];
771 ++__cnt_true;
772 }
773 else
774 {
775 __out_false[__cnt_false] = __first[__i];
776 ++__cnt_false;
777 }
778 }
779 return std::make_pair(__out_true + __cnt_true, __out_false + __cnt_false);
780 }
781
782 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
783 _ForwardIterator1
__simd_find_first_of(_ForwardIterator1 __first,_ForwardIterator1 __last,_ForwardIterator2 __s_first,_ForwardIterator2 __s_last,_BinaryPredicate __pred)784 __simd_find_first_of(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first,
785 _ForwardIterator2 __s_last, _BinaryPredicate __pred) noexcept
786 {
787 typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _DifferencType;
788
789 const _DifferencType __n1 = __last - __first;
790 const _DifferencType __n2 = __s_last - __s_first;
791 if (__n1 == 0 || __n2 == 0)
792 {
793 return __last; // according to the standard
794 }
795
796 // Common case
797 // If first sequence larger than second then we'll run simd_first with parameters of first sequence.
798 // Otherwise, vice versa.
799 if (__n1 < __n2)
800 {
801 for (; __first != __last; ++__first)
802 {
803 if (__unseq_backend::__simd_or(
804 __s_first, __n2,
805 __internal::__equal_value_by_pred<decltype(*__first), _BinaryPredicate>(*__first, __pred)))
806 {
807 return __first;
808 }
809 }
810 }
811 else
812 {
813 for (; __s_first != __s_last; ++__s_first)
814 {
815 const auto __result = __unseq_backend::__simd_first(
816 __first, _DifferencType(0), __n1, [__s_first, &__pred](_ForwardIterator1 __it, _DifferencType __i) {
817 return __pred(__it[__i], *__s_first);
818 });
819 if (__result != __last)
820 {
821 return __result;
822 }
823 }
824 }
825 return __last;
826 }
827
828 template <class _RandomAccessIterator, class _DifferenceType, class _UnaryPredicate>
829 _RandomAccessIterator
__simd_remove_if(_RandomAccessIterator __first,_DifferenceType __n,_UnaryPredicate __pred)830 __simd_remove_if(_RandomAccessIterator __first, _DifferenceType __n, _UnaryPredicate __pred) noexcept
831 {
832 // find first element we need to remove
833 auto __current = __unseq_backend::__simd_first(
834 __first, _DifferenceType(0), __n,
835 [&__pred](_RandomAccessIterator __it, _DifferenceType __i) { return __pred(__it[__i]); });
836 __n -= __current - __first;
837
838 // if we have in sequence only one element that pred(__current[1]) != false we can exit the function
839 if (__n < 2)
840 {
841 return __current;
842 }
843
844 _DifferenceType __cnt = 0;
845 _PSTL_PRAGMA_SIMD
846 for (_DifferenceType __i = 1; __i < __n; ++__i)
847 {
848 _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
849 if (!__pred(__current[__i]))
850 {
851 __current[__cnt] = std::move(__current[__i]);
852 ++__cnt;
853 }
854 }
855 return __current + __cnt;
856 }
857 } // namespace __unseq_backend
858 } // namespace __pstl
859
860 _PSTL_HIDE_FROM_ABI_POP
861
862 #endif /* _PSTL_UNSEQ_BACKEND_SIMD_H */
863