Lines Matching full:merge
124 .merge = {
232 hsg_merge_levels_init_shared(struct hsg_merge * const merge) in hsg_merge_levels_init_shared() argument
240 uint32_t const bs_threads = merge->warps << hsg_config.warp.lanes_log2; in hsg_merge_levels_init_shared()
244 …uint32_t const bs_rows_even = bs_kpt_mod & ~1; // must be even because flip merge only works on ro… in hsg_merge_levels_init_shared()
254 merge->rows_bs = MIN_MACRO(bs_rows_even, hsg_config.thread.regs); in hsg_merge_levels_init_shared()
267 // if merge->warps is not pow2 then we're going to skip creating a bc elsewhere in hsg_merge_levels_init_shared()
269 uint32_t const bc_warps_min = MAX_MACRO(merge->warps,hsg_config.block.warps_min); in hsg_merge_levels_init_shared()
281 merge->rows_bc = MIN_MACRO(bc_kpt_mod, hsg_config.thread.regs); in hsg_merge_levels_init_shared()
282 merge->skpw_bc = bc_keys / bc_warps_min; in hsg_merge_levels_init_shared()
292 hsg_merge_levels_init_1(struct hsg_merge * const merge, uint32_t const warps, uint32_t const level,… in hsg_merge_levels_init_1() argument
296 merge->levels[level].evenodds[even_odd]++; in hsg_merge_levels_init_1()
297 merge->levels[level].networks[even_odd] = warps; in hsg_merge_levels_init_1()
302 merge->levels[level].active.b64 |= BITS_TO_MASK_AT_64(warps,offset); in hsg_merge_levels_init_1()
304 uint32_t const count = merge->levels[level].count++; in hsg_merge_levels_init_1()
308 merge->levels[level].evenodd_masks[even_odd] |= bit; in hsg_merge_levels_init_1()
313 uint32_t const diff = offset - merge->offsets[index-1]; in hsg_merge_levels_init_1()
315 uint32_t const diff_0 = merge->levels[level].diffs[0]; in hsg_merge_levels_init_1()
316 uint32_t const diff_1 = merge->levels[level].diffs[1]; in hsg_merge_levels_init_1()
329 merge->levels[level].diffs [diff_idx] = diff; in hsg_merge_levels_init_1()
330 merge->levels[level].diff_masks[diff_idx] |= 1 << (count-1); in hsg_merge_levels_init_1()
333 merge->networks[index] = warps; in hsg_merge_levels_init_1()
334 merge->offsets [index] = offset; in hsg_merge_levels_init_1()
339 hsg_merge_levels_init_1(merge,l,level+1,offset); in hsg_merge_levels_init_1()
340 hsg_merge_levels_init_1(merge,r,level+1,offset+l); in hsg_merge_levels_init_1()
345 hsg_merge_levels_debug(struct hsg_merge * const merge) in hsg_merge_levels_debug() argument
349 uint32_t count = merge->levels[level].count; in hsg_merge_levels_debug()
357 merge->levels[level].active.b64); in hsg_merge_levels_debug()
362 merge->levels[level].diffs[0], in hsg_merge_levels_debug()
363 merge->levels[level].diff_masks[0], in hsg_merge_levels_debug()
364 POPCOUNT_MACRO(merge->levels[level].diff_masks[0]), in hsg_merge_levels_debug()
365 merge->levels[level].diffs[1], in hsg_merge_levels_debug()
366 merge->levels[level].diff_masks[1], in hsg_merge_levels_debug()
367 POPCOUNT_MACRO(merge->levels[level].diff_masks[1])); in hsg_merge_levels_debug()
372 merge->levels[level].evenodd_masks[0], in hsg_merge_levels_debug()
373 POPCOUNT_MACRO(merge->levels[level].evenodd_masks[0]), in hsg_merge_levels_debug()
374 merge->levels[level].evenodd_masks[1], in hsg_merge_levels_debug()
375 POPCOUNT_MACRO(merge->levels[level].evenodd_masks[1])); in hsg_merge_levels_debug()
379 if (merge->levels[level].networks[ii] > 1) in hsg_merge_levels_debug()
384 merge->levels[level].evenodds[ii], in hsg_merge_levels_debug()
385 merge->levels[level].networks[ii]); in hsg_merge_levels_debug()
395 merge->offsets [index], in hsg_merge_levels_debug()
396 merge->networks[index]); in hsg_merge_levels_debug()
407 hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune) in hsg_merge_levels_hint() argument
409 // clamp against merge levels in hsg_merge_levels_hint()
413 uint32_t const n_max = MAX_MACRO(merge->levels[level].networks[0], in hsg_merge_levels_hint()
414 merge->levels[level].networks[1]); in hsg_merge_levels_hint()
416 if (n_max <= (merge->rows_bs + hsg_config.thread.xtra)) in hsg_merge_levels_hint()
421 hsg_config.thread.xtra = n_max - merge->rows_bs; in hsg_merge_levels_hint()
444 n_max - merge->rows_bs); in hsg_merge_levels_hint()
746 struct hsg_merge const * const merge, in hsg_bc_half_merge_level() argument
751 uint32_t const net_even = merge->levels[0].networks[0]; in hsg_bc_half_merge_level()
757 if (active < merge->warps) in hsg_bc_half_merge_level()
779 // merge all registers in hsg_bc_half_merge_level()
801 hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge) in hsg_bc_half_merge() argument
804 // will only be called with merge->warps >= 2 in hsg_bc_half_merge()
806 uint32_t const warps = MAX_MACRO(merge->warps,hsg_config.block.warps_min); in hsg_bc_half_merge()
809 uint32_t const net_even = merge->levels[0].networks[0]; in hsg_bc_half_merge()
812 ops = hsg_op(ops,BC_MERGE_H_PREAMBLE(merge->index)); in hsg_bc_half_merge()
816 uint32_t const s_max = merge->rows_bc; in hsg_bc_half_merge()
829 // merge loop in hsg_bc_half_merge()
830 ops = hsg_bc_half_merge_level(ops,merge,r_lo,s_count); in hsg_bc_half_merge()
850 struct hsg_merge const * const merge, in hsg_bs_flip_merge_level() argument
855 // Note there are a number of ways to flip merge these warps. There in hsg_bs_flip_merge_level()
856 // is a magic number in the merge structure that indicates which in hsg_bs_flip_merge_level()
862 // to gather up the network associated with a row and merge them. in hsg_bs_flip_merge_level()
867 // If there are more warps than smem row pairs to merge then we in hsg_bs_flip_merge_level()
873 // Note that it takes two warps to flip merge two smem rows. in hsg_bs_flip_merge_level()
876 // attempt to balance the load>merge>store operations across the in hsg_bs_flip_merge_level()
883 // where are we in computed merge? in hsg_bs_flip_merge_level()
884 uint32_t const count = merge->levels[level].count; in hsg_bs_flip_merge_level()
892 uint32_t active = merge->warps; in hsg_bs_flip_merge_level()
895 if (merge->warps > s_rows) { in hsg_bs_flip_merge_level()
903 // how many equal number of rows to merge? in hsg_bs_flip_merge_level()
917 uint32_t const offset = merge->offsets [index+ii]; in hsg_bs_flip_merge_level()
918 uint32_t const network = merge->networks[index+ii]; in hsg_bs_flip_merge_level()
946 base += active * merge->warps; in hsg_bs_flip_merge_level()
958 hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge) in hsg_bs_flip_merge() argument
961 ops = hsg_op(ops,BS_MERGE_H_PREAMBLE(merge->index)); in hsg_bs_flip_merge()
963 // begin merge in hsg_bs_flip_merge()
968 uint32_t const count = merge->levels[level].count; in hsg_bs_flip_merge()
974 uint32_t const s_pairs_max = merge->rows_bs/2; // this is warp mod in hsg_bs_flip_merge()
987 ops = hsg_op(ops,BS_REG_SHARED_STORE_V(merge->index,r_lo+c,c*2+0)); in hsg_bs_flip_merge()
988 ops = hsg_op(ops,BS_REG_SHARED_STORE_V(merge->index,r_hi-c,c*2+1)); in hsg_bs_flip_merge()
994 // merge loop in hsg_bs_flip_merge()
995 ops = hsg_bs_flip_merge_level(ops,merge,level,s_pairs); in hsg_bs_flip_merge()
1003 ops = hsg_op(ops,BS_REG_SHARED_LOAD_V(merge->index,r_lo+c,c*2+0)); in hsg_bs_flip_merge()
1004 ops = hsg_op(ops,BS_REG_SHARED_LOAD_V(merge->index,r_hi-c,c*2+1)); in hsg_bs_flip_merge()
1009 if (merge->levels[level].active.b64 != BITS_TO_MASK_64(merge->warps)) in hsg_bs_flip_merge()
1010 ops = hsg_op(ops,BS_ACTIVE_PRED(merge->index,level)); in hsg_bs_flip_merge()
1029 hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge)
1033 const struct hsg_merge* const m = merge + merge_idx;
1054 hsg_bs_sort(struct hsg_op * ops, struct hsg_merge const * const merge) in hsg_bs_sort() argument
1057 ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index)); in hsg_bs_sort()
1063 ops = hsg_op(ops,BS_KERNEL_PREAMBLE(merge->index)); in hsg_bs_sort()
1075 if (merge->warps > 1) in hsg_bs_sort()
1076 ops = hsg_bs_flip_merge(ops,merge); in hsg_bs_sort()
1116 hsg_bc_clean(struct hsg_op * ops, struct hsg_merge const * const merge) in hsg_bc_clean() argument
1119 ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index)); in hsg_bc_clean()
1125 ops = hsg_op(ops,BC_KERNEL_PREAMBLE(merge->index)); in hsg_bc_clean()
1128 if (merge->warps == 1) in hsg_bc_clean()
1136 ops = hsg_bc_half_merge(ops,merge); in hsg_bc_clean()
1184 // GENERATE FLIP MERGE KERNEL
1284 // GENERATE HALF MERGE KERNELS
1338 // GENERATE MERGE KERNELS
1349 // GENERATE FLIP MERGE KERNELS in hsg_xm_merge_all()
1351 …for (uint32_t scale_log2=hsg_config.merge.flip.lo; scale_log2<=hsg_config.merge.flip.hi; scale_log… in hsg_xm_merge_all()
1355 // GENERATE HALF MERGE KERNELS in hsg_xm_merge_all()
1357 …for (uint32_t scale_log2=hsg_config.merge.half.lo; scale_log2<=hsg_config.merge.half.hi; scale_log… in hsg_xm_merge_all()
1372 struct hsg_merge const * const merge, in hsg_op_translate_depth() argument
1381 target_pfn(target,config,merge,ops,depth-1); in hsg_op_translate_depth()
1385 target_pfn(target,config,merge,ops,depth); in hsg_op_translate_depth()
1386 ops = hsg_op_translate_depth(target_pfn,target,config,merge,ops+1,depth+1); in hsg_op_translate_depth()
1390 target_pfn(target,config,merge,ops++,depth); in hsg_op_translate_depth()
1402 struct hsg_merge const * const merge, in hsg_op_translate() argument
1405 hsg_op_translate_depth(target_pfn,target,config,merge,ops,0); in hsg_op_translate()
1524 hsg_config.merge.flip.lo = atoi(optarg); in main()
1528 hsg_config.merge.flip.hi = atoi(optarg); in main()
1532 hsg_config.merge.half.lo = atoi(optarg); in main()
1536 hsg_config.merge.half.hi = atoi(optarg); in main()
1540 hsg_config.merge.flip.warps = atoi(optarg); in main()
1544 hsg_config.merge.half.warps = atoi(optarg); in main()
1558 // INIT MERGE in main()
1595 // INIT MERGE MAGIC in main()
1599 struct hsg_merge * const merge = hsg_merge + ii; in main() local
1601 if (merge->warps == 0) in main()
1609 merge->warps); in main()
1611 hsg_merge_levels_init_shared(merge); in main()
1613 hsg_merge_levels_init_1(merge,merge->warps,0,0); in main()
1615 hsg_merge_levels_hint(merge,autotune); in main()
1622 hsg_merge_levels_debug(merge); in main()
1652 // GENERATE MERGE KERNELS in main()