1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  *
7  */
8 
9 //
10 //
11 //
12 
13 #include "transpose.h"
14 #include "common/macros.h"
15 
16 //
17 // Rows must be an even number.  This is enforced elsewhere.
18 //
19 // The transpose requires (cols_log2 * rows/2) row-pair blends.
20 //
21 void
hsg_transpose(uint32_t const cols_log2,uint32_t const rows,void (* pfn_blend)(uint32_t const cols_log2,uint32_t const row_ll,uint32_t const row_ur,void * blend),void * blend,void (* pfn_remap)(uint32_t const row_from,uint32_t const row_to,void * remap),void * remap)22 hsg_transpose(uint32_t                   const cols_log2,
23               uint32_t                   const rows,
24               void (*pfn_blend)(uint32_t const cols_log2,
25                                 uint32_t const row_ll, // lower-left
26                                 uint32_t const row_ur, // upper-right
27                                 void *         blend),
28               void *                           blend,
29               void (*pfn_remap)(uint32_t const row_from,
30                                 uint32_t const row_to,
31                                 void *         remap),
32               void *                           remap)
33 {
34   // get mapping array
35   uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
36   uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));
37 
38   // init the mapping array
39   for (uint32_t ii=0; ii<rows; ii++)
40     map_curr[ii] = ii;
41 
42   // successively transpose rows using blends
43   for (uint32_t cc=1; cc<=cols_log2; cc++)
44     {
45       uint32_t const mask = BITS_TO_MASK(cc);
46 
47       for (uint32_t ii=0; ii<rows; ii++)
48         {
49           uint32_t const left = map_curr[ii];
50           uint32_t const stay = left & ~mask;
51 
52           if (left != stay) // will be swapped away
53             {
54               for (uint32_t jj=0; jj<rows; jj++)
55                 {
56                   if (map_curr[jj] == stay)
57                     {
58                       map_next[jj] = stay;
59                       map_next[ii] = stay + (rows << (cc-1));
60 
61                       pfn_blend(cc,ii,jj,blend); // log2,left,right,payload
62 
63                       break;
64                     }
65                 }
66             }
67         }
68 
69       uint32_t * tmp = map_curr;
70 
71       map_curr = map_next;
72       map_next = tmp;
73     }
74 
75   // write out the remapping
76   for (uint32_t ii=0; ii<rows; ii++)
77     pfn_remap(ii,map_curr[ii] >> cols_log2,remap);
78 }
79 
80 //
81 // test it!
82 //
83 
84 #ifdef HS_TRANSPOSE_DEBUG
85 
86 #include <stdio.h>
87 
88 static uint32_t cols; // implicit on SIMD/GPU
89 
90 static
91 void
hsg_debug_blend(uint32_t const cols_log2,uint32_t const row_ll,uint32_t const row_ur,uint32_t * b)92 hsg_debug_blend(uint32_t const cols_log2,
93                 uint32_t const row_ll, // lower-left
94                 uint32_t const row_ur, // upper-right
95                 uint32_t *     b)
96 {
97   fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);
98 
99   uint32_t * const ll = ALLOCA_MACRO(cols * sizeof(*b));
100   uint32_t * const ur = ALLOCA_MACRO(cols * sizeof(*b));
101 
102   memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
103   memcpy(ur,b+row_ur*cols,cols * sizeof(*b));
104 
105   for (uint32_t ii=0; ii<cols; ii++)
106     b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
107 
108   for (uint32_t ii=0; ii<cols; ii++)
109     b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
110 }
111 
112 static
113 void
hsg_debug_remap(uint32_t const row_from,uint32_t const row_to,uint32_t * const r)114 hsg_debug_remap(uint32_t   const row_from,
115                 uint32_t   const row_to,
116                 uint32_t * const r)
117 {
118   fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
119 
120   r[row_to] = row_from;
121 }
122 
123 static
124 void
hsg_debug_print(uint32_t const rows,uint32_t const * const m,uint32_t const * const r)125 hsg_debug_print(uint32_t         const rows,
126                 uint32_t const * const m,
127                 uint32_t const * const r)
128 {
129   for (uint32_t rr=0; rr<rows; rr++) {
130     for (uint32_t cc=0; cc<cols; cc++)
131       fprintf(stdout,"%4u ",m[r[rr]*cols + cc]);
132     fprintf(stdout,"\n");
133   }
134 }
135 
136 int
main(int argc,char * argv[])137 main(int argc, char * argv[])
138 {
139   uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0);
140   uint32_t const rows      = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0);
141 
142   if (rows & 1)
143     return;
144 
145   cols = 1 << cols_log2;
146 
147   uint32_t * const b = ALLOCA_MACRO(cols * rows * sizeof(*b));
148   uint32_t * const r = ALLOCA_MACRO(       rows * sizeof(*r));
149 
150   for (uint32_t rr=0; rr<rows; rr++) {
151     r[rr] = rr;
152     for (uint32_t cc=0; cc<cols; cc++)
153       b[rr*cols+cc] = cc*rows+rr;
154   }
155 
156   hsg_debug_print(rows,b,r);
157 
158   hsg_transpose(cols_log2,rows,
159                 hsg_debug_blend,b,
160                 hsg_debug_remap,r);
161 
162   hsg_debug_print(rows,b,r);
163 
164   return EXIT_SUCCESS;
165 }
166 
167 #endif
168 
169 //
170 //
171 //
172