1 /*
2  * Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 
29 #include "mpdecimal.h"
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <assert.h>
35 #include "bits.h"
36 #include "constants.h"
37 #include "typearith.h"
38 #include "transpose.h"
39 
40 
41 #define BUFSIZE 4096
42 #define SIDE 128
43 
44 
45 /* Bignum: The transpose functions are used for very large transforms
46    in sixstep.c and fourstep.c. */
47 
48 
49 /* Definition of the matrix transpose */
50 void
std_trans(mpd_uint_t dest[],mpd_uint_t src[],mpd_size_t rows,mpd_size_t cols)51 std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols)
52 {
53     mpd_size_t idest, isrc;
54     mpd_size_t r, c;
55 
56     for (r = 0; r < rows; r++) {
57         isrc = r * cols;
58         idest = r;
59         for (c = 0; c < cols; c++) {
60             dest[idest] = src[isrc];
61             isrc += 1;
62             idest += rows;
63         }
64     }
65 }
66 
67 /*
68  * Swap half-rows of 2^n * (2*2^n) matrix.
69  * FORWARD_CYCLE: even/odd permutation of the halfrows.
70  * BACKWARD_CYCLE: reverse the even/odd permutation.
71  */
72 static int
swap_halfrows_pow2(mpd_uint_t * matrix,mpd_size_t rows,mpd_size_t cols,int dir)73 swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir)
74 {
75     mpd_uint_t buf1[BUFSIZE];
76     mpd_uint_t buf2[BUFSIZE];
77     mpd_uint_t *readbuf, *writebuf, *hp;
78     mpd_size_t *done, dbits;
79     mpd_size_t b = BUFSIZE, stride;
80     mpd_size_t hn, hmax; /* halfrow number */
81     mpd_size_t m, r=0;
82     mpd_size_t offset;
83     mpd_size_t next;
84 
85 
86     assert(cols == mul_size_t(2, rows));
87 
88     if (dir == FORWARD_CYCLE) {
89         r = rows;
90     }
91     else if (dir == BACKWARD_CYCLE) {
92         r = 2;
93     }
94     else {
95         abort(); /* GCOV_NOT_REACHED */
96     }
97 
98     m = cols - 1;
99     hmax = rows; /* cycles start at odd halfrows */
100     dbits = 8 * sizeof *done;
101     if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) {
102         return 0;
103     }
104 
105     for (hn = 1; hn <= hmax; hn += 2) {
106 
107         if (done[hn/dbits] & mpd_bits[hn%dbits]) {
108             continue;
109         }
110 
111         readbuf = buf1; writebuf = buf2;
112 
113         for (offset = 0; offset < cols/2; offset += b) {
114 
115             stride = (offset + b < cols/2) ? b : cols/2-offset;
116 
117             hp = matrix + hn*cols/2;
118             memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));
119             pointerswap(&readbuf, &writebuf);
120 
121             next = mulmod_size_t(hn, r, m);
122             hp = matrix + next*cols/2;
123 
124             while (next != hn) {
125 
126                 memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));
127                 memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));
128                 pointerswap(&readbuf, &writebuf);
129 
130                 done[next/dbits] |= mpd_bits[next%dbits];
131 
132                 next = mulmod_size_t(next, r, m);
133                     hp = matrix + next*cols/2;
134 
135             }
136 
137             memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));
138 
139             done[hn/dbits] |= mpd_bits[hn%dbits];
140         }
141     }
142 
143     mpd_free(done);
144     return 1;
145 }
146 
147 /* In-place transpose of a square matrix */
148 static inline void
squaretrans(mpd_uint_t * buf,mpd_size_t cols)149 squaretrans(mpd_uint_t *buf, mpd_size_t cols)
150 {
151     mpd_uint_t tmp;
152     mpd_size_t idest, isrc;
153     mpd_size_t r, c;
154 
155     for (r = 0; r < cols; r++) {
156         c = r+1;
157         isrc = r*cols + c;
158         idest = c*cols + r;
159         for (c = r+1; c < cols; c++) {
160             tmp = buf[isrc];
161             buf[isrc] = buf[idest];
162             buf[idest] = tmp;
163             isrc += 1;
164             idest += cols;
165         }
166     }
167 }
168 
169 /*
170  * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into
171  * square blocks with side length 'SIDE'. First, the blocks are transposed,
172  * then a square transposition is done on each individual block.
173  */
174 static void
squaretrans_pow2(mpd_uint_t * matrix,mpd_size_t size)175 squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size)
176 {
177     mpd_uint_t buf1[SIDE*SIDE];
178     mpd_uint_t buf2[SIDE*SIDE];
179     mpd_uint_t *to, *from;
180     mpd_size_t b = size;
181     mpd_size_t r, c;
182     mpd_size_t i;
183 
184     while (b > SIDE) b >>= 1;
185 
186     for (r = 0; r < size; r += b) {
187 
188         for (c = r; c < size; c += b) {
189 
190             from = matrix + r*size + c;
191             to = buf1;
192             for (i = 0; i < b; i++) {
193                 memcpy(to, from, b*(sizeof *to));
194                 from += size;
195                 to += b;
196             }
197             squaretrans(buf1, b);
198 
199             if (r == c) {
200                 to = matrix + r*size + c;
201                 from = buf1;
202                 for (i = 0; i < b; i++) {
203                     memcpy(to, from, b*(sizeof *to));
204                     from += b;
205                     to += size;
206                 }
207                 continue;
208             }
209             else {
210                 from = matrix + c*size + r;
211                 to = buf2;
212                 for (i = 0; i < b; i++) {
213                     memcpy(to, from, b*(sizeof *to));
214                     from += size;
215                     to += b;
216                 }
217                 squaretrans(buf2, b);
218 
219                 to = matrix + c*size + r;
220                 from = buf1;
221                 for (i = 0; i < b; i++) {
222                     memcpy(to, from, b*(sizeof *to));
223                     from += b;
224                     to += size;
225                 }
226 
227                 to = matrix + r*size + c;
228                 from = buf2;
229                 for (i = 0; i < b; i++) {
230                     memcpy(to, from, b*(sizeof *to));
231                     from += b;
232                     to += size;
233                 }
234             }
235         }
236     }
237 
238 }
239 
240 /*
241  * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n)
242  * or a (2*2^n) x 2^n matrix.
243  */
244 int
transpose_pow2(mpd_uint_t * matrix,mpd_size_t rows,mpd_size_t cols)245 transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols)
246 {
247     mpd_size_t size = mul_size_t(rows, cols);
248 
249     assert(ispower2(rows));
250     assert(ispower2(cols));
251 
252     if (cols == rows) {
253         squaretrans_pow2(matrix, rows);
254     }
255     else if (cols == mul_size_t(2, rows)) {
256         if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) {
257             return 0;
258         }
259         squaretrans_pow2(matrix, rows);
260         squaretrans_pow2(matrix+(size/2), rows);
261     }
262     else if (rows == mul_size_t(2, cols)) {
263         squaretrans_pow2(matrix, cols);
264         squaretrans_pow2(matrix+(size/2), cols);
265         if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) {
266             return 0;
267         }
268     }
269     else {
270         abort(); /* GCOV_NOT_REACHED */
271     }
272 
273     return 1;
274 }
275 
276 
277