/* * Copyright (c) 2008-2012 Stefan Krah. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "mpdecimal.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> #include <assert.h> #include "bits.h" #include "constants.h" #include "typearith.h" #include "transpose.h" #define BUFSIZE 4096 #define SIDE 128 /* Bignum: The transpose functions are used for very large transforms in sixstep.c and fourstep.c. */ /* Definition of the matrix transpose */ void std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols) { mpd_size_t idest, isrc; mpd_size_t r, c; for (r = 0; r < rows; r++) { isrc = r * cols; idest = r; for (c = 0; c < cols; c++) { dest[idest] = src[isrc]; isrc += 1; idest += rows; } } } /* * Swap half-rows of 2^n * (2*2^n) matrix. * FORWARD_CYCLE: even/odd permutation of the halfrows. * BACKWARD_CYCLE: reverse the even/odd permutation. */ static int swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir) { mpd_uint_t buf1[BUFSIZE]; mpd_uint_t buf2[BUFSIZE]; mpd_uint_t *readbuf, *writebuf, *hp; mpd_size_t *done, dbits; mpd_size_t b = BUFSIZE, stride; mpd_size_t hn, hmax; /* halfrow number */ mpd_size_t m, r=0; mpd_size_t offset; mpd_size_t next; assert(cols == mul_size_t(2, rows)); if (dir == FORWARD_CYCLE) { r = rows; } else if (dir == BACKWARD_CYCLE) { r = 2; } else { abort(); /* GCOV_NOT_REACHED */ } m = cols - 1; hmax = rows; /* cycles start at odd halfrows */ dbits = 8 * sizeof *done; if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) { return 0; } for (hn = 1; hn <= hmax; hn += 2) { if (done[hn/dbits] & mpd_bits[hn%dbits]) { continue; } readbuf = buf1; writebuf = buf2; for (offset = 0; offset < cols/2; offset += b) { stride = (offset + b < cols/2) ? b : cols/2-offset; hp = matrix + hn*cols/2; memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); pointerswap(&readbuf, &writebuf); next = mulmod_size_t(hn, r, m); hp = matrix + next*cols/2; while (next != hn) { memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); pointerswap(&readbuf, &writebuf); done[next/dbits] |= mpd_bits[next%dbits]; next = mulmod_size_t(next, r, m); hp = matrix + next*cols/2; } memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); done[hn/dbits] |= mpd_bits[hn%dbits]; } } mpd_free(done); return 1; } /* In-place transpose of a square matrix */ static inline void squaretrans(mpd_uint_t *buf, mpd_size_t cols) { mpd_uint_t tmp; mpd_size_t idest, isrc; mpd_size_t r, c; for (r = 0; r < cols; r++) { c = r+1; isrc = r*cols + c; idest = c*cols + r; for (c = r+1; c < cols; c++) { tmp = buf[isrc]; buf[isrc] = buf[idest]; buf[idest] = tmp; isrc += 1; idest += cols; } } } /* * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into * square blocks with side length 'SIDE'. First, the blocks are transposed, * then a square tranposition is done on each individual block. */ static void squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size) { mpd_uint_t buf1[SIDE*SIDE]; mpd_uint_t buf2[SIDE*SIDE]; mpd_uint_t *to, *from; mpd_size_t b = size; mpd_size_t r, c; mpd_size_t i; while (b > SIDE) b >>= 1; for (r = 0; r < size; r += b) { for (c = r; c < size; c += b) { from = matrix + r*size + c; to = buf1; for (i = 0; i < b; i++) { memcpy(to, from, b*(sizeof *to)); from += size; to += b; } squaretrans(buf1, b); if (r == c) { to = matrix + r*size + c; from = buf1; for (i = 0; i < b; i++) { memcpy(to, from, b*(sizeof *to)); from += b; to += size; } continue; } else { from = matrix + c*size + r; to = buf2; for (i = 0; i < b; i++) { memcpy(to, from, b*(sizeof *to)); from += size; to += b; } squaretrans(buf2, b); to = matrix + c*size + r; from = buf1; for (i = 0; i < b; i++) { memcpy(to, from, b*(sizeof *to)); from += b; to += size; } to = matrix + r*size + c; from = buf2; for (i = 0; i < b; i++) { memcpy(to, from, b*(sizeof *to)); from += b; to += size; } } } } } /* * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n) * or a (2*2^n) x 2^n matrix. */ int transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols) { mpd_size_t size = mul_size_t(rows, cols); assert(ispower2(rows)); assert(ispower2(cols)); if (cols == rows) { squaretrans_pow2(matrix, rows); } else if (cols == mul_size_t(2, rows)) { if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) { return 0; } squaretrans_pow2(matrix, rows); squaretrans_pow2(matrix+(size/2), rows); } else if (rows == mul_size_t(2, cols)) { squaretrans_pow2(matrix, cols); squaretrans_pow2(matrix+(size/2), cols); if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) { return 0; } } else { abort(); /* GCOV_NOT_REACHED */ } return 1; }