/************************************************************************** * * Subroutine P11 performs the matrix multiplication C = A x B * * Parameters: * * Provided by calling routine: * A = Matrix of multiple precision numbers * B = Matrix of multiple precision numbers * MODULUS = Large integer modulus, expressed as an array of integers * Q = The size of the grid of PEs, Q x Q * M = The size of the sub-matrices on each PE * NUMSIZE = The number of words used to store an MP integer * * Returned by this routine: * C = Matrix of multiple precision numbers * *************************************************************************** * * This version for non-shared-memory systems uses Fox`s algorithm for * matrix multiplication. * **************************************************************************/ /* CVS info */ /* $Date: 2005/09/19 18:48:18 $ */ /* $Revision: 1.3 $ */ /* $RCSfile: p11.c,v $ */ /* $Name: rel_5 $ */ #include "bench11.h" static char cvs_info[] = "BMARKGRP $Date: 2005/09/19 18:48:18 $ $Revision: 1.3 $ $RCSfile: p11.c,v $ $Name: rel_5 $"; #ifdef CRAY # include #else # include #endif long pSync[_SHMEM_REDUCE_SYNC_SIZE]; // use macros to address 3-dim arrays with variable dimension #define c(I,J,K) c[ (K) + numsize * ( (J) + m * (I) ) ] #define la(I,J,K) la[ (K) + numsize * ( (J) + m * (I) ) ] #define lb(I,J,K) lb[ (K) + numsize * ( (J) + m * (I) ) ] #define lc(I,J,K) lc[ (K) + numsize * ( (J) + m * (I) ) ] void p11 ( mplong a[], mplong b[], mplong c[], mplong modulus[], int q, int m, int numsize ) { // Multiply into tempmp, accumulate sums in tempc mplong tempmp [ 2*maxnumsize ]; mplong tempc [ 2*maxnumsize ]; int i, j, abcsize, tempsize; void mult(); // tempa, otherb are for shuffling submatrices for Fox's algorithm // multa, multb pointers passed to mult mplong * sendb , * recvb , * multa , * multb; mplong * tempa , * otherb; int row, col, root, whichb, step, matrixsize; int source; extern int npes, mype; if ( q > 1 ) { matrixsize = numsize * m * m; tempa = (mplong *) shmalloc ( 8 * matrixsize ); otherb = (mplong *) shmalloc ( 8 * matrixsize ); row = mype / q; col = mype % q; whichb = 0; } abcsize = numsize - MPHL; tempsize = 2*maxnumsize - MPHL; // initialize c to hold product a*b for ( i = 0; i < m; i++ ) { for ( j = 0; j < m; j++ ) { MPINIT ( &c(i,j,0), abcsize, 0 ); } } // initialize temporaries MPINIT ( tempmp, tempsize, 0 ); if (q == 1) { // M = N, ordinary multiply on 1 processor for timing comparison mult ( a, b, c, modulus, numsize, m, tempc, tempmp, tempsize ); } else { // q > 1 -- Fox's algorithm takes q steps for ( step = 0; step < q; step++ ) { // Spread a subblock of A across a row of PEs, then multiply // PE's on main diagonal send; next step, diag moves to right root = (row + step) % q; shmem_barrier_all(); shmem_broadcast64 ( tempa, a, matrixsize, root, q*row, 0, q, pSync ); if ( col == root ) multa = a; else multa = tempa; // active version of B switches between B and otherB if ( whichb == 0 ) multb = b; else multb = otherb; // multiply appropriate current subblocks mult ( multa, multb, c, modulus, numsize, m, tempc, tempmp, tempsize ); if ( step < q-1 ) { // Rotate rows of sub-matrices up one (B->otherB or vice versa) // Each PE pulls data from below if ( whichb == 0 ) { sendb = b; recvb = otherb; } else { sendb = otherb; recvb = b; } source = ( mype + q ) % npes; // one row down (wraps around) shmem_barrier_all(); shmem_get64 ( recvb, sendb, matrixsize, source ); shmem_barrier_all(); // Switch to "other" B - toggle between B, otherb in mult whichb = 1 - whichb; } // if ( step < q-1 ) } // loop on step to q-1 shfree ( tempa ); shfree ( otherb ); } // q > 1 } // p11 // Single subr to handle 4 combinations (A or tempA) X (B or otherB) void mult ( mplong la[], mplong lb[], mplong lc[], mplong modulus[], int numsize, int m, mplong tempc[], mplong tempmp[], int tempsize ) { int i, j, k; // extern int VERBOSE; #ifdef GMP // Must fix up pointer word since most A's and all B's have been moved for ( i = 0; i < m; i++ ) for ( j = 0; j < m; j++ ) { la(i,j,MPPTR) = (mplong) & la(i,j,MPDATA); lb(i,j,MPPTR) = (mplong) & lb(i,j,MPDATA); } #endif for ( i = 0; i < m; i++ ) for ( j = 0; j < m; j++ ) { // zero tempc - accumulate sum during loop // size = 0 is enough MPINIT ( tempc, tempsize, 0 ); /* VERBOSE = 2; */ /* it makes MPUTIL print lots of debug info */ for ( k = 0; k < m; k++ ) { mpmult ( tempmp, &la(i,k,0), &lb(k,j,0) ); mpadd ( tempc, tempc, tempmp ); } /* * Accumulate sums in lc(i,j) -- every entry has Q contributions * (one for each time P11 calls MULT) * With Q-1 extra calls to mpmod, C can be (numsize.. not (2*numsize.. * Work in double-size tempc, then copy back into single-size lc(i,j) */ mpadd ( tempc, tempc, &lc(i,j,0) ); mpmod ( tempc, tempc, modulus ); #ifdef GMP mpset ( &lc(i,j,0), tempc ); #else // MPUTIL lc(i,j,MPSIZE) = tempc[MPSIZE]; for ( k = MPDATA; k < MPDATA + tempc[MPSIZE]; k++ ) lc(i,j,k) = tempc[k]; #endif } }