SUBROUTINE P11 ( A, B, C, MODULUS, Q, M, NUMSIZE ) c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c CVS Info c $Date: 2005/01/10 21:56:21 $ c $Revision: 1.2 $ c $RCSfile: p11.F,v $ c $Name: rel_5 $ c c P11 calculates the matrix product C = A x B, reduced mod MODULUS. c It implements Fox's algorithm for distributing the work on a Q x Q grid c of processors, each with an M x M subblock of the global matrices. c c Parameters: c c Provided by calling routine: c A = Matrix of multiple precision numbers c B = Matrix of multiple precision numbers c MODULUS = Large integer modulus, stored as an array of integers c Q = The size of the grid of PE's Q x Q c M = The size of the sub-matrices on each PE c NUMSIZE = The number of words to store an MP integer c (passed in so P11 can declare arrays with exact dimensions) c c Returned by this routine: c C = Matrix of multiple precision numbers, C = A x B c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c IMPLICIT NONE c #include "bench11.h" #include c C--CVS variable declaration TYPE CVS sequence character( 160 ) string integer stringend END TYPE CVS C--CVS initilaize variables TYPE( CVS ),save :: CVS_INFO = $ CVS("BMARKGRP $Date: 2005/01/10 21:56:21 $ $Revision: 1.2 $" // $ "$RCSfile: p11.F,v $ $Name: rel_5 $", 0) C INTEGER Q, M, NUMSIZE c c These are allocated in main M11 c MPLONG A ( NUMSIZE, M, M ) MPLONG B ( NUMSIZE, M, M ) MPLONG C ( NUMSIZE, M, M ) MPLONG MODULUS ( NUMSIZE ) c c The following are allocated here c c TEMPA, OTHERB are for shuffling submatrices for Fox's algorithm c MPLONG TEMPA ( :, :, : ) MPLONG OTHERB( :, :, : ) ALLOCATABLE TEMPA, OTHERB c TARGET A, B, TEMPA, OTHERB MPLONG, POINTER :: MULTA(:,:,:), MULTB(:,:,:) c c Multiply into TEMPMP, accumulate sums in TEMPC c MPLONG TEMPMP ( 2*MAXNUMSIZE ) MPLONG TEMPC ( 2*MAXNUMSIZE ) c INTEGER ROW, COL, ROOT, WHICHB, STEP INTEGER I, J, ABCSIZE, TEMPSIZE, MATRIXSIZE INTEGER SOUR, DEST, TAG, IERROR c c MPI stuff c INTEGER STATUS(MPI_STATUS_SIZE) INTEGER REQUEST, WORLD, ROW_COMM c IF (Q .GT. 1) THEN c WORLD = MPI_COMM_WORLD ROW = MYPE/Q COL = MOD(MYPE, Q) c make row communicator for broadcasting A across row CALL MPI_COMM_SPLIT( WORLD, ROW, COL, ROW_COMM, IERROR ) c allocate space for temp matrices for Fox's algorithm ALLOCATE ( TEMPA( NUMSIZE, M, M ), OTHERB( NUMSIZE, M, M ) ) c MATRIXSIZE = NUMSIZE * M * M TAG = 8 WHICHB = 0 c ENDIF c c initialize C to hold product A*B c DO 10, I = 1, M DO 10, J = 1, M C(MPALLOC, I, J) = NUMSIZE - MPHL #if (GMP32 == 0) C(MPSIZE, I, J) = 0 #endif #ifdef GMP C(MPPTR, I, J) = LOC( C(MPDATA, I, J) ) #endif 10 CONTINUE c TEMPMP(MPALLOC) = 2*NUMSIZE - MPHL #if (GMP32 == 0) TEMPMP(MPSIZE) = 0 #endif TEMPC(MPALLOC) = 2*NUMSIZE - MPHL #ifdef GMP TEMPMP(MPPTR) = LOC( TEMPMP(MPDATA) ) TEMPC(MPPTR) = LOC( TEMPC(MPDATA) ) #endif c c IF (Q .EQ. 1) THEN c c ordinary multiply on 1 processor for timing comparison c CALL MULT ( A, B, C, MODULUS, NUMSIZE, M, TEMPC, TEMPMP ) c ELSE ! Q > 1 c c Fox`s algorithm takes Q steps c DO 200, STEP = 0, Q - 1 c c Spread a subblock of A across a row of PEs, then multiply c First PE's on main diagonal send, then diag moves one block to right c ROOT = MOD(ROW + STEP, Q) c CALL MPI_BARRIER(WORLD, IERROR) c IF ( COL .EQ. ROOT ) THEN c c Each PE on current diagonal broadcasts its A, multiplies its A c CALL MPI_BCAST ( A, MATRIXSIZE, MY_INTEGER, ROOT, * ROW_COMM, IERROR ) MULTA => A c ELSE ! COL .NE ROOT c c Each PE not on curr diag receives broadcast into TEMPA, multiplies TEMPA c CALL MPI_BCAST ( TEMPA, MATRIXSIZE, MY_INTEGER, ROOT, * ROW_COMM, IERROR ) MULTA => TEMPA c END IF c c Active version of B switches between B and otherB c IF ( WHICHB .EQ. 0 ) THEN MULTB => B ELSE MULTB => OTHERB END IF c CALL MULT ( MULTA, MULTB, C, MODULUS, NUMSIZE, M, * TEMPC, TEMPMP ) c IF (STEP .LT. (Q - 1)) THEN c c Rotate rows of sub-matrices up one (B->otherB or vice versa) c Each PE receives from below and sends to above c CALL MPI_BARRIER(WORLD, IERROR) c DEST = MOD ( MYPE + Q * (Q-1), NPES ) SOUR = MOD ( MYPE + Q, NPES ) c IF ( WHICHB .EQ. 0 ) THEN c CALL MPI_IRECV ( OTHERB, MATRIXSIZE, MY_INTEGER, * SOUR, TAG, WORLD, REQUEST, IERROR ) c CALL MPI_SEND ( B, MATRIXSIZE, MY_INTEGER, * DEST, TAG, WORLD, IERROR ) c CALL MPI_WAIT ( REQUEST, STATUS, IERROR ) c ELSE c CALL MPI_IRECV ( B, MATRIXSIZE, MY_INTEGER, * SOUR, TAG, WORLD, REQUEST, IERROR ) c CALL MPI_SEND ( OTHERB, MATRIXSIZE, MY_INTEGER, * DEST, TAG, WORLD, IERROR ) c CALL MPI_WAIT ( REQUEST, STATUS, IERROR ) c END IF c CALL MPI_BARRIER(WORLD, IERROR) c c Switch to "other" B - toggle between B, OTHERB in MULT c WHICHB = 1 - WHICHB c ENDIF ! not last STEP c 200 CONTINUE ! loop on STEP to Q-1 c ENDIF ! Q > 1 c RETURN END c c Single subr to handle 4 combinations (A or TEMPA) X (B or OTHERB) c SUBROUTINE MULT ( LA, LB, LC, MODULUS, NUMSIZE, M, TEMPC, TEMPMP ) #include "bench11.h" c MPLONG LA ( NUMSIZE, M, M ) MPLONG LB ( NUMSIZE, M, M ) MPLONG LC ( NUMSIZE, M, M ) MPLONG MODULUS ( NUMSIZE ) MPLONG TEMPMP ( 2 * NUMSIZE ) MPLONG TEMPC ( 2 * NUMSIZE ) c INTEGER M, I, J, K c #ifdef GMP c Must fix up pointer word since most A`s and all B`s have been moved DO I = 1, M DO J = 1, M LA(MPPTR, I, J) = LOC( LA(MPDATA, I, J) ) LB(MPPTR, I, J) = LOC( LB(MPDATA, I, J) ) ENDDO ENDDO #endif DO I = 1, M DO J = 1, M c #if (GMP32 == 0) TEMPC(MPSIZE) = 0 #else c erase upper 32 bits of dual-use word 1 TEMPC(MPALLOC) = IAND( TEMPC(MPALLOC), BITMASK32 ) #endif c DO K = 1, M CALL MPMULT ( TEMPMP, LA(1,I,K), LB(1,K,J) ) CALL MPADD ( TEMPC, TEMPMP, TEMPC ) END DO c c Accumulate sums in LC(I,J) -- every entry has Q contributions c (one for each time P11 calls MULT) c With Q-1 extra calls to MPMOD, C can be (NUMSIZE.. not (2*NUMSIZE.. c Work in double-size TEMPC, then copy back into single-size C(I,J) c CALL MPADD ( TEMPC, TEMPC, LC(1,I,J) ) CALL MPMOD ( TEMPC, TEMPC, MODULUS ) c #ifdef GMP CALL MPSET ( LC(1,I,J), TEMPC ) #else LC(MPSIZE, I,J) = TEMPC(MPSIZE) DO K = MPHL + 1, MPHL + TEMPC(MPSIZE) LC(K, I,J) = TEMPC(K) ENDDO #endif c END DO END DO c RETURN END