SUBROUTINE P8D(A,D,B,N,K,T,TIMES,AA,Z)
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Benchmark #8 -- Dynamic Program
c Small dense matrices
c Parallel Version
c
c Parameters;
c
c Provided by the calling routine:
c A = K long array of N by N matrices
c D = T long array of integers between 1 and K used to
c select the appropriate A matrix
c N = The size of the A matrices
c K = The number of A matrices
c T = The length of D, and one less than the length of B
c TIMES = Related to the formation of A
c AA = With TIMES related to the formation of A
c
c Returned by this routine:
c B = T+1 long array containing the best path
c Z = The probability of this best path
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic Algorithm
c
c The basic algorithm is given in the problem.
c
c There are a number of ideas to improve this algorithm. First, the
c floating point numbers involved tend to go out of bounds. This is
c solved by noting that the i which maximizes
c Y[t](j) = max Y[t-1](i)*A[k](i,j)
c also maximizes
c Log (Y[t](j)) = max Log (Y[t-1](i)) + Log (A[k](i,j))
c (The matrices are non-negative.) Thus, all work will be done in the
c log domain. Zeros in the matrices, if any, will be replaced with
c very small values.
c
c The second is the idea that we can pre-compute all possible products
c of pairs or even trios of matrices, and then double or even triple
c step through the Y vectors. Even if we do not wish to compute
c all possible trios of matrices, because of the nature of the matrices,
c they are the result of multiplying a single matrix by the K different
c vectors in array TIMES, we can still precompute A[D(t)]*A[D(t+1)]*A
c and triple step with work proportional to N*N + N.
c
c The third idea is to note that the Y vectors are not output. Thus,
c we need to keep only the last and current copies of Y at any time.
c If memory becomes a problem, we can also pack the P array, but this
c will not be done here.
c
c The fourth idea (and the difference between this version and the
c version in p8d.f, results in increased work and memory requirements,
c but also increased parallelism. Note that we can compute
c A[D(t)]*A[D(t+1)]*A[D(t+2]*...*A[D(t+j)]
c and then from Y[t-1] obtain Y[t+j] in one step. However, the
c work required to do this is N times greater (the work to compute
c A[D(t)]*A[D(t+1)] is order N^3, where the work to compute
c Y[t-1]*A[D(t)] is order N^2.) However, we can divide the range
c T into numproc sub-regions, and compute in parallel the product
c of the A's for each sub-region. In a final, relatively cheap
c but sequential, section of work, Y[T] can be computed in order
c N^2 * numproc operations. The computation of B(t) can be computed
c in parallel also.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
USE LIMITS
c The maximum size of the matrices and vectors
PARAMETER (MN=B8_MN)
c
c The maximum number of matrices
PARAMETER (MK=B8_MK)
c
c The maximum length of the D stream
PARAMETER (MT = B8_MT)
c
c The degree of parallelism desired
INTEGER VL
PARAMETER (VL = 128)
c
c The size of the matrices and vectors
INTEGER N
c
c The number of matrices
INTEGER K
c
c The K NxN matrices
REAL A(N,N,K)
c
c The number of steps, and number of steps in each sub-region
INTEGER T, TT
c
c the vector specifying which of the K matrices to use in each step
INTEGER D(T)
c
c The vectors formed after each step
REAL Y(MN,0:1)
c
c The "best" path through the steps
INTEGER B(0:T)
c
c The vectors of which element of the matrix formed the maximum
INTEGER P(MN,MT)
c
c The lookup table of precomputed pseudo products of all permutations of
c the K matrices taken in groups of two or three
REAL TAB(MN,MN,MK,MK)
c
c A table of the best path through each precomputed matrix
INTEGER TABP(MN,MN,MK,MK,2)
c
c A table of positions in each sub-region currently being computed,
c and the value in D around this position
INTEGER IPOS(VL), K1(VL), K2(VL), K3(VL), JPOS(VL)
c
c The partial tables of products of A over each sub-region, for
c parallelism. (Plays the role of Y in the MATRIX MATRIX products)
REAL REG(VL,MN,MN,0:1)
c
c The partial tables of pointers to best previous value. This takes the
c place of the P table in the MATRIX MATRIX products.
INTEGER REGP(VL,MN,MN,(MT/(3*VL) +1))
c
c Base matrix to form the K actual matrices
REAL TIMES(N,K)
c
c Second base matrix to form the K actual matrices
REAL AA(N,N)
c
c The user-supplied minimum allowed floating point number
c PARAMETER (FPMIN=1.0E-2465)
PARAMETER (FPMIN=1.0E-307)
c
c Used to time the various subportions of this program
REAL CPUTIME,WALL,X0,Y0
c
C--CVS variable declaration
TYPE CVS
sequence
character( 160 ) string
integer stringend
END TYPE CVS
C--CVS initilaize variables
TYPE( CVS ),save :: CVS_INFO =
$ CVS("BMARKGRP $Date: 2005/01/07 23:07:10 $ $Revision: 1.2 $" //
$ "$RCSfile: p8dp.f,v $ $Name: rel_5 $", 0)
c
c Check the input parameters to see that there is sufficient space
IF(MN .LT. N) THEN
PRINT 5,MN,N
5 FORMAT('Insufficient space in working arrays for P8D.',/,
$ 'Size available from parameter MN = ',I5,/,
$ 'Needed = ',I6)
STOP
ENDIF
c
IF(MK .LT. K) THEN
PRINT 10,MK,K
10 FORMAT('Insufficient space in working arrays for P8D.',/,
$ 'Size available from parameter MK = ',I5,/,
$ 'Needed = ',I6)
STOP
ENDIF
c
IF(MT .LT. T) THEN
PRINT 15,MT,T
15 FORMAT('Insufficient space in working arrays for P8D.',/,
$ 'Size available from parameter MT = ',I10,/,
$ 'Needed = ',I10)
STOP
ENDIF
c
c
c NOTE: The code for timings and region identification are
c provided for internal instrumentation only. To use, just
c uncomment the code. Do not include these times in the final
c report.
c
c
c Convert matrices to logs and generate tables for three-stepping.
c
Cc X0 = CPUTIME()
Cc Y0 = WALL()
c
CALL G8D(A,AA,TAB,TABP,TIMES,N,K,T,MN,MK,FPMIN)
c
Cc X0 = CPUTIME() - X0
Cc Y0 = WALL() -Y0
Cc PRINT 17, X0, Y0
Cc 17 FORMAT('Time to set up three-stepping tables',/,
Cc $ ' CPU = ',F12.4,' seconds.',/,
Cc $ ' Wall Clock = ',F12.4,' seconds.',/)
c
c
TT = T/(3*VL)
IF((T .LE. (3*K*K*N)) .OR. (TT .LT. 20)) THEN
PRINT 23, T, 3*K*K*N, 60*VL
23 FORMAT('T = ',I7,' is probably too small for this approach.',/,
$ 'It should be at least the larger of ',I7,' and ',I7)
STOP
ENDIF
c
Cc X0 = CPUTIME()
Cc Y0 = WALL()
c
c Generate a vector of start points for each sub-region, and the
c number of sequential steps left over.
c
DO 30 I = 1, VL
IPOS(I) = T + 1 - (1+VL-I)*3*TT
JPOS(I) = IPOS(I)
30 CONTINUE
c
II = IPOS(1) -1
c
c Generate Initial REG Values
DO 35 KK=1,VL
K1(KK) = D(IPOS(KK))
K2(KK) = D(1+IPOS(KK))
K3(KK) = D(2+IPOS(KK))
IPOS(KK) = IPOS(KK) + 3
35 CONTINUE
c
DO 50 I=1,N
DO 45 J=1,N
DO 40 KK=1,VL
REG(KK,I,J,0) = TAB(J,I,K1(KK),K2(KK)) +
$ TIMES(J,K3(KK))
40 CONTINUE
45 CONTINUE
50 CONTINUE
IBUF = 1
IBUF2 = 0
c
c Compute product of matrices in each sub-region
DO 90 I=2,TT
DO 55 KK=1,VL
K1(KK) = D(IPOS(KK))
K2(KK) = D(1+IPOS(KK))
K3(KK) = D(2+IPOS(KK))
IPOS(KK) = IPOS(KK) + 3
55 CONTINUE
c
DO 85 J=1,N
DO 80 JJ=1,N
DO 60 KK=1,VL
REG(KK,J,JJ,IBUF) = REG(KK,J,1,IBUF2) +
$ TAB(JJ,1,K1(KK),K2(KK))
REGP(KK,J,JJ,I) = 1
60 CONTINUE
c
DO 70 M=2,N
DO 65 KK=1,VL
TEMP = REG(KK,J,M,IBUF2) +
$ TAB(JJ,M,K1(KK),K2(KK))
IF(TEMP .GT. REG(KK,J,JJ,IBUF)) THEN
REG(KK,J,JJ,IBUF) = TEMP
REGP(KK,J,JJ,I) = M
ENDIF
65 CONTINUE
70 CONTINUE
c
DO 75 KK=1,VL
REG(KK,J,JJ,IBUF) = REG(KK,J,JJ,IBUF) + TIMES(JJ,K3(KK))
75 CONTINUE
80 CONTINUE
85 CONTINUE
c
IBUF2 = IBUF
IBUF = 1 - IBUF
90 CONTINUE
c
c
c Check time and publish results
Cc X0 = CPUTIME() - X0
Cc Y0 = WALL() -Y0
Cc PRINT 93, X0, Y0
Cc 93 FORMAT( 'Time to do Parallel Matrix Matrix Operations',/,
Cc $ ' CPU = ',F12.4,' seconds.',/,
Cc $ ' Wall Clock = ',F12.4,' seconds.',/)
c
Cc PRINT 97,II,3*TT,VL
Cc 97 FORMAT('Single step Region = ',I7,/,
Cc $ 'Parallel Sub-region = ',I7,/,
Cc $ 'Vector Length = ',I7)
c
c
c Now start the scaler portion
c Initialize the initial Y array
c
Cc X0 = CPUTIME()
Cc Y0 = WALL()
c
DO 200 I=1,N
Y(I,0) = 0
200 CONTINUE
JBUF = 1
JBUF2 = 0
c
IF(II .NE. 0) THEN
DO 240 I=1,II
DO 210 J=1,N
Y(J,JBUF) = Y(1,JBUF2) + A(1,J,D(I))
P(J,I) = 1
210 CONTINUE
c
DO 230 J=2,N
DO 220 KK=1,N
TEMP = Y(J,JBUF2) + A (J,KK,D(I))
IF(TEMP .GT. Y(KK,JBUF)) THEN
Y(KK,JBUF) = TEMP
P(KK, I) = J
ENDIF
220 CONTINUE
230 CONTINUE
c
JBUF2 = JBUF
JBUF = 1 - JBUF
240 CONTINUE
ENDIF
c
c Giant step over each sub-region
DO 280 I=1,VL
DO 250 J=1,N
Y(J,JBUF) = Y(1,JBUF2) + REG(I,1,J,IBUF2)
P(J,II+I) = 1
250 CONTINUE
c
DO 270 J=2,N
DO 260 KK = 1, N
TEMP = Y(J,JBUF2) + REG(I,J,KK,IBUF2)
IF(TEMP .GT. Y(KK,JBUF)) THEN
Y(KK,JBUF) = TEMP
P(KK,II+I) = J
ENDIF
260 CONTINUE
270 CONTINUE
c
c Update buffer pointers
JBUF2 = JBUF
JBUF = 1 - JBUF
280 CONTINUE
c
c
Cc X0 = CPUTIME() - X0
Cc Y0 = WALL() -Y0
Cc PRINT 283, X0, Y0
Cc 283 FORMAT( 'Time to do Sequential Operations',/,
Cc $ ' CPU = ',F12.4,' seconds.',/,
Cc $ ' Wall Clock = ',F12.4,' seconds.',/)
c
c
c Now find the end of the best path
X0 = CPUTIME()
Y0 = WALL()
c
Z = Y(1,JBUF2)
IJ = 1
DO 300,KK=2,N
IF(Y(KK,JBUF2) .GT. Z) THEN
Z = Y(KK,JBUF2)
IJ = KK
ENDIF
300 CONTINUE
B(T) = IJ
c
c Giant step over each parallel sub-region
DO 310, KK=VL,1,-1
IJ = P(IJ,II+KK)
B(JPOS(KK)-1) = IJ
310 CONTINUE
c
c Step over initial sequential region
IF(II .NE. 0) THEN
DO 320 I=II,1,-1
B(I-1) = P(B(I),I)
320 CONTINUE
ENDIF
c
c Now step through each parallel region
DO 330 KK=1,VL
K1(KK) = B(JPOS(KK)-1)
K2(KK) = B(JPOS(KK) + TT*3 - 1)
330 CONTINUE
c
DO 350 I=TT-1,1,-1
DO 340 KK=1,VL
JTEMP = JPOS(KK)-1
B(3*I + JTEMP) = REGP(KK,K1(KK),K2(KK),I+1)
K2(KK) = B(3*I + JTEMP)
340 CONTINUE
350 CONTINUE
c
DO 370 I=T-2,II+1,-3
IJ = B(I+2)
KK = D(I)
MM = D(I+1)
J = B(I-1)
B(I+1) = TABP(IJ,J,KK,MM,2)
B(I) = TABP(IJ,J,KK,MM,1)
370 CONTINUE
c
c
Cc X0 = CPUTIME() - X0
Cc Y0 = WALL() -Y0
Cc PRINT 373, X0, Y0
Cc 373 FORMAT( 'Time to do Backtracking',/,
Cc $ ' CPU = ',F12.4,' seconds.',/,
Cc
Cc $ ' Wall Clock = ',F12.4,' seconds.',/)
c
c
RETURN
END
c
c
c
c
SUBROUTINE G8D(A,AA,TAB,TABP,TIMES,N,K,T,MN,MK,FPMIN)
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Subroutine to take the log of the A matrices, and to
c generate the tables to be used in the 3-stepping.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c The maximum size of the matrices and vectors
PARAMETER (MM=30)
REAL A(N,N,K)
REAL AA(N,N)
REAL TAB(MN,MN,MK,MK)
INTEGER TABP(MN,MN,MK,MK,2)
REAL TIMES(N,K)
REAL VTEMP(MM,MM)
INTEGER PTEMP(MM,MM)
INTEGER T
c
c Some checking to make sure that arrays are large enough.
IF(MM .LT. N) THEN
PRINT 5,MM,N
5 FORMAT('Insufficient space in working arrays for G8D.',/,
$ 'Size available from parameter MM = ',I5,/,
$ 'Needed = ',I6)
STOP
ENDIF
c
c Scale all the matrices, and the array TIMES
DO 30 J = 1,K
DO 20 I = 1,N
IF(TIMES(I,J) .GT. FPMIN) THEN
TIMES(I,J) = ALOG(TIMES(I,J))
ELSE
TIMES(I,J) = ALOG(FPMIN)
ENDIF
c
DO 10 II = 1,N
IF(A(II,I,J) .GT. FPMIN) THEN
A(II,I,J) = ALOG(A(II,I,J))
ELSE
A(II,I,J) = ALOG(FPMIN)
ENDIF
10 CONTINUE
20 CONTINUE
30 CONTINUE
c
DO 50 I=1,N
DO 40 J=1,N
IF(AA(I,J) .GT. FPMIN) THEN
AA(I,J) = ALOG(AA(I,J))
ELSE
AA(I,J) = ALOG(FPMIN)
ENDIF
40 CONTINUE
50 CONTINUE
c
c If N, K and T warrant, multiply together all K*K permutations of the
c K input matrices.
c
IF(T .LE. (3*K*K*N)) RETURN
c
DO 190 I = 1,K
DO 180 J = 1,K
DO 130 KK = 1,N
DO 100 L = 1,N
VTEMP(KK,L) = A(KK,1,I) + A(1,L,J)
PTEMP(KK,L) = 1
100 CONTINUE
c
DO 120 M = 2,N
DO 110 L = 1,N
TMP = A(KK,M,I)+A(M,L,J)
IF(TMP .GT. VTEMP(KK,L)) THEN
VTEMP(KK,L) = TMP
PTEMP(KK,L) = M
ENDIF
110 CONTINUE
120 CONTINUE
130 CONTINUE
c
c Multiply all K*K matrices in the table by the first A matrix. Then,
c to step by three, one just has to multiply by the appropriate matrix
c in the table using the first two steps as an index, and then using
c the multiplication factors of the third step to readjust the columns
c of the output.
DO 170 KK = 1,N
DO 140 L = 1,N
TAB(L,KK,I,J) = VTEMP(KK,1) + AA(1,L)
TABP(L,KK,I,J,1) = PTEMP(KK,1)
TABP(L,KK,I,J,2) = 1
140 CONTINUE
DO 160 M = 2,N
DO 150 L = 1,N
TEMP = VTEMP(KK,M) + AA(M,L)
IF(TEMP .GT. TAB(L,KK,I,J)) THEN
TAB(L,KK,I,J) = TEMP
TABP(L,KK,I,J,1) = PTEMP(KK,M)
TABP(L,KK,I,J,2) = M
ENDIF
150 CONTINUE
160 CONTINUE
170 CONTINUE
180 CONTINUE
190 CONTINUE
c
RETURN
END