#define restrict /* Benchmark #8 -- Dynamic Program Large sparse matrices Parameters: Provided by the calling routine: A = K-long array of N by N matrices, packed into N by L arrays IA = N by L array of indices for the packing of A D = T-long array of integers between 1 and K used to select the appropriate A matrix N = The size of the A matrices K = The number of A matrices L = Number of non-zeros in each row and column of each A matrix T = The length of D, and one less than the length of B D2 = T = D1*D2, with neither of these equal to 1. Returned by this routine: B = T+1 long array containing the best path. Z = The log probability of this path cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc Basic Algorithm: The input matrices are assumed to be compressed along their columns. Thus there is an array containing the nonzero elements, compressed into L rows, and a same shaped array containing the row-numbers of these elements. The basic algorithm is then straight-forward: For i = 1,2,..,N Y[0](i) = 1.0 For t = 1,2,...,T k = D(t) for i = 1,...,N Y[t](i) = A[k](i,1)*Y[t-1](IA[k](i,1)) P[t](i) = IA[k](i,1) next i for j = 2,...,L for i = 1,...,N if A[k](i,j)*Y[t-1](IA[k](i,j)) > Y[t](i) Y[t](i) = A[k](i,j)*Y[t-1](IA[k](i,j)) P[t](i) = IA[k](i,j) end if next i next j next t B(T) = 1 YY = Y[T](1) For i = 2,...,N if Y[T](i) > YY YY = Y[T](i) B(T) = i end if Next i For t = T-1,T-2,...,1,0 set B(t) = P[t+1](B(t+1)) Modifications to the Basic Algorithm: The algorithm would be as given in the Basic Algorithm above except for two things: the numbers tend to underflow, and the amount of storage required for all the Y and P arrays would overflow memory on any existing computer. To solve the underflow problem, note that the i which maximizes Y[t](j) = max Y[t-1](i)*A[k](i,j) also maximizes Log (Y[t](j)) = max Log (Y[t-1](i)) + Log (A[k](i,j)) The matrices are non-negative. Thus, all work will be done in the log domain. (Zeros in the matrices, if any, will be replaced with very small values.) To solve the memory overflow problem, there are three options. One could a) store most of the Y and P arrays on secondary storage; b) compress the data so it does fit; or c) store only a portion of the data, from which the rest can be recomputed fairly quickly. It is this last option that is implemented here. Pick two numbers, D1 and D2, such that D1*D2=T. Then two passes can be made through the T steps. The first saves the Y arrays for every D1'th step. The second pass then takes the saved steps, starting with the last and working down, regenerating the P arrays for all the steps, and then generating the B array working backwards through the P's. The memory required for this solution is proportional to D1+D2, as opposed to D1*D2(=T). Obviously, memory used will be minimal if D1=D2=SQRT(T). For the parameter of T = 2000, this should fit on most modern high performance computers. And this is what is implemented here. As now coded, it is assumed that D1 * D2 = T, and that neither of these are equal to 1. Note that the work goes up as the number of passes. This work is needed to recompute the Y arrays. Thus, this requires roughly twice the work. */ #include #include #include "bench8sc.h" #include #include #include #include static char cvs_info[] = "BMARKGRP \$Date: 2005/01/10 21:04:37 \$ \$Revision: 1.2 \$ \$RCSfile: p8sc.c,v \$ \$Name: rel_5 \$"; #define min(A,B) ((A) < (B) ? (A) : (B)) #define max(A,B) ((A) < (B) ? (B) : (A)) int32 *spp; double y[2][MATRIXSIZE]; /* The vectors formed after each step */ double yim[PART][MATRIXSIZE]; /* The intermediate values of Y being stored at the end of each of the subdivisions of the first and second pass. The first D3 vectors are used for the first pass, and the remaining D2 for the second pass. */ void err(int errorcode, char *msg) { char errmsg[MPI_MAX_ERROR_STRING]; int strlen; MPI_Error_string(errorcode,errmsg,&strlen); printf("%s::%s\n",msg,errmsg); MPI_Abort(MPI_COMM_WORLD, 1); } void p8s(double *a, int32 ia[NONZERO][MATRIXSIZE], long d[DLEN], int32 b[DLEN+1], int n, int k, int l, int t, int d2, double *z, int penum, int npes) { int start; /* offset of starting point of this subdivision of steps */ int first; /* Flag which is initialized to zero, indicating that the first time FWBW is called, B(T) is to be calculated. Thereafter, FIRST is set to 1 */ int d1; extern double y[2][MATRIXSIZE]; extern double yim[PART][MATRIXSIZE]; void g8s(),fw(),fwbw(); int i,i2; double x0, y0; double cputime(), wall(); int mynum, myfirst, mylast, e_code; int32 pp[MATRIXSIZE*MD1]; /* The storage space for the D1 P vectors needed during each of the subdivisions of the last pass. */ double *scratch; scratch = (double * ) malloc(sizeof(double) * MATRIXSIZE / npes); if (scratch == (double *)0) { printf("Can't malloc scratch space \n"); MPI_Abort(MPI_COMM_WORLD, 1); exit(-1); } spp = (int32 *) malloc(sizeof(int32)*MATRIXSIZE/npes); if (scratch == (double *)0) { printf("Can't malloc scratch spp \n"); MPI_Abort(MPI_COMM_WORLD, 1); exit(-1); } d1 = t/d2; if ((d1 == 1) || (d2 == 1)) { printf(" In p8s, both d1 and d2 must be greater than 1\n"); printf(" d1 = %d, d2 = %d, t = %d\n", d1,d2,t); exit(-1); } mynum = n/npes; /* # of elements each pe is responsible for */ myfirst = penum * mynum; /* index of first element for this pe */ mylast = myfirst + mynum -1; /* index of last element for this pe */ x0 = -cputime(); y0 = -wall(); g8s(a,mynum,k,l,npes); x0 += cputime(); y0 += wall(); /* printf("Time to convert matrices to logs\n"); printf("CPU = %12.4f\n",x0); printf("WallCloc = %12.4f\n",y0); */ /* Initialize Y[1] to be all log(1.0). */ for(i=0; i scratch[i]) { scratch[i] = temp; spp[i] = ia[j][myfirst+i]; } } if((e_code = MPI_Allgather(scratch,mynum,MPI_DOUBLE,&(y[0][0]),mynum,MPI_DOUBLE,MPI_COMM_WORLD)) != MPI_SUCCESS) err(e_code, "Allgather"); if((e_code = MPI_Gather(spp,mynum,MPI_I32,&(pp[0][0]),mynum,MPI_I32,0,MPI_COMM_WORLD)) != MPI_SUCCESS) err(e_code, "Gather"); ibuf = 1; ibuf2 = 0; /* Do the remaining steps, going between Y(,1) and Y(,0) */ for(kk=1;kk scratch[i]) { scratch[i] = temp; spp[i] = ia[j][i+myfirst]; } } zero=0; /* FLOWMARK(&zero); */ } if((e_code = MPI_Allgather(scratch,mynum,MPI_DOUBLE,&(y[ibuf][0]),mynum,MPI_DOUBLE,MPI_COMM_WORLD)) != MPI_SUCCESS) err(e_code, "Allgather"); if((e_code = MPI_Gather(spp,mynum,MPI_I32,&(pp[kk][0]),mynum,MPI_I32,0,MPI_COMM_WORLD)) != MPI_SUCCESS) err(e_code, "Gather"); ibuf2 = ibuf; ibuf = 1 - ibuf; } /* * If these are the last steps, compute B(T) by taking the maximum * value in Y(T) and storing its position */ if(!penum) { if (!first) { temp = y[ibuf2][0]; ii = 0; for(i = 1;i temp) { temp = y[ibuf2][i]; ii = i; } b[t] = ii; *z = temp; } /* * Backtrack over all the steps computed in this subroutine */ for(i=t-1;i>-1;i--) b[i] = pp[i][b[i+1]]; } } /* * Subroutine to convert A matrices to logs, to eliminate the need to rescale. */ void g8s( double *a, int mynum, int k, int l, int npes) { double fpmin = 1.17549435e-38f; int i,j,kk; /* #ifdef FPMIN fpmin=FPMIN; #endif #ifdef FLT_MIN fpmin = FLT_MIN; #endif */ /* * Scale each matrix */ for(i=0;i