#define restrict
/*
Benchmark #8 -- Dynamic Program
Large sparse matrices
Parameters:
Provided by the calling routine:
A = K-long array of N by N matrices, packed into N by L arrays
IA = N by L array of indices for the packing of A
D = T-long array of integers between 1 and K used to
select the appropriate A matrix
N = The size of the A matrices
K = The number of A matrices
L = Number of non-zeros in each row and column of each A matrix
T = The length of D, and one less than the length of B
D2 = T = D1*D2, with neither of these equal to 1.
Returned by this routine:
B = T+1 long array containing the best path.
Z = The log probability of this path
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
Basic Algorithm:
The input matrices are assumed to be compressed along their
columns. Thus there is an array containing the nonzero elements,
compressed into L rows, and a same shaped array containing the
row-numbers of these elements.
The basic algorithm is then straight-forward:
For i = 1,2,..,N Y[0](i) = 1.0
For t = 1,2,...,T
k = D(t)
for i = 1,...,N
Y[t](i) = A[k](i,1)*Y[t-1](IA[k](i,1))
P[t](i) = IA[k](i,1)
next i
for j = 2,...,L
for i = 1,...,N
if A[k](i,j)*Y[t-1](IA[k](i,j)) > Y[t](i)
Y[t](i) = A[k](i,j)*Y[t-1](IA[k](i,j))
P[t](i) = IA[k](i,j)
end if
next i
next j
next t
B(T) = 1
YY = Y[T](1)
For i = 2,...,N
if Y[T](i) > YY
YY = Y[T](i)
B(T) = i
end if
Next i
For t = T-1,T-2,...,1,0 set B(t) = P[t+1](B(t+1))
Modifications to the Basic Algorithm:
The algorithm would be as given in the Basic Algorithm above
except for two things: the numbers tend to underflow, and
the amount of storage required for all the Y and P arrays
would overflow memory on any existing computer.
To solve the underflow problem, note that the i which maximizes
Y[t](j) = max Y[t-1](i)*A[k](i,j)
also maximizes
Log (Y[t](j)) = max Log (Y[t-1](i)) + Log (A[k](i,j))
The matrices are non-negative. Thus, all work will be done
in the log domain. (Zeros in the matrices, if any, will be replaced
with very small values.)
To solve the memory overflow problem, there are three options. One
could
a) store most of the Y and P arrays on secondary storage;
b) compress the data so it does fit; or
c) store only a portion of the data, from which the rest can be
recomputed fairly quickly.
It is this last option that is implemented here.
Pick two numbers, D1 and D2, such that D1*D2=T. Then two passes can
be made through the T steps. The first saves the Y arrays for every
D1'th step. The second pass then takes the saved steps, starting
with the last and working down, regenerating the P arrays for all the
steps, and then generating the B array working backwards through
the P's.
The memory required for this solution is proportional to D1+D2, as
opposed to D1*D2(=T). Obviously, memory used will be minimal if
D1=D2=SQRT(T). For the parameter of T = 2000, this should fit on
most modern high performance computers. And this is what is
implemented here. As now coded, it is assumed that D1 * D2 = T, and
that neither of these are equal to 1.
Note that the work goes up as the number of passes. This work is
needed to recompute the Y arrays. Thus, this requires roughly
twice the work.
*/
#include
#include
#include "bench8sc.h"
#include
#include
#include
#include
/* CVS info */
/* $Date: 2005/01/10 21:15:00 $ */
/* $Revision: 1.2 $ */
/* $RCSfile: p8s_upc.c,v $ */
/* $Name: rel_5 $ */
static char cvs_info[] = "BMARKGRP $Date: 2005/01/10 21:15:00 $ $Revision: 1.2 $ $RCSfile: p8s_upc.c,v $ $Name: rel_5 $";
#define index(N) ((N%mynum)*THREADS + N/mynum)
#define min(A,B) ((A) < (B) ? (A) : (B))
#define max(A,B) ((A) < (B) ? (B) : (A))
shared double y[2][MATRIXSIZE]; /* The vectors formed after each step */
shared double yim[PART][MATRIXSIZE]; /* The intermediate values of Y being stored
at the end of each of the subdivisions of
the first and second pass. The first D3
vectors are used for the first pass, and
the remaining D2 for the second pass. */
void p8s(double *a,
int32 ia[NONZERO][MATRIXSIZE],
long d[DLEN],
int32 b[DLEN+1],
int n,
int k,
int l,
int t,
int d2,
double *z,
int penum,
int npes)
{
int start; /* offset of starting point of this subdivision of steps */
int first; /* Flag which is initialized to zero, indicating that the
first time FWBW is called, B(T) is to be calculated.
Thereafter, FIRST is set to 1 */
int d1;
/*
extern double y[2][MATRIXSIZE];
extern double yim[PART][MATRIXSIZE];
*/
void g8s(),fw(),fwbw();
int i,j,i2;
double x0, y0;
double cputime(), wall();
int mynum, myfirst, mylast, e_code;
static shared int32 pp[MD1][MATRIXSIZE]; /* The storage space for the D1 P vectors
needed during each of the subdivisions of
the last pass. */
double *localyim;
d1 = t/d2;
if ((d1 == 1) || (d2 == 1))
{
printf(" In p8s, both d1 and d2 must be greater than 1\n");
printf(" d1 = %d, d2 = %d, t = %d\n", d1,d2,t);
exit(-1);
}
mynum = n/npes; /* # of elements each pe is responsible for */
myfirst = penum * mynum; /* index of first element for this pe */
mylast = myfirst + mynum -1; /* index of last element for this pe */
/*
x0 = -cputime();
y0 = -wall();
*/
g8s(a,mynum,k,l,npes);
/*
x0 += cputime();
y0 += wall();
*/
/*
printf("Time to convert matrices to logs\n");
printf("CPU = %12.4f\n",x0);
printf("WallCloc = %12.4f\n",y0);
*/
/* Initialize Y[1] to be all log(1.0). */
j = 0;
localyim = (double *)&(yim[0][MYTHREAD]);
for(i=0; i localy[0][i])
{
localy[0][i] = temp0;
localpp[i] = ia[j][myfirst+i];
}
if (temp1 > localy[0][i+1])
{
localy[0][i+1] = temp1;
localpp[i+1] = ia[j][myfirst+i+1];
}
if (temp2 > localy[0][i+2])
{
localy[0][i+2] = temp2;
localpp[i+2] = ia[j][myfirst+i+2];
}
if (temp3 > localy[0][i+3])
{
localy[0][i+3] = temp3;
localpp[i+3] = ia[j][myfirst+i+3];
}
if (temp4 > localy[0][i+4])
{
localy[0][i+4] = temp4;
localpp[i+4] = ia[j][myfirst+i+4];
}
if (temp5 > localy[0][i+5])
{
localy[0][i+5] = temp5;
localpp[i+5] = ia[j][myfirst+i+5];
}
if (temp6 > localy[0][i+6])
{
localy[0][i+6] = temp6;
localpp[i+6] = ia[j][myfirst+i+6];
}
if (temp7 > localy[0][i+7])
{
localy[0][i+7] = temp7;
localpp[i+7] = ia[j][myfirst+i+7];
}
}
BARRIER;
ibuf = 1;
ibuf2 = 0;
/* Do the remaining steps, going between Y(,1) and Y(,0) */
for(kk=1;kk localy[ibuf][i])
{
localy[ibuf][i] = temp;
localpp[i] = ia[j][i+myfirst];
}
}
*/
for(i=0;i localy[ibuf][i])
{
localy[ibuf][i] = temp0;
localpp[i] = ia[j][i+myfirst];
}
if (temp1 > localy[ibuf][i+1])
{
localy[ibuf][i+1] = temp1;
localpp[i+1] = ia[j][i+1+myfirst];
}
if (temp2 > localy[ibuf][i+2])
{
localy[ibuf][i+2] = temp2;
localpp[i+2] = ia[j][i+2+myfirst];
}
if (temp3 > localy[ibuf][i+3])
{
localy[ibuf][i+3] = temp3;
localpp[i+3] = ia[j][i+3+myfirst];
}
if (temp4 > localy[ibuf][i+4])
{
localy[ibuf][i+4] = temp4;
localpp[i+4] = ia[j][i+4+myfirst];
}
if (temp5 > localy[ibuf][i+5])
{
localy[ibuf][i+5] = temp5;
localpp[i+5] = ia[j][i+5+myfirst];
}
if (temp6 > localy[ibuf][i+6])
{
localy[ibuf][i+6] = temp6;
localpp[i+6] = ia[j][i+6+myfirst];
}
if (temp7 > localy[ibuf][i+7])
{
localy[ibuf][i+7] = temp7;
localpp[i+7] = ia[j][i+7+myfirst];
}
}
}
ibuf2 = ibuf;
ibuf = 1 - ibuf;
BARRIER;
}
/*
* If these are the last steps, compute B(T) by taking the maximum
* value in Y(T) and storing its position
*/
if(!penum)
{
if (!first)
{
temp = y[ibuf2][0];
ii = 0;
for(i = 1;i temp)
{
temp = y[ibuf2][index(i)];
ii = i;
}
b[t] = ii;
*z = temp;
}
/*
* Backtrack over all the steps computed in this subroutine
*/
for(i=t-1;i>-1;i--) {
b[i] = pp[i][index(b[i+1])];
}
}
}
/*
* Subroutine to convert A matrices to logs, to eliminate the need to rescale.
*/
void g8s( double *a,
int mynum,
int k,
int l,
int npes)
{
double fpmin = 1.17549435e-38f;
int i,j,kk;
/*
#ifdef FPMIN
fpmin=FPMIN;
#endif
#ifdef FLT_MIN
fpmin = FLT_MIN;
#endif
*/
/*
* Scale each matrix
*/
for(i=0;i