//#define Debug
#include "bench7.h"
// macros to treat B, C as 2-dim arrays as in Fortran version
// Fortran has EQUIVALENCE (B(1,1), BB(1)) but C can't
// B,C [BLOCK][LENBC=5]
#define B(I, J) B[ (J) + LENBC * (I) ]
#define C(I, J) C[ (J) + LENBC * (I) ]
/*
* Benchmark #7 -- Bit Twiddle
*
* This version assumes that the BIT MATRIX MULTIPLY
* functional unit is not available
*
p7 ( uint64 *A, int bufnum,
int mype, int npes, int myfirst, int mylast,
int64 *START, int64 *LENGTH, int64 *NN )
*
* Parameters: Provided by the calling routine:
* A = Buffer of input bit stream, BUFSIZE=11264000 words long
* Bufnum = Number of buffer
* mype
* npes
* Myfirst= first buffer for mype's chunk
* Mylast = last buffer for mype's chunk
*
* Returned by this routine:
* START = Starting position of each stretch of zeros
* LENGTH = Length of each stretch of zeros
* NN = Number of stretches of zeros found
*
*
* Basic algorithm
*
* The benchmark has three parts to it:
* 1) The calculation of streams B and C
* 2) The calculation of streams D and DXORC (to be defined later)
* 3) The calculation of stream E and the locating of sequences of zeros in E
*
* Each part of this problem has a different algorithm, to be described below.
*/
/* Number of bits in a word; this code assumes that this is 64 */
#define WS 64
/*
* Number of contiguous bits to take to form streams B and C; this code
* assumes that this is 5
*/
#define LENBC 5
#define INC 11
#define SKIPBC 6
/* Number of 64-bit words in a buffer of streams B, C, D, DXORC, and E */
#define BLEN (BUFSIZE*LENBC)/INC
/* Number of blocks of words. */
#define BLOCK BUFSIZE/INC
/* Distance between 1st and 2nd positions in DXORC for XORing together */
#define OFF1 37
/* Distance between 1st and 3rd positions in DXORC for XORing together */
#define OFF2 100
/* Buffer length */
#define LBUF 8192
#define NSAVE 3
#ifdef USE_SHMEM
// Copy first words of mype+1's chunk to mype
// Then put after end of mype's last buff.
// declare before p7 so not on stack so remotely accessible by shmem
uint64 BBCC[2*NSAVE], BBCCSAVE[2*NSAVE];
#endif
p7 ( uint64 A[BLOCK][INC], int bufnum,
int mype, int npes, int myfirst, int mylast,
int64 *START, int64 *LENGTH, int64 *NN )
{
long int I, J, K, DLIM;
uint64 IMSK, MSKIT, ZEROES;
uint64 TB, TC, TEMP1, TEMP2, TEMP3;
int OFF3;
int LOCNN = *NN;
int64 LEND, STRT, CUREND;
int64 EBITS = BLEN*WS;
static int64 CURSTART, CURLENGTH;
/* Buffer to hold the B and C streams before packing */
uint64 BBUF[LBUF][INC], CBUF[LBUF][INC];
/* fiddle with these arrays to copy the flexible array bounds in Fortran */
/* These are 4 words longer to hold last 4 words of previous buffer-full */
/* To hold the B stream */
static uint64 BBB[BLEN + 7];
uint64 * BB = &BBB[4]; /* so BB[-4:BLEN+2] */
uint64 * B = BB; /* use B(I,J) in Part 1 */
/* To hold the C stream */
static uint64 CCC[BLEN + 7];
uint64 * CC = &CCC[4]; /* so CC[-4:BLEN+2] */
uint64 * C = CC; /* use C(I,J) in Part 1 */
// save last words from one of my buffers to put before beg. of next buf.
static uint64 BSAVE[NSAVE], CSAVE[NSAVE];
#ifndef USE_SHMEM
// Copy first words of mype+1's chunk to mype
// Then put after end of mype's last buff.
static uint64 BBCC[2*NSAVE], BBCCSAVE[2*NSAVE];
#endif
/* To hold the (partial) DXORC stream */
static uint64 DDXORC[BLEN + 6];
uint64 *DXORC = &DDXORC[2]; /* so DXORC[-2:BLEN+3] */
/* To hold a segment of the full DXORC stream, and of the E stream */
uint64 TTDXORC[5];
uint64 *TDXORC = &TTDXORC[1]; /* so TDXORC[-1:3] */
uint64 EE[3];
uint64 *E = &EE[1]; /* so E[-1:1] */
/* Masks for Part 1 */
uint64 MASKB1[INC] = { 0xF81F03E07C0F81F0, 0x3E07C0F81F03E07C,
0x0F81F03E07C0F81F, 0x03E07C0F81F03E07, 0xC0F81F03E07C0F81,
0xF03E07C0F81F03E0, 0x7C0F81F03E07C0F8, 0x1F03E07C0F81F03E,
0x07C0F81F03E07C0F, 0x81F03E07C0F81F03, 0xE07C0F81F03E07C0 };
uint64 MASKB2[INC] = { 0xFFC003FF000FFC00, 0x3FF000FFC003FF00,
0x0FFC003FF000FFC0, 0x03FF000FFC003FC0, 0xFE001FF8007FE001,
0xFF8007FE001FF800, 0x7FE001FF8007FE00, 0x1FF8007FE001FF80,
0x07FE001FF8007FC0, 0xFC003FF000FFC003, 0xFF000FFC003FF000 };
uint64 MASKB3[INC] = { 0xFFFFF000000FFC00, 0x3FFFFC000003FF00,
0x0FFFFF000000FFC0, 0x03FFFFC000003FC0, 0xFFFF8000007FF000,
0xFFFFE000001FF800, 0x7FFFF8000007FE00, 0x1FFFFE000001FF80,
0x07FFFF8000007FC0, 0xFFFF000000FFF000, 0xFFFFC000003FF000 };
uint64 MASKB4[INC] = { 0xFFFFFFFC00000000, 0x3FFFFFFF00000000,
0x0FFFFFFFC0000000, 0x03FFFFFFC0000000, 0xFFFFFFF000000000,
0xFFFFFFF800000000, 0x7FFFFFFE00000000, 0x1FFFFFFF80000000,
0x07FFFFFFC0000000, 0xFFFFFFF000000000, 0xFFFFFFF000000000 };
uint64 MASKC1[INC] = { 0x07C0F81F03E07C0F, 0x81F03E07C0F81F03,
0xE07C0F81F03E07C0, 0xF81F03E07C0F81F0, 0x3E07C0F81F03E07C,
0x0F81F03E07C0F81F, 0x03E07C0F81F03E07, 0xC0F81F03E07C0F81,
0xF03E07C0F81F03E0, 0x7C0F81F03E07C0F8, 0x1F03E07C0F81F03E };
uint64 MASKC2[INC] = { 0x07FE001FF8007FC0, 0xFC003FF000FFC003,
0xFF000FFC003FF000, 0xFFC003FF000FFC00, 0x3FF000FFC003FF00,
0x0FFC003FF000FFC0, 0x03FF000FFC003FC0, 0xFE001FF8007FE001,
0xFF8007FE001FF800, 0x7FE001FF8007FE00, 0x1FF8007FE001FF80 };
uint64 MASKC3[INC] = { 0x07FFFF8000007FC0, 0xFFFF000000FFF000,
0xFFFFC000003FF000, 0xFFFFF000000FFC00, 0x3FFFFC000003FF00,
0x0FFFFF000000FFC0, 0x03FFFFC000003FC0, 0xFFFF8000007FF000,
0xFFFFE000001FF800, 0x7FFFF8000007FE00, 0x1FFFFE000001FF80 };
uint64 MASKC4[INC] = { 0x07FFFFFFC0000000, 0xFFFFFFF000000000,
0xFFFFFFF000000000, 0xFFFFFFFC00000000, 0x3FFFFFFF00000000,
0x0FFFFFFFC0000000, 0x03FFFFFFC0000000, 0xFFFFFFF000000000,
0xFFFFFFF800000000, 0x7FFFFFFE00000000, 0x1FFFFFFF80000000 };
int SRB[INC] = {0, 28, 56, 20, 54, 18, 46, 10, 38, 8, 36};
int SLB[INC] = {0, 0, 8, 0, 10, 0, 18, 0, 26, 0, 0};
int SRC[INC] = {0, 29, 57, 21, 49, 13, 41, 11, 39, 3, 31};
int SLC[INC] = {5, 0, 7, 0, 15, 0, 23, 0, 25, 0, 0};
#ifdef USE_MPI
MPI_Status status;
#endif
/* Generate mask for Part 3 */
/* Set up mask of valid bits in partial E stream */
OFF3 = OFF2 - WS;
IMSK = 0x7FFFFFFFFFFFFFFE;
MSKIT = IMSK & ( IMSK >> (WS-OFF1) );
MSKIT = MSKIT & ( IMSK >> (WS-OFF3) );
I = 2*WS - NZ - 2;
if (I < 0) I = 0;
MSKIT = MSKIT & ( IMSK >> I );
/* Check number of check bits in this mask */
if (popcnt (MSKIT) < 10)
{
printf ("Too few check bits error.\n\n");
printf ("There are too few check bits for OFF1, OFF2 and N2.\n");
printf ("Code assumes at least 10.\n");
printf ("OFF1 = %d\n", OFF1);
printf ("OFF2 = %d\n", OFF2);
printf ("NZ = %d\n", NZ);
printf ("MSKIT = %16lX\n", MSKIT);
printf ("Density = %d\n", popcnt (MSKIT));
exit (1);
}
if (bufnum == myfirst)
{
// To prevent problems at the beginning of the streams, set the first few
// words preceeding the actual data of both the B and C streams to 0
BB[-4] = 1;
BB[-3] = 1;
BB[-2] = 1;
BB[-1] = 1;
CC[-4] = 0;
CC[-3] = 0;
CC[-2] = 0;
CC[-1] = 0;
// also 2 dummy words at end
BB[BLEN ] = 1;
BB[BLEN+1] = 1;
CC[BLEN ] = 0;
CC[BLEN+1] = 0;
}
else
{
// Not the 1st buffer for this PE, insert the last words of prev buff
for (I = 0; I < NSAVE; I++)
{
BB[I-NSAVE] = BSAVE[I];
CC[I-NSAVE] = CSAVE[I];
}
}
/* Used to prevent positions past the end of the stream. */
LEND = (bufnum+1) * EBITS - NZ;
/*
* Basic algorithm for Part 1:
*
* Notice that for all machines, except ones whose word size is a multiple
* of 11, every eleventh word of A has the same bits moved to the same
* output locations. Therefore if the inner loop is unrolled so that each
* iteration works on 11 words of stream A, the masks and shift counts
* will be the same for each iteration, allowing vectorization and/or
* parallelization.
*
* The basic idea is to mask off the bits of a word going to either B or C,
* push them together, and then shift them into place for packing into the
* output words.
*
* There are up to 7 fields of 5 contiguous bits in each 64-bit word that
* need to be compressed to form the output stream. Using a BIT MATRIX
* MULTIPLY, one could mask and shift each one of these to the correct
* position for output. Without this operation, the most efficient
* approach is a divide and conquer approach whereby first neighboring
* fields are put together, and then all the bits from the word are put
* together near the front of the word. These bits are then shifted into
* place for output. The result is that only about 13 operations (instead
* of 20) are needed per word for a straight forward approach.
*
* This version assumes that the BIT MATRIX MULTIPLY functional unit is not
* available.
*
* Certain masks and shiftcounts are needed for Part 1 to perform its
* calculations. The first mask which needs to be generated is the one
* marking off the first 5 bits, skipping 6, marking the next 5, etc., for
* the B stream. It is contained in array MASKB1. MASKC1 contains a
* similar array for the C stream.
*
* Since we're using the divide and conquer approach, we need masks not just
* for the positions of the bits in the 11 original words, but also for
* 10-bit groups, for the 20-bit groups, and for the final up to 30-bit
* groups. These are contained in arrays MASKB2 through MASKB4 for the B
* stream, and in MASKC2 through MASKC4 for the C stream. These are
* created in data statements at the beginning of this subroutine.
*
* Generate the stream B by compressing and shifting into position the
* 5-bit groups from stream A. This loop assumes that LENBC = 5 and that
* the wordsize is 64.
*/
/* Process the current buffer of data */
/* Do full buffer of A stream, INC*LBUF at a time, so 125 loops */
/* For each buffer length. LBUF == 8192 */
for (K = 0; K < BLOCK; K += LBUF)
{
for (J = 0; J < INC; J++)
{
for (I = 0; I < LBUF; I++)
{
/* Make 5-bit groups */
TB = A[I+K][J] & MASKB1[J];
TC = A[I+K][J] & MASKC1[J];
/* Compress 5-bit groups together */
TB = (TB + (TB << SKIPBC)) & MASKB2[J];
TC = (TC + (TC << SKIPBC)) & MASKC2[J];
/* Now compress every other resulting 10-bit group together */
TB = (TB + (TB << (2*SKIPBC))) & MASKB3[J];
TC = (TC + (TC << (2*SKIPBC))) & MASKC3[J];
BBUF[I][J] = (TB + (TB << (4*SKIPBC))) & MASKB4[J];
CBUF[I][J] = (TC + (TC << (4*SKIPBC))) & MASKC4[J];
}
}
for (I = 0; I < LBUF; I++)
{
/* Now pack up the B stream into single words */
/* Move words 0 and 1 into position */
TB = BBUF[I][0] + ( BBUF[I][1] >> SRB[1] );
/* Move 1st part of word 2 into pos. and store word 0 of output*/
B(I+K, 0) = TB + ( BBUF[I][2] >> SRB[2] );
/* Move last part of word 2 into position */
TB = BBUF[I][2] << SLB[2];
/* Move word 3 into position */
TB = TB + ( BBUF[I][3] >> SRB[3]) ;
/* Move 1st part of word 4 into pos. and store word 1 of output*/
B(I+K, 1) = TB + ( BBUF[I][4] >> SRB[4] );
/* Move last part of word 4 into position */
TB = BBUF[I][4] << SLB[4];
/* Move word 5 into position */
TB = TB + ( BBUF[I][5] >> SRB[5]) ;
/* Move 1st part of word 6 into pos. and store word 2 of output*/
B(I+K, 2) = TB + ( BBUF[I][6] >> SRB[6] );
/* Move last part of word 6 into position */
TB = BBUF[I][6] << SLB[6];
/* Move word 7 into position */
TB = TB + ( BBUF[I][7] >> SRB[7]) ;
/* Move 1st part of word 8 into pos. and store word 3 of output*/
B(I+K, 3) = TB + ( BBUF[I][8] >> SRB[8] );
/* Move last part of word 8 into position */
TB = BBUF[I][8] << SLB[8];
/* Move word 9 into position */
TB = TB + ( BBUF[I][9] >> SRB[9]) ;
/* Move word 10 into position and store word 4 of output */
B(I+K, 4) = TB + ( BBUF[I][10] >> SRB[10] );
/* Now pack up the C stream into single words */
/* Move words 0 and 1 into position */
TC = ( CBUF[I][0] << SLC[0] ) + ( CBUF[I][1] >> SRC[1] );
/* Move 1st part of word 2 into pos. and store word 0 of output*/
C(I+K, 0) = TC + ( CBUF[I][2] >> SRC[2] );
/* Move last part of word 2 into position */
TC = CBUF[I][2] << SLC[2];
/* Move word 3 into position */
TC = TC + ( CBUF[I][3] >> SRC[3]) ;
/* Move 1st part of word 4 into pos. and store word 1 of output*/
C(I+K, 1) = TC + ( CBUF[I][4] >> SRC[4] );
/* Move last part of word 4 into position */
TC = CBUF[I][4] << SLC[4];
/* Move word 5 into position */
TC = TC + ( CBUF[I][5] >> SRC[5]) ;
/* Move 1st part of word 6 into pos. and store word 2 of output*/
C(I+K, 2) = TC + ( CBUF[I][6] >> SRC[6] );
/* Move last part of word 6 into position */
TC = CBUF[I][6] << SLC[6];
/* Move word 7 into position */
TC = TC + ( CBUF[I][7] >> SRC[7]) ;
/* Move 1st part of word 8 into pos. and store word 3 of output*/
C(I+K, 3) = TC + ( CBUF[I][8] >> SRC[8] );
/* Move last part of word 8 into position */
TC = CBUF[I][8] << SLC[8];
/* Move word 9 into position */
TC = TC + ( CBUF[I][9] >> SRC[9]) ;
/* Move word 10 into position and store word 4 of output */
C(I+K, 4) = TC + ( CBUF[I][10] >> SRC[10] );
} /* for (I = 0 to LBUF-1) */
} /* for (K = 0; K < BLOCK; K += LBUF) */
/* We have finished generating BLEN=5*1024000 words of BB, CC */
if ( (npes > 1) && (bufnum == myfirst) )
{
// First time through, get first 3 words of BB, CC from mype+1
// Then last time through all but npes-1 have the overlap area
BBCC[0] = BB[0];
BBCC[1] = BB[1];
BBCC[2] = BB[2];
BBCC[3] = CC[0];
BBCC[4] = CC[1];
BBCC[5] = CC[2];
#ifdef USE_SHMEM
shmem_barrier_all();
if (mype < npes-1)
shmem_get64 ( BBCCSAVE, BBCC, 6, mype+1 );
shmem_barrier_all();
#endif
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
if (mype < npes-1)
MPI_Recv( BBCCSAVE, 6, MPI_INT64, mype+1,
123, MPI_COMM_WORLD, &status );
if (mype > 0)
MPI_Send( BBCC, 6, MPI_INT64, mype-1,
123, MPI_COMM_WORLD );
MPI_Barrier(MPI_COMM_WORLD);
#endif
}
/*
* Basic algorithm for Part 2:
*
* Define DXORC[i] = C[i] XOR D[i]. Because
* E[i] = C[i] XOR D[i] XOR C[i+37] XOR D[i+37] XOR C[i+100] XOR D[i+100]
* = DXORC[i] XOR DXORC[i+37] XOR DXORC[i+100] ,
* what we really want to compute in Part 2 is DXORC.
*
* DXORC[i] = C[i] XOR D[i]
* = C[i] XOR ( (C[i] ^ B[i+1]) XOR (~C[i] ^ B[i-1]) )
* = ( C[i] ^ ~B[i+1] ) XOR (~C[i] ^ B[i-1])
* The rest is relatively straightforward.
* Shifts are used to align bits within words for logical operations.
*
* CLEVER TIMESAVER:
* Since, in most cases, it's not necessary to compute every bit of the E
* stream (and thus of the DXORC stream) in order to reject the
* hypothesis of NZ consecutive zeros in the E stream (though it will
* be necessary to compute these bits when confirming this hyphothesis)
* only the middle 62 bits of each word of DXORC will be computed at
* this point. This results in a savings of four operations out of the
* 11 needed to compute DXORC completely.
*
*/
DLIM = BLEN;
// special handling if bufnum = mylast - do 3 more words
// --- but not if mype = npes-1
if ( (bufnum == mylast) && (mype < npes-1) )
{
DLIM = BLEN + 3;
BB[BLEN ] = BBCCSAVE[0];
BB[BLEN+1] = BBCCSAVE[1];
BB[BLEN+2] = BBCCSAVE[2];
CC[BLEN ] = BBCCSAVE[3];
CC[BLEN+1] = BBCCSAVE[4];
CC[BLEN+2] = BBCCSAVE[5];
}
/* Calculate D and DXORC */
for (I = -2; I < DLIM; I++)
{
TEMP1 = CC[I] & ( ~BB[I] << 1 );
TEMP2 = ~CC[I] & ( BB[I] >> 1 );
DXORC[I] = TEMP1 ^ TEMP2;
}
/*
* Basic algorithm for Part 3:
*
* Notice that for wordsize WS, NZ-WS+1 bits starting at any location in
* the word must be zero before there can be NZ zeroes in a row. Since
* E does not need to be saved, some work can be eliminated by only
* computing a portion of each word, looking for all zeroes in that
* portion. The portion should be longer than 10 bits, otherwise too
* many false starts will happen, and shorter than NZ-WS bits, otherwise
* some sequences might be missed. Once an all zero portion is found,
* local words of DXORC and E are computed in full to determine if
* there are NZ consecutive zeroes.
*
* This code assumes that NZ,OFF1 and OFF2 are such that at least 10 bits
* of check are obtained. It is also assumed that OFF1 and OFF2 - WS
* are both greater than 32. If not, it might be better to take the
* check bits from the left end of the word of E, rather than from the
* right (as is done in the following code). For the given values of
* OFF1 and OFF2, NZ must be at least 74, to provide 10 bits of check.
*
*
* Test for potential zero stretches. The "non-structured" coding is to
* increase efficiency when the probability of the if statement is very rare.
*/
I = -2;
label100:
for (I = I; I <= DLIM - 2; I++)
{
TEMP1 = DXORC[I+1] >> (WS - OFF1);
TEMP2 = DXORC[I+2] >> (WS - OFF3);
TEMP3 = TEMP1 ^ TEMP2 ^ DXORC[I];
if ((TEMP3 & MSKIT) == 0)
goto label120;
}
goto label150;
label120:
// An all-zero portion was found, check to see if NZ zeroes are there.
// Calculate the full values of DXORC for 5 words around this position.
for (J = -1; J <= 3; J++)
{
/* TEMP1 has B(i+1) -- bits start on left!! */
TEMP1 = (B[I+J] << 1) + (B[I+J+1] >> WS-1);
/* TEMP2 has B(i-1) */
TEMP2 = (B[I+J-1] << WS-1) + (B[I+J] >> 1);
/* each bit has C(i)*~B(i+1) ^ ~C(i)*B(i-1) */
TDXORC[J] = ( C[I+J] & (~TEMP1) ) ^ ( ~C[I+J] & TEMP2 );
}
// Now calculate the full E stream for 3 words around this position
for (J = -1; J <= 1; J++)
{
TEMP1 = (TDXORC[J ] << OFF1) + (TDXORC[J+1] >> (WS-OFF1));
TEMP2 = (TDXORC[J+1] << OFF3) + (TDXORC[J+2] >> (WS-OFF3));
E[J] = TDXORC[J] ^ TEMP1 ^ TEMP2;
}
if (E[0] == 0)
{
// Current word is all zeros, get trailing zeroes of previous word
if (E[-1] == 0)
ZEROES = 2*WS;
else
ZEROES = 2*WS - (1 + ( popcnt (E[-1] ^ (-E[-1])) ) );
}
else
{
// Else get trailing zeros of current word */
ZEROES = WS - (1 + ( popcnt (E[0] ^ (-E[0])) ) );
}
// Now get start position and add in leading zeros of following word
STRT = bufnum*EBITS + (I+1)*WS - ZEROES;
ZEROES = ZEROES + (leadz (E[1]));
// Store start position and length
// also look for overlapped regions of zeros and clean up the results
if (ZEROES >= NZ)
{
if (LOCNN >= 0)
CUREND = CURSTART + CURLENGTH;
else
CUREND = 0;
/* Check for overlapped regions in results, and collapse */
if (STRT <= CUREND)
{
CURLENGTH = STRT + ZEROES - CURSTART;
if (LOCNN < MAXANS)
LENGTH[LOCNN] = CURLENGTH;
}
else
{
// No overlap. Check that section is not off end of actual stream.
if (STRT <= LEND)
{
LOCNN += 1;
CURSTART = STRT;
CURLENGTH = ZEROES;
/* Check to prevent overwriting the START, LENGTH arrays */
if (LOCNN < MAXANS)
{
START[LOCNN] = STRT;
LENGTH[LOCNN] = ZEROES;
}
}
}
}
I++;
if (I <= DLIM - 1)
goto label100;
label150:
/* save the last words for next buffer pass */
for (I = 0; I < NSAVE; I++)
{
BSAVE[I] = BB[BLEN+I-NSAVE];
CSAVE[I] = CC[BLEN+I-NSAVE];
}
*NN = LOCNN;
return;
}