//#define Debug #include "bench7.h" // macros to treat B, C as 2-dim arrays as in Fortran version // Fortran has EQUIVALENCE (B(1,1), BB(1)) but C can't // B,C [BLOCK][LENBC=5] #define B(I, J) B[ (J) + LENBC * (I) ] #define C(I, J) C[ (J) + LENBC * (I) ] /* * Benchmark #7 -- Bit Twiddle * * This version assumes that the BIT MATRIX MULTIPLY * functional unit is not available * p7 ( uint64 *A, int bufnum, int mype, int npes, int myfirst, int mylast, int64 *START, int64 *LENGTH, int64 *NN ) * * Parameters: Provided by the calling routine: * A = Buffer of input bit stream, BUFSIZE=11264000 words long * Bufnum = Number of buffer * mype * npes * Myfirst= first buffer for mype's chunk * Mylast = last buffer for mype's chunk * * Returned by this routine: * START = Starting position of each stretch of zeros * LENGTH = Length of each stretch of zeros * NN = Number of stretches of zeros found * * * Basic algorithm * * The benchmark has three parts to it: * 1) The calculation of streams B and C * 2) The calculation of streams D and DXORC (to be defined later) * 3) The calculation of stream E and the locating of sequences of zeros in E * * Each part of this problem has a different algorithm, to be described below. */ /* Number of bits in a word; this code assumes that this is 64 */ #define WS 64 /* * Number of contiguous bits to take to form streams B and C; this code * assumes that this is 5 */ #define LENBC 5 #define INC 11 #define SKIPBC 6 /* Number of 64-bit words in a buffer of streams B, C, D, DXORC, and E */ #define BLEN (BUFSIZE*LENBC)/INC /* Number of blocks of words. */ #define BLOCK BUFSIZE/INC /* Distance between 1st and 2nd positions in DXORC for XORing together */ #define OFF1 37 /* Distance between 1st and 3rd positions in DXORC for XORing together */ #define OFF2 100 /* Buffer length */ #define LBUF 8192 #define NSAVE 3 #ifdef USE_SHMEM // Copy first words of mype+1's chunk to mype // Then put after end of mype's last buff. // declare before p7 so not on stack so remotely accessible by shmem uint64 BBCC[2*NSAVE], BBCCSAVE[2*NSAVE]; #endif p7 ( uint64 A[BLOCK][INC], int bufnum, int mype, int npes, int myfirst, int mylast, int64 *START, int64 *LENGTH, int64 *NN ) { long int I, J, K, DLIM; uint64 IMSK, MSKIT, ZEROES; uint64 TB, TC, TEMP1, TEMP2, TEMP3; int OFF3; int LOCNN = *NN; int64 LEND, STRT, CUREND; int64 EBITS = BLEN*WS; static int64 CURSTART, CURLENGTH; /* Buffer to hold the B and C streams before packing */ uint64 BBUF[LBUF][INC], CBUF[LBUF][INC]; /* fiddle with these arrays to copy the flexible array bounds in Fortran */ /* These are 4 words longer to hold last 4 words of previous buffer-full */ /* To hold the B stream */ static uint64 BBB[BLEN + 7]; uint64 * BB = &BBB[4]; /* so BB[-4:BLEN+2] */ uint64 * B = BB; /* use B(I,J) in Part 1 */ /* To hold the C stream */ static uint64 CCC[BLEN + 7]; uint64 * CC = &CCC[4]; /* so CC[-4:BLEN+2] */ uint64 * C = CC; /* use C(I,J) in Part 1 */ // save last words from one of my buffers to put before beg. of next buf. static uint64 BSAVE[NSAVE], CSAVE[NSAVE]; #ifndef USE_SHMEM // Copy first words of mype+1's chunk to mype // Then put after end of mype's last buff. static uint64 BBCC[2*NSAVE], BBCCSAVE[2*NSAVE]; #endif /* To hold the (partial) DXORC stream */ static uint64 DDXORC[BLEN + 6]; uint64 *DXORC = &DDXORC[2]; /* so DXORC[-2:BLEN+3] */ /* To hold a segment of the full DXORC stream, and of the E stream */ uint64 TTDXORC[5]; uint64 *TDXORC = &TTDXORC[1]; /* so TDXORC[-1:3] */ uint64 EE[3]; uint64 *E = &EE[1]; /* so E[-1:1] */ /* Masks for Part 1 */ uint64 MASKB1[INC] = { 0xF81F03E07C0F81F0, 0x3E07C0F81F03E07C, 0x0F81F03E07C0F81F, 0x03E07C0F81F03E07, 0xC0F81F03E07C0F81, 0xF03E07C0F81F03E0, 0x7C0F81F03E07C0F8, 0x1F03E07C0F81F03E, 0x07C0F81F03E07C0F, 0x81F03E07C0F81F03, 0xE07C0F81F03E07C0 }; uint64 MASKB2[INC] = { 0xFFC003FF000FFC00, 0x3FF000FFC003FF00, 0x0FFC003FF000FFC0, 0x03FF000FFC003FC0, 0xFE001FF8007FE001, 0xFF8007FE001FF800, 0x7FE001FF8007FE00, 0x1FF8007FE001FF80, 0x07FE001FF8007FC0, 0xFC003FF000FFC003, 0xFF000FFC003FF000 }; uint64 MASKB3[INC] = { 0xFFFFF000000FFC00, 0x3FFFFC000003FF00, 0x0FFFFF000000FFC0, 0x03FFFFC000003FC0, 0xFFFF8000007FF000, 0xFFFFE000001FF800, 0x7FFFF8000007FE00, 0x1FFFFE000001FF80, 0x07FFFF8000007FC0, 0xFFFF000000FFF000, 0xFFFFC000003FF000 }; uint64 MASKB4[INC] = { 0xFFFFFFFC00000000, 0x3FFFFFFF00000000, 0x0FFFFFFFC0000000, 0x03FFFFFFC0000000, 0xFFFFFFF000000000, 0xFFFFFFF800000000, 0x7FFFFFFE00000000, 0x1FFFFFFF80000000, 0x07FFFFFFC0000000, 0xFFFFFFF000000000, 0xFFFFFFF000000000 }; uint64 MASKC1[INC] = { 0x07C0F81F03E07C0F, 0x81F03E07C0F81F03, 0xE07C0F81F03E07C0, 0xF81F03E07C0F81F0, 0x3E07C0F81F03E07C, 0x0F81F03E07C0F81F, 0x03E07C0F81F03E07, 0xC0F81F03E07C0F81, 0xF03E07C0F81F03E0, 0x7C0F81F03E07C0F8, 0x1F03E07C0F81F03E }; uint64 MASKC2[INC] = { 0x07FE001FF8007FC0, 0xFC003FF000FFC003, 0xFF000FFC003FF000, 0xFFC003FF000FFC00, 0x3FF000FFC003FF00, 0x0FFC003FF000FFC0, 0x03FF000FFC003FC0, 0xFE001FF8007FE001, 0xFF8007FE001FF800, 0x7FE001FF8007FE00, 0x1FF8007FE001FF80 }; uint64 MASKC3[INC] = { 0x07FFFF8000007FC0, 0xFFFF000000FFF000, 0xFFFFC000003FF000, 0xFFFFF000000FFC00, 0x3FFFFC000003FF00, 0x0FFFFF000000FFC0, 0x03FFFFC000003FC0, 0xFFFF8000007FF000, 0xFFFFE000001FF800, 0x7FFFF8000007FE00, 0x1FFFFE000001FF80 }; uint64 MASKC4[INC] = { 0x07FFFFFFC0000000, 0xFFFFFFF000000000, 0xFFFFFFF000000000, 0xFFFFFFFC00000000, 0x3FFFFFFF00000000, 0x0FFFFFFFC0000000, 0x03FFFFFFC0000000, 0xFFFFFFF000000000, 0xFFFFFFF800000000, 0x7FFFFFFE00000000, 0x1FFFFFFF80000000 }; int SRB[INC] = {0, 28, 56, 20, 54, 18, 46, 10, 38, 8, 36}; int SLB[INC] = {0, 0, 8, 0, 10, 0, 18, 0, 26, 0, 0}; int SRC[INC] = {0, 29, 57, 21, 49, 13, 41, 11, 39, 3, 31}; int SLC[INC] = {5, 0, 7, 0, 15, 0, 23, 0, 25, 0, 0}; #ifdef USE_MPI MPI_Status status; #endif /* Generate mask for Part 3 */ /* Set up mask of valid bits in partial E stream */ OFF3 = OFF2 - WS; IMSK = 0x7FFFFFFFFFFFFFFE; MSKIT = IMSK & ( IMSK >> (WS-OFF1) ); MSKIT = MSKIT & ( IMSK >> (WS-OFF3) ); I = 2*WS - NZ - 2; if (I < 0) I = 0; MSKIT = MSKIT & ( IMSK >> I ); /* Check number of check bits in this mask */ if (popcnt (MSKIT) < 10) { printf ("Too few check bits error.\n\n"); printf ("There are too few check bits for OFF1, OFF2 and N2.\n"); printf ("Code assumes at least 10.\n"); printf ("OFF1 = %d\n", OFF1); printf ("OFF2 = %d\n", OFF2); printf ("NZ = %d\n", NZ); printf ("MSKIT = %16lX\n", MSKIT); printf ("Density = %d\n", popcnt (MSKIT)); exit (1); } if (bufnum == myfirst) { // To prevent problems at the beginning of the streams, set the first few // words preceeding the actual data of both the B and C streams to 0 BB[-4] = 1; BB[-3] = 1; BB[-2] = 1; BB[-1] = 1; CC[-4] = 0; CC[-3] = 0; CC[-2] = 0; CC[-1] = 0; // also 2 dummy words at end BB[BLEN ] = 1; BB[BLEN+1] = 1; CC[BLEN ] = 0; CC[BLEN+1] = 0; } else { // Not the 1st buffer for this PE, insert the last words of prev buff for (I = 0; I < NSAVE; I++) { BB[I-NSAVE] = BSAVE[I]; CC[I-NSAVE] = CSAVE[I]; } } /* Used to prevent positions past the end of the stream. */ LEND = (bufnum+1) * EBITS - NZ; /* * Basic algorithm for Part 1: * * Notice that for all machines, except ones whose word size is a multiple * of 11, every eleventh word of A has the same bits moved to the same * output locations. Therefore if the inner loop is unrolled so that each * iteration works on 11 words of stream A, the masks and shift counts * will be the same for each iteration, allowing vectorization and/or * parallelization. * * The basic idea is to mask off the bits of a word going to either B or C, * push them together, and then shift them into place for packing into the * output words. * * There are up to 7 fields of 5 contiguous bits in each 64-bit word that * need to be compressed to form the output stream. Using a BIT MATRIX * MULTIPLY, one could mask and shift each one of these to the correct * position for output. Without this operation, the most efficient * approach is a divide and conquer approach whereby first neighboring * fields are put together, and then all the bits from the word are put * together near the front of the word. These bits are then shifted into * place for output. The result is that only about 13 operations (instead * of 20) are needed per word for a straight forward approach. * * This version assumes that the BIT MATRIX MULTIPLY functional unit is not * available. * * Certain masks and shiftcounts are needed for Part 1 to perform its * calculations. The first mask which needs to be generated is the one * marking off the first 5 bits, skipping 6, marking the next 5, etc., for * the B stream. It is contained in array MASKB1. MASKC1 contains a * similar array for the C stream. * * Since we're using the divide and conquer approach, we need masks not just * for the positions of the bits in the 11 original words, but also for * 10-bit groups, for the 20-bit groups, and for the final up to 30-bit * groups. These are contained in arrays MASKB2 through MASKB4 for the B * stream, and in MASKC2 through MASKC4 for the C stream. These are * created in data statements at the beginning of this subroutine. * * Generate the stream B by compressing and shifting into position the * 5-bit groups from stream A. This loop assumes that LENBC = 5 and that * the wordsize is 64. */ /* Process the current buffer of data */ /* Do full buffer of A stream, INC*LBUF at a time, so 125 loops */ /* For each buffer length. LBUF == 8192 */ for (K = 0; K < BLOCK; K += LBUF) { for (J = 0; J < INC; J++) { for (I = 0; I < LBUF; I++) { /* Make 5-bit groups */ TB = A[I+K][J] & MASKB1[J]; TC = A[I+K][J] & MASKC1[J]; /* Compress 5-bit groups together */ TB = (TB + (TB << SKIPBC)) & MASKB2[J]; TC = (TC + (TC << SKIPBC)) & MASKC2[J]; /* Now compress every other resulting 10-bit group together */ TB = (TB + (TB << (2*SKIPBC))) & MASKB3[J]; TC = (TC + (TC << (2*SKIPBC))) & MASKC3[J]; BBUF[I][J] = (TB + (TB << (4*SKIPBC))) & MASKB4[J]; CBUF[I][J] = (TC + (TC << (4*SKIPBC))) & MASKC4[J]; } } for (I = 0; I < LBUF; I++) { /* Now pack up the B stream into single words */ /* Move words 0 and 1 into position */ TB = BBUF[I][0] + ( BBUF[I][1] >> SRB[1] ); /* Move 1st part of word 2 into pos. and store word 0 of output*/ B(I+K, 0) = TB + ( BBUF[I][2] >> SRB[2] ); /* Move last part of word 2 into position */ TB = BBUF[I][2] << SLB[2]; /* Move word 3 into position */ TB = TB + ( BBUF[I][3] >> SRB[3]) ; /* Move 1st part of word 4 into pos. and store word 1 of output*/ B(I+K, 1) = TB + ( BBUF[I][4] >> SRB[4] ); /* Move last part of word 4 into position */ TB = BBUF[I][4] << SLB[4]; /* Move word 5 into position */ TB = TB + ( BBUF[I][5] >> SRB[5]) ; /* Move 1st part of word 6 into pos. and store word 2 of output*/ B(I+K, 2) = TB + ( BBUF[I][6] >> SRB[6] ); /* Move last part of word 6 into position */ TB = BBUF[I][6] << SLB[6]; /* Move word 7 into position */ TB = TB + ( BBUF[I][7] >> SRB[7]) ; /* Move 1st part of word 8 into pos. and store word 3 of output*/ B(I+K, 3) = TB + ( BBUF[I][8] >> SRB[8] ); /* Move last part of word 8 into position */ TB = BBUF[I][8] << SLB[8]; /* Move word 9 into position */ TB = TB + ( BBUF[I][9] >> SRB[9]) ; /* Move word 10 into position and store word 4 of output */ B(I+K, 4) = TB + ( BBUF[I][10] >> SRB[10] ); /* Now pack up the C stream into single words */ /* Move words 0 and 1 into position */ TC = ( CBUF[I][0] << SLC[0] ) + ( CBUF[I][1] >> SRC[1] ); /* Move 1st part of word 2 into pos. and store word 0 of output*/ C(I+K, 0) = TC + ( CBUF[I][2] >> SRC[2] ); /* Move last part of word 2 into position */ TC = CBUF[I][2] << SLC[2]; /* Move word 3 into position */ TC = TC + ( CBUF[I][3] >> SRC[3]) ; /* Move 1st part of word 4 into pos. and store word 1 of output*/ C(I+K, 1) = TC + ( CBUF[I][4] >> SRC[4] ); /* Move last part of word 4 into position */ TC = CBUF[I][4] << SLC[4]; /* Move word 5 into position */ TC = TC + ( CBUF[I][5] >> SRC[5]) ; /* Move 1st part of word 6 into pos. and store word 2 of output*/ C(I+K, 2) = TC + ( CBUF[I][6] >> SRC[6] ); /* Move last part of word 6 into position */ TC = CBUF[I][6] << SLC[6]; /* Move word 7 into position */ TC = TC + ( CBUF[I][7] >> SRC[7]) ; /* Move 1st part of word 8 into pos. and store word 3 of output*/ C(I+K, 3) = TC + ( CBUF[I][8] >> SRC[8] ); /* Move last part of word 8 into position */ TC = CBUF[I][8] << SLC[8]; /* Move word 9 into position */ TC = TC + ( CBUF[I][9] >> SRC[9]) ; /* Move word 10 into position and store word 4 of output */ C(I+K, 4) = TC + ( CBUF[I][10] >> SRC[10] ); } /* for (I = 0 to LBUF-1) */ } /* for (K = 0; K < BLOCK; K += LBUF) */ /* We have finished generating BLEN=5*1024000 words of BB, CC */ if ( (npes > 1) && (bufnum == myfirst) ) { // First time through, get first 3 words of BB, CC from mype+1 // Then last time through all but npes-1 have the overlap area BBCC[0] = BB[0]; BBCC[1] = BB[1]; BBCC[2] = BB[2]; BBCC[3] = CC[0]; BBCC[4] = CC[1]; BBCC[5] = CC[2]; #ifdef USE_SHMEM shmem_barrier_all(); if (mype < npes-1) shmem_get64 ( BBCCSAVE, BBCC, 6, mype+1 ); shmem_barrier_all(); #endif #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); if (mype < npes-1) MPI_Recv( BBCCSAVE, 6, MPI_INT64, mype+1, 123, MPI_COMM_WORLD, &status ); if (mype > 0) MPI_Send( BBCC, 6, MPI_INT64, mype-1, 123, MPI_COMM_WORLD ); MPI_Barrier(MPI_COMM_WORLD); #endif } /* * Basic algorithm for Part 2: * * Define DXORC[i] = C[i] XOR D[i]. Because * E[i] = C[i] XOR D[i] XOR C[i+37] XOR D[i+37] XOR C[i+100] XOR D[i+100] * = DXORC[i] XOR DXORC[i+37] XOR DXORC[i+100] , * what we really want to compute in Part 2 is DXORC. * * DXORC[i] = C[i] XOR D[i] * = C[i] XOR ( (C[i] ^ B[i+1]) XOR (~C[i] ^ B[i-1]) ) * = ( C[i] ^ ~B[i+1] ) XOR (~C[i] ^ B[i-1]) * The rest is relatively straightforward. * Shifts are used to align bits within words for logical operations. * * CLEVER TIMESAVER: * Since, in most cases, it's not necessary to compute every bit of the E * stream (and thus of the DXORC stream) in order to reject the * hypothesis of NZ consecutive zeros in the E stream (though it will * be necessary to compute these bits when confirming this hyphothesis) * only the middle 62 bits of each word of DXORC will be computed at * this point. This results in a savings of four operations out of the * 11 needed to compute DXORC completely. * */ DLIM = BLEN; // special handling if bufnum = mylast - do 3 more words // --- but not if mype = npes-1 if ( (bufnum == mylast) && (mype < npes-1) ) { DLIM = BLEN + 3; BB[BLEN ] = BBCCSAVE[0]; BB[BLEN+1] = BBCCSAVE[1]; BB[BLEN+2] = BBCCSAVE[2]; CC[BLEN ] = BBCCSAVE[3]; CC[BLEN+1] = BBCCSAVE[4]; CC[BLEN+2] = BBCCSAVE[5]; } /* Calculate D and DXORC */ for (I = -2; I < DLIM; I++) { TEMP1 = CC[I] & ( ~BB[I] << 1 ); TEMP2 = ~CC[I] & ( BB[I] >> 1 ); DXORC[I] = TEMP1 ^ TEMP2; } /* * Basic algorithm for Part 3: * * Notice that for wordsize WS, NZ-WS+1 bits starting at any location in * the word must be zero before there can be NZ zeroes in a row. Since * E does not need to be saved, some work can be eliminated by only * computing a portion of each word, looking for all zeroes in that * portion. The portion should be longer than 10 bits, otherwise too * many false starts will happen, and shorter than NZ-WS bits, otherwise * some sequences might be missed. Once an all zero portion is found, * local words of DXORC and E are computed in full to determine if * there are NZ consecutive zeroes. * * This code assumes that NZ,OFF1 and OFF2 are such that at least 10 bits * of check are obtained. It is also assumed that OFF1 and OFF2 - WS * are both greater than 32. If not, it might be better to take the * check bits from the left end of the word of E, rather than from the * right (as is done in the following code). For the given values of * OFF1 and OFF2, NZ must be at least 74, to provide 10 bits of check. * * * Test for potential zero stretches. The "non-structured" coding is to * increase efficiency when the probability of the if statement is very rare. */ I = -2; label100: for (I = I; I <= DLIM - 2; I++) { TEMP1 = DXORC[I+1] >> (WS - OFF1); TEMP2 = DXORC[I+2] >> (WS - OFF3); TEMP3 = TEMP1 ^ TEMP2 ^ DXORC[I]; if ((TEMP3 & MSKIT) == 0) goto label120; } goto label150; label120: // An all-zero portion was found, check to see if NZ zeroes are there. // Calculate the full values of DXORC for 5 words around this position. for (J = -1; J <= 3; J++) { /* TEMP1 has B(i+1) -- bits start on left!! */ TEMP1 = (B[I+J] << 1) + (B[I+J+1] >> WS-1); /* TEMP2 has B(i-1) */ TEMP2 = (B[I+J-1] << WS-1) + (B[I+J] >> 1); /* each bit has C(i)*~B(i+1) ^ ~C(i)*B(i-1) */ TDXORC[J] = ( C[I+J] & (~TEMP1) ) ^ ( ~C[I+J] & TEMP2 ); } // Now calculate the full E stream for 3 words around this position for (J = -1; J <= 1; J++) { TEMP1 = (TDXORC[J ] << OFF1) + (TDXORC[J+1] >> (WS-OFF1)); TEMP2 = (TDXORC[J+1] << OFF3) + (TDXORC[J+2] >> (WS-OFF3)); E[J] = TDXORC[J] ^ TEMP1 ^ TEMP2; } if (E[0] == 0) { // Current word is all zeros, get trailing zeroes of previous word if (E[-1] == 0) ZEROES = 2*WS; else ZEROES = 2*WS - (1 + ( popcnt (E[-1] ^ (-E[-1])) ) ); } else { // Else get trailing zeros of current word */ ZEROES = WS - (1 + ( popcnt (E[0] ^ (-E[0])) ) ); } // Now get start position and add in leading zeros of following word STRT = bufnum*EBITS + (I+1)*WS - ZEROES; ZEROES = ZEROES + (leadz (E[1])); // Store start position and length // also look for overlapped regions of zeros and clean up the results if (ZEROES >= NZ) { if (LOCNN >= 0) CUREND = CURSTART + CURLENGTH; else CUREND = 0; /* Check for overlapped regions in results, and collapse */ if (STRT <= CUREND) { CURLENGTH = STRT + ZEROES - CURSTART; if (LOCNN < MAXANS) LENGTH[LOCNN] = CURLENGTH; } else { // No overlap. Check that section is not off end of actual stream. if (STRT <= LEND) { LOCNN += 1; CURSTART = STRT; CURLENGTH = ZEROES; /* Check to prevent overwriting the START, LENGTH arrays */ if (LOCNN < MAXANS) { START[LOCNN] = STRT; LENGTH[LOCNN] = ZEROES; } } } } I++; if (I <= DLIM - 1) goto label100; label150: /* save the last words for next buffer pass */ for (I = 0; I < NSAVE; I++) { BSAVE[I] = BB[BLEN+I-NSAVE]; CSAVE[I] = CC[BLEN+I-NSAVE]; } *NN = LOCNN; return; }