SUBROUTINE P7(A,L,NZ,START,LENGTH,NN,ERR) c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Benchmark #7 -- Bit Twiddle c c *** This version uses the BIT MATRIX MULTIPLY functional unit *** c c Parameters: c c Provided by the calling routine: c A = Input bit stream c L = Number of 64 bit words in bit stream A c NZ = Minimum length of zeros before reporting results c c Returned by this routine: c START = Starting position of each stretch of zeros c LENGTH = Length of each stretch of zeros c NN = Number of stretches of zeros found c ERR = Flag indicating whether there is array overflow c ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c c Basic algorithm c c The benchmark has three parts to it: c 1) The calculation of streams B and C c 2) The calculation of streams D and DXORC (to be defined later) c 3) The calculation of stream E and the locating of sequences of c zeros in E c c Each part of this problem has a different algorithm, which c will be described below. c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c IMPLICIT INTEGER (A-Z) c c c Cc Uncomment the following for machines without the POPCNT intrinsic. Cc Also, uncomment the code for INITPC and POPCNT in the UTIL routine Cc and the call to INITPC in M7. c C COMMON /POPS/POPS(0:65535) c c c Cc Uncomment the following for machines without the LEADZ intrinsic. Cc Also, uncomment the code for INITLZ in the UTIL routine Cc and the call to INITLZ in M7. c C COMMON /LZCNT/LZCNT(0:65535) c c c c Length of the A stream buffer in 64 bit words PARAMETER (MN=11 264 000) c c Number of buffers to use PARAMETER (NBUF=2) c c Masks to mask off the high and low order 32 bits, used in generation of c the B and C streams PARAMETER (BMASK = Z'FFFFFFFF00000000') PARAMETER (CMASK = Z'FFFFFFFF') c c Number of contiguous bits to take to form streams B and C; this c code assumes that this is 5 PARAMETER (LENBC=5) c c Number of bits between beginning bits of a LENB-bit or LENC-bit group; c this code assumes that this is 11 PARAMETER (INC=11) c c BMM matrix to extract 5 skip 6, starting in the first position DIMENSION IM(40), IMM(64,INC) c c Maximum number of 64-bit words in streams B, C, D, DXORC, and E PARAMETER (BLEN=(MN*LENBC)/INC) c c Number of bits in a word; this code assumes that this is 64 PARAMETER (WS=64) c c Number of blocks of INC words PARAMETER (BLOCK = MN/INC) c c Distance between 1st and 2nd positions in DXORC for XORing together PARAMETER (OFF1=37) c c Distance between 1st and 3rd positions in DXORC for XORing together PARAMETER (OFF2=100) c DIMENSION A(INC,BLOCK,NBUF) c c Buffer to hold the B and C streams before packing PARAMETER (LBUF = 8192) DIMENSION BUF(INC,LBUF) c c To hold the B stream DIMENSION B(LENBC,BLOCK), BB(-3:LENBC*BLOCK) EQUIVALENCE (B(1,1),BB(1)) c c To hold the C stream DIMENSION C(LENBC,BLOCK), CC(-3:LENBC*BLOCK) EQUIVALENCE (C(1,1),CC(1)) c c To hold the DXORC stream DIMENSION DXORC(-1:LENBC*BLOCK) c c To hold a segment of the full DXORC stream, and of the E stream DIMENSION TDXORC(-1:3) DIMENSION TE(-1:1) c c Arrays to hold the results DIMENSION START(1),LENGTH(1) c Dummy array to allow loading of BMM DIMENSION M@BXX(64) c c Variables used to switch between buffers INTEGER FGBUF, BGBUF, TMP c c Shift values for Part 1 DIMENSION SRB(INC), SRC(INC), SLB(INC), SLC(INC) DATA SRB /0, 30, 60, 26, 54, 18, 47, 13, 43, 8, 36/ DATA SLB /0, 0, 4, 0, 10, 0, 17, 0, 21, 0, 0/ DATA SRC /32, 3, 25, 11, 19, 15, 15, 21, 7, 28, 2/ DATA SLC /0, 0, 39, 0, 45, 0, 49, 0, 57, 0, 0/ c c Generate the BMM fills for Part 1 c Fill array IM IM(1) = Z'8000000000000000' DO 10 I=2,5 IM(I) = SHIFTR(IM(I-1), 1) 10 CONTINUE DO 20 I=1,7 DO 15 J=1,5 IM(5*I+J) = SHIFTR(IM(5*I+J-5), 11) 15 CONTINUE 20 CONTINUE c c Now use IM as a pattern and initialize IMM for each pattern DO 25 I=1,32 IMM(I,1) = IM(I) IMM(I+32,1) = SHIFTR(IM(I) , 5) 25 CONTINUE c DO 30 I=1,32 IMM(I,2) = SHIFTR(IM(I),2) IMM(I+32,2) = SHIFTL(IM(I+4),4) 30 CONTINUE IMM(59,2) = SHIFTR(IM(29),4) IMM(60,2) = SHIFTR(IM(30),4) DO 35 I=1,32 IMM(I,3) = SHIFTR(IM(I),4) IMM(I+32,3) = SHIFTL(IM(I+2),2) 35 CONTINUE c DO 40 I=1,32 IMM(I,4) = SHIFTR(IM(I), 6) IMM(I+32,4) = IM(I) 40 CONTINUE c DO 45 I=1,32 IMM(I,5) = SHIFTL(IM(I+3),3) IMM(I+32,5) = SHIFTR(IM(I), 2) 45 CONTINUE IMM(28,5) = SHIFTR(IM(26), 8) c DO 50 I=1,32 IMM(I,6) = SHIFTL(IM(I+1), 1) IMM(I+32,6) = SHIFTR(IM(I),4) 50 CONTINUE c DO 55 I=1,32 IMM(I,7) = SHIFTR(IM(I),1) IMM(I+32,7) = SHIFTR(IM(I),6) 55 CONTINUE c DO 60 I=1,32 IMM(I,8) = SHIFTR(IM(I),3) IMM(I+32,8) = SHIFTL(IM(I+3), 3) 60 CONTINUE IMM(60,8) = SHIFTR(IM(26), 8) c DO 65 I=1,32 IMM(I,9) = SHIFTR(IM(I),5) IMM(I+32,9) = SHIFTL(IM(I+1), 1) 65 CONTINUE c DO 70 I=1,32 IMM(I,10) = SHIFTL(IM(I+4), 4) IMM(I+32,10) = SHIFTR(IM(I),1) 70 CONTINUE IMM(27,10) = SHIFTR(IM(29), 4) IMM(28,10) = SHIFTR(IM(30), 4) c DO 75 I=1,32 IMM(I,11) = SHIFTL(IM(I+2), 2) IMM(I+32,11) = SHIFTR(IM(I),3) 75 CONTINUE c c Generate mask for Part 3 c Set up mask of valid bits in partial E stream OFF3 = OFF2 - WS IMSK = Z'7FFFFFFFFFFFFFFE' JMSK = SHIFTR(IMSK,WS-OFF1) MSKIT = IAND(IMSK, JMSK) JMSK = SHIFTR(IMSK,WS-OFF3) MSKIT = IAND(MSKIT, JMSK) KK = MAX(0, 2*WS - NZ - 2) JMSK = SHIFTR(IMSK,KK) MSKIT = IAND(MSKIT, JMSK) c NN = 0 c c Check number of check bits in this mask IF( POPCNT(MSKIT) .LT. 10) THEN PRINT 77, OFF1, OFF2, NZ, MSKIT, POPCNT(MSKIT) 77 FORMAT(//,'There are too few check bits for the chosen',/, \$ ' values of OFF1, OFF2, NZ. Code asumes at least 10',/, \$ 'OFF1 = ',I5,/, \$ 'OFF2 = ',I5,/, \$ 'NZ = ',I5,/, \$ 'MSKIT = ',Z16,/, \$ 'Density = ',I5) STOP ENDIF c c To prevent problems at the beginning of the streams, set the first few c words preceeding the actual data of both the B and C streams to 0 BB(-3) = 0 BB(-2) = 0 BB(-1) = 0 BB(0) = 0 CC(-3) = 0 CC(-2) = 0 CC(-1) = 0 CC(0) = 0 c Used to prevent positions past the end of the stream. LEND = L*LENBC*WS/INC - NZ c c ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Basic algorithm for Part 1: c c Notice that for all machines except for ones whose word size is a c multiple of 11, every eleventh word of A has the same bits moved to c the same output locations. Therefore if the inner loop is unrolled c so that each iteration works on 11 words of stream A, the masks c and shift counts will be the same for each iteration, allowing c vectorization and/or parallelization. c c The basic idea of this part is to use the BIT MATRIX MULTIPLY to mask c off and push together the bits of a word which go to the B stream c into the high order 32 bits of the result, and those which go to c the C stream into the low order 32 bits. These are then shifted c into place and packed into the final B and C streams. c c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Generate the stream B by compressing and shifting into the high order c bits the 5-bit groups from stream A. At at the same time, generate c stream C by compressing and shifting into the low order bits the c 5-bit groups from stream A. Use the BIT MATRIX MULTIPLY to do this. c This loop assumes that LENBC = 5 and that the wordsize is 64. c SKIPBC = INC - LENBC c c There may be so little data that we didn't use the secondary storage IF(MN .GE. L) THEN MM = L MMNXT = 0 c Secondary storage was needed ELSE MM = MN MMNXT = MIN(MN, L-MN) ENDIF c c Rewind the secondary storage file IF (L .GT. MN) THEN CALL SKTMP(1,0) CALL SYNTMP(1) ENDIF c c Initialize buffer variables FGBUF = 1 BGBUF = 2 TMP = -1 c IF (L .GT. MN) THEN CALL RDTMP(1,A(1,1,FGBUF),MM) CALL SYNTMP(1) ENDIF c c Set flag to test for array overflow ERR = 0 c c For each buffer's worth of data DO 220 JBUF = 1,L,MN c c Try to overlap reads with the work IF(MMNXT .GT. 0) THEN CALL RDTMP(1,A(1,1,BGBUF),MMNXT) ENDIF c c Process the current data LL = 1 + (MM-1)/INC c c For each buffer length DO 140 K=0,LL-1,LBUF LLBUF = MIN(LBUF, LL-K) DO 120 J=1,INC c c Load the BMM DO 100 I=1,64 M@BXX(I) = M@LD(IMM(I,J)) 100 CONTINUE c c Do the multiply DO 110 I = 1, LLBUF BUF(J,I) = M@MX(A(J,I+K,FGBUF)) 110 CONTINUE 120 CONTINUE c DO 130 I=1, LLBUF c c Now pack up the B and C streams into single words c c Get word 1 TB = IAND(BUF(1,I), BMASK) TC = IAND(BUF(1,I), CMASK) TC = SHIFTL(TC, SRC(1)) c c Move word 2 into position TBB = IAND(BUF(2,I), BMASK) TCC = IAND(BUF(2,I), CMASK) TB = TB + SHIFTR(TBB, SRB(2)) TC = TC + SHIFTL(TCC, SRC(2)) c c Move first part of word 3 into position and output 1st word of output TBB = IAND(BUF(3,I), BMASK) TCC = IAND(BUF(3,I), CMASK) B(1,I+K) = TB + SHIFTR(TBB, SRB(3)) C(1,I+K) = TC + SHIFTR(TCC, SRC(3)) c c Move last part of word 3 into position TB = SHIFTL(TBB, SLB(3)) TC = SHIFTL(TCC, SLC(3)) c c Move word 4 into position TBB = IAND(BUF(4,I), BMASK) TCC = IAND(BUF(4,I), CMASK) TB = TB + SHIFTR(TBB, SRB(4)) TC = TC + SHIFTL(TCC, SRC(4)) c c Move first part of word 5 into position and output 2nd word of output TBB = IAND(BUF(5,I), BMASK) TCC = IAND(BUF(5,I), CMASK) B(2,I+K) = TB + SHIFTR(TBB, SRB(5)) C(2,I+K) = TC + SHIFTR(TCC, SRC(5)) c c Move last part of word 5 into position TB = SHIFTL(TBB, SLB(5)) TC = SHIFTL(TCC, SLC(5)) c c Move word 6 into position TBB = IAND(BUF(6,I), BMASK) TCC = IAND(BUF(6,I), CMASK) TB = TB + SHIFTR(TBB, SRB(6)) TC = TC + SHIFTL(TCC, SRC(6)) c c Move first part of word 7 into position and output 3rd word of output TBB = IAND(BUF(7,I), BMASK) TCC = IAND(BUF(7,I), CMASK) B(3,I+K) = TB + SHIFTR(TBB, SRB(7)) C(3,I+K) = TC + SHIFTR(TCC, SRC(7)) c c Move last part of word 7 into position TB = SHIFTL(TBB, SLB(7)) TC = SHIFTL(TCC, SLC(7)) c c Move word 8 into position TBB = IAND(BUF(8,I), BMASK) TCC = IAND(BUF(8,I), CMASK) TB = TB + SHIFTR(TBB, SRB(8)) TC = TC + SHIFTL(TCC, SRC(8)) c c Move first part of word 9 into position and output 4th word of output TBB = IAND(BUF(9,I), BMASK) TCC = IAND(BUF(9,I), CMASK) B(4,I+K) = TB + SHIFTR(TBB, SRB(9)) C(4,I+K) = TC + SHIFTR(TCC, SRC(9)) c c Move last part of word 9 into position TB = SHIFTL(TBB, SLB(9)) TC = SHIFTL(TCC, SLC(9)) c c Move word 10 into position TBB = IAND(BUF(10,I), BMASK) TCC = IAND(BUF(10,I), CMASK) TB = TB + SHIFTR(TBB, SRB(10)) TC = TC + SHIFTL(TCC, SRC(10)) c c Move word 11 into position and output 5th word TBB = IAND(BUF(11,I), BMASK) TCC = IAND(BUF(11,I), CMASK) B(5,I+K) = TB + SHIFTR(TBB, SRB(11)) C(5,I+K) = TC + SHIFTR(TCC, SRC(11)) c 130 CONTINUE 140 CONTINUE c c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Basic algorithm for Part 2: c c Assume DXORC[i] = C[i] XOR D[i]. Because c E[i] = C[i] XOR D[i] XOR C[i+37] XOR D[i+37] XOR C[i+100] XOR D[i+100] c = DXORC[i] XOR DXORC[i+37] XOR DXORC[i+100] , c what we really want to compute in Part 2 is DXORC. c c DXORC[i] = C[i] XOR D[i] c = C[i] XOR ( (C[i] ^ B[i+1]) XOR (~C[i] ^ B[i-1]) ) c = ( C[i] ^ ~B[i+1] ) XOR (~C[i] ^ B[i-1]) c The rest is relatively straightforward. Shifts are used to align c bits within words for logical operations. c c Since, in most cases, it is not necessary to compute every bit of c the E stream (and thus of the DXORC stream) in order to reject the c hypothesis of NZ consecutive zeros in the E stream (though it will c be necessary to compute these bits when confirming this hyphothesis), c only the middle 62 bits of each word of DXORC will be computed at c this point. This results in a savings of four operations out of c the 11 needed to compute DXORC completely. c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Calculate D and DXORC DO 150 I = -1,LENBC*LL TEMP1 = SHIFTL(BB(I),1) TEMP2 = IAND(CC(I),NOT(TEMP1)) TEMP3 = SHIFTR(BB(I),1) DXORC(I) = IEOR(TEMP2,IAND(NOT(CC(I)),TEMP3)) 150 CONTINUE c c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Basic algorithm for Part 3: c c Notice that for wordsize WS, NZ-WS+1 bits starting at any location in c the word must be zero before there can be NZ zeroes in a row. Since E c does not need to be saved, some work can be eliminated by only c computing a portion of each word, looking for all zeros in that c portion. The portion should be longer than 10 bits, otherwise too c many false starts will happen, and shorter than N-WS bits, otherwise c some sequences might be missed. Once an all zero portion is found, c local words of DXORC and E are computed in full to determine if c there are N consecutive zeroes. c c This code assumes that NZ, OFF1 and OFF2 are such that at least 10 c bits of check are obtained. It is also assumed that OFF1 and c OFF2 - WS are both greater than 32. If not, it might be better c to take the check bits from the left end of the word of E, rather c than from the right(as is done in the following code). For the given c values of OFF1 and OFF2, NZ must be at least 74, to provide 10 bits c of check. c cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c c Test for potential zero stretches. The "non-structured" coding is c to increase efficiency when the probability of the IF statement c is very rare. c I=-1 160 CONTINUE DO 170 I = I,LENBC*LL - 2 TEMP1 = SHIFTR(DXORC(I+1),WS-OFF1) TEMP2 = SHIFTR(DXORC(I+2),WS-OFF3) TEMP3 = IEOR(TEMP2,IEOR(TEMP1,DXORC(I))) IF (IAND(TEMP3,MSKIT) .EQ. 0) GOTO 180 170 CONTINUE GOTO 210 c c If an all-zero portion was found, check to see if NZ zeroes are there. c Calculate the full values of DXORC for 5 words around this position. c 180 CONTINUE DO 190 J=-1,3 TEMP1 = SHIFTL(BB(I+J),1) + SHIFTR(BB(I+J+1),WS-1) TEMP2 = SHIFTR(BB(I+J),1) + SHIFTL(BB(I+J-1), WS-1) TDXORC(J) = IEOR(IAND(CC(I+J),NOT(TEMP1)), \$ IAND(NOT(CC(I+J)),TEMP2)) 190 CONTINUE c c Now calculate the full E stream for 3 words around this position c DO 200 J=-1,1 TEMP1 = SHIFTR(TDXORC(J+1),WS-OFF1) + SHIFTL(TDXORC(J),OFF1) TEMP2 = SHIFTR(TDXORC(J+2),WS-OFF3) + SHIFTL(TDXORC(J+1),OFF3) TE(J) = IEOR(TEMP2,IEOR(TEMP1,TDXORC(J))) 200 CONTINUE c c If the current word is all zeros, get trailing zero count of c previous word c ZEROES = WS - POPCNT(TE(0)) IF (ZEROES .EQ. WS) THEN ZEROES = WS - (1 + POPCNT(IEOR(TE(-1),-TE(-1)))) IF (ZEROES .EQ. WS) THEN ZEROES = 2*WS ELSE ZEROES = WS + ZEROES ENDIF c c Else get trailing zeros of current word c ELSE ZEROES = WS - (1 + POPCNT(IEOR(TE(0), -TE(0)))) ENDIF c c Now add in the leading zeros of the following word and get start c position c STRT = ((JBUF-1)*LENBC/INC + I)*WS - ZEROES ZEROES = ZEROES + LEADZ(TE(1)) c c Store start position and length; will also look for overlapped c regions of zeroes and clean up the results c IF (ZEROES .GE. NZ) THEN IF(NN .GT. 0) THEN JJ = START(NN) + LENGTH(NN) ELSE JJ = -1 ENDIF c c Check for overlapped regions in results, and collapse IF(STRT .LE. JJ) THEN LENGTH(NN) = STRT + ZEROES - START(NN) ELSE c No overlap c Check that section is not off the end of the actual stream. IF(STRT .LE. LEND) THEN NN = NN+1 c c Check to prevent overwriting an array IF(NN .GT. 1000) THEN ERR = 1 ELSE START(NN) = STRT LENGTH(NN) = ZEROES ENDIF ENDIF ENDIF ENDIF I = I+1 IF(I .LE. (LENBC*LL - 2)) GOTO 160 210 CONTINUE c c Handle possible shorter length for last pass MM = MMNXT IF(L+1 - (JBUF+2*MN) .LT. MN) THEN MMNXT = L+1 - (JBUF+2*MN) ENDIF c c Remember last two words of the B and C streams to next buffer BB(-1) = BB(LENBC*BLOCK - 1) BB(0) = BB(LENBC*BLOCK) CC(-1) = CC(LENBC*BLOCK - 1) CC(0) = CC(LENBC*BLOCK) c c Switch buffers and check to see that the read has completed TMP = FGBUF FGBUF = BGBUF BGBUF = TMP IF (L .GT. MN) THEN CALL SYNTMP(1) ENDIF 220 CONTINUE c RETURN END