SUBROUTINE P7(A,L,NZ,START,LENGTH,NN,ERR)
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Benchmark #7 -- Bit Twiddle
c
c *** This version uses the BIT MATRIX MULTIPLY functional unit ***
c
c Parameters:
c
c Provided by the calling routine:
c A = Input bit stream
c L = Number of 64 bit words in bit stream A
c NZ = Minimum length of zeros before reporting results
c
c Returned by this routine:
c START = Starting position of each stretch of zeros
c LENGTH = Length of each stretch of zeros
c NN = Number of stretches of zeros found
c ERR = Flag indicating whether there is array overflow
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c
c Basic algorithm
c
c The benchmark has three parts to it:
c 1) The calculation of streams B and C
c 2) The calculation of streams D and DXORC (to be defined later)
c 3) The calculation of stream E and the locating of sequences of
c zeros in E
c
c Each part of this problem has a different algorithm, which
c will be described below.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
IMPLICIT INTEGER (A-Z)
c
c
c
Cc Uncomment the following for machines without the POPCNT intrinsic.
Cc Also, uncomment the code for INITPC and POPCNT in the UTIL routine
Cc and the call to INITPC in M7.
c
C COMMON /POPS/POPS(0:65535)
c
c
c
Cc Uncomment the following for machines without the LEADZ intrinsic.
Cc Also, uncomment the code for INITLZ in the UTIL routine
Cc and the call to INITLZ in M7.
c
C COMMON /LZCNT/LZCNT(0:65535)
c
c
c
c Length of the A stream buffer in 64 bit words
PARAMETER (MN=11 264 000)
c
c Number of buffers to use
PARAMETER (NBUF=2)
c
c Masks to mask off the high and low order 32 bits, used in generation of
c the B and C streams
PARAMETER (BMASK = Z'FFFFFFFF00000000')
PARAMETER (CMASK = Z'FFFFFFFF')
c
c Number of contiguous bits to take to form streams B and C; this
c code assumes that this is 5
PARAMETER (LENBC=5)
c
c Number of bits between beginning bits of a LENB-bit or LENC-bit group;
c this code assumes that this is 11
PARAMETER (INC=11)
c
c BMM matrix to extract 5 skip 6, starting in the first position
DIMENSION IM(40), IMM(64,INC)
c
c Maximum number of 64-bit words in streams B, C, D, DXORC, and E
PARAMETER (BLEN=(MN*LENBC)/INC)
c
c Number of bits in a word; this code assumes that this is 64
PARAMETER (WS=64)
c
c Number of blocks of INC words
PARAMETER (BLOCK = MN/INC)
c
c Distance between 1st and 2nd positions in DXORC for XORing together
PARAMETER (OFF1=37)
c
c Distance between 1st and 3rd positions in DXORC for XORing together
PARAMETER (OFF2=100)
c
DIMENSION A(INC,BLOCK,NBUF)
c
c Buffer to hold the B and C streams before packing
PARAMETER (LBUF = 8192)
DIMENSION BUF(INC,LBUF)
c
c To hold the B stream
DIMENSION B(LENBC,BLOCK), BB(-3:LENBC*BLOCK)
EQUIVALENCE (B(1,1),BB(1))
c
c To hold the C stream
DIMENSION C(LENBC,BLOCK), CC(-3:LENBC*BLOCK)
EQUIVALENCE (C(1,1),CC(1))
c
c To hold the DXORC stream
DIMENSION DXORC(-1:LENBC*BLOCK)
c
c To hold a segment of the full DXORC stream, and of the E stream
DIMENSION TDXORC(-1:3)
DIMENSION TE(-1:1)
c
c Arrays to hold the results
DIMENSION START(1),LENGTH(1)
c Dummy array to allow loading of BMM
DIMENSION M@BXX(64)
c
c Variables used to switch between buffers
INTEGER FGBUF, BGBUF, TMP
c
c Shift values for Part 1
DIMENSION SRB(INC), SRC(INC), SLB(INC), SLC(INC)
DATA SRB /0, 30, 60, 26, 54, 18, 47, 13, 43, 8, 36/
DATA SLB /0, 0, 4, 0, 10, 0, 17, 0, 21, 0, 0/
DATA SRC /32, 3, 25, 11, 19, 15, 15, 21, 7, 28, 2/
DATA SLC /0, 0, 39, 0, 45, 0, 49, 0, 57, 0, 0/
c
c Generate the BMM fills for Part 1
c Fill array IM
IM(1) = Z'8000000000000000'
DO 10 I=2,5
IM(I) = SHIFTR(IM(I-1), 1)
10 CONTINUE
DO 20 I=1,7
DO 15 J=1,5
IM(5*I+J) = SHIFTR(IM(5*I+J-5), 11)
15 CONTINUE
20 CONTINUE
c
c Now use IM as a pattern and initialize IMM for each pattern
DO 25 I=1,32
IMM(I,1) = IM(I)
IMM(I+32,1) = SHIFTR(IM(I) , 5)
25 CONTINUE
c
DO 30 I=1,32
IMM(I,2) = SHIFTR(IM(I),2)
IMM(I+32,2) = SHIFTL(IM(I+4),4)
30 CONTINUE
IMM(59,2) = SHIFTR(IM(29),4)
IMM(60,2) = SHIFTR(IM(30),4)
DO 35 I=1,32
IMM(I,3) = SHIFTR(IM(I),4)
IMM(I+32,3) = SHIFTL(IM(I+2),2)
35 CONTINUE
c
DO 40 I=1,32
IMM(I,4) = SHIFTR(IM(I), 6)
IMM(I+32,4) = IM(I)
40 CONTINUE
c
DO 45 I=1,32
IMM(I,5) = SHIFTL(IM(I+3),3)
IMM(I+32,5) = SHIFTR(IM(I), 2)
45 CONTINUE
IMM(28,5) = SHIFTR(IM(26), 8)
c
DO 50 I=1,32
IMM(I,6) = SHIFTL(IM(I+1), 1)
IMM(I+32,6) = SHIFTR(IM(I),4)
50 CONTINUE
c
DO 55 I=1,32
IMM(I,7) = SHIFTR(IM(I),1)
IMM(I+32,7) = SHIFTR(IM(I),6)
55 CONTINUE
c
DO 60 I=1,32
IMM(I,8) = SHIFTR(IM(I),3)
IMM(I+32,8) = SHIFTL(IM(I+3), 3)
60 CONTINUE
IMM(60,8) = SHIFTR(IM(26), 8)
c
DO 65 I=1,32
IMM(I,9) = SHIFTR(IM(I),5)
IMM(I+32,9) = SHIFTL(IM(I+1), 1)
65 CONTINUE
c
DO 70 I=1,32
IMM(I,10) = SHIFTL(IM(I+4), 4)
IMM(I+32,10) = SHIFTR(IM(I),1)
70 CONTINUE
IMM(27,10) = SHIFTR(IM(29), 4)
IMM(28,10) = SHIFTR(IM(30), 4)
c
DO 75 I=1,32
IMM(I,11) = SHIFTL(IM(I+2), 2)
IMM(I+32,11) = SHIFTR(IM(I),3)
75 CONTINUE
c
c Generate mask for Part 3
c Set up mask of valid bits in partial E stream
OFF3 = OFF2 - WS
IMSK = Z'7FFFFFFFFFFFFFFE'
JMSK = SHIFTR(IMSK,WS-OFF1)
MSKIT = IAND(IMSK, JMSK)
JMSK = SHIFTR(IMSK,WS-OFF3)
MSKIT = IAND(MSKIT, JMSK)
KK = MAX(0, 2*WS - NZ - 2)
JMSK = SHIFTR(IMSK,KK)
MSKIT = IAND(MSKIT, JMSK)
c
NN = 0
c
c Check number of check bits in this mask
IF( POPCNT(MSKIT) .LT. 10) THEN
PRINT 77, OFF1, OFF2, NZ, MSKIT, POPCNT(MSKIT)
77 FORMAT(//,'There are too few check bits for the chosen',/,
$ ' values of OFF1, OFF2, NZ. Code asumes at least 10',/,
$ 'OFF1 = ',I5,/,
$ 'OFF2 = ',I5,/,
$ 'NZ = ',I5,/,
$ 'MSKIT = ',Z16,/,
$ 'Density = ',I5)
STOP
ENDIF
c
c To prevent problems at the beginning of the streams, set the first few
c words preceeding the actual data of both the B and C streams to 0
BB(-3) = 0
BB(-2) = 0
BB(-1) = 0
BB(0) = 0
CC(-3) = 0
CC(-2) = 0
CC(-1) = 0
CC(0) = 0
c Used to prevent positions past the end of the stream.
LEND = L*LENBC*WS/INC - NZ
c
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm for Part 1:
c
c Notice that for all machines except for ones whose word size is a
c multiple of 11, every eleventh word of A has the same bits moved to
c the same output locations. Therefore if the inner loop is unrolled
c so that each iteration works on 11 words of stream A, the masks
c and shift counts will be the same for each iteration, allowing
c vectorization and/or parallelization.
c
c The basic idea of this part is to use the BIT MATRIX MULTIPLY to mask
c off and push together the bits of a word which go to the B stream
c into the high order 32 bits of the result, and those which go to
c the C stream into the low order 32 bits. These are then shifted
c into place and packed into the final B and C streams.
c
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Generate the stream B by compressing and shifting into the high order
c bits the 5-bit groups from stream A. At at the same time, generate
c stream C by compressing and shifting into the low order bits the
c 5-bit groups from stream A. Use the BIT MATRIX MULTIPLY to do this.
c This loop assumes that LENBC = 5 and that the wordsize is 64.
c
SKIPBC = INC - LENBC
c
c There may be so little data that we didn't use the secondary storage
IF(MN .GE. L) THEN
MM = L
MMNXT = 0
c Secondary storage was needed
ELSE
MM = MN
MMNXT = MIN(MN, L-MN)
ENDIF
c
c Rewind the secondary storage file
IF (L .GT. MN) THEN
CALL SKTMP(1,0)
CALL SYNTMP(1)
ENDIF
c
c Initialize buffer variables
FGBUF = 1
BGBUF = 2
TMP = -1
c
IF (L .GT. MN) THEN
CALL RDTMP(1,A(1,1,FGBUF),MM)
CALL SYNTMP(1)
ENDIF
c
c Set flag to test for array overflow
ERR = 0
c
c For each buffer's worth of data
DO 220 JBUF = 1,L,MN
c
c Try to overlap reads with the work
IF(MMNXT .GT. 0) THEN
CALL RDTMP(1,A(1,1,BGBUF),MMNXT)
ENDIF
c
c Process the current data
LL = 1 + (MM-1)/INC
c
c For each buffer length
DO 140 K=0,LL-1,LBUF
LLBUF = MIN(LBUF, LL-K)
DO 120 J=1,INC
c
c Load the BMM
DO 100 I=1,64
M@BXX(I) = M@LD(IMM(I,J))
100 CONTINUE
c
c Do the multiply
DO 110 I = 1, LLBUF
BUF(J,I) = M@MX(A(J,I+K,FGBUF))
110 CONTINUE
120 CONTINUE
c
DO 130 I=1, LLBUF
c
c Now pack up the B and C streams into single words
c
c Get word 1
TB = IAND(BUF(1,I), BMASK)
TC = IAND(BUF(1,I), CMASK)
TC = SHIFTL(TC, SRC(1))
c
c Move word 2 into position
TBB = IAND(BUF(2,I), BMASK)
TCC = IAND(BUF(2,I), CMASK)
TB = TB + SHIFTR(TBB, SRB(2))
TC = TC + SHIFTL(TCC, SRC(2))
c
c Move first part of word 3 into position and output 1st word of output
TBB = IAND(BUF(3,I), BMASK)
TCC = IAND(BUF(3,I), CMASK)
B(1,I+K) = TB + SHIFTR(TBB, SRB(3))
C(1,I+K) = TC + SHIFTR(TCC, SRC(3))
c
c Move last part of word 3 into position
TB = SHIFTL(TBB, SLB(3))
TC = SHIFTL(TCC, SLC(3))
c
c Move word 4 into position
TBB = IAND(BUF(4,I), BMASK)
TCC = IAND(BUF(4,I), CMASK)
TB = TB + SHIFTR(TBB, SRB(4))
TC = TC + SHIFTL(TCC, SRC(4))
c
c Move first part of word 5 into position and output 2nd word of output
TBB = IAND(BUF(5,I), BMASK)
TCC = IAND(BUF(5,I), CMASK)
B(2,I+K) = TB + SHIFTR(TBB, SRB(5))
C(2,I+K) = TC + SHIFTR(TCC, SRC(5))
c
c Move last part of word 5 into position
TB = SHIFTL(TBB, SLB(5))
TC = SHIFTL(TCC, SLC(5))
c
c Move word 6 into position
TBB = IAND(BUF(6,I), BMASK)
TCC = IAND(BUF(6,I), CMASK)
TB = TB + SHIFTR(TBB, SRB(6))
TC = TC + SHIFTL(TCC, SRC(6))
c
c Move first part of word 7 into position and output 3rd word of output
TBB = IAND(BUF(7,I), BMASK)
TCC = IAND(BUF(7,I), CMASK)
B(3,I+K) = TB + SHIFTR(TBB, SRB(7))
C(3,I+K) = TC + SHIFTR(TCC, SRC(7))
c
c Move last part of word 7 into position
TB = SHIFTL(TBB, SLB(7))
TC = SHIFTL(TCC, SLC(7))
c
c Move word 8 into position
TBB = IAND(BUF(8,I), BMASK)
TCC = IAND(BUF(8,I), CMASK)
TB = TB + SHIFTR(TBB, SRB(8))
TC = TC + SHIFTL(TCC, SRC(8))
c
c Move first part of word 9 into position and output 4th word of output
TBB = IAND(BUF(9,I), BMASK)
TCC = IAND(BUF(9,I), CMASK)
B(4,I+K) = TB + SHIFTR(TBB, SRB(9))
C(4,I+K) = TC + SHIFTR(TCC, SRC(9))
c
c Move last part of word 9 into position
TB = SHIFTL(TBB, SLB(9))
TC = SHIFTL(TCC, SLC(9))
c
c Move word 10 into position
TBB = IAND(BUF(10,I), BMASK)
TCC = IAND(BUF(10,I), CMASK)
TB = TB + SHIFTR(TBB, SRB(10))
TC = TC + SHIFTL(TCC, SRC(10))
c
c Move word 11 into position and output 5th word
TBB = IAND(BUF(11,I), BMASK)
TCC = IAND(BUF(11,I), CMASK)
B(5,I+K) = TB + SHIFTR(TBB, SRB(11))
C(5,I+K) = TC + SHIFTR(TCC, SRC(11))
c
130 CONTINUE
140 CONTINUE
c
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm for Part 2:
c
c Assume DXORC[i] = C[i] XOR D[i]. Because
c E[i] = C[i] XOR D[i] XOR C[i+37] XOR D[i+37] XOR C[i+100] XOR D[i+100]
c = DXORC[i] XOR DXORC[i+37] XOR DXORC[i+100] ,
c what we really want to compute in Part 2 is DXORC.
c
c DXORC[i] = C[i] XOR D[i]
c = C[i] XOR ( (C[i] ^ B[i+1]) XOR (~C[i] ^ B[i-1]) )
c = ( C[i] ^ ~B[i+1] ) XOR (~C[i] ^ B[i-1])
c The rest is relatively straightforward. Shifts are used to align
c bits within words for logical operations.
c
c Since, in most cases, it is not necessary to compute every bit of
c the E stream (and thus of the DXORC stream) in order to reject the
c hypothesis of NZ consecutive zeros in the E stream (though it will
c be necessary to compute these bits when confirming this hyphothesis),
c only the middle 62 bits of each word of DXORC will be computed at
c this point. This results in a savings of four operations out of
c the 11 needed to compute DXORC completely.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Calculate D and DXORC
DO 150 I = -1,LENBC*LL
TEMP1 = SHIFTL(BB(I),1)
TEMP2 = IAND(CC(I),NOT(TEMP1))
TEMP3 = SHIFTR(BB(I),1)
DXORC(I) = IEOR(TEMP2,IAND(NOT(CC(I)),TEMP3))
150 CONTINUE
c
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm for Part 3:
c
c Notice that for wordsize WS, NZ-WS+1 bits starting at any location in
c the word must be zero before there can be NZ zeroes in a row. Since E
c does not need to be saved, some work can be eliminated by only
c computing a portion of each word, looking for all zeros in that
c portion. The portion should be longer than 10 bits, otherwise too
c many false starts will happen, and shorter than N-WS bits, otherwise
c some sequences might be missed. Once an all zero portion is found,
c local words of DXORC and E are computed in full to determine if
c there are N consecutive zeroes.
c
c This code assumes that NZ, OFF1 and OFF2 are such that at least 10
c bits of check are obtained. It is also assumed that OFF1 and
c OFF2 - WS are both greater than 32. If not, it might be better
c to take the check bits from the left end of the word of E, rather
c than from the right(as is done in the following code). For the given
c values of OFF1 and OFF2, NZ must be at least 74, to provide 10 bits
c of check.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Test for potential zero stretches. The "non-structured" coding is
c to increase efficiency when the probability of the IF statement
c is very rare.
c
I=-1
160 CONTINUE
DO 170 I = I,LENBC*LL - 2
TEMP1 = SHIFTR(DXORC(I+1),WS-OFF1)
TEMP2 = SHIFTR(DXORC(I+2),WS-OFF3)
TEMP3 = IEOR(TEMP2,IEOR(TEMP1,DXORC(I)))
IF (IAND(TEMP3,MSKIT) .EQ. 0) GOTO 180
170 CONTINUE
GOTO 210
c
c If an all-zero portion was found, check to see if NZ zeroes are there.
c Calculate the full values of DXORC for 5 words around this position.
c
180 CONTINUE
DO 190 J=-1,3
TEMP1 = SHIFTL(BB(I+J),1) + SHIFTR(BB(I+J+1),WS-1)
TEMP2 = SHIFTR(BB(I+J),1) + SHIFTL(BB(I+J-1), WS-1)
TDXORC(J) = IEOR(IAND(CC(I+J),NOT(TEMP1)),
$ IAND(NOT(CC(I+J)),TEMP2))
190 CONTINUE
c
c Now calculate the full E stream for 3 words around this position
c
DO 200 J=-1,1
TEMP1 = SHIFTR(TDXORC(J+1),WS-OFF1) + SHIFTL(TDXORC(J),OFF1)
TEMP2 = SHIFTR(TDXORC(J+2),WS-OFF3) + SHIFTL(TDXORC(J+1),OFF3)
TE(J) = IEOR(TEMP2,IEOR(TEMP1,TDXORC(J)))
200 CONTINUE
c
c If the current word is all zeros, get trailing zero count of
c previous word
c
ZEROES = WS - POPCNT(TE(0))
IF (ZEROES .EQ. WS) THEN
ZEROES = WS - (1 + POPCNT(IEOR(TE(-1),-TE(-1))))
IF (ZEROES .EQ. WS) THEN
ZEROES = 2*WS
ELSE
ZEROES = WS + ZEROES
ENDIF
c
c Else get trailing zeros of current word
c
ELSE
ZEROES = WS - (1 + POPCNT(IEOR(TE(0), -TE(0))))
ENDIF
c
c Now add in the leading zeros of the following word and get start
c position
c
STRT = ((JBUF-1)*LENBC/INC + I)*WS - ZEROES
ZEROES = ZEROES + LEADZ(TE(1))
c
c Store start position and length; will also look for overlapped
c regions of zeroes and clean up the results
c
IF (ZEROES .GE. NZ) THEN
IF(NN .GT. 0) THEN
JJ = START(NN) + LENGTH(NN)
ELSE
JJ = -1
ENDIF
c
c Check for overlapped regions in results, and collapse
IF(STRT .LE. JJ) THEN
LENGTH(NN) = STRT + ZEROES - START(NN)
ELSE
c No overlap
c Check that section is not off the end of the actual stream.
IF(STRT .LE. LEND) THEN
NN = NN+1
c
c Check to prevent overwriting an array
IF(NN .GT. 1000) THEN
ERR = 1
ELSE
START(NN) = STRT
LENGTH(NN) = ZEROES
ENDIF
ENDIF
ENDIF
ENDIF
I = I+1
IF(I .LE. (LENBC*LL - 2)) GOTO 160
210 CONTINUE
c
c Handle possible shorter length for last pass
MM = MMNXT
IF(L+1 - (JBUF+2*MN) .LT. MN) THEN
MMNXT = L+1 - (JBUF+2*MN)
ENDIF
c
c Remember last two words of the B and C streams to next buffer
BB(-1) = BB(LENBC*BLOCK - 1)
BB(0) = BB(LENBC*BLOCK)
CC(-1) = CC(LENBC*BLOCK - 1)
CC(0) = CC(LENBC*BLOCK)
c
c Switch buffers and check to see that the read has completed
TMP = FGBUF
FGBUF = BGBUF
BGBUF = TMP
IF (L .GT. MN) THEN
CALL SYNTMP(1)
ENDIF
220 CONTINUE
c
RETURN
END