SUBROUTINE P7(A, BUFNUM, MYFIRST, MYLAST, START, LENGTH, NN,
* mype, npes )
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Benchmark #7 -- Bit Twiddle
c
c *** This version assumes that the BIT MATRIX MULTIPLY ***
c *** functional unit is not available ***
c
c Parameters:
c Provided by the calling routine:
c A = Buffer of input bit stream, BUFSIZE=11264000 words long
c BUFNUM = Number of buffer for this call to P7
c
c Returned by this routine:
c START = Starting position of each stretch of zeros
c LENGTH = Length of each stretch of zeros
c NN = Number of stretches of zeros found
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm
c
c The benchmark has three parts to it:
c 1) The calculation of streams B and C
c 2) The calculation of streams D and DXORC (to be defined later)
c 3) The calculation of stream E and the locating of seq's of zeros in E
c
c Each part of this problem has a different algorithm, which
c will be described below.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
IMPLICIT INTEGER (A-Z)
c
#include "bench7.h"
#include
#define NSAVE 3
c
INTEGER (kind=8) A, BBUF, CBUF, B, BB, C, CC, DXORC, TDXORC, TE
INTEGER (kind=8) LEND, EBITS, START, STRT, CURSTART, CUREND
INTEGER (kind=8) BSAVE, CSAVE, BBCC, BBCCSAVE
SAVE :: CURSTART, CURLENGTH, BSAVE, CSAVE, BBCCSAVE
c
c Number of contiguous bits to take to form streams B and C; this
c code assumes that this is 5
PARAMETER (LENBC = 5)
c
c Number of bits between beginning bits of a LENB-bit or LENC-bit group;
c this code assumes that this is 11
PARAMETER (INC = 11)
c
PARAMETER (SKIPBC = INC - LENBC)
c
c Number of 64-bit words in a buffer of streams B, C, D, DXORC, and E
PARAMETER (BLEN = (BUFSIZE*LENBC)/INC)
c
c Number of *bits* in a buffer of E stream
PARAMETER (EBITS = BLEN*64_8)
c
c Number of bits in a word; this code assumes that this is 64
PARAMETER (WS = 64)
c
c Number of blocks of INC words.
PARAMETER (BLOCK = BUFSIZE/INC)
c
c The buffer of stream data
DIMENSION A(INC,BLOCK)
c
c Distance between 1st and 2nd positions in DXORC for XORing together
PARAMETER (OFF1 = 37)
c
c Distance between 1st and 3rd positions in DXORC for XORing together
PARAMETER (OFF2 = 100)
c
c Buffer to hold the B and C streams before packing
PARAMETER (LBUF = 8192)
c
DIMENSION BBUF(INC,LBUF), CBUF(INC,LBUF)
c
c To hold the B stream - BLEN = LENBC*BLOCK
DIMENSION B(LENBC,BLOCK), BB(-3 : BLEN+NSAVE)
EQUIVALENCE ( B(1,1), BB(1) )
c
c To hold the C stream
DIMENSION C(LENBC,BLOCK), CC(-3 : BLEN+NSAVE)
EQUIVALENCE ( C(1,1), CC(1) )
c
c Save last words from one of my buffers to put before beg. of next buf.
DIMENSION BSAVE(NSAVE), CSAVE(NSAVE)
c
c Copy first words of mype+1's chunk to mype
c Then put after end of mype's last buff.
DIMENSION BBCC(2*NSAVE), BBCCSAVE(2*NSAVE)
c
c To hold the (approximate) DXORC stream
DIMENSION DXORC(-1 : BLEN+3)
c
c To hold a segment of the full DXORC stream, and of the E stream
DIMENSION TDXORC(-1:3)
DIMENSION TE(-1:1)
c
c Arrays to hold the results
DIMENSION START(MAXANS), LENGTH(MAXANS)
c
c Masks for Part 1 - assuming INC = 11, LENBC = 5
c
INTEGER (kind=8) MASKB1, MASKB2, MASKB3, MASKB4
INTEGER (kind=8) MASKC1, MASKC2, MASKC3, MASKC4
INTEGER (kind=8) IMSK, MSKIT, TB, TC, TEMP1, TEMP2, TEMP3
c
DIMENSION MASKB1(INC), MASKB2(INC), MASKB3(INC), MASKB4(INC)
c
DATA MASKB1 /Z'F81F03E07C0F81F0', Z'3E07C0F81F03E07C',
$ Z'0F81F03E07C0F81F', Z'03E07C0F81F03E07',
$ Z'C0F81F03E07C0F81', Z'F03E07C0F81F03E0',
$ Z'7C0F81F03E07C0F8', Z'1F03E07C0F81F03E',
$ Z'07C0F81F03E07C0F', Z'81F03E07C0F81F03',
$ Z'E07C0F81F03E07C0'/
c
DATA MASKB2 /Z'FFC003FF000FFC00', Z'3FF000FFC003FF00',
$ Z'0FFC003FF000FFC0', Z'03FF000FFC003FC0',
$ Z'FE001FF8007FE001', Z'FF8007FE001FF800',
$ Z'7FE001FF8007FE00', Z'1FF8007FE001FF80',
$ Z'07FE001FF8007FC0', Z'FC003FF000FFC003',
$ Z'FF000FFC003FF000'/
c
DATA MASKB3 /Z'FFFFF000000FFC00', Z'3FFFFC000003FF00',
$ Z'0FFFFF000000FFC0', Z'03FFFFC000003FC0',
$ Z'FFFF8000007FF000', Z'FFFFE000001FF800',
$ Z'7FFFF8000007FE00', Z'1FFFFE000001FF80',
$ Z'07FFFF8000007FC0', Z'FFFF000000FFF000',
$ Z'FFFFC000003FF000'/
c
DATA MASKB4 /Z'FFFFFFFC00000000', Z'3FFFFFFF00000000',
$ Z'0FFFFFFFC0000000', Z'03FFFFFFC0000000',
$ Z'FFFFFFF000000000', Z'FFFFFFF800000000',
$ Z'7FFFFFFE00000000', Z'1FFFFFFF80000000',
$ Z'07FFFFFFC0000000', Z'FFFFFFF000000000',
$ Z'FFFFFFF000000000'/
c
DIMENSION MASKC1(INC), MASKC2(INC), MASKC3(INC), MASKC4(INC)
c
DATA MASKC1 /Z'07C0F81F03E07C0F', Z'81F03E07C0F81F03',
$ Z'E07C0F81F03E07C0', Z'F81F03E07C0F81F0',
$ Z'3E07C0F81F03E07C', Z'0F81F03E07C0F81F',
$ Z'03E07C0F81F03E07', Z'C0F81F03E07C0F81',
$ Z'F03E07C0F81F03E0', Z'7C0F81F03E07C0F8',
$ Z'1F03E07C0F81F03E'/
c
DATA MASKC2 /Z'07FE001FF8007FC0', Z'FC003FF000FFC003',
$ Z'FF000FFC003FF000', Z'FFC003FF000FFC00',
$ Z'3FF000FFC003FF00', Z'0FFC003FF000FFC0',
$ Z'03FF000FFC003FC0', Z'FE001FF8007FE001',
$ Z'FF8007FE001FF800', Z'7FE001FF8007FE00',
$ Z'1FF8007FE001FF80'/
c
DATA MASKC3 /Z'07FFFF8000007FC0', Z'FFFF000000FFF000',
$ Z'FFFFC000003FF000', Z'FFFFF000000FFC00',
$ Z'3FFFFC000003FF00', Z'0FFFFF000000FFC0',
$ Z'03FFFFC000003FC0', Z'FFFF8000007FF000',
$ Z'FFFFE000001FF800', Z'7FFFF8000007FE00',
$ Z'1FFFFE000001FF80'/
c
DATA MASKC4 /Z'07FFFFFFC0000000', Z'FFFFFFF000000000',
$ Z'FFFFFFF000000000', Z'FFFFFFFC00000000',
$ Z'3FFFFFFF00000000', Z'0FFFFFFFC0000000',
$ Z'03FFFFFFC0000000', Z'FFFFFFF000000000',
$ Z'FFFFFFF800000000', Z'7FFFFFFE00000000',
$ Z'1FFFFFFF80000000'/
c
DIMENSION SRB(INC), SRC(INC), SLB(INC), SLC(INC)
c
DATA SRB /0, -28, -56, -20, -54, -18, -46, -10, -38, -8, -36/
DATA SLB /0, 0, 8, 0, 10, 0, 18, 0, 26, 0, 0/
DATA SRC /0, -29, -57, -21, -49, -13, -41, -11, -39, -3, -31/
DATA SLC /5, 0, 7, 0, 15, 0, 23, 0, 25, 0, 0/
c
INTEGER IER, STATUS(MPI_STATUS_SIZE)
c
c Generate mask for Part 3
c
c Set up mask of valid bits in partial E stream
OFF3 = OFF2 - WS
IMSK = Z'7FFFFFFFFFFFFFFE'
MSKIT = IAND(IMSK, ISHFT(IMSK,OFF1-WS))
MSKIT = IAND(MSKIT, ISHFT(IMSK,OFF3 - WS))
KK = MAX(0, 2*WS - NZ - 2)
MSKIT = IAND(MSKIT,ISHFT(IMSK,-KK))
c
c Check number of check bits in this mask
IF (POPCNT(MSKIT) .LT. 10) THEN
PRINT 5, OFF1, OFF2, NZ, MSKIT, POPCNT(MSKIT)
5 FORMAT(//,'There are too few check bits for the chosen',/,
$ ' values of OFF1, OFF2, NZ. Code asumes at least 10',/,
$ 'OFF1 = ',I5,/,
$ 'OFF2 = ',I5,/,
$ 'NZ = ',I5,/,
$ 'MSKIT = ',Z16,/,
$ 'Density = ',I5)
STOP
ENDIF
c
c To prevent problems at the beginning of the streams, set the first few
c words preceeding the actual data of both the B and C streams to 0
c print *, mype, 'in P7', bufnum
IF (BUFNUM .EQ. MYFIRST) THEN
BB(-3) = 1
BB(-2) = 1
BB(-1) = 1
BB(0) = 1
BB(BLEN+1) = 1
BB(BLEN+2) = 1
CC(-3) = 0
CC(-2) = 0
CC(-1) = 0
CC(0) = 0
CC(BLEN+1) = 0
CC(BLEN+2) = 0
ELSE
c
c Not the 1st buffer for this PE, insert last NSAVE words of prev. buff
c
DO I = 1, NSAVE
BB(I-NSAVE) = BSAVE(I)
CC(I-NSAVE) = CSAVE(I)
ENDDO
c
ENDIF
c
HOLDNN = NN
c
c Used to prevent positions past the end of the stream.
LEND = BUFNUM * EBITS - NZ
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm for Part 1:
c
c Notice that for all machines except for ones whose word size is a
c multiple of 11, every eleventh word of A has the same bits moved to
c the same output locations. Therefore if the inner loop is unrolled
c so that each iteration works on 11 words of stream A, the masks
c and shift counts will be the same for each iteration, allowing
c vectorization and/or parallelization.
c
c The basic idea is to mask off the bits of a word going to either
c B or C, push them together, and then shift them into place
c for packing into the output words.
c
c There are up to seven fields of 5 contiguous bits in each 64-bit
c word that need to be compressed to form the output stream. Using a
c BIT MATRIX MULTIPLY, one could mask and shift each one of these to
c the correct position for output. Without this operation, the most
c efficient approach is a divide and conquer approach whereby first
c neighboring fields are put together, and then all the bits from the
c word are put together near the front of the word. These bits are
c then shifted into place for output. The result is that only about
c 13 operations (instead of 20) are needed per word for a straight
c forward approach.
c
c This version assumes that the BIT MATRIX MULTIPLY functional unit
c is not available.
c
c Certain masks and shiftcounts are needed for Part 1 to perform its
c calculations. The first mask which needs to be generated is the one
c marking off the first 5 bits, skipping 6, marking the next 5, etc.,
c for the B stream. It is contained in array MASKB1. MASKC1 contains
c a similar array for the C stream.
c
c Since we're using the divide and conquer approach, we need masks not
c just for the positions of the bits in the 11 original words, but
c also for 10-bit groups, for the 20-bit groups, and for the final
c up to 30-bit groups. These are contained in arrays MASKB2 through
c MASKB4 for the B stream, and in MASKC2 through MASKC4 for the C
c stream. These are created in data statements at the beginning of
c this subroutine.
c
cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Generate the streams B, C by compressing and shifting into position the
c 5-bit groups from stream A. This loop assumes that LENBC = 5 and that
c the wordsize is 64.
c
c Process the buffer of current data
c Do full buffer of A stream, INC*LBUF at a time, so 125 loops
c
c For each buffer length
c
DO 50 K = 0, BLOCK-1, LBUF
c
DO 30 J = 1, INC
c
DO 20 I = 1, LBUF
c
c Mask 5-bit groups
TB = IAND( A(J,I+K), MASKB1(J))
TC = IAND( A(J,I+K), MASKC1(J))
c
c Compress 5-bit groups together
TB = IAND(TB + ISHFT(TB, SKIPBC), MASKB2(J))
TC = IAND(TC + ISHFT(TC, SKIPBC), MASKC2(J))
c
c Now compress every other resulting 10-bit group together
TB = IAND(TB + ISHFT( TB, 2*SKIPBC), MASKB3(J))
TC = IAND(TC + ISHFT( TC, 2*SKIPBC), MASKC3(J))
c
BBUF(J,I) = IAND(TB + ISHFT( TB, 4*SKIPBC), MASKB4(J))
CBUF(J,I) = IAND(TC + ISHFT( TC, 4*SKIPBC), MASKC4(J))
c
20 CONTINUE
c
30 CONTINUE
c
c
DO 40 I=1, LBUF
c
c Now pack up the stuff in BBUF into packed words of B stream
c
c Move words 1 and 2 into position
TB = BBUF(1,I) + ISHFT(BBUF(2,I), SRB(2))
c
c Move first part of word 3 into position and store 1st word of output
B(1,I+K) = TB + ISHFT(BBUF(3,I), SRB(3))
c
c Move last part of word 3 into position
TB = ISHFT(BBUF(3,I), SLB(3))
c
c Move word 4 into position
TB = TB + ISHFT(BBUF(4,I), SRB(4))
c
c Move first part of word 5 into position and store 2nd word of output
B(2,I+K) = TB + ISHFT(BBUF(5,I), SRB(5))
c
c Move last part of word 5 into position
TB = ISHFT(BBUF(5,I), SLB(5))
c
c Move word 6 into position
TB = TB + ISHFT(BBUF(6,I), SRB(6))
c
c Move first part of word 7 into position and store 3rd word of output
B(3,I+K) = TB + ISHFT(BBUF(7,I), SRB(7))
c
c Move last part of word 7 into position
TB = ISHFT(BBUF(7,I), SLB(7))
c
c Move word 8 into position
TB = TB + ISHFT(BBUF(8,I), SRB(8))
c
c Move first part of word 9 into position and store 4th word of output
B(4,I+K) = TB + ISHFT(BBUF(9,I), SRB(9))
c
c Move last part of word 9 into position
TB = ISHFT(BBUF(9,I), SLB(9))
c
c Move word 10 into position
TB = TB + ISHFT(BBUF(10,I), SRB(10))
c
c Move word 11 into position and store 5th word of output
B(5,I+K) = TB + ISHFT(BBUF(11,I), SRB(11))
c
c
c Now pack up the stuff in CBUF into packed words of C stream
c
c Move word 1 into position
TC = ISHFT(CBUF(1,I), SLC(1))
c
c Move first part of word 2 into position
TC = TC + ISHFT(CBUF(2,I), SRC(2))
c
c Move word 3 into position into position and store 1st word of output
C(1,I+K) = TC + ISHFT(CBUF(3,I), SRC(3))
c
c Move last part of word 3 into position
TC = ISHFT(CBUF(3,I), SLC(3))
c
c Move word 4 into position
TC = TC + ISHFT(CBUF(4,I), SRC(4))
c
c Move first part of word 5 into position and store 2nd word of output
C(2,I+K) = TC + ISHFT(CBUF(5,I), SRC(5))
c
c Move last part of word 5 into position
TC = ISHFT(CBUF(5,I), SLC(5))
c
c Move word 6 into position
TC = TC + ISHFT(CBUF(6,I), SRC(6))
c
c Move first part of word 6 into position and store 3rd word of output
C(3,I+K) = TC + ISHFT(CBUF(7,I), SRC(7))
c
c Move last part of word 7 into position
TC = ISHFT(CBUF(7,I), SLC(7))
c
c Move word 8 into position
TC = TC + ISHFT(CBUF(8,I), SRC(8))
c
c Move first part of word 9 into position and store 4th word of output
C(4,I+K) = TC + ISHFT(CBUF(9,I), SRC(9))
c
c Move last part of word 9 into position
TC = ISHFT(CBUF(9,I), SLC(9))
c
c Move word 10 into position
TC = TC + ISHFT(CBUF(10,I), SRC(10))
c
c Move word 11 into position and store 5th word of output
C(5,I+K) = TC + ISHFT(CBUF(11,I), SRC(11))
c
40 CONTINUE
50 CONTINUE
c
c We have finished generating BLEN=5*1024000 words of BB, CC
c
IF ( (npes .GT. 1) .AND. (BUFNUM .EQ. MYFIRST) ) THEN
c
c First time through, send first 2 words of BB, CC to mype-1
c Then last time through all but npes-1 have the overlap area
c
tag = 21
c
c If npes < numbufs some pe's won't hit this barrier
c
BBCC(1) = BB(1)
BBCC(2) = BB(2)
BBCC(3) = BB(3)
BBCC(4) = CC(1)
BBCC(5) = CC(2)
BBCC(6) = CC(3)
c
call MPI_BARRIER(MPI_COMM_WORLD, ier)
c
IF (MYPE .NE. NPES-1) then
call MPI_RECV( BBCCSAVE, 6, MYMPI_INTEGER8, mype+1,
* tag, MPI_COMM_WORLD, status, ier)
endif
c
IF (MYPE .NE. 0) then
call MPI_SEND( BBCC, 6, MYMPI_INTEGER8, mype-1,
* tag, MPI_COMM_WORLD, ier)
endif
c
call MPI_BARRIER(MPI_COMM_WORLD, ier)
c
ENDIF
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm for Part 2:
c
c Assume DXORC[i] = C[i] XOR D[i]. Because
c E[i] = C[i] XOR D[i] XOR C[i+37] XOR D[i+37] XOR C[i+100] XOR D[i+100]
c = DXORC[i] XOR DXORC[i+37] XOR DXORC[i+100] ,
c what we really want to compute in Part 2 is DXORC.
c
c DXORC[i] = C[i] XOR D[i]
c = C[i] XOR ( (C[i] ^ B[i+1]) XOR (~C[i] ^ B[i-1]) )
c = ( C[i] ^ ~B[i+1] ) XOR (~C[i] ^ B[i-1])
c The rest is relatively straightforward.
c Shifts are used to align bits within words for logical operations.
c
c Since, in most cases, it is not necessary to compute every bit of
c the E stream (and thus of the DXORC stream) in order to reject the
c hypothesis of NZ consecutive zeros in the E stream (though it will
c be necessary to compute these bits when confirming this hyphothesis)
c only the middle 62 bits of each word of DXORC will be computed at
c this point. This results in a savings of four operations out of
c the 11 needed to compute DXORC completely.
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c upper loop bound is blen = lenbc*11 = bufsize*5/11
c
DLIM = BLEN
c
c special handling if bufnum = mylast - do 2 more words
c --- but not if mype = npes-1
c
IF ( (BUFNUM .EQ. MYLAST) .AND. (MYPE .NE. NPES-1) ) THEN
DLIM = BLEN + 3
BB(BLEN+1) = BBCCSAVE(1)
BB(BLEN+2) = BBCCSAVE(2)
BB(BLEN+3) = BBCCSAVE(3)
CC(BLEN+1) = BBCCSAVE(4)
CC(BLEN+2) = BBCCSAVE(5)
CC(BLEN+3) = BBCCSAVE(6)
c if (mype.eq.0) then
c print '(Z16)', (BB(i), i=blen-3,blen+3)
c print '(Z16)', (CC(i), i=blen-3,blen+3)
c endif
ENDIF
c
c Calculate D and DXORC
c
DO 60 I = -1, DLIM
c
TEMP1 = IAND( CC(I), NOT(ISHFT(BB(I),1)) )
TEMP2 = IAND( NOT(CC(I)), ISHFT(BB(I),-1) )
DXORC(I) = IEOR( TEMP1, TEMP2 )
c
60 CONTINUE
c
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Basic algorithm for Part 3:
c
c Notice that for wordsize WS, NZ-WS+1 bits starting at any location
c in the word must be zero before there can be NZ zeroes in a row.
c Since E does not need to be saved, some work can be eliminated by only
c computing a portion of each word, looking for all zeroes in that
c portion. The portion should be longer than 10 bits, otherwise too many
c false starts will happen, and shorter than N-WS bits, otherwise some
c sequences might be missed. Once an all zero portion is found, local
c words of DXORC and E are computed in full to determine if there
c are N consecutive zeroes.
c
c This code assumes that NZ, OFF1 and OFF2 are such that at least 10
c bits of check are obtained. It is also assumed that OFF1 and
c OFF2 - WS are both greater than 32. If not, it might be better to
c take the check bits from the left end of the word of E, rather than
c from the right(as is done in the following code). For the given
c values of OFF1 and OFF2, NZ must be at least 74, to provide 10
c bits of check.
c
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
c
c Test for potential zero stretches. The "non-structured" coding is to
c increase efficiency when the probability of the IF statement is very rare.
c
I = -1
c
100 CONTINUE
c
DO 110 I = I, DLIM - 2
TEMP1 = ISHFT( DXORC(I+1), OFF1-WS )
TEMP2 = ISHFT( DXORC(I+2), OFF3-WS )
TEMP3 = IEOR( TEMP2, IEOR( TEMP1, DXORC(I) ) )
IF ( IAND( TEMP3, MSKIT ) .EQ. 0 )
$ GOTO 120
110 CONTINUE
c
GOTO 150 ! finished the buffer
c
120 CONTINUE
c
c An all-zero portion was found, check to see if NZ zeroes are there.
c Calculate the full values of DXORC for 5 words around this position.
c
c print *, NN, I
DO 130 J = -1, 3
TEMP1 = ISHFT(BB(I+J), 1) + ISHFT(BB(I+J+1), 1-WS)
TEMP2 = ISHFT(BB(I+J),-1) + ISHFT(BB(I+J-1), WS-1)
TDXORC(J) = IEOR( IAND( CC(I+J), NOT(TEMP1) ),
$ IAND( NOT(CC(I+J)), TEMP2 ) )
c print '(2X,Z16)', TDXORC(J)
130 CONTINUE
c
c Now calculate the full E stream for 3 words around this position
c
DO 140 J = -1, 1
TEMP1 = ISHFT(TDXORC(J+1), OFF1-WS) + ISHFT(TDXORC(J), OFF1)
TEMP2 = ISHFT(TDXORC(J+2), OFF3-WS) + ISHFT(TDXORC(J+1), OFF3)
TE(J) = IEOR( TEMP2, IEOR( TEMP1, TDXORC(J) ) )
c print '(Z16)', TE(J)
140 CONTINUE
c
c If the current word is all zeros, get trailing zeroes of previous word
c
IF (TE(0) .EQ. 0) THEN
c
IF (TE(-1) .EQ. 0) THEN
ZEROES = 2*WS
ELSE
ZEROES = 2*WS - (1 + POPCNT(IEOR(TE(-1),-TE(-1))))
ENDIF
c
c Else get trailing zeros of current word
c
ELSE
ZEROES = WS - (1 + POPCNT(IEOR(TE(0), -TE(0))))
ENDIF
c
c Now get start position and add in leading zeros of following word
c
STRT = (BUFNUM-1)*EBITS + I*WS - ZEROES
ZEROES = ZEROES + LEADZ(TE(1))
c
c Store start position and length; will also look for overlapped
c regions of zeros and clean up the results
c
IF (ZEROES .GE. NZ) THEN
c print *, NN, HOLDNN, STRT, ZEROES
c
IF (NN .GT. 0) THEN
CUREND = CURSTART + CURLENGTH
ELSE
CUREND = -1
ENDIF
c
c Check for overlapped regions in results, and collapse
c
c print *, nn, jj, strt
IF (STRT .LE. CUREND) THEN
c
CURLENGTH = STRT + ZEROES - CURSTART
c print *, 'OL', mype, NN, curstart, curlength
IF (NN .LE. MAXANS) LENGTH(NN) = CURLENGTH
c
ELSE
c
c No overlap
c Check that section is not off the end of the actual stream.
IF (STRT .LE. LEND) THEN
NN = NN+1
CURSTART = STRT
CURLENGTH = ZEROES
c print *, mype, NN, curstart, curlength
c
c Check to prevent overwriting the START, LENGTH arrays
IF (NN .LE. MAXANS) THEN
START(NN) = STRT
LENGTH(NN) = ZEROES
ENDIF
ENDIF
c
ENDIF
c
ENDIF
I = I+1
c
IF (I .LE. (DLIM - 2)) GOTO 100
c
150 CONTINUE
c
c Save last NSAVE words for next buffer pass
c
DO I = 1, NSAVE
BSAVE(I) = BB(BLEN+I-NSAVE)
CSAVE(I) = CC(BLEN+I-NSAVE)
ENDDO
c print '(Z16)', BSAVE(1),BSAVE(2),CSAVE(1),CSAVE(2)
c
RETURN
END