/*************************************************************************** * * Benchmark #11 -- Multiple Precision Arithmetic * * Given two NxN matrices with entries in the ring of integers modulo * some number MODULUS, compute the product of the matrices. * * Default Parameters: N = 225 MODULUS = (7**183) - 6 * **************************************************************************** * * Main Program for Benchmark #11 * * Call time parameters: * * -v = Verbose mode, invokes printing of intermediate information. * -V = Very verbose mode. * * N = The size of the matrices * Min allowed = 4 * Max allowed = 1024 * Default = 225 * * MODULUS = The modulus * Max allowed = 1024 bits * Default = 7^183-6 (514 bits) * A "-" indicates reading the modulus from standard-in. * * Usage: * bench11 32 123456789 (32x32 matrices modulo 123456789) * bench11 N MODULUS (NxN matrices modulo MODULUS) * bench11 N (use the default value for MODULUS) * bench11 N 0 (use the default value for MODULUS) * bench11 0 0 (use the default values for N, MODULUS) * bench11 (use the default values for N, MODULUS) * * bench11 N i (i=1,2,...,10) uses the modulus Mi from the list below * * echo "2^512-1" | bc | bench11 100 - (100x100 matrices mod 2^512-1) * **************************************************************************** * * The check routine is designed to verify the results for any * combination of N and MODULUS from the following list: * * N = 5, 25, 100, 225, 256, 512, 640, 1024 * * MODULUS: * M1 = 7^23-6 (65 bits) * M2 = 2^89-1 (89 bits) * M3 = 2^127-1 (127 bits) * M4 = 3^121-2 (192 bits) * M5 = 9^81-8 (257 bits) * M6 = 5^166-4 (386 bits) * M7 = 2^512-1 (512 bits) * M8 = 7^183-6 (514 bits) (the default) * M9 = 3^477-2 (757 bits) * M10 = 5^430-4 (999 bits) * * Other combinations cannot be verified. * **************************************************************************** * * NOTE: * This version for non-shared-memory systems uses Fox`s algorithm for * matrix multiplication. It assumes that: * (1) the number of PEs is a square Q*Q * (2) if N, the size of the matrix data, is NOT divisible by Q * then there is room to use the next larger size div. by Q. * **************************************************************************** */ /* CVS info */ /* $Date: 2005/09/19 18:48:18 $ */ /* $Revision: 1.3 $ */ /* $RCSfile: m11.c,v $ */ /* $Name: rel_5 $ */ /* global variables - set here, extern in other modules */ int verbose = 0, debug = 0, mype = 0, npes = 1; /* This includes an architecture-specific .h file */ #include "bench11.h" #if defined CRAY # include # include #elif defined __alpha # include # include # include #endif #ifdef _CRAYMPP long _num_pes(void); #else int _num_pes(void); #endif int _my_pe(void); long pSync[_SHMEM_REDUCE_SYNC_SIZE]; static char cvs_info[] = "BMARKGRP $Date: 2005/09/19 18:48:18 $ $Revision: 1.3 $ $RCSfile: m11.c,v $ $Name: rel_5 $"; /* on Cray, need to convert fcd returned for modulus into C string */ #ifdef _MPUTIL # include # ifdef _CRAYMPP # include # endif #endif /*----------------------------------------------------------------------------*/ void main ( int argc, char ** argv ) { /* Arrays to hold the matrices */ /* We will malloc them to be the size needed by each PE - m x m x numsize */ mplong * a, * b, * c; /* An array to hold the modulus */ mplong modulus[maxmodsize]; /* A 2-dim array with the 10 checkable moduli, defined and initialized according to number of bits per MP word */ #if BITSPERWORD == 23 static mplong defmod[24] = { 23, 2345073, 6268386, 7122414, 967733, 7782757, 4336732, 7036344, 5860704, 2506226, 8003024, 6255620, 7518812, 4150431, 729597, 25318, 660986, 8294841, 4294177, 4269410, 381337, 7312422, 5587860, 214 }; #endif #if BITSPERWORD == 32 /* Data for a machine with 32 bits per multiple precision word */ static mplong defmods[10][33] = { { 3, 0X30F0A771, 0X7BD152B3, 1 }, { 3, 0XFFFFFFFF, 0XFFFFFFFF, 0X01FFFFFF }, { 4, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0X7FFFFFFF }, { 6, 0X12CACD21, 0X22D2F6C3, 0X92AB1311, 0X9D135AC8, 0XCD3D4156, 0XDBDCFD23 }, { 9, 0XF8DB7C81, 0X786C0065, 0X95D05DC0, 0X5CC1C941, 0XA7987CBA, 0XD6FD8182, 0X278B4D09, 0XB2B6F77A, 1 }, { 13, 0X5BB45685, 0X2F03B89A, 0X0C238E65, 0XD0979D30, 0X7D1B506D, 0X334B46E8, 0XF63B8E74, 0XDC82E031, 0X6842B9A8, 0XBD586A9E, 0X80152BA6, 0XB69CB330, 2 }, { 16, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF }, { 17, 0XF123C871, 0X2B7BAFD2, 0X51D886BB, 0X62E76C16, 0XAD76E211, 0XF2B2DAC1, 0X0EE8263D, 0X97DD013D, 0X49FE574B, 0X590FEBF5, 0XF4018B98, 0X91B9142B, 0XA0C310FE, 0X33304958, 0XF94260BA, 0X5AAA1CA6, 3 }, { 24, 0XBFB72E91, 0X3D890F36, 0XA692A7BB, 0XFCA7523A, 0XBD1C5E4D, 0X88CEDE98, 0X19C47C10, 0X302879A8, 0X98D73E3D, 0X1C54BA18, 0X7EFE1607, 0X5D42F516, 0X5290AE47, 0X4D35BA75, 0XC2A13902, 0XF567F6AB, 0X5C5D0361, 0X66153C40, 0XF9DC5FB4, 0X88E1811F, 0X7923A048, 0XDDCAF0C6, 0X6CCF51C0, 0X104DB4 }, { 32, 0X22E30765, 0XC81109B4, 0XC07755A8, 0XBAE3EC77, 0XAB13676F, 0X4E263E53, 0X3D1E4D7B, 0XD2E5D070, 0X5E140CBB, 0X79C1E1FB, 0X361F35F8, 0X826C9A87, 0X53A3D469, 0XDA1DFDE4, 0XFFCDA2F1, 0X9BC1690A, 0XD6468B5D, 0X856A8777, 0X79E33E50, 0X83412B3C, 0X3391E087, 0XD6C198C0, 0X04D38D4C, 0X34AF758C, 0XB09D292C, 0XE3E3BECA, 0X132CF817, 0X8BF9D3BB, 0X837B535A, 0X1C6AFDFA, 0X2B040960, 0X56 } }; #endif #if BITSPERWORD == 64 /* Data for a machine with 64 bits per multiple precision word */ static mplong defmods[10][17] = { { 2, 0X7BD152B330F0A771, 1 }, { 2, 0XFFFFFFFFFFFFFFFF, 0X01FFFFFF }, { 2, 0XFFFFFFFFFFFFFFFF, 0X7FFFFFFFFFFFFFFF }, { 3, 0X22D2F6C312CACD21, 0X9D135AC892AB1311, 0XDBDCFD23CD3D4156 }, { 5, 0X786C0065F8DB7C81, 0X5CC1C94195D05DC0, 0XD6FD8182A7987CBA, 0XB2B6F77A278B4D09, 1 }, { 7, 0X2F03B89A5BB45685, 0XD0979D300C238E65, 0X334B46E87D1B506D, 0XDC82E031F63B8E74, 0XBD586A9E6842B9A8, 0XB69CB33080152BA6, 2 }, { 8, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF }, { 9, 0X2B7BAFD2F123C871, 0X62E76C1651D886BB, 0XF2B2DAC1AD76E211, 0X97DD013D0EE8263D, 0X590FEBF549FE574B, 0X91B9142BF4018B98, 0X33304958A0C310FE, 0X5AAA1CA6F94260BA, 3 }, { 12, 0X3D890F36BFB72E91, 0XFCA7523AA692A7BB, 0X88CEDE98BD1C5E4D, 0X302879A819C47C10, 0X1C54BA1898D73E3D, 0X5D42F5167EFE1607, 0X4D35BA755290AE47, 0XF567F6ABC2A13902, 0X66153C405C5D0361, 0X88E1811FF9DC5FB4, 0XDDCAF0C67923A048, 0X00104DB46CCF51C0 }, { 16, 0XC81109B422E30765, 0XBAE3EC77C07755A8, 0X4E263E53AB13676F, 0XD2E5D0703D1E4D7B, 0X79C1E1FB5E140CBB, 0X826C9A87361F35F8, 0XDA1DFDE453A3D469, 0X9BC1690AFFCDA2F1, 0X856A8777D6468B5D, 0X83412B3C79E33E50, 0XD6C198C03391E087, 0X34AF758C04D38D4C, 0XE3E3BECAB09D292C, 0X8BF9D3BB132CF817, 0X1C6AFDFA837B535A, 0X000000562B040960 } }; #endif // user selects one of above moduli by using 1,2...,10 as 2nd arg. int defmodnum; // Variables for timing purposes double cputime(), wall(), x0, y0; double cset, wset, crun, wrun, ccheck, wcheck; // Other variables mplong checksum; int n, m, numsize, modbits, modsize; int q = 1; int o, argind = 1, i, err = 0; double sqrt (); #ifdef _MPUTIL char tmpmod[310]; int tmpmodlen; # ifdef _CRAYMPP _fcd FCD; # endif #endif // This should come before any other executed code #if (defined(__alpha) && !defined(CRAY)) shmem_init(); #endif npes = _num_pes(); mype = _my_pe(); q = sqrt ( npes + 0.5 ); if ( q*q != npes ) { printf( "Number of PE's must be a square\n" ); exit(0); } /* Read the command line arguments */ if (argc > 1) { if (argv[1][0] == '-') { if (argv[1][1] == 'v') verbose = 1; if (argv[1][1] == 'V') verbose = 2; argind = 2; } } #ifdef _MPUTIL /* pass flags to Fortran MPUTIL */ mputil(&verbose, &debug); #endif /* Read the dimension of the matrices, check in range */ if (argc > argind) n = atoi(argv[argind]); else n = 0; if (n <= 0) n = 225; /* default */ if (n < 4) { printf("Minimum allowed value for N is 4, N = %d \n", n); exit(0); } if (n > maxmatsize) { printf ( "Maximum allowed value for N is %d, N = %d \n", maxmatsize, n ); exit(0); } /* calculate m, size of subblocks of A, B, C on each PE */ if (q == 1) { m = n; } else { // If Q doesn't divide N, round up: // so the first Q-1 PEs in each row/col have full m x m data // and the PEs on the right and bottom edges // have a border of zeros after the data runs out m = (n+q-1) / q; } if (m > maxsubmatsize) { printf ( "Maximum submatrix size M is %d, M = %d \n", maxsubmatsize, m ); exit(0); } /* Prepare the structure for the modulus */ MPINIT ( modulus, maxmodsize-MPHL, 0 ); /* Read the modulus */ if (argc > argind+1) { if (argv[argind+1][0] == '-') { if (verbose) printf( "Reading MODULUS from stdin: \n" ); mpread(modulus); } else { #ifdef _MPUTIL /* must pass Fortran MPFROMSTRING a string ending with a blank */ memset(tmpmod, 0, 310); strcpy(tmpmod, argv[argind+1]); tmpmodlen = (int)strlen(tmpmod); tmpmod[tmpmodlen] = ' '; tmpmod[tmpmodlen+1] = 0; # if defined _CRAY1 || _CRAYMPP /* Cray (except X1) needs Fortran Char Descriptor (FCD) */ FCD = _cptofcd( tmpmod, tmpmodlen+1 ); mpfromstring(modulus, FCD); # else /* Cray X1 and others can just get C string */ mpfromstring(modulus, tmpmod); # endif #else /* call GMP's mpz_set_str with usual C string */ if (verbose) printf( "call gmp on %s\n", argv[argind+1]); mpfromstring(modulus, argv[argind+1]); if (verbose) printf( "called gmp's mpfrfomstring\n"); #endif } } if (verbose) printf("%lx %lx %ld\n", modulus,modulus[1],modulus[2]); modsize = MPGETSIZE ( modulus ); /* Check for default modulus or one of the 10 checkable moduli */ defmodnum = 0; if (modsize == 0) { defmodnum = 8; } else if ( (modsize == 1) && (modulus[MPDATA] <= 10) ) { defmodnum = modulus[MPDATA]; if (defmodnum == 0) defmodnum = 8; } if (defmodnum != 0) { modsize = defmods[defmodnum-1][0]; for (i = 0; i < modsize; i++ ) modulus[MPDATA+i] = defmods[defmodnum-1][i+1]; } numsize = modsize + MPHL; if (verbose) printf("modsize %d\n", modsize); // set modulus to actual size MPINIT ( modulus, modsize, modsize ); /* Calculate number of bits in the modulus; start with highest limb */ modbits = WORDLEN - leadz( modulus[numsize-1] ); /* Now add total bits in lower-order limbs */ modbits = modbits + BITSPERWORD * (modsize-1); if (verbose) { printf( "Using a modulus of: \n" ); mpwrite( modulus ); printf( " of %d bits \n", modbits ); printf( "We are multiplying two %d x %d matrices \n", n, n ); } /* now we're ready to call s11, p11, c11, r11 to do the work */ /* s11 builds the "random" matrices A and B */ x0 = cputime(); y0 = wall(); /* malloc the matrices A and B */ a = (mplong *) shmalloc ( 8 * m * m * numsize ); b = (mplong *) shmalloc ( 8 * m * m * numsize ); if (verbose) printf( "%d Calling subroutine S11 \n", mype ); s11 ( a, b, n, q, m, numsize, modbits ); if (verbose) printf( "%d Returned from subroutine S11 \n", mype ); /* wait till all PEs finish initializing their subblocks of A, B */ for (i=0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) pSync[i] = _SHMEM_SYNC_VALUE; shmem_barrier_all(); /* malloc the matrix C used from now on */ c = (mplong *) shmalloc ( 8 * m * m * numsize ); cset = cputime() - x0; wset = wall() - y0; /* p11 does all the real work */ x0 = cputime(); y0 = wall(); if (verbose) printf( "%d Calling subroutine P11 \n", mype ); p11 ( a, b, c, modulus, q, m, numsize ); if (verbose) printf( "%d Returned from subroutine P11 \n", mype ); crun = cputime() - x0; wrun = wall() - y0; shfree ( a ); shfree ( b ); /* c11 checks the results */ x0 = cputime(); y0 = wall(); if (verbose) printf( "%d Calling subroutine C11 \n", mype ); err = c11 ( c, modulus, &checksum, n, m, numsize, modbits ); if (verbose) printf( "%d Returned from subroutine C11 %d \n", mype, err ); ccheck = cputime() - x0; wcheck = wall() - y0; /* r11 outputs the results */ if (verbose) printf( "%d Calling subroutine R11 \n", mype ); r11 ( c, modulus, checksum, n, q, m, numsize, err, cset, wset, crun, wrun, ccheck, wcheck ); if (verbose) printf( "%d Returned from subroutine R11 \n", mype ); shmem_barrier_all(); shfree ( c ); }