/*-----------------------------------------------------------------------*/ /* Program: VStream */ /* Revision: $Id: istream.c,v 1.0 2007/02/19 23:57:39 mccalpin Exp mccalpin $ */ /* Original code developed by John D. McCalpin */ /* Programmers: John D. McCalpin */ /* Joe R. Zagar */ /* Steve Poole */ /* */ /* This program measures memory transfer rates in MB/s for simple */ /* computational kernels coded in C. */ /*-----------------------------------------------------------------------*/ /* Copyright 1991-2005: John D. McCalpin */ /*-----------------------------------------------------------------------*/ /* */ /* This is a hack of the cstream program by John */ /* */ /* License: */ /* 1. You are free to use this program and/or to redistribute */ /* this program. */ /* 2. You are free to modify this program for your own use, */ /* including commercial use, subject to the publication */ /* restrictions in item 3. */ /* 3. You are free to publish results obtained from running this */ /* program, or from works that you derive from this program, */ /* with the following limitations: */ /* 3a. In order to be referred to as "STREAM benchmark results", */ /* published results must be in conformance to the STREAM */ /* Run Rules, (briefly reviewed below) published at */ /* http://www.cs.virginia.edu/stream/ref.html */ /* and incorporated herein by reference. */ /* As the copyright holder, John McCalpin retains the */ /* right to determine conformity with the Run Rules. */ /* 3b. Results based on modified source code or on runs not in */ /* accordance with the STREAM Run Rules must be clearly */ /* labelled whenever they are published. Examples of */ /* proper labelling include: */ /* "tuned STREAM benchmark results" */ /* "based on a variant of the STREAM benchmark code" */ /* Other comparable, clear and reasonable labelling is */ /* acceptable. */ /* 3c. Submission of results to the STREAM benchmark web site */ /* is encouraged, but not required. */ /* 4. Use of this program or creation of derived works based on this */ /* program constitutes acceptance of these licensing restrictions. */ /* 5. Absolutely no warranty is expressed or implied. */ /* */ /*-----------------------------------------------------------------------*/ /* */ # include # include # include # include # include # include # include /* INSTRUCTIONS: * * 1) Stream requires a good bit of memory to run. Adjust the * value of 'N' (below) to give a 'timing calibration' of * at least 20 clock-ticks. This will provide rate estimates * that should be good to about 5% precision. */ /* ** These are defined for the "static" allocations ** Default is R*8 */ #if defined (IBY4) # define MY_MAX_N 2000000 # define NTIMES 10 # define OFFSET 0 # define NUM_TESTS 5 # define MY_DATA_SIZE long # define MY_DATA_TYPE "long" # define MY_OP << # define MY_ZERO 0L # define MY_ONE 1L # define MY_TWO 2L # define MY_THREE 3L #elif defined (IBY8) # define MY_MAX_N 2000000 # define NTIMES 10 # define OFFSET 0 # define NUM_TESTS 5 # define MY_DATA_SIZE long long # define MY_DATA_TYPE "long long" # define MY_OP << # define MY_ZERO 0LL # define MY_ONE 1LL # define MY_TWO 2LL # define MY_THREE 3LL #elif defined (RBY4) # define MY_MAX_N 2000000 # define NTIMES 10 # define OFFSET 0 # define NUM_TESTS 4 # define MY_DATA_SIZE float # define MY_DATA_TYPE "float" # define MY_ZERO 0.0e0 # define MY_ONE 1.0e0 # define MY_TWO 2.0e0 # define MY_THREE 3.0e0 #elif defined (RBY8) # define MY_MAX_N 2000000 # define NTIMES 10 # define OFFSET 0 # define NUM_TESTS 4 # define MY_DATA_SIZE double # define MY_DATA_TYPE "double" # define MY_ZERO 0.0e0 # define MY_ONE 1.0e0 # define MY_TWO 2.0e0 # define MY_THREE 3.0e0 #endif /* * 3) Compile the code with full optimization. Many compilers * generate unreasonably bad code before the optimizer tightens * things up. If the results are unreasonably good, on the * other hand, the optimizer might be too smart for me! * * Try compiling with: * cc -O stream_omp.c -o stream_omp * * This is known to work on Cray, SGI, IBM, and Sun machines. * * * 4) Mail the results to mccalpin@cs.virginia.edu and * benchmarks@ornl.gov * Be sure to include: * a) computer hardware model number and software revision * b) compiler used (-v or -version works on many compilers) * c) the compiler flags * d) if possible, as much info on the system as can be obtained. * On most Linux boxes, /proc/meminfo and /proc/cpuinfo will give * very useful information. hwinfo is also usefule, if available. * On a Mac, you can use "system_profiler >& my_info" and include * that information in the mail. * e) all of the output from the test case(s). * Thanks! * */ # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static MY_DATA_SIZE a[ MY_MAX_N + OFFSET ], b[ MY_MAX_N + OFFSET ], c[ MY_MAX_N + OFFSET ], d[ MY_MAX_N + OFFSET ], e[ MY_MAX_N + OFFSET ]; static double avgtime[5] = {0}, maxtime[NUM_TESTS] = {0}, mintime[5] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static char *label[5] = {"Copy: ", "Scale: ", "Add: ", "Triad: ", "SScale "}; static MY_DATA_SIZE bytes[5] = { 2 * sizeof(MY_DATA_SIZE) * MY_MAX_N, 2 * sizeof(MY_DATA_SIZE) * MY_MAX_N, 3 * sizeof(MY_DATA_SIZE) * MY_MAX_N, 3 * sizeof(MY_DATA_SIZE) * MY_MAX_N, 2 * sizeof(MY_DATA_SIZE) * MY_MAX_N }; extern double mysecond(); extern void checkSTREAMresults(); extern int checktick(); long N=MY_MAX_N; #ifdef TUNED extern void tuned_STREAM_Copy(); extern void tuned_STREAM_Add(); extern void tuned_STREAM_Scale(MY_DATA_SIZE scalar); extern void tuned_STREAM_SScale(MY_DATA_SIZE scalar); extern void tuned_STREAM_Triad(MY_DATA_SIZE scalar); #endif #ifdef _OPENMP extern int omp_get_num_threads(); #endif int main(int argc, char** argv) { /* int quantum, checktick(); */ int quantum; int BytesPerWord; char *cvalue = NULL; int aflag = 0; int bflag = 0; int index , g; register int j, k; register MY_DATA_SIZE scalar; double rscalar, t, times[NUM_TESTS][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("VSTREAM version $Revision: 1.0 $\n"); printf(HLINE); /* ** Get the options */ opterr = 0; while ((g = getopt (argc, argv, "se:")) != -1) switch (g) { case 's': aflag = 1; break; case 'e': bflag = 1; break; case '?': if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return 1; default: abort (); } /* ** */ BytesPerWord = sizeof(MY_DATA_SIZE); printf("The data type is %s\n",MY_DATA_TYPE); printf("This system uses %d bytes per Element being tested.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (float) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("OpenMP invoked: This is the MASTER Routine\n"); printf ("Number of Threads requested = %i\n",k); } } #endif printf(HLINE); #pragma omp parallel { printf ("Printing one line per active thread....\n"); } /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < N; j++) a[j] = MY_TWO * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (long) t ); printf(" (= %d clock ticks)\n", (long) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* ** ** --- MAIN LOOP --- repeat test cases NTIMES times --- ** */ scalar = MY_THREE; for (k=0; k double mysecond() { struct timeval tp; struct timezone tzp; int i; i = gettimeofday(&tp,&tzp); return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } void checkSTREAMresults () { double aj,bj,cj,scalar; double asum,bsum,csum; double epsilon; int j,k; /* reproduce initialization */ aj = (double) MY_ONE; bj = (double) MY_TWO; cj = (double) MY_ZERO; /* a[] is modified during timing check */ aj = (double) MY_TWO * aj; /* now execute timing loop */ scalar = (double) MY_THREE; for (k=0; k= 0 ? (a) : -(a)) #endif epsilon = 1.e-8; if (abs(aj-asum)/asum > epsilon) { printf ("Failed Validation on array a[]\n"); printf (" Expected : %f \n",aj); printf (" Observed : %f \n",asum); } else if (abs(bj-bsum)/bsum > epsilon) { printf ("Failed Validation on array b[]\n"); printf (" Expected : %f \n",bj); printf (" Observed : %f \n",bsum); } else if (abs(cj-csum)/csum > epsilon) { printf ("Failed Validation on array c[]\n"); printf (" Expected : %f \n",cj); printf (" Observed : %f \n",csum); } else { printf ("Solution Validates\n"); } } void tuned_STREAM_Copy() { int j; #pragma omp parallel for for (j=0; j