/* COPYRIGHT (2011-2012) by: Kevin Marco Erler (author), http://www.kevinerler.de AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de BSZ-Hermsdorf (co-owner), http://www.bszh.de Advanced Licensing (dual license: COPYRIGHT and following licenses): License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en License (Germany): CC-BY v3.0-DE or later - link: http://creativecommons.org/licenses/by/3.0/de/ ------------------ Compilation requirements: Packages (x86-64): GCC >v4.2, compat. libstdc++ and GOMP v3.0 CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools) NOTES: optimized for NVIDIA-GPU-architecture: FERMI! two compile-steps! 1.) <src.cu> ==> <src.cpp> 2.) <src.cpp> ==> <dest> Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v */ // HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O #include <climits> #include <stdint.h> #include <inttypes.h> #include <cfloat> #include <cwchar> #include <string> //std:string #include <string.h> #include <cstring> #include <cstdlib> #include <cstdio> #include <iostream> #include <sstream> #include <cmath> // HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support #ifdef _OPENMP #include <omp.h> #endif // HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support #include <cuda.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> #include <math_functions.h> #include "./cuPrintf/cuPrintf.cu" using namespace std; #define free(x) free(x); *x=NULL //const uint64_t UINT64_MIN = 0; const uint64_t NF = 100000000ULL; // Anzahl Brüche (num fractions): 1000000000ULL // DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function) __global__ void PiCuCalc(double *D_PiCu); __host__ int main(int argc, char *argv[]) { // Runtime manipulation of OpenMP-state variables //omp_set_num_threads(4); omp_set_dynamic(0); // data declarations and implementations double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00; long double PiSingle = 0.000000000000000000, PiParallel = 0.000000000000000000; double h_PiCu[NF] = {0.000000000000000000}, *d_PiCu=NULL; std::cout.precision(20); std::cout << "PiCalc (Leibniz-Reihe) (64-Bit)\n" << "=============================================================================\n" << "CPU-SERIELLE AUSFÜHRUNG:"; //--------------------------Begin: CPU-serial execution of algorithm------------------------------- starttime = omp_get_wtime(); //CPU-serial algorithm: for(uint64_t i = 0ULL;i<NF;++i) { PiSingle+=(pow(-1.000000000000000000,i)/((2.000000000000000000*i)+1.000000000000000000)); } PiSingle*=4.000000000000000000; sdelay = omp_get_wtime()-starttime; std::cout << " done\n"; //serial (CPU) //--------------------------End: CPU-serial execution of algorithm--------------------------------- //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm---------------------- std::cout << "CPU-PARALLELE AUSFÜHRUNG mit "; //long double *Threads = new long double[omp_get_max_threads()]; // create parallel region: shared(Threads,...) if using "Threads"-variable #pragma omp parallel default(none) shared(std::cout, starttime, pdelay1, PiParallel) { #pragma omp master { std::cout << omp_get_num_threads() << " Threads:"; starttime = omp_get_wtime(); } //OpenMP-CPU-parallel algorithm with the Reduction-clause: #pragma omp flush #pragma omp for schedule(static) reduction(+: PiParallel) for(uint64_t k = 0ULL;k<NF;++k) { PiParallel+=(pow(-1.000000000000000000,k)/((2.000000000000000000*k)+1.000000000000000000)); //Threads[omp_get_thread_num()]=PiParallel; //Partial results } #pragma omp master { PiParallel*=4.000000000000000000; //endresult pdelay1 = omp_get_wtime()-starttime; if(omp_get_num_threads() >= 10) { std::cout << " done\n"; //parallel (CPU) } else { std::cout << " done\n"; //parallel (CPU) } /* //Partial results of each CPU-Thread for(int j=0;j<omp_get_num_threads();++j) { if(j<9) { if((Threads[j]*4.000000000000000000)>=0) { std::cout << "Teilergebnis von Thread " << (j+1) << ": " << (Threads[j]*4.000000000000000000) << '\n'; } else { std::cout << "Teilergebnis von Thread " << (j+1) << ":" << (Threads[j]*4.000000000000000000) << '\n'; } } else { if((Threads[j]*4.000000000000000000)>=0) { std::cout << "Teilergebnis von Thread " << (j+1) << ": " << (Threads[j]*4.000000000000000000) << '\n'; } else { std::cout << "Teilergebnis von Thread " << (j+1) << ":" << (Threads[j]*4.000000000000000000) << '\n'; } } } */ } } //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------ //++++++++++++++++++++++++++Begin: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++ std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 1 GPU u. 14 SM:"; // begin of CUDAPrintf-section //cudaPrintfInit(); // allocate data on GPU memory cudaMalloc((void**)&d_PiCu,NF*sizeof(double)); // copy Host-Data into GPU-Data (memory) cudaMemcpy(d_PiCu,h_PiCu,NF*sizeof(double),cudaMemcpyHostToDevice); starttime = omp_get_wtime(); /* CUDA-kernel call with 112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */ PiCuCalc<<<112,192>>>(d_PiCu); // Part 1 of algorithm // CPU external GPU-Threads synchronisation cudaDeviceSynchronize(); pdelay2 = omp_get_wtime()-starttime; // CUDAPrintf output of CUDA-kernel call //cudaPrintfDisplay(stdout,true); // end of CUDAPrintf-section //cudaPrintfEnd(); // copy calculated device-data from device memory back into CPU-data (RAM) cudaMemcpy(h_PiCu,d_PiCu,NF*sizeof(double),cudaMemcpyDeviceToHost); // reduce (accumulate) partial data (results of algorithm) to one result: h_PiCu[0] for(uint64_t c=1ULL;c<NF;++c) { h_PiCu[0]+=h_PiCu[c]; } h_PiCu[0]*=4.000000000000000000; // endresult (PI) std::cout << " done\n"; //parallel (GPU) //++++++++++++++++++++++++++End: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++++ //--------------------------Analysis of results---------------------------------------------------- std::cout << "\nAuswertung:\n" << "*****************************************************************************\n" << "Anzahl Brüche: " << NF << '\n' << "PI - SERIELL (CPU): " << PiSingle << '\n' << "PI - PARALLEL (CPU): " << PiParallel << '\n' << "PI - PARALLEL (GPU): " << h_PiCu[0] << '\n' << "Referenzwert: 3.141592653589793238462643383279502884197169399375105\n" << "Dauer - SERIELL: " << sdelay << " sec\n" << "Dauer - PARALLEL (CPU) " << pdelay1 << " sec\n" << "Dauer - PARALLEL (GPU): " << pdelay2 << " sec\n"; //delete []Threads; // free allocated data on GPU memory cudaFree(d_PiCu); getchar(); return 0; } // DEVICE: Implementation of CUDA-kernel "PiCuCalc()" __global__ void PiCuCalc(double *D_PiCu) { // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale) int tid = threadIdx.x + blockIdx.x*blockDim.x; /* algorithm (Part 2) NOTE: normal algorithm ist SUM(powf(-1,k)/2*k+1)) ; k=0 to k=NF, but: powf(>-1<,k)-(CUDA)-function of the current CUDA-Version credited incorrectly with negative value "-1", therefore, manual determination with tmp-variable: */ __shared__ int64_t tmp; while(tid<NF) // iterate through all the data in GPU-memory - PART 1 { if(tid%2 == 0) { tmp = (1.000000000000000000); } else { tmp = (-1.000000000000000000); } D_PiCu[tid]+=(tmp/((2.000000000000000000*((double)tid))+1.000000000000000000)); //D_PiCu[tid]+=(powf((double)(-1.000000000000000000),(double)tid)/((2.000000000000000000*((double)tid))+1.000000000000000000)); // CUDAPrintf call //cuPrintf("%f\n",D_PiCu[tid]); // block locale GPU-Threads synchronization __syncthreads(); // iterate through all the data in GPU-memory - PART 2 tid+=(blockDim.x*gridDim.x); } }