/* COPYRIGHT (2011-2012) by: Kevin Marco Erler (author), http://www.kevinerler.de AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de BSZ-Hermsdorf (co-owner), http://www.bszh.de Advanced Licensing (dual license: COPYRIGHT and following licenses): License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en License (Germany): CC-BY v3.0-DE or later - link: http://creativecommons.org/licenses/by/3.0/de/ ------------------ Compilation requirements: Packages (x86-64): GCC >v4.2, compat. libstdc++ and GOMP v3.0 CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools) NOTES: optimized for NVIDIA-GPU-architecture: FERMI! two compile-steps! 1.) <src.cu> ==> <src.cpp> 2.) <src.cpp> ==> <dest> Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v */ // HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O #include <climits> #include <stdint.h> #include <inttypes.h> #include <cfloat> #include <cwchar> #include <string> //std:string #include <string.h> #include <cstring> #include <cstdlib> #include <cstdio> #include <iostream> #include <sstream> #include <iomanip> #include <cmath> // HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support #ifdef _OPENMP #include <omp.h> #endif // HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support #include <cuda.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> #include <math_functions.h> #include "./cuPrintf/cuPrintf.cu" using namespace std; #define free(x) free(x); *x=NULL //NVdimN = NumValues of dimN const uint64_t NVsize = 1920ULL; const uint64_t NVdim1 = NVsize, //dim1 = x (col) NVdim2 = NVsize; //dim2 = y (row) // DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function) __global__ void cu2DMatrixMul64ui(uint64_t *D_MA, uint64_t *D_MB, uint64_t *D_MC); __host__ int main(int argc, char *argv[]) { // Runtime manipulation of OpenMP-state variables //omp_set_num_threads(4); omp_set_dynamic(0); omp_set_nested(3); // important for nested parallelism // data declarations and implementations double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00; uint64_t MA_s[NVdim1][NVdim2] = {{0ULL}}, MB_s[NVdim1][NVdim2] = {{0ULL}}, MC_s[NVdim1][NVdim2] = {{0ULL}}, MA_p[NVdim1][NVdim2] = {{0ULL}}, MB_p[NVdim1][NVdim2] = {{0ULL}}, MC_p[NVdim1][NVdim2] = {{0ULL}}, h_MA_p[NVdim1*NVdim2]={0ULL}, h_MB_p[NVdim1*NVdim2]={0ULL}, h_MC_p[NVdim1*NVdim2]={0ULL}, *d_MA_p=NULL, *d_MB_p=NULL, *d_MC_p=NULL, x = 0ULL, y = 0ULL, i = 0ULL; // x: col, y: row, i: Initialisierung bool ResultsAreCorrect = true; std::cout << "Matrix-Multiplikation (64-Bit)\n" << "===================================================================\n" << "Initialisierung:"; //--------------------------Begin: Initialization of data------------------------------------------ x = y = i = 0ULL; //spaltenweise vektorielle Initialisierung for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { MA_s[x][y] = MB_s[x][y] = MA_p[x][y] = MB_p[x][y] = h_MA_p[(x+y*NVsize)] = h_MB_p[(x+y*NVsize)] = (i+1ULL); ++i; } } //--------------------------End: Initialization of data-------------------------------------------- std::cout << " done\n" << "CPU-SERIELLE AUSFÜHRUNG:"; //--------------------------Begin: CPU-serial execution of algorithm------------------------------- x = y = i = 0ULL; starttime = omp_get_wtime(); //CPU-serial algorithm: for(x=0;x<NVdim1;++x) //col { for(y=0;y<NVdim2;++y) //row { for(i=0;i<NVsize;++i) { MC_s[x][y]+=(MA_s[x][i] * MB_s[i][y]); } } } sdelay = omp_get_wtime()-starttime; std::cout << " done\n"; //serial //--------------------------End: CPU-serial execution of algorithm--------------------------------- //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm---------------------- std::cout << "CPU-PARALLELE AUSFÜHRUNG mit "; x = y = i = 0ULL; // create parallel region: #pragma omp parallel default(none) private(x, y, i) shared(std::cout, starttime, pdelay1, MA_p, MB_p, MC_p) { #pragma omp master { std::cout << omp_get_num_threads() << " Threads:"; starttime = omp_get_wtime(); } /* OpenMP-CPU-parallel algorithm with nested parallelism: Nested parallelism with collapse(k)-clause on OpenMP-for-directive (k = nesting deep of loops). */ #pragma omp flush #pragma omp for collapse(3) schedule(static) for(x=0;x<NVdim1;++x) //col { for(y=0;y<NVdim2;++y) //row { for(i=0;i<NVsize;++i) { MC_p[x][y]+=(MA_p[x][i] * MB_p[i][y]); } } } #pragma omp master { pdelay1 = omp_get_wtime()-starttime; if(omp_get_num_threads() >= 10) { std::cout << " done\n"; //parallel } else { std::cout << " done\n"; //parallel } } } x = y = 0ULL; //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------ //++++++++++++++++++++++++++Begin: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++ std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 1 GPU u. 14 SM:"; cudaSetDevice(0); // choose GPU // allocate data on GPU memory cudaMalloc((void**)&d_MA_p,((NVdim1*NVdim2)*sizeof(uint64_t))); cudaMalloc((void**)&d_MB_p,((NVdim1*NVdim2)*sizeof(uint64_t))); cudaMalloc((void**)&d_MC_p,((NVdim1*NVdim2)*sizeof(uint64_t))); // copy Host-Data into GPU-Data (memory) cudaMemcpy(d_MA_p,(void*)h_MA_p,((NVdim1*NVdim2)*sizeof(uint64_t)),cudaMemcpyHostToDevice); cudaMemcpy(d_MB_p,(void*)h_MB_p,((NVdim1*NVdim2)*sizeof(uint64_t)),cudaMemcpyHostToDevice); // begin of CUDAPrintf-section cudaPrintfInit(); starttime = omp_get_wtime(); /* CUDA-kernel call with 112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */ cu2DMatrixMul64ui<<<112,192>>>(d_MA_p,d_MB_p,d_MC_p); //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n'; // CPU external GPU-Threads synchronisation cudaThreadSynchronize(); pdelay2=(omp_get_wtime()-starttime); // CUDAPrintf output of CUDA-kernel call cudaPrintfDisplay(stdout,true); // end of CUDAPrintf-section cudaPrintfEnd(); // copy calculated device-data from device memory back into CPU-data (RAM) cudaMemcpy((void*)h_MC_p,d_MC_p,((NVdim1*NVdim2)*sizeof(uint64_t)),cudaMemcpyDeviceToHost); /* // Control output for(uint64_t i=0ULL;i<NVdim1;++i) { for(uint64_t j=0ULL;j<NVdim2;++j) { printf("%d\n",h_MC_p[i+j*NVsize]); } printf("\n"); } */ /* // Control output std::cout << h_MC_p[0] << std::endl << h_MC_p[(NVdim1-1)+(NVdim2-1)*NVsize] << std::endl; */ std::cout << " done\n"; //parallel (GPU) //++++++++++++++++++++++++++End: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++++ //--------------------------Analysis of results---------------------------------------------------- std::cout << "Überprüfe Ergebnisse:"; //spaltenweise vektorielle Überprüfung for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { if((MC_p[x][y]!=MC_s[x][y])&&(h_MC_p[(x+y*NVsize)]!=MC_s[x][y])) { ResultsAreCorrect = false; break; } } } std::cout << " done\n"; std::cout << "\nAuswertung:\n" << "*******************************************************************\n" << "Anzahl Komponenten der 2D-Eingangsmatrix A: " << (NVdim1*NVdim2) << '\n' << "Anzahl Komponenten der 2D-Eingangsmatrix B: " << (NVdim1*NVdim2) << '\n' << "Anzahl Komponenten der 2D-Ergebnismatrix C: " << (NVdim1*NVdim2) << '\n' << "Seriell & parallel richtig gerechnet?: " << ((ResultsAreCorrect==true)?"yes\n":" no\n") << "Dauer - SERIELL (CPU): " << sdelay << " sec\n" << "Dauer - PARALLEL (CPU): " << pdelay1 << " sec\n" << "Dauer - PARALLEL (GPU): " << pdelay2 << " sec\n" << "__________________\n" << "==>Summenmatrix (CPU-seriell berechnet):\n" << " MA(" << MA_s[0][0] << ';' << MA_s[1][0] << ';' << MA_s[2][0] << ';' << MA_s[3][0] << ";...;" << MA_s[NVdim1-2][NVdim2-1] << ';' << MA_s[NVdim1-1][NVdim2-1] << ")\n+" << "MB(" << MB_s[0][0] << ';' << MB_s[1][0] << ';' << MB_s[2][0] << ';' << MB_s[3][0] << ";...;" << MB_s[NVdim1-2][NVdim2-1] << ';' << MB_s[NVdim1-1][NVdim2-1] << ")\n=" << "MC(" << MC_s[0][0] << ';' << MC_s[1][0] << ';' << MC_s[2][0] << ';' << MC_s[3][0] << ";...;" << MC_s[NVdim1-2][NVdim2-1] << ';' << MC_s[NVdim1-1][NVdim2-1] << ")\n" << "==>Summenmatrix (CPU-parallel berechnet):\n" << " MA(" << MA_p[0][0] << ';' << MA_p[1][0] << ';' << MA_p[2][0] << ';' << MA_p[3][0] << ";...;" << MA_p[NVdim1-2][NVdim2-1] << ';' << MA_p[NVdim1-1][NVdim2-1] << ")\n+" << "MB(" << MB_p[0][0] << ';' << MB_p[1][0] << ';' << MB_p[2][0] << ';' << MB_p[3][0] << ";...;" << MB_p[NVdim1-2][NVdim2-1] << ';' << MB_p[NVdim1-1][NVdim2-1] << ")\n=" << "MC(" << MC_p[0][0] << ';' << MC_p[1][0] << ';' << MC_p[2][0] << ';' << MC_p[3][0] << ";...;" << MC_p[NVdim1-2][NVdim2-1] << ';' << MC_p[NVdim1-1][NVdim2-1] << ")\n" << "==>Summenmatrix (GPU-parallel berechnet):\n" << " MA(" << h_MA_p[(0+0*NVsize)] << ';' << h_MA_p[(1+0*NVsize)] << ';' << h_MA_p[(2+0*NVsize)] << ';' << h_MA_p[(3+0*NVsize)] << ";...;" << h_MA_p[((NVdim1-2)+(NVdim2-1)*NVsize)] << ';' << h_MA_p[((NVdim1-1)+(NVdim2-1)*NVsize)] << ")\n+" << "MB(" << h_MB_p[(0+0*NVsize)] << ';' << h_MB_p[(1+0*NVsize)] << ';' << h_MB_p[(2+0*NVsize)] << ';' << h_MB_p[(3+0*NVsize)] << ";...;" << h_MB_p[((NVdim1-2)+(NVdim2-1)*NVsize)] << ';' << h_MB_p[((NVdim1-1)+(NVdim2-1)*NVsize)] << ")\n=" << "MC(" << h_MC_p[(0+0*NVsize)] << ';' << h_MC_p[(1+0*NVsize)] << ';' << h_MC_p[(2+0*NVsize)] << ';' << h_MC_p[(3+0*NVsize)] << ";...;" << h_MC_p[((NVdim1-2)+(NVdim2-1)*NVsize)] << ';' << h_MC_p[((NVdim1-1)+(NVdim2-1)*NVsize)] << ")\n"; /* // Detailed output std::cout << "__________________\n" << "Ergebnisliste:\n"; x = y = i = 0ULL; uint64_t j = 0ULL; for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { for(i=0;i<NVsize;++i) { std::cout << "Seriell: MA" << (j+1ULL) << '[' << x << "][" << i << "](" << MA_s[x][i] << ") + " << "MB" << (j+1ULL) << '[' << i << "][" << y << "](" << MB_s[i][y] << ") = " << "MC" << (j+1ULL) << '[' << x << "][" << y << "](" << MC_s[x][y] << ")\n" << "Parallel: MA" << (j+1ULL) << '[' << x << "][" << i << "](" << MA_p[x][i] << ") + " << "MB" << (j+1ULL) << '[' << i << "][" << y << "](" << MB_p[i][y] << ") = " << "MC" << (j+1ULL) << '[' << x << "][" << y << "](" << MC_p[x][y] << ")\n"; ++j; } } } */ // free allocated data on GPU memory cudaFree(d_MA_p); cudaFree(d_MB_p); cudaFree(d_MC_p); getchar(); return 0; } // DEVICE: Implementation of CUDA-kernel "cu2DMatrixMul64ui()" __global__ void cu2DMatrixMul64ui(uint64_t *D_MA, uint64_t *D_MB, uint64_t *D_MC) { // Get GPU-Thread-ID (dim.x and dim.y) by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale) unsigned int tx = threadIdx.x + blockDim.x*blockIdx.x; unsigned int ty = threadIdx.y + blockDim.y*blockIdx.y; // algorithm uint64_t rC = 0; for(uint64_t z=0;z<NVsize;++z) { rC+=(D_MA[ty*NVsize+z]*D_MB[z*NVsize+tx]); // CUDAPrintf call //cuPrintf("%d\n",D_MA[ty*NVsize+z]); } D_MC[ty*NVsize+tx]=rC; // block locale GPU-Threads synchronization //__syncthreads(); }