/* COPYRIGHT (2011-2012) by: Kevin Marco Erler (author), http://www.kevinerler.de AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de BSZ-Hermsdorf (co-owner), http://www.bszh.de Advanced Licensing (dual license: COPYRIGHT and following licenses): License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en License (Germany): CC-BY v3.0-DE or later - link: http://creativecommons.org/licenses/by/3.0/de/ ------------------ Compilation requirements: Packages (x86-64): GCC >v4.2, compat. libstdc++ and GOMP v3.0 CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools) NOTES: optimized for NVIDIA-GPU-architecture: FERMI! two compile-steps! 1.) <src.cu> ==> <src.cpp> 2.) <src.cpp> ==> <dest> Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v */ // HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O #include <cxxabi.h> #include <climits> #include <stdint.h> #include <inttypes.h> #include <cfloat> #include <cwchar> #include <string> //std:string #include <string.h> #include <cstring> #include <cstdlib> #include <cstdio> #include <iostream> #include <sstream> #include <cmath> // HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support #ifdef _OPENMP #include <omp.h> #endif // HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support #include <cuda.h> #include <cuda.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> #include "./cuPrintf/cuPrintf.cu" using namespace std; #define free(x) free(x); *x=NULL const uint64_t NumValues = 400000000ULL, DataPartitionSize = (NumValues/2ULL); // DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function) __global__ void cu1DVecAdd64(uint64_t *D_A, uint64_t *D_B, uint64_t *D_C); __host__ int main(int argc, char *argv[]) { // Runtime manipulation of OpenMP-state variables //omp_set_num_threads(4); omp_set_dynamic(0); // data declarations and implementations double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00; uint64_t A_s[NumValues] = {0ULL}, \ B_s[NumValues] = {0ULL}, \ C_s[NumValues] = {0ULL}, \ A_p[NumValues] = {0ULL}, \ B_p[NumValues] = {0ULL}, \ C_p[NumValues] = {0ULL}, \ h_A_p[NumValues] = {0ULL}, \ h_B_p[NumValues] = {0ULL}, \ h_C_p[NumValues] = {0ULL}, \ *d_A_p=NULL, *d_B_p=NULL, *d_C_p=NULL; bool ResultsAreCorrect = false; std::cout << "Vektoraddition (1D) (64-Bit)\n" << "===================================================================\n" << "Initialisierung:"; //--------------------------Begin: Initialization of data------------------------------------------ for(uint64_t i=0ULL;i<NumValues;++i) { A_s[i] = A_p[i] = h_A_p[i] = B_s[i] = B_p[i] = h_B_p[i] = (i+1ULL); } //--------------------------End: Initialization of data-------------------------------------------- std::cout << " done\n" << "CPU-SERIELLE AUSFÜHRUNG:"; //--------------------------Begin: CPU-serial execution of algorithm------------------------------- starttime = omp_get_wtime(); //CPU-serial algorithm: for(uint64_t j=0ULL;j<NumValues;++j) { C_s[j] = A_s[j] + B_s[j]; } sdelay = omp_get_wtime()-starttime; std::cout << " done\n"; //serial //--------------------------End: CPU-serial execution of algorithm--------------------------------- //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm---------------------- std::cout << "CPU-PARALLELE AUSFÜHRUNG mit "; // create parallel region: #pragma omp parallel default(none) shared(std::cout, starttime, pdelay1, A_p, B_p, C_p) { #pragma omp master { std::cout << omp_get_num_threads() << " Threads:"; starttime = omp_get_wtime(); } //OpenMP-CPU-parallel algorithm #pragma omp flush #pragma omp for schedule(static) for(uint64_t k=0ULL;k<NumValues;++k) { C_p[k] = A_p[k] + B_p[k]; } #pragma omp master { pdelay1 = omp_get_wtime()-starttime; if(omp_get_num_threads() >= 10) { std::cout << " done\n"; //parallel } else { std::cout << " done\n"; //parallel } } } //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------ //++++++++++++++++++++++++++Begin: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++ std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 2 GPU u. 14 SM:"; // choose GPU cudaSetDevice(0); // allocate data on GPU memory cudaMalloc((void**)&d_A_p,(DataPartitionSize*sizeof(uint64_t))); cudaMalloc((void**)&d_B_p,(DataPartitionSize*sizeof(uint64_t))); cudaMalloc((void**)&d_C_p,(DataPartitionSize*sizeof(uint64_t))); // partitioning of data (CPU memory = big, but GPU memory (relatively) = small) for(uint64_t partition=0;partition<(NumValues/DataPartitionSize);++partition) //(NumValues/DataPartitionSize) = NumPartitions { // copy Host-Data into GPU-Data (memory) cudaMemcpy(d_A_p,(void*)(h_A_p+(partition*DataPartitionSize)),(DataPartitionSize*sizeof(uint64_t)),cudaMemcpyHostToDevice); cudaMemcpy(d_B_p,(void*)(h_B_p+(partition*DataPartitionSize)),(DataPartitionSize*sizeof(uint64_t)),cudaMemcpyHostToDevice); starttime = omp_get_wtime(); /* CUDA-kernel call with 112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */ cu1DVecAdd64<<<112,192>>>(d_A_p,d_B_p,d_C_p); //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n'; // CPU external GPU-Threads synchronisation cudaDeviceSynchronize(); pdelay2+=(omp_get_wtime()-starttime); // copy calculated device-data from device memory back into CPU-data (RAM) cudaMemcpy((void*)(h_C_p+(partition*DataPartitionSize)),d_C_p,(DataPartitionSize*sizeof(uint64_t)),cudaMemcpyDeviceToHost); } std::cout << " done\n"; //parallel (GPU) //++++++++++++++++++++++++++End: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++++ //--------------------------Analysis of results---------------------------------------------------- std::cout << "Überprüfe Ergebnisse:"; for(uint64_t l=0ULL;l<NumValues;++l) { if((C_p[l]==C_s[l])&&(h_C_p[l]==C_s[l])) { ResultsAreCorrect = true; } else { ResultsAreCorrect = false; break; } } std::cout << " done\n"; std::cout << "\nAuswertung:\n" << "*******************************************************************\n" << "Anzahl 1D-Eingangsvektoren A: " << NumValues << '\n' << "Anzahl 1D-Eingangsvektoren B: " << NumValues << '\n' << "Anzahl 1D-Ergebnis-Vektoren C: " << NumValues << '\n' << "Seriell & parallel richtig gerechnet?: " << ((ResultsAreCorrect==true)?"yes\n":" no\n") << "Dauer - SERIELL: " << sdelay << " sec\n" << "Dauer - PARALLEL (CPU): " << pdelay1 << " sec\n" << "Dauer - PARALLEL (GPU): " << pdelay2 << " sec\n" << "__________________\n" << "Beispiele:\n" << "==> 1.(1D)-Vektoraddition:\n" << "C1 = A1(" << A_p[0] << ") + B1(" << B_p[0] << ")\n" << "C1 = " << C_p[0] << "\n" << "==> " << NumValues << ".(1D)-Vektoraddition:\n" << "C" << NumValues << " = A" << NumValues << "(" << A_p[NumValues-1] << ") + B" << NumValues << "(" << B_p[NumValues-1] << ")\n" << "C" << NumValues << " = " << C_p[NumValues-1] << "\n"; /* // Detailed output std::cout << "__________________\n" << "Ergebnisliste:\n"; for(uint64_t m=0ULL;m<NumValues;++m) { std::cout << "Seriell: A" << (m+1) << " = " << A_s[m] << " B" << (m+1) << " = " << B_s[m] << " C" << (m+1) << " = " << C_s[m] << '\n'; std::cout << "Parallel (CPU): A" << (m+1) << " = " << A_p[m] << " B" << (m+1) << " = " << B_p[m] << " C" << (m+1) << " = " << C_p[m] << '\n'; std::cout << "Parallel (GPU): A" << (m+1) << " = " << h_A_p[m] << " B" << (m+1) << " = " << h_B_p[m] << " C" << (m+1) << " = " << h_C_p[m] << '\n'; } */ // free allocated data on GPU memory cudaFree(d_A_p); cudaFree(d_B_p); cudaFree(d_C_p); getchar(); return 0; } // DEVICE: Implementation of CUDA-kernel "cu1DVecAdd64()" __global__ void cu1DVecAdd64(uint64_t *D_A, uint64_t *D_B, uint64_t *D_C) { // for work with faster on-Chip GPU-shared memory __shared__ uint64_t sD_A; __shared__ uint64_t sD_B; __shared__ uint64_t sD_C; // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale) int tid = threadIdx.x + blockIdx.x * blockDim.x; // algorithm while(tid < DataPartitionSize) // iterate through all the data in GPU-memory - PART 1 { sD_A = D_A[tid]; sD_B = D_B[tid]; sD_C = sD_A + sD_B; D_C[tid] = sD_C; // block locale GPU-Threads synchronization __syncthreads(); tid+=(blockDim.x*gridDim.x); // iterate through all the data in GPU-memory - PART 2 } }