/* COPYRIGHT (2011-2012) by: Kevin Marco Erler (author), http://www.kevinerler.de AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de BSZ-Hermsdorf (co-owner), http://www.bszh.de Advanced Licensing (dual license: COPYRIGHT and following licenses): License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en License (Germany): CC-BY v3.0-DE or later - link: http://creativecommons.org/licenses/by/3.0/de/ ------------------ Compilation requirements: Packages (x86-64): GCC >v4.2, compat. libstdc++ and GOMP v3.0 CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools) NOTES: optimized for NVIDIA-GPU-architecture: FERMI! two compile-steps! 1.) <src.cu> ==> <src.cpp> 2.) <src.cpp> ==> <dest> Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) 1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v 2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v */ // HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O #include <cxxabi.h> #include <climits> #include <stdint.h> #include <inttypes.h> #include <cfloat> #include <cwchar> #include <string> //std:string #include <string.h> #include <cstring> #include <cstdlib> #include <cstdio> #include <iostream> #include <sstream> #include <cmath> // HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support #ifdef _OPENMP #include <omp.h> #endif // HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support #include <cuda.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> #include "./cuPrintf/cuPrintf.cu" using namespace std; #define free(x) free(x); *x=NULL // DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function) __global__ void GPUthread(void); __host__ int main(int argc, char *argv[]) { // Runtime manipulation of OpenMP-state variables //omp_set_num_threads(4); omp_set_dynamic(false); omp_set_nested(true); // Get Number of GPU´s int NumGPUs = 0; cudaGetDeviceCount(&NumGPUs); std::cout << "Hello World (64-Bit)\n" << "==============================================================================================\n"; /* Create OpenMP-parallel region to use Multi-CPU and Multi-GPU. For CUDA: only one CPU-Thread can use one GPU-session at the same time. */ #pragma omp parallel default(none) shared(std::cout, stdout, NumGPUs) { // "Hello World" from (Multi-)CPU and get CPU information #pragma omp master { std::cout << "==>sagt die CPU (HOST, SERIELL): done\n"; if(omp_get_num_procs() > 1) { std::cout << "Multiprozessor: yes (" << omp_get_num_procs() << " CPU-Kerne)\n" << "Max. Anzahl möglicher Threads: " << omp_get_max_threads() << '\n' << "Max. Anzahl Threads im aktuellen Team: " << omp_get_num_threads() <<'\n'; } else { std::cout << "Multiprozessor: no (nur " << omp_get_num_procs() << "CPU-Kern)\n"; } } #pragma omp barrier // All CPU-Threads says "Hello World" if(omp_get_num_procs() > 1) { #pragma omp master { std::cout << "\n==>sagen alle CPU´s (" << omp_get_num_threads() << " HOST-Threads, PARALLEL mit OpenMP): done\n"; } #pragma omp barrier #pragma omp critical { if(omp_get_thread_num()==0) { std::cout << " CPU[-Thread]: " << (omp_get_thread_num()+1) << " (Master-Thread)\n"; } else { std::cout << " CPU[-Thread]: " << (omp_get_thread_num()+1) << '\n'; } } } #pragma omp barrier // Get GPU[´s] informations #pragma omp master { bool isFermi = false; // state variable for GPU architecture type (is FERMI = true, isn´t FERMI = false, default: false) if(NumGPUs!=0) { if(NumGPUs == 1) { std::cout << "\n==> sagt die CUDA-fähige GPU (Device): done\n"; } else { std::cout << "\n==> sagen alle " << NumGPUs << " CUDA-fähigen GPU´s (GPC, Devices): done\n"; } cudaDeviceProp dprop; // CUDA-Struct variable for CUDA-GPU-Settings for(int p=0;p<NumGPUs;++p) { std::cout << " GPU " << p << ":\n"; cudaGetDeviceProperties(&dprop,p); if((dprop.major==2)&&(dprop.minor==0)) { isFermi = true; } else { isFermi = false; } std::cout << " Name: NVIDIA® " << dprop.name << ((isFermi==true)?" (Fermi)\n":"\n") << " Typ: " << ((dprop.integrated==1)?"integrated (Onboard,MB)\n":"discrete (card)\n") << " Compute capability (revision numbers): " << dprop.major << '.' << dprop.minor << " (major.minor)\n" << " Clock frequency: " << dprop.clockRate << " kHz (~" << ((double)dprop.clockRate/1000.00) << " MHz / ~" << ((double)dprop.clockRate/1000000.00) << " GHz)\n" << " Total global memory (available): " << dprop.totalGlobalMem << " Bytes (~" << ((double)dprop.totalGlobalMem/pow(1024.00,3)) << " Gb)\n" << " Total constant memory (available): " << dprop.totalConstMem << " Bytes (~" << ((double)dprop.totalConstMem/1024.00) << " Kb)\n" << " Max memory pitch: " << dprop.memPitch << " Bytes (~" << ((double)dprop.memPitch/pow(1024.00,3)) << " Gb)\n" << " Texture Alignment: " << dprop.textureAlignment << " Bytes\n" << " Device copy overlap: " << ((dprop.deviceOverlap==1)?"Enabled\n":"Disabled\n") << " Host memory-mapping (hostm. map to devicem.): " << ((dprop.canMapHostMemory==1)?"Enabled\n":"Disabled\n") << " Kernel execution timeout: " << ((dprop.kernelExecTimeoutEnabled==1)?"Enabled\n":"Disabled\n") << " Compute Mode: " << ((dprop.computeMode==0)?"Default mode\n":\ (dprop.computeMode==1)?"Compute-exclusive mode\n":\ (dprop.computeMode==2)?"Compute-prohibited mode\n":\ "unknown\n") << " Streaming Multiprocessors (SM) count: " << dprop.multiProcessorCount << '\n'; if(isFermi==true) { std::cout << " Streamprocessors (SP, CUDA-Cores) count: " << (dprop.multiProcessorCount*32) << '\n' << " SP´s per SM: 32\n" << " Threads pro SM: 1536\n" << " Warps per SM: " << (1536/dprop.warpSize) << '\n'; } else { std::cout << " Streamprocessors (SP, CUDA-Cores) count: unknown\n" << " SP´s per SM: unknown\n" << " Threads pro SM: unknown\n"; } std::cout << " Shared mem per SM (available & shared by all blocks): " << dprop.sharedMemPerBlock << " Bytes (~" << ((double)dprop.sharedMemPerBlock/1024.00) << " Kb)\n" << " Registers per SM (available & shared by all blocks): " << dprop.regsPerBlock << " (32-bit width)\n" << " Warp size: " << dprop.warpSize << " Threads/Warp\n" << " Max threads per block: " << dprop.maxThreadsPerBlock << '\n' << " Max block dimensions (threads per blockdim): " << dprop.maxThreadsDim[0] << 'x' << dprop.maxThreadsDim[1] << 'x' << dprop.maxThreadsDim[2] << " (x*y*z)\n" << " Max grid dimensions (blocks per griddim): " << dprop.maxGridSize[0] << 'x' << dprop.maxGridSize[1] << 'x' << dprop.maxGridSize[2] << " (x*y*z)\n" << '\n'; } } else { } } /* All CUDA supported GPU´s says "Hello World" in the CUDA-kernel call. The OpenMP-loop: number of used CPU-Threads = Number of GPU´s. CUDAPrintf requires a CUDAPrintf-section by the following function calls in order: cudaSetDevice(<GPU-ID>); cudaPrintfInit(); <CUDA-kernel call>; cudaDeviceSynchronize(); cudaPrintfDisplay(stdout,true); cudaPrintfEnd(); */ #pragma omp for schedule(static) for(int GPU=0;GPU<NumGPUs;++GPU) { #pragma omp critical { cudaSetDevice(GPU); // choose GPU cudaPrintfInit(); // begin of CUDAPrintf-section printf("GPU %i:\n",GPU); // normal C-printf() GPUthread<<<1,1>>>(); // CUDA-kernel call with 1 Thread-Block and 1 GPU-Thread per block = 1 GPU-Thread total cudaDeviceSynchronize(); // CPU external GPU-Threads synchronisation cudaPrintfDisplay(stdout,true); // CUDAPrintf output of CUDA-kernel call cudaPrintfEnd(); // end of CUDAPrintf-section } } } getchar(); return 0; } // DEVICE: Implementation of CUDA-kernel "GPUthread()" __global__ void GPUthread(void) { // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale) int idx = blockIdx.x * blockDim.x + threadIdx.x; // CUDAPrintf call cuPrintf("Thread %i: Hello, nice to meet you!\n",(idx+1)); // block locale GPU-Threads synchronization __syncthreads(); }