/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
  CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools)
NOTES: optimized for NVIDIA-GPU-architecture: FERMI!
       two compile-steps!
  1.) <src.cu>  ==> <src.cpp>
  2.) <src.cpp> ==> <dest>
Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v
Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v
Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v
*/

// HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <cxxabi.h>
#include <climits>
#include <stdint.h>
#include <inttypes.h>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <cmath>

// HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

// HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "./cuPrintf/cuPrintf.cu"

using namespace std;

#define free(x) free(x); *x=NULL
const uint64_t NumValues = 400000000ULL;

// DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function)
__global__ void cu1DVecAdd64(uint64_t *D_A, uint64_t *D_B, uint64_t *D_C);

__host__ int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(0);

  // data declarations and implementations
  double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00;
  uint64_t A_s[NumValues] = {0ULL}, \
           B_s[NumValues] = {0ULL}, \
           C_s[NumValues] = {0ULL}, \
           A_p[NumValues] = {0ULL}, \
           B_p[NumValues] = {0ULL}, \
           C_p[NumValues] = {0ULL}, \
           h_A1_p[NumValues/2] = {0ULL}, \
           h_B1_p[NumValues/2] = {0ULL}, \
           h_C1_p[NumValues/2] = {0ULL}, \
           h_A2_p[NumValues/2] = {0ULL}, \
           h_B2_p[NumValues/2] = {0ULL}, \
           h_C2_p[NumValues/2] = {0ULL}, \
           h_C_p[NumValues] = {0ULL}, \
           *d_A1_p, *d_B1_p, *d_C1_p, \
           *d_A2_p, *d_B2_p, *d_C2_p;
  bool ResultsAreCorrect = false;

  std::cout << "Vektoraddition (1D)                                        (64-Bit)\n"
            << "===================================================================\n"
            << "Initialisierung:";

  //--------------------------Begin: Initialization of data------------------------------------------

  for(uint64_t i=0ULL;i<NumValues;++i)
  {
    A_s[i] = A_p[i] = B_s[i] = B_p[i] = (i+1ULL);
    if(i<(NumValues/2))
    {
      h_A1_p[i] = h_B1_p[i] = (i+1ULL);
      h_A2_p[i] = h_B2_p[i] = (i+(NumValues/2)+1ULL);
    }
  }

  //--------------------------End: Initialization of data--------------------------------------------

  std::cout << "                                               done\n"
            << "CPU-SERIELLE AUSFÜHRUNG:";

  //--------------------------Begin: CPU-serial execution of algorithm-------------------------------

  starttime = omp_get_wtime();
  //CPU-serial algorithm:
  for(uint64_t j=0ULL;j<NumValues;++j)
  {
    C_s[j] = A_s[j] + B_s[j];
  }
  sdelay = omp_get_wtime()-starttime;
  std::cout << "                                       done\n"; //serial

  //--------------------------End: CPU-serial execution of algorithm---------------------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm----------------------

  std::cout << "CPU-PARALLELE AUSFÜHRUNG mit ";

  // create parallel region:
  #pragma omp parallel default(none) shared(std::cout, starttime, pdelay1, A_p, B_p, C_p)
  {
    #pragma omp master
    {
      std::cout << omp_get_num_threads() << " Threads:";
      starttime = omp_get_wtime();
    }

    //OpenMP-CPU-parallel algorithm
    #pragma omp flush
    #pragma omp for schedule(static)
    for(uint64_t k=0ULL;k<NumValues;++k)
    {
      C_p[k] = A_p[k] + B_p[k];
    }

    #pragma omp master
    {
      pdelay1 = omp_get_wtime()-starttime;
      if(omp_get_num_threads() >= 10)
      {
        std::cout << "                       done\n";  //parallel
      }
      else
      {
        std::cout << "                        done\n"; //parallel
      }
    }
  }

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------

  //++++++++++++++++++++++++++Begin: Multi-GPU-parallel CUDA-execution of algorithm++++++++++++++++++

  std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 2 GPU u. 14 SM:";

  // allocate data on GPU memory of each GPU
  // choose GPU1
  cudaSetDevice(0);
  // allocate data on GPU memory of GPU1
  cudaMalloc((void**)&d_A1_p,((NumValues/2)*sizeof(uint64_t)));
  cudaMalloc((void**)&d_B1_p,((NumValues/2)*sizeof(uint64_t)));
  cudaMalloc((void**)&d_C1_p,((NumValues/2)*sizeof(uint64_t)));
  // choose GPU2
  cudaSetDevice(1);
  // allocate data on GPU memory of GPU2
  cudaMalloc((void**)&d_A2_p,((NumValues/2)*sizeof(uint64_t)));
  cudaMalloc((void**)&d_B2_p,((NumValues/2)*sizeof(uint64_t)));
  cudaMalloc((void**)&d_C2_p,((NumValues/2)*sizeof(uint64_t)));
  // copy partitioned Host-Data into GPU-Data (into memory of each GPU (partition1 into GPU1, partition2 into GPU2))
  // choose GPU1
  cudaSetDevice(0);
  // copy Host-Data into GPU-Data (memory of GPU1)
  cudaMemcpy(d_A1_p,h_A1_p,((NumValues/2)*sizeof(uint64_t)),cudaMemcpyHostToDevice);
  cudaMemcpy(d_B1_p,h_B1_p,((NumValues/2)*sizeof(uint64_t)),cudaMemcpyHostToDevice);
  // choose GPU2
  cudaSetDevice(1);
  // copy Host-Data into GPU-Data (memory of GPU2)
  cudaMemcpy(d_A2_p,h_A2_p,((NumValues/2)*sizeof(uint64_t)),cudaMemcpyHostToDevice);
  cudaMemcpy(d_B2_p,h_B2_p,((NumValues/2)*sizeof(uint64_t)),cudaMemcpyHostToDevice);

  starttime = omp_get_wtime();
  /*
  // serial execution of GPU´s: 1.) parallel execution with GPU1, 2.) parallel execution with GPU2
  // choose GPU 1
  cudaSetDevice(0);
  // CUDA-kernel call with
  // 112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU)
  cu1DVecAdd64<<<112,192>>>(d_A1_p,d_B1_p,d_C1_p);
  //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n';
  // CPU external GPU-Threads synchronisation
  cudaDeviceSynchronize();

  // choose GPU 2
  cudaSetDevice(1);
  // CUDA-kernel call with
  // 112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU)
  cu1DVecAdd64<<<112,192>>>(d_A2_p,d_B2_p,d_C2_p);
  //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n';
  // CPU external GPU-Threads synchronisation
  cudaDeviceSynchronize();
  */
  omp_set_num_threads(2); // 2 GPU requires max. 2 CPU-Threads
  /* Create OpenMP-parallel sections-region to use Multi-CPU and Multi-GPU.
     Num sections (2) = Num CPU-Threads (2).
     For CUDA: only one CPU-Thread can use one GPU-session at the same time. */
  #pragma omp sections
  {
    #pragma omp section
    {
      // choose GPU1
      cudaSetDevice(0);
      /* CUDA-kernel calls on GPU1 with
         112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */
      cu1DVecAdd64<<<112,192>>>(d_A1_p,d_B1_p,d_C1_p);
      //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n';
      // CPU external GPU-Threads synchronisation
      cudaDeviceSynchronize();
    }
    #pragma omp section
    {
      // choose GPU2
      cudaSetDevice(1);
      /* CUDA-kernel calls on GPU2 with
         112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */
      cu1DVecAdd64<<<112,192>>>(d_A2_p,d_B2_p,d_C2_p);
      //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n';
      // CPU external GPU-Threads synchronisation
      cudaDeviceSynchronize();
    }
  }
  pdelay2 = omp_get_wtime()-starttime;
  // choose GPU 1
  cudaSetDevice(0);
  // copy calculated device-data from device memory of GPU1 back into CPU-data partition1 (RAM)
  cudaMemcpy(h_C1_p,d_C1_p,((NumValues/2)*sizeof(uint64_t)),cudaMemcpyDeviceToHost);
  // choose GPU 2
  cudaSetDevice(1);
  // copy calculated device-data from device memory of GPU2 back into CPU-data partition2 (RAM)
  cudaMemcpy(h_C2_p,d_C2_p,((NumValues/2)*sizeof(uint64_t)),cudaMemcpyDeviceToHost);
  // Merging of partitions
  for(uint64_t n=0ULL;n<(NumValues/2);++n)
  {
    h_C_p[n] = h_C1_p[n];
    h_C_p[n+(NumValues/2)] = h_C2_p[n];
  }
  std::cout << "                   done\n"; //parallel (GPU)

  //++++++++++++++++++++++++++End: Multi-GPU-parallel CUDA-execution of algorithm++++++++++++++++++++

  //--------------------------Analysis of results----------------------------------------------------

  std::cout << "Überprüfe Ergebnisse:";
  for(uint64_t l=0ULL;l<NumValues;++l)
  {
    if((C_p[l]==C_s[l])&&(h_C_p[l]==C_s[l]))
    {
      ResultsAreCorrect = true;
    }
    else
    {
      ResultsAreCorrect = false;
      break;
    }
  }
  std::cout << "                                          done\n";

  std::cout << "\nAuswertung:\n"
            << "*******************************************************************\n"
            << "Anzahl 1D-Eingangsvektoren  A: " << NumValues << '\n'
            << "Anzahl 1D-Eingangsvektoren  B: " << NumValues << '\n'
            << "Anzahl 1D-Ergebnis-Vektoren C: " << NumValues << '\n'
            << "Seriell & parallel richtig gerechnet?:                          " << ((ResultsAreCorrect==true)?"yes\n":" no\n")
            << "Dauer - SERIELL:           " << sdelay << " sec\n"
            << "Dauer - PARALLEL (CPU):    " << pdelay1 << " sec\n"
            << "Dauer - PARALLEL (GPU):    " << pdelay2 << " sec\n"
            << "__________________\n"
            << "Beispiele:\n"
            << "==> 1.(1D)-Vektoraddition:\n"
            << "C1 = A1(" << A_p[0] << ") + B1(" << B_p[0] << ")\n"
            << "C1 = " << C_p[0] << "\n"
            << "==> " << NumValues << ".(1D)-Vektoraddition:\n"
            << "C" << NumValues << " = A" << NumValues << "(" << A_p[NumValues-1] << ") + B" << NumValues << "(" << B_p[NumValues-1] << ")\n"
            << "C" << NumValues << " = " << C_p[NumValues-1] << "\n";
  /*
  // Detailed output
  std::cout << "__________________\n"
            << "Ergebnisliste:\n";
  for(uint64_t m=0ULL;m<NumValues;++m)
  {
    std::cout << "Seriell:    A" << (m+1) << " = " << A_s[m] << "    B" << (m+1) << " = " << B_s[m] << "    C" << (m+1) << " = " << C_s[m] << '\n';
    std::cout << "Parallel (CPU):   A" << (m+1) << " = " << A_p[m] << "    B" << (m+1) << " = " << B_p[m] << "    C" << (m+1) << " = " << C_p[m] << '\n';
    std::cout << "Parallel (GPU):   A" << (m+1) << " = " << h_A_p[m] << "    B" << (m+1) << " = " << h_B_p[m] << "    C" << (m+1) << " = " << h_C_p[m] << '\n';
  }
  */
  // free allocated data on GPU memory
  cudaFree(d_A1_p);
  cudaFree(d_B1_p);
  cudaFree(d_C1_p);
  cudaFree(d_A2_p);
  cudaFree(d_B2_p);
  cudaFree(d_C2_p);

  getchar();
  return 0;
}

// DEVICE: Implementation of CUDA-kernel "cu1DVecAdd64()"
__global__ void cu1DVecAdd64(uint64_t *D_A, uint64_t *D_B, uint64_t *D_C)
{
  // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale)
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  // algorithm
  while(tid < (NumValues/2)) // iterate through all the data in GPU-memory - PART 1
  {
    D_C[tid] = D_A[tid]+D_B[tid];
    tid+=(blockDim.x*gridDim.x);  // iterate through all the data in GPU-memory - PART 2
  }
  // block locale GPU-Threads synchronization
  __syncthreads();
}