3DMatrixAdd_Optimized1

/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
Normal-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -s <source.cpp> -o <dest>
Release-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -O3 -s <source.cpp> -o <dest>
Debug-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -g -ggdb3 <source.cpp> -o <dest>
*/

// Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <climits>
#include <cstdint>
#include <cinttypes>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <cmath>

// Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

using namespace std;

#define free(x) free(x); *x=NULL
#define PRId128 "s"
#define PRIi128 "s"
#define PRIu128 "s"

const uint64_t UINT64_MIN     = 0;
const __int128_t INT128_MIN   = (__int128_t)((-170141183460469231731.687303715884105728) * pow(10,18));
const __int128_t INT128_MAX   = (__int128_t)(( 170141183460469231731.687303715884105727) * pow(10,18));
const __uint128_t UINT128_MAX = (__uint128_t)((340282366920938463463.374607431768211455) * pow(10,18));
const __uint128_t UINT128_MIN = 0/* * pow(10,18)*/;

std::ostream &operator<<(std::ostream &out, __uint128_t x)
{
  if(x >= 10)
  {
    out << x / 10;
  }
  return out << static_cast<unsigned>(x % 10);
}

std::ostream &operator<<(std::ostream &out, __int128_t x)
{
  if(x < 0)
  {
    out << '-';
    x = -x;
  }
  return out << static_cast<__uint128_t>(x);
}

string INT128ToSTR(__int128_t x)
{
  std::stringstream sstr;
  sstr<<x;
  return sstr.str();
}
#define INT128ToCSTR(x) (INT128ToSTR(x)).c_str()

string UINT128ToSTR(__uint128_t x)
{
  std::stringstream sstr;
  sstr<<x;
  return sstr.str();
}
#define UINT128ToCSTR(x) (UINT128ToSTR(x)).c_str()

//NVdimN = NumValues of dimN
const uint64_t NVdim1 = 950ULL,    //dim1 = x (col)
               NVdim2 = 950ULL,    //dim2 = y (row)
               NVdim3 = 950ULL;    //dim3 = z (depth)

int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(12);
  omp_set_dynamic(0);
  omp_set_nested(3);  // important for nested parallelism

  // data declarations and implementations
  double starttime = 0.00, sdelay = 0.00, pdelay = 0.00;
  uint64_t MA_s[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \
           MB_s[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \
           MC_s[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \
           MA_p[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \
           MB_p[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \
           MC_p[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \
           x = 0ULL, y = 0ULL, z = 0ULL, i = 0ULL; // x: col, y: row, z: depth, i: Initialisierung
  bool ResultsAreCorrect = true;

  std::cout << "Matrix-Addition / Summenmatrix (3D)                        (64-Bit)\n"
            << "===================================================================\n"
            << "Initialisierung:";

  //--------------------------Begin: Initialization of data------------------------------------------

  x = y = z = i = 0ULL;
  //vektorielle Initialisierung
  for(x=0ULL;x<NVdim1;++x)     //col
  {
    for(y=0ULL;y<NVdim2;++y)   //row
    {
      for(z=0ULL;z<NVdim3;++z) //dep
      {
        MA_s[x][y][z] = MB_s[x][y][z] = MA_p[x][y][z] = MB_p[x][y][z] = (i+1ULL);
        ++i;
      }
    }
  }

  //--------------------------End: Initialization of data--------------------------------------------

  std::cout << "                                               done\n"
            << "SERIELLE AUSFÜHRUNG:";

  //--------------------------Begin: CPU-serial execution of algorithm-------------------------------

  x = y = z = 0ULL;
  starttime = omp_get_wtime();
  //CPU-serial algorithm (vektorielle Addition [Bildung der Summenmatrix]):
  for(x=0ULL;x<NVdim1;++x)     //col
  {
    for(y=0ULL;y<NVdim2;++y)   //row
    {
      for(z=0ULL;z<NVdim3;++z) //dep
      {
        MC_s[x][y][z] = MA_s[x][y][z] + MB_s[x][y][z];
      }
    }
  }
  sdelay = omp_get_wtime()-starttime;
  std::cout << "                                           done\n"; //serial

  //--------------------------End: CPU-serial execution of algorithm---------------------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm----------------------

  std::cout << "PARALLELE AUSFÜHRUNG mit ";

  x = y = z = 0ULL;
  // PRegion1: create parallel region: num threads = MaxThreads / 4 (quarter size of max)
  #pragma omp parallel if(omp_get_max_threads()>1) \
                       num_threads(((omp_get_max_threads()>=4)?(omp_get_max_threads()/4):1)) \
                       default(none) private(x, y, z) shared(std::cout, starttime, pdelay, MA_p, MB_p, MC_p)
  {
    #pragma omp master
    {
      std::cout << ((omp_get_num_threads()>1)?((omp_get_num_threads()*2)*2):1) << " Threads:";
      starttime = omp_get_wtime();
    }

    //OpenMP-CPU-parallel algorithm with nested parallelism (vektorielle Addition [Bildung der Summenmatrix]):
    #pragma omp flush
    #pragma omp for schedule(static)
    for(x=0ULL;x<NVdim1;++x)     //col
    {
      /* PRegion2: create nested parallel region with 2 threads (requires omp_set_nested(<!= 0>);)
         new num threads exec. = NumThreadsOfPRegion1 * NumThreadsOfPRegion2 (QuarterOfMaxThreads * 2 = HalfOfMaxThreads) */
      #pragma omp parallel if(omp_get_max_threads()>1) \
                           num_threads(((omp_get_num_threads()>3)?2:1)) \
                           default(none) shared(x, y, z, MA_p, MB_p, MC_p)
      {
        #pragma omp for schedule(static)
        for(y=0ULL;y<NVdim2;++y)   //row
        {
          /* PRegion3: create nested parallel region with 2 threads (requires omp_set_nested(<!= 0>);)
             num threads exec. total = NumThreadsOfPRegion1 * NumThreadsOfPRegion2 * NumThreadsOfPRegion3 (((1/4)*2)*2; HalfOfMaxThreads * 2 = MaxThreads) */
          #pragma omp parallel if(omp_get_max_threads()>1) \
                           num_threads(((omp_get_num_threads()>3)?2:1)) \
                           default(none) shared(x, y, z, MA_p, MB_p, MC_p)
          {
            #pragma omp for schedule(static)
            for(z=0ULL;z<NVdim3;++z) //dep
            {
              MC_p[x][y][z] = MA_p[x][y][z] + MB_p[x][y][z];
            }
          }
        }
      }
    }

    #pragma omp master
    {
      pdelay = omp_get_wtime()-starttime;
      if(((omp_get_num_threads()*2)*2) >= 10)
      {
        std::cout << "                           done\n";  //parallel
      }
      else
      {
        std::cout << "                            done\n"; //parallel
      }
    }
  }
  x = y = z = 0ULL;

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------

  //--------------------------Analysis of results----------------------------------------------------

  std::cout << "Überprüfe Ergebnisse:"; //vektorielle Überprüfung
  for(x=0ULL;x<NVdim1;++x)     //col
  {
    for(y=0ULL;y<NVdim2;++y)   //row
    {
      for(z=0ULL;z<NVdim3;++z) //dep
      {
        if(MC_p[x][y][z]!=MC_s[x][y][z])
        {
          ResultsAreCorrect = false;
          break;
        }
      }
    }
  }
  std::cout << "                                          done\n";

  std::cout << "\nAuswertung:\n"
            << "*******************************************************************\n"
            << "Anzahl Komponenten der 3D-Eingangsmatrix MA: " << (NVdim1*NVdim2*NVdim3) << '\n'
            << "Anzahl Komponenten der 3D-Eingangsmatrix MB: " << (NVdim1*NVdim2*NVdim3) << '\n'
            << "Anzahl Komponenten der 3D-Ergebnismatrix MC: " << (NVdim1*NVdim2*NVdim3) << '\n'
            << "Seriell & parallel richtig gerechnet?:                          " << ((ResultsAreCorrect==true)?"yes\n":" no\n")
            << "Dauer - SERIELL:     " << sdelay << " sec\n"
            << "Dauer - PARALLEL:    " << pdelay << " sec\n"
            << "__________________\n"
            << "==>Summenmatrix (seriell berechnet):\n"
            << " MA(" << MA_s[0][0][0] << ';' << MA_s[0][0][1] << ';' << MA_s[0][0][2] << ';' << MA_s[0][0][3] << ";...;"
                     << MA_s[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MA_s[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n+"
            << "MB(" << MB_s[0][0][0] << ';' << MB_s[0][0][1] << ';' << MB_s[0][0][2] << ';' << MB_s[0][0][3] << ";...;"
                     << MB_s[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MB_s[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n="
            << "MC(" << MC_s[0][0][0] << ';' << MC_s[0][0][1] << ';' << MC_s[0][0][2] << ';' << MC_s[0][0][3] << ";...;"
                     << MC_s[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MC_s[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n"
            << "==>Summenmatrix (parallel berechnet):\n"
            << " MA(" << MA_p[0][0][0] << ';' << MA_p[0][0][1] << ';' << MA_p[0][0][2] << ';' << MA_p[0][0][3] << ";...;"
                     << MA_p[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MA_p[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n+"
            << "MB(" << MB_p[0][0][0] << ';' << MB_p[0][0][1] << ';' << MB_p[0][0][2] << ';' << MB_p[0][0][3] << ";...;"
                     << MB_p[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MB_p[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n="
            << "MC(" << MC_p[0][0][0] << ';' << MC_p[0][0][1] << ';' << MC_p[0][0][2] << ';' << MC_p[0][0][3] << ";...;"
                     << MC_p[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MC_p[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n"
            << "__________________"
            << "\n64-Bit-Werte:\n"
            << "INT64_MIN:                                    "  << INT64_MIN << '\n'
            << "INT64_MAX:                                     " << INT64_MAX << '\n'
            << "UINT64_MIN:                                    " << UINT64_MIN << '\n'
            << "UINT64_MAX:                                    " << UINT64_MAX << '\n';
  /*
  // Detailed output
  std::cout << "__________________\n"
            << "Ergebnisliste:\n";
  x = y = z = i = 0ULL;
  for(x=0ULL;x<NVdim1;++x)     //col
  {
    for(y=0ULL;y<NVdim2;++y)   //row
    {
      for(z=0ULL;z<NVdim3;++z) //dep
      {
        std::cout << "Seriell:    MA" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MA_s[x][y][z] << ") + "
                  <<             "MB" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MB_s[x][y][z] << ") = "
                  <<             "MC" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MC_s[x][y][z] << ")\n"
                  << "Parallel:   MA" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MA_p[x][y][z] << ") + "
                  <<             "MB" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MB_p[x][y][z] << ") = "
                  <<             "MC" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MC_p[x][y][z] << ")\n";
        ++i;
      }
    }
  }
  */
  getchar();
  return 0;
}