/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
Normal-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -s <source.cpp> -o <dest>
Release-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -O3 -s <source.cpp> -o <dest>
Debug-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -g -ggdb3 <source.cpp> -o <dest>
*/

// Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <climits>
#include <cstdint>
#include <cinttypes>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <cmath>

// Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

using namespace std;

#define free(x) free(x); *x=NULL
#define PRId128 "s"
#define PRIi128 "s"
#define PRIu128 "s"

const uint64_t UINT64_MIN     = 0;
const __int128_t INT128_MIN   = (__int128_t)((-170141183460469231731.687303715884105728) * pow(10,18));
const __int128_t INT128_MAX   = (__int128_t)(( 170141183460469231731.687303715884105727) * pow(10,18));
const __uint128_t UINT128_MAX = (__uint128_t)((340282366920938463463.374607431768211455) * pow(10,18));
const __uint128_t UINT128_MIN = 0/* * pow(10,18)*/;

std::ostream &operator<<(std::ostream &out, __uint128_t x)
{
  if(x >= 10)
  {
    out << x / 10;
  }
  return out << static_cast<unsigned>(x % 10);
}

std::ostream &operator<<(std::ostream &out, __int128_t x)
{
  if(x < 0)
  {
    out << '-';
    x = -x;
  }
  return out << static_cast<__uint128_t>(x);
}

string INT128ToSTR(__int128_t x)
{
  std::stringstream sstr;
  sstr<<x;
  return sstr.str();
}
#define INT128ToCSTR(x) (INT128ToSTR(x)).c_str()

string UINT128ToSTR(__uint128_t x)
{
  std::stringstream sstr;
  sstr<<x;
  return sstr.str();
}
#define UINT128ToCSTR(x) (UINT128ToSTR(x)).c_str()

const __uint128_t NumValues = 100000000000ULL;

int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(0);

  // data declarations and implementations
  double starttime = 0.00, gaussdelay = 0.00, sdelay = 0.00, pdelay = 0.00;
  __uint128_t SumGauss = 0ULL, SumSingle = 0ULL, SumParallel = 0ULL;

  std::cout << "Akkumulation - Gaußsche Summenformel (kleiner Gauß)                          (128-Bit)\n"
            << "======================================================================================\n"
            << "AUSFÜHRUNG gaußsche Summenformel:";

  //--------------------------Begin: directly execution of algorithm---------------------------------

  starttime = omp_get_wtime();
  //direkte Anwendung der Gaußschen Summenformel (kleiner Gauß) / directly algorithm:
  SumGauss = (NumValues*(NumValues+1ULL))/2ULL;
  gaussdelay = omp_get_wtime()-starttime;
  std::cout << "                                                 done\n"; //Gaußsche Summenformel (kleiner Gauß)

  //--------------------------End: directly execution of algorithm-----------------------------------

  //--------------------------Begin: CPU-serial execution of algorithm-------------------------------

  std::cout << "SERIELLE AUSFÜHRUNG:";
  starttime = omp_get_wtime();
  //CPU-serial algorithm (iterative):
  for(__uint128_t i=0ULL;i<(NumValues+1ULL);++i)
  {
    SumSingle+=i;
  }
  sdelay = omp_get_wtime()-starttime;
  std::cout << "                                                              done\n"; //serial (iterative)

  //--------------------------End: CPU-serial execution of algorithm---------------------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm----------------------

  std::cout << "PARALLELE AUSFÜHRUNG mit ";

  //__uint128_t *Threads = new __uint128_t[omp_get_max_threads()];
  // create parallel region: shared(Threads,...) if using "Threads"-variable
  #pragma omp parallel default(none) shared(std::cout, starttime, pdelay, SumParallel)
  {
    #pragma omp master
    {
      std::cout << omp_get_num_threads() << " Threads:";
      starttime = omp_get_wtime();
    }

    //OpenMP-CPU-parallel algorithm with the Reduction-clause:
    #pragma omp flush
    #pragma omp for schedule(static) reduction(+: SumParallel)
    for(__uint128_t k=0ULL;k<(NumValues+1ULL);++k)
    {
      SumParallel+=k;
      //Threads[omp_get_thread_num()]+=k;
    }

    #pragma omp master
    {
      pdelay = omp_get_wtime()-starttime;
      if(omp_get_num_threads() >= 10)
      {
        std::cout << "                                              done\n";  //parallel (iterative)
      }
      else
      {
        std::cout << "                                               done\n"; //parallel (iterative)
      }
      /*
      //Partial results of each CPU-Thread
      for(int j=0;j<omp_get_num_threads();++j)
      {
        if(j<9)
        {
          std::cout << "Summe von Thread  " << (j+1) << ": " << Threads[j] << '\n';
        }
        else
        {
          std::cout << "Summe von Thread " << (j+1) << ": " << Threads[j] << '\n';
        }
      }*/
    }
  }

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------

  //--------------------------Analysis of results----------------------------------------------------
  std::cout << "\nAuswertung:\n"
            << "**************************************************************************************\n"
            << "Anzahl aufeinanderfolgender natürliche Zahlen: " << NumValues << '\n'
            << "Summe - SERIELL:                               " << SumSingle << "\n"
            << "Summe - PARALLEL:                              " << SumParallel << "\n"
            << "Referenzwert (Gaußsche Summenformel):          " << SumGauss << '\n'
            << "Dauer - Gaußsche Summenformel:                 " << gaussdelay << " sec\n"
            << "Dauer - SERIELL:                               " << sdelay << " sec\n"
            << "Dauer - PARALLEL:                              " << pdelay << " sec\n"
            << "__________________"
            << "\n128-Bit-Werte:\n"
            << "INT128_MIN:                                   "  << INT128_MIN << '\n'
            << "INT128_MAX:                                    " << INT128_MAX << '\n'
            << "UINT128_MIN:                                   " << UINT128_MIN << '\n'
            << "UINT128_MAX:                                   " << UINT128_MAX << '\n';

  //delete []Threads;

  getchar();
  return 0;
}