/* COPYRIGHT (2011-2012) by: Kevin Marco Erler (author), http://www.kevinerler.de AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de BSZ-Hermsdorf (co-owner), http://www.bszh.de Advanced Licensing (dual license: COPYRIGHT and following licenses): License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en License (Germany): CC-BY v3.0-DE or later - link: http://creativecommons.org/licenses/by/3.0/de/ ------------------ Compilation requirements: Packages (x86-64): GCC >v4.2, compat. libstdc++ and GOMP v3.0 Normal-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -s <source.cpp> -o <dest> Release-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -O3 -s <source.cpp> -o <dest> Debug-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested) g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -g -ggdb3 <source.cpp> -o <dest> */ // Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O #include <climits> #include <cstdint> #include <cinttypes> #include <cfloat> #include <cwchar> #include <string> //std:string #include <string.h> #include <cstring> #include <cstdlib> #include <cstdio> #include <iostream> #include <sstream> #include <iomanip> #include <cmath> // Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support #ifdef _OPENMP #include <omp.h> #endif using namespace std; #define free(x) free(x); *x=NULL #define PRId128 "s" #define PRIi128 "s" #define PRIu128 "s" const uint64_t UINT64_MIN = 0; const __int128_t INT128_MIN = (__int128_t)((-170141183460469231731.687303715884105728) * pow(10,18)); const __int128_t INT128_MAX = (__int128_t)(( 170141183460469231731.687303715884105727) * pow(10,18)); const __uint128_t UINT128_MAX = (__uint128_t)((340282366920938463463.374607431768211455) * pow(10,18)); const __uint128_t UINT128_MIN = 0/* * pow(10,18)*/; std::ostream &operator<<(std::ostream &out, __uint128_t x) { if(x >= 10) { out << x / 10; } return out << static_cast<unsigned>(x % 10); } std::ostream &operator<<(std::ostream &out, __int128_t x) { if(x < 0) { out << '-'; x = -x; } return out << static_cast<__uint128_t>(x); } string INT128ToSTR(__int128_t x) { std::stringstream sstr; sstr<<x; return sstr.str(); } #define INT128ToCSTR(x) (INT128ToSTR(x)).c_str() string UINT128ToSTR(__uint128_t x) { std::stringstream sstr; sstr<<x; return sstr.str(); } #define UINT128ToCSTR(x) (UINT128ToSTR(x)).c_str() //NVdimN = NumValues of dimN const uint64_t NVdim1 = 950ULL, //dim1 = x (col) NVdim2 = 950ULL, //dim2 = y (row) NVdim3 = 950ULL; //dim3 = z (depth) int main(int argc, char *argv[]) { // Runtime manipulation of OpenMP-state variables //omp_set_num_threads(12); omp_set_dynamic(0); omp_set_nested(3); // important for nested parallelism // data declarations and implementations double starttime = 0.00, sdelay = 0.00, pdelay = 0.00; uint64_t MA_s[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \ MB_s[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \ MC_s[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \ MA_p[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \ MB_p[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \ MC_p[NVdim1][NVdim2][NVdim3] = {{{0ULL}}}, \ x = 0ULL, y = 0ULL, z = 0ULL, i = 0ULL; // x: col, y: row, z: depth, i: Initialisierung bool ResultsAreCorrect = true; std::cout << "Matrix-Addition / Summenmatrix (3D) (64-Bit)\n" << "===================================================================\n" << "Initialisierung:"; //--------------------------Begin: Initialization of data------------------------------------------ x = y = z = i = 0ULL; //vektorielle Initialisierung for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { for(z=0ULL;z<NVdim3;++z) //dep { MA_s[x][y][z] = MB_s[x][y][z] = MA_p[x][y][z] = MB_p[x][y][z] = (i+1ULL); ++i; } } } //--------------------------End: Initialization of data-------------------------------------------- std::cout << " done\n" << "SERIELLE AUSFÜHRUNG:"; //--------------------------Begin: CPU-serial execution of algorithm------------------------------- x = y = z = 0ULL; starttime = omp_get_wtime(); //CPU-serial algorithm (vektorielle Addition [Bildung der Summenmatrix]): for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { for(z=0ULL;z<NVdim3;++z) //dep { MC_s[x][y][z] = MA_s[x][y][z] + MB_s[x][y][z]; } } } sdelay = omp_get_wtime()-starttime; std::cout << " done\n"; //serial //--------------------------End: CPU-serial execution of algorithm--------------------------------- //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm---------------------- std::cout << "PARALLELE AUSFÜHRUNG mit "; x = y = z = 0ULL; // PRegion1: create parallel region: num threads = MaxThreads / 4 (quarter size of max) #pragma omp parallel if(omp_get_max_threads()>1) \ num_threads(((omp_get_max_threads()>=4)?(omp_get_max_threads()/4):1)) \ default(none) private(x, y, z) shared(std::cout, starttime, pdelay, MA_p, MB_p, MC_p) { #pragma omp master { std::cout << ((omp_get_num_threads()>1)?((omp_get_num_threads()*2)*2):1) << " Threads:"; starttime = omp_get_wtime(); } //OpenMP-CPU-parallel algorithm with nested parallelism (vektorielle Addition [Bildung der Summenmatrix]): #pragma omp flush #pragma omp for schedule(static) for(x=0ULL;x<NVdim1;++x) //col { /* PRegion2: create nested parallel region with 2 threads (requires omp_set_nested(<!= 0>);) new num threads exec. = NumThreadsOfPRegion1 * NumThreadsOfPRegion2 (QuarterOfMaxThreads * 2 = HalfOfMaxThreads) */ #pragma omp parallel if(omp_get_max_threads()>1) \ num_threads(((omp_get_num_threads()>3)?2:1)) \ default(none) shared(x, y, z, MA_p, MB_p, MC_p) { #pragma omp for schedule(static) for(y=0ULL;y<NVdim2;++y) //row { /* PRegion3: create nested parallel region with 2 threads (requires omp_set_nested(<!= 0>);) num threads exec. total = NumThreadsOfPRegion1 * NumThreadsOfPRegion2 * NumThreadsOfPRegion3 (((1/4)*2)*2; HalfOfMaxThreads * 2 = MaxThreads) */ #pragma omp parallel if(omp_get_max_threads()>1) \ num_threads(((omp_get_num_threads()>3)?2:1)) \ default(none) shared(x, y, z, MA_p, MB_p, MC_p) { #pragma omp for schedule(static) for(z=0ULL;z<NVdim3;++z) //dep { MC_p[x][y][z] = MA_p[x][y][z] + MB_p[x][y][z]; } } } } } #pragma omp master { pdelay = omp_get_wtime()-starttime; if(((omp_get_num_threads()*2)*2) >= 10) { std::cout << " done\n"; //parallel } else { std::cout << " done\n"; //parallel } } } x = y = z = 0ULL; //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------ //--------------------------Analysis of results---------------------------------------------------- std::cout << "Überprüfe Ergebnisse:"; //vektorielle Überprüfung for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { for(z=0ULL;z<NVdim3;++z) //dep { if(MC_p[x][y][z]!=MC_s[x][y][z]) { ResultsAreCorrect = false; break; } } } } std::cout << " done\n"; std::cout << "\nAuswertung:\n" << "*******************************************************************\n" << "Anzahl Komponenten der 3D-Eingangsmatrix MA: " << (NVdim1*NVdim2*NVdim3) << '\n' << "Anzahl Komponenten der 3D-Eingangsmatrix MB: " << (NVdim1*NVdim2*NVdim3) << '\n' << "Anzahl Komponenten der 3D-Ergebnismatrix MC: " << (NVdim1*NVdim2*NVdim3) << '\n' << "Seriell & parallel richtig gerechnet?: " << ((ResultsAreCorrect==true)?"yes\n":" no\n") << "Dauer - SERIELL: " << sdelay << " sec\n" << "Dauer - PARALLEL: " << pdelay << " sec\n" << "__________________\n" << "==>Summenmatrix (seriell berechnet):\n" << " MA(" << MA_s[0][0][0] << ';' << MA_s[0][0][1] << ';' << MA_s[0][0][2] << ';' << MA_s[0][0][3] << ";...;" << MA_s[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MA_s[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n+" << "MB(" << MB_s[0][0][0] << ';' << MB_s[0][0][1] << ';' << MB_s[0][0][2] << ';' << MB_s[0][0][3] << ";...;" << MB_s[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MB_s[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n=" << "MC(" << MC_s[0][0][0] << ';' << MC_s[0][0][1] << ';' << MC_s[0][0][2] << ';' << MC_s[0][0][3] << ";...;" << MC_s[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MC_s[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n" << "==>Summenmatrix (parallel berechnet):\n" << " MA(" << MA_p[0][0][0] << ';' << MA_p[0][0][1] << ';' << MA_p[0][0][2] << ';' << MA_p[0][0][3] << ";...;" << MA_p[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MA_p[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n+" << "MB(" << MB_p[0][0][0] << ';' << MB_p[0][0][1] << ';' << MB_p[0][0][2] << ';' << MB_p[0][0][3] << ";...;" << MB_p[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MB_p[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n=" << "MC(" << MC_p[0][0][0] << ';' << MC_p[0][0][1] << ';' << MC_p[0][0][2] << ';' << MC_p[0][0][3] << ";...;" << MC_p[NVdim1-1][NVdim2-1][NVdim3-2] << ';' << MC_p[NVdim1-1][NVdim2-1][NVdim3-1] << ")\n" << "__________________" << "\n64-Bit-Werte:\n" << "INT64_MIN: " << INT64_MIN << '\n' << "INT64_MAX: " << INT64_MAX << '\n' << "UINT64_MIN: " << UINT64_MIN << '\n' << "UINT64_MAX: " << UINT64_MAX << '\n'; /* // Detailed output std::cout << "__________________\n" << "Ergebnisliste:\n"; x = y = z = i = 0ULL; for(x=0ULL;x<NVdim1;++x) //col { for(y=0ULL;y<NVdim2;++y) //row { for(z=0ULL;z<NVdim3;++z) //dep { std::cout << "Seriell: MA" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MA_s[x][y][z] << ") + " << "MB" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MB_s[x][y][z] << ") = " << "MC" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MC_s[x][y][z] << ")\n" << "Parallel: MA" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MA_p[x][y][z] << ") + " << "MB" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MB_p[x][y][z] << ") = " << "MC" << (i+1ULL) << '[' << x << "][" << y << "][" << z << "](" << MC_p[x][y][z] << ")\n"; ++i; } } } */ getchar(); return 0; }