/// @file
////////////////////////////////////////////////////////////////////////////////////////////////////
///
/// Copyright (C) 2017/18      Christian Lessig, Otto-von-Guericke Universitaet Magdeburg
///
////////////////////////////////////////////////////////////////////////////////////////////////////
///
///  module     : tutorial 2
///
///  author     : lessig@isg.cs.ovgu.de
///
///  project    : GPU Programming
///
///  description: race conditions
///
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <vector>
#include <thread>
#include <cstdlib>
#include <cmath>
#include <chrono>
#include <atomic>
#include <mutex>

std::atomic<int> sum_atomic;
std::mutex sum_mutex;
int sum_global = 0;

typedef std::chrono::milliseconds TimeT;

////////////////////////////////////////////////////////////////////////////////////////////////////
// worker function for nom computation
////////////////////////////////////////////////////////////////////////////////////////////////////
void
compute_norm_mutex( int task, int num_tasks, int* a, int n, int& sum) {

  int chunk_size = n / num_tasks;
  int i1 = task * chunk_size;
  int i2 = (task+1) * chunk_size;

  for( int i = i1; i < i2; ++i) {
    sum_mutex.lock();
    sum_global += a[i] * a[i];
    sum_mutex.unlock();
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// worker function for nom computation
////////////////////////////////////////////////////////////////////////////////////////////////////
void
compute_norm_atomic( int task, int num_tasks, int* a, int n, int& sum) {

  int chunk_size = n / num_tasks;
  int i1 = task * chunk_size;
  int i2 = (task+1) * chunk_size;

  for( int i = i1; i < i2; ++i) {
    sum_atomic += a[i] * a[i];
  }
}


////////////////////////////////////////////////////////////////////////////////////////////////////
// worker function for nom computation
////////////////////////////////////////////////////////////////////////////////////////////////////
void
compute_norm( int task, int num_tasks, int* a, int n, int& sum) {

  int chunk_size = n / num_tasks;
  int i1 = task * chunk_size;
  int i2 = (task+1) * chunk_size;

  for( int i = i1; i < i2; ++i) {
    sum += a[i] * a[i];
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
////////////////////////////////////////////////////////////////////////////////////////////////////
void norm1() {

  const unsigned int n = 16 * 8192 * 8192;
  const unsigned int num_threads = std::thread::hardware_concurrency() / 2;

  // allocate memory
  int* a = (int*) malloc( n * sizeof(int));
  for( unsigned int i = 0; i < n; i++) {
    a[i] = 1;
  }

  int sum = 0;

  auto start = std::chrono::steady_clock::now();

  // create threads
  std::vector< std::thread > threads;
  std::vector<int> sums( num_threads, 0);

  for( unsigned int i = 0; i < num_threads; ++i) {
    threads.push_back( std::thread( compute_norm, i, num_threads, a, n,
                                    std::ref( sums[i]) ));
  }

 for( unsigned int i = 0; i < num_threads; ++i) {
    threads[i].join();
    sum += sums[i];
  }

  auto t = std::chrono::duration_cast<TimeT>(std::chrono::steady_clock::now()-start).count();
  std::cout << "time = " << t << std::endl;

  std::cerr << "norm(a) = " << std::sqrt( (float) sum) << std::endl;

  // cleanup memory
  free( a);

}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
////////////////////////////////////////////////////////////////////////////////////////////////////
void norm2() {

  const unsigned int n = 16 * 8192 * 8192;
  const unsigned int num_threads = std::thread::hardware_concurrency() / 2;

  // allocate memory
  int* a = (int*) malloc( n * sizeof(int));
  for( unsigned int i = 0; i < n; i++) {
    a[i] = 1;
  }

  int sum = 0;
  int num_tasks = n / 2048;
  std::vector<int> sums( num_threads, 0);

  // worker function

  std::atomic<int> cnt(0);
  auto worker = [&]( const int tid) {

    int task = 0;
    while((task = cnt++) < num_tasks) {

      compute_norm( task, num_tasks, a, n, sums[tid]);
    }

  };

  // run parallel execution

  auto start = std::chrono::steady_clock::now();

  std::vector< std::thread > threads;
  for( unsigned int i = 0; i < num_threads; ++i) {
    threads.push_back( std::thread( worker, i));
  }

 for( unsigned int i = 0; i < num_threads; ++i) {
    threads[i].join();
    sum += sums[i];
  }

  auto t = std::chrono::duration_cast<TimeT>(std::chrono::steady_clock::now()-start).count();
  std::cout << "time (pool pattern) = " << t << std::endl;

  std::cerr << "norm(a) = " << std::sqrt( (float) sum) << std::endl;

  // cleanup memory
  free( a);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// program entry point
////////////////////////////////////////////////////////////////////////////////////////////////////
int
main( int /*argc*/, char** /*argv*/ ) {

  norm1();

  norm2();

  return EXIT_SUCCESS;
}
