#include <mkl.h>
#include <omp.h>

float ComputePi(int n) {

  int block_size = 256;
  int n_blocks = n/block_size;
  int n_padded = n_blocks*block_size;

  int hits = 0;


#pragma omp parallel reduction(+: hits)
  {
    float x[block_size], y[block_size];
    
    VSLStreamStatePtr rnStream; // Random number generator
    vslNewStream(&rnStream, VSL_BRNG_MT19937, omp_get_thread_num()); 

    float r; // Temporary variable
#pragma omp for
    for (int block = 0; block < n_blocks; block++) {
      vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, block_size, x, 0.0f, 1.0f);
      vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, block_size, y, 0.0f, 1.0f);

      for (int i = 0; i < block_size; i++) {
	r = x[i]*x[i] + y[i]*y[i];
	if (r < 1.0f) 
	  hits++;
      }
    }
  }
  float pi_est = 4.0f * (float)hits / (float)n_padded;
  return pi_est;
}
