#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include "utility.h"
#include "cuda_runtime.h"
using namespace std;

__global__ void vecAdd1(int *x, int *y, int *z, int N) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for(int i = tid; i < N; i+=stride) {
        z[i] = x[i] + y[i];
    }
}

__global__ void vecAdd2(int *x, int *y, int *z, int N) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    int nthreads = blockDim.x * gridDim.x;
    int chunk_size = N / nthreads;
    
    int start = tid * chunk_size;
    int end = (tid + 1) * chunk_size;
    if (tid == nthreads-1) {
        int extra = N - nthreads * chunk_size;
        if (extra != 0)
            end += extra;
    }

    for(int i = start; i < end; i+=1) {
        z[i] = x[i] + y[i];
    }
}

int main(int argc, const char **argv) {
    int N = atoi(argv[1]);

    // Host Memory
    int *x = (int *)malloc(N * sizeof(int));
    int *y = (int *)malloc(N * sizeof(int));
    int *z = (int *)malloc(N * sizeof(int));

    init_mat(x, 1, N, 0);
    init_mat(y, 1, N, 1);
    init_mat(z, 1, N, -1);

    // GPU Memory
    int *x_d, *y_d, *z_d;
    cudaMalloc((void**) &x_d, N * sizeof(int));
    cudaMalloc((void**) &y_d, N * sizeof(int));
    cudaMalloc((void**) &z_d, N * sizeof(int));

    cudaMemcpy(x_d, x, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(y_d, y, N * sizeof(int), cudaMemcpyHostToDevice);

    // Compute
    set_clock();
    vecAdd1<<<1024, 1024>>>(x_d, y_d, z_d, N);
    cudaDeviceSynchronize();
    double time1 = elapsed_time();

    // Copy result back
    cudaMemcpy(z, z_d, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Time taken for vecAdd1: %.4f secs\n", time1);

    for(int i = 0; i < N; i++) {
        if(z[i] != x[i] + y[i])
            printf("Error in code! For i = %d, %d != %d + %d\n", i, z[i], x[i], y[i]);
    }

    double time2 = elapsed_time();
    vecAdd2<<<1024, 1024>>>(x_d, y_d, z_d, N);
    cudaDeviceSynchronize();
    double time3 = elapsed_time();

    // Copy result back
    cudaMemcpy(z, z_d, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Time taken for vecAdd2: %.4f secs\n", time3-time2);

    for(int i = 0; i < N; i++) {
        if(z[i] != x[i] + y[i])
            printf("Error in code! For i = %d, %d != %d + %d\n", i, z[i], x[i], y[i]);
    }

    return 0;
}