#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include "utility.h"
#include "cuda_runtime.h"
using namespace std;

__global__ void vecAdd(int *x, int *y, int *z, int N) {
    for(int i = 0; i < N; i++) {
        z[i] = x[i] + y[i];
    }
}

int main(int argc, const char **argv) {
    int N = atoi(argv[1]);

    // Host Memory
    int *x = (int *)malloc(N * sizeof(int));
    int *y = (int *)malloc(N * sizeof(int));
    int *z = (int *)malloc(N * sizeof(int));

    init_mat(x, 1, N, 0);
    init_mat(y, 1, N, 1);
    init_mat(z, 1, N, -1);

    set_clock();

    // GPU Memory
    int *x_d, *y_d, *z_d;
    cudaMalloc((void**) &x_d, N * sizeof(int));
    cudaMalloc((void**) &y_d, N * sizeof(int));
    cudaMalloc((void**) &z_d, N * sizeof(int));

    cudaMemcpy(x_d, x, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(y_d, y, N * sizeof(int), cudaMemcpyHostToDevice);

    // Compute
    vecAdd<<<1, 1>>>(x_d, y_d, z_d, N);
    cudaDeviceSynchronize();

    // Copy result back
    cudaMemcpy(z, z_d, N * sizeof(int), cudaMemcpyDeviceToHost);

    double time = elapsed_time();
    printf("Time taken: %.4f secs\n", time);

    for(int i = 0; i < N; i++) {
        if(z[i] != x[i] + y[i])
            printf("Error in code! For i = %d, %d != %d + %d\n", i, z[i], x[i], y[i]);
    }

    free(x); free(y); free(z);
    cudaFree(x_d); cudaFree(y_d); cudaFree(z_d);

    return 0;
}