CUDA

CUDA (Compute Unified Device Architecture) is a parallel computing platform and programming model created by NVIDIA.

It allows developers to use NVIDIA GPUs to perform general-purpose computations (not limited to graphics), dramatically accelerating applications that can be parallelized.

Key Features:

Massive parallelism — Thousands of tasks run at once on GPU.
Easy C/C++ extension — Small additions to normal code.
Unified Memory — CPU and GPU share data easily.
Fast libraries — cuBLAS, cuDNN, cuFFT give instant speed.
Python friendly — Works great with PyTorch, TensorFlow, CuPy.
Modern improvements — Graphs, Tensor Cores, multi-GPU support.

Use Cases:

Deep Learning & AI
Scientific Simulations
Data Science & Analytics
Robotics & Autonomous Systems
Cryptocurrency & Finance
Video & Image Processing
Signal Processing

CUDA Program Source Code (Example)

Example source code (Hello World) path → /home/$USER/job_template/C/cuda_hello_world.cu

#include <stdio.h>
#include <cuda_runtime.h>

// CUDA kernel to print Hello World
__global__ void helloWorldKernel() {
  printf("Hello World from thread %d, block %d\n", threadIdx.x, blockIdx.x);
}

int main() {
  // Launch the kernel with 1 block of 10 threads
  helloWorldKernel<<<1, 10>>>();

  // Synchronize to ensure all printf calls are completed
  cudaDeviceSynchronize();

  // Check for any CUDA errors
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(err));
    return 1;
  }

  return 0;
}

Example source code (Pi Calculation) path → /home/$USER/job_template/C/cuda_pi.cu

#include <stdio.h>
#include <cuda_runtime.h>

#define N 1000000000 // 1E9
#define d 1E-9
#define d2 1E-18

// Macro for CUDA error checking
#define CUDA_CHECK(call) \
  do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
      fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
      exit(1); \
    } \
  } while (0)

// CUDA kernel to compute partial sums
__global__ void compute_pi(double *sum, int n, int offset) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int global_idx = idx + offset;
  double x2;

  if (global_idx < n) {
    x2 = d2 * global_idx * global_idx;
    sum[idx] = 1.0 / (1.0 + x2);
  }
}

int main() {
  int deviceCount;
  CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
  if (deviceCount == 0) {
    printf("No CUDA devices found!\n");
    return 1;
  }
  printf("Found %d CUDA device(s)\n", deviceCount);

  double pi = 0.0, total_sum = 0.0;
  cudaEvent_t start, stop;
  float total_seconds = 0.0f;

  // Allocate host memory for sums
  double *h_sums = (double*)malloc(N * sizeof(double));
  if (!h_sums) {
    fprintf(stderr, "Host memory allocation failed\n");
    return 1;
  }

  // Create CUDA events for global timing
  CUDA_CHECK(cudaEventCreate(&start));
  CUDA_CHECK(cudaEventCreate(&stop));
  // Record start time (use default device for global timing)
  CUDA_CHECK(cudaSetDevice(0));
  CUDA_CHECK(cudaEventRecord(start));

  // Calculate workload per GPU
  int base_workload = N / deviceCount;
  int remainder = N % deviceCount;
  int offset = 0;

  // Process each GPU
  for (int dev = 0; dev < deviceCount; dev++) {
    CUDA_CHECK(cudaSetDevice(dev));
    CUDA_CHECK(cudaDeviceSynchronize()); // Ensure device is ready

    // Calculate workload for this GPU
    int workload = base_workload + (dev < remainder ? 1 : 0);
    if (workload == 0) continue;

    float gpu_milliseconds = 0.0f;
    cudaEvent_t gpu_start, gpu_stop;

    // Create events on the current device
    CUDA_CHECK(cudaEventCreate(&gpu_start));
    CUDA_CHECK(cudaEventCreate(&gpu_stop));

    // Record GPU start time
    CUDA_CHECK(cudaEventRecord(gpu_start));

    // Allocate device memory
    double *d_sum;
    CUDA_CHECK(cudaMalloc(&d_sum, workload * sizeof(double)));

    // Set up grid and block dimensions
    int threadsPerBlock = 256;
    int blocks = (workload + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    compute_pi<<<blocks, threadsPerBlock>>>(d_sum, N, offset);
    CUDA_CHECK(cudaGetLastError());

    // Copy results back to host
    CUDA_CHECK(cudaMemcpy(&h_sums[offset], d_sum, workload * sizeof(double), cudaMemcpyDeviceToHost));

    // Record GPU end time
    CUDA_CHECK(cudaEventRecord(gpu_stop));
    CUDA_CHECK(cudaEventSynchronize(gpu_stop)); // Ensure GPU work is complete
    CUDA_CHECK(cudaEventElapsedTime(&gpu_milliseconds, gpu_start, gpu_stop));

    // Convert milliseconds to seconds for this GPU
    float gpu_seconds = gpu_milliseconds / 1000.0f;
    printf("GPU %d: Time=%f seconds, Workload=%d\n", dev, gpu_seconds, workload);

    // Update offset and accumulate time
    offset += workload;
    total_seconds += gpu_seconds;

    // Clean up device resources
    CUDA_CHECK(cudaFree(d_sum));
    CUDA_CHECK(cudaEventDestroy(gpu_start));
    CUDA_CHECK(cudaEventDestroy(gpu_stop));

    // Synchronize device before switching
    CUDA_CHECK(cudaDeviceSynchronize());
  }

  // Sum results on host
  for (int i = 0; i < N; i++) {
    total_sum += h_sums[i];
  }

  // Record end time on default device
  CUDA_CHECK(cudaSetDevice(0));
  CUDA_CHECK(cudaEventRecord(stop));
  CUDA_CHECK(cudaEventSynchronize(stop));
  float event_milliseconds;
  CUDA_CHECK(cudaEventElapsedTime(&event_milliseconds, start, stop));
  float event_seconds = event_milliseconds / 1000.0f;

  // Calculate and print PI
  pi = 4 * d * total_sum;
  printf("Total Time (sum of GPU times)=%f seconds; Event-based Time=%f seconds; PI=%lf\n",
    total_seconds, event_seconds, pi);

  // Clean up
  free(h_sums);
  CUDA_CHECK(cudaEventDestroy(start));
  CUDA_CHECK(cudaEventDestroy(stop));

  return 0;

For more information about CUDA, please refer to Nvidia CUDA Official Site