Skip to content

CUDA

CUDA (Compute Unified Device Architecture) is a parallel computing platform and programming model created by NVIDIA.

It allows developers to use NVIDIA GPUs to perform general-purpose computations (not limited to graphics), dramatically accelerating applications that can be parallelized.


Key Features:

  • Massive parallelism — Thousands of tasks run at once on GPU.
  • Easy C/C++ extension — Small additions to normal code.
  • Unified Memory — CPU and GPU share data easily.
  • Fast libraries — cuBLAS, cuDNN, cuFFT give instant speed.
  • Python friendly — Works great with PyTorch, TensorFlow, CuPy.
  • Modern improvements — Graphs, Tensor Cores, multi-GPU support.

Use Cases:

  • Deep Learning & AI
  • Scientific Simulations
  • Data Science & Analytics
  • Robotics & Autonomous Systems
  • Cryptocurrency & Finance
  • Video & Image Processing
  • Signal Processing

Example source code (Hello World) path → /home/$USER/job_template/C/cuda_hello_world.cu


#include <stdio.h>
#include <cuda_runtime.h>
// CUDA kernel to print Hello World
__global__ void helloWorldKernel() {
printf("Hello World from thread %d, block %d\n", threadIdx.x, blockIdx.x);
}
int main() {
// Launch the kernel with 1 block of 10 threads
helloWorldKernel<<<1, 10>>>();
// Synchronize to ensure all printf calls are completed
cudaDeviceSynchronize();
// Check for any CUDA errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(err));
return 1;
}
return 0;
}

Example source code (Pi Calculation) path → /home/$USER/job_template/C/cuda_pi.cu


#include <stdio.h>
#include <cuda_runtime.h>
#define N 1000000000 // 1E9
#define d 1E-9
#define d2 1E-18
// Macro for CUDA error checking
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
} while (0)
// CUDA kernel to compute partial sums
__global__ void compute_pi(double *sum, int n, int offset) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int global_idx = idx + offset;
double x2;
if (global_idx < n) {
x2 = d2 * global_idx * global_idx;
sum[idx] = 1.0 / (1.0 + x2);
}
}
int main() {
int deviceCount;
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0) {
printf("No CUDA devices found!\n");
return 1;
}
printf("Found %d CUDA device(s)\n", deviceCount);
double pi = 0.0, total_sum = 0.0;
cudaEvent_t start, stop;
float total_seconds = 0.0f;
// Allocate host memory for sums
double *h_sums = (double*)malloc(N * sizeof(double));
if (!h_sums) {
fprintf(stderr, "Host memory allocation failed\n");
return 1;
}
// Create CUDA events for global timing
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
// Record start time (use default device for global timing)
CUDA_CHECK(cudaSetDevice(0));
CUDA_CHECK(cudaEventRecord(start));
// Calculate workload per GPU
int base_workload = N / deviceCount;
int remainder = N % deviceCount;
int offset = 0;
// Process each GPU
for (int dev = 0; dev < deviceCount; dev++) {
CUDA_CHECK(cudaSetDevice(dev));
CUDA_CHECK(cudaDeviceSynchronize()); // Ensure device is ready
// Calculate workload for this GPU
int workload = base_workload + (dev < remainder ? 1 : 0);
if (workload == 0) continue;
float gpu_milliseconds = 0.0f;
cudaEvent_t gpu_start, gpu_stop;
// Create events on the current device
CUDA_CHECK(cudaEventCreate(&gpu_start));
CUDA_CHECK(cudaEventCreate(&gpu_stop));
// Record GPU start time
CUDA_CHECK(cudaEventRecord(gpu_start));
// Allocate device memory
double *d_sum;
CUDA_CHECK(cudaMalloc(&d_sum, workload * sizeof(double)));
// Set up grid and block dimensions
int threadsPerBlock = 256;
int blocks = (workload + threadsPerBlock - 1) / threadsPerBlock;
// Launch kernel
compute_pi<<<blocks, threadsPerBlock>>>(d_sum, N, offset);
CUDA_CHECK(cudaGetLastError());
// Copy results back to host
CUDA_CHECK(cudaMemcpy(&h_sums[offset], d_sum, workload * sizeof(double), cudaMemcpyDeviceToHost));
// Record GPU end time
CUDA_CHECK(cudaEventRecord(gpu_stop));
CUDA_CHECK(cudaEventSynchronize(gpu_stop)); // Ensure GPU work is complete
CUDA_CHECK(cudaEventElapsedTime(&gpu_milliseconds, gpu_start, gpu_stop));
// Convert milliseconds to seconds for this GPU
float gpu_seconds = gpu_milliseconds / 1000.0f;
printf("GPU %d: Time=%f seconds, Workload=%d\n", dev, gpu_seconds, workload);
// Update offset and accumulate time
offset += workload;
total_seconds += gpu_seconds;
// Clean up device resources
CUDA_CHECK(cudaFree(d_sum));
CUDA_CHECK(cudaEventDestroy(gpu_start));
CUDA_CHECK(cudaEventDestroy(gpu_stop));
// Synchronize device before switching
CUDA_CHECK(cudaDeviceSynchronize());
}
// Sum results on host
for (int i = 0; i < N; i++) {
total_sum += h_sums[i];
}
// Record end time on default device
CUDA_CHECK(cudaSetDevice(0));
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
float event_milliseconds;
CUDA_CHECK(cudaEventElapsedTime(&event_milliseconds, start, stop));
float event_seconds = event_milliseconds / 1000.0f;
// Calculate and print PI
pi = 4 * d * total_sum;
printf("Total Time (sum of GPU times)=%f seconds; Event-based Time=%f seconds; PI=%lf\n",
total_seconds, event_seconds, pi);
// Clean up
free(h_sums);
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
return 0;

For more information about CUDA, please refer to Nvidia CUDA Official Site