CUDA
CUDA (Compute Unified Device Architecture) is a parallel computing platform and programming model created by NVIDIA.
It allows developers to use NVIDIA GPUs to perform general-purpose computations (not limited to graphics), dramatically accelerating applications that can be parallelized.
Key Features:
- Massive parallelism — Thousands of tasks run at once on GPU.
- Easy C/C++ extension — Small additions to normal code.
- Unified Memory — CPU and GPU share data easily.
- Fast libraries — cuBLAS, cuDNN, cuFFT give instant speed.
- Python friendly — Works great with PyTorch, TensorFlow, CuPy.
- Modern improvements — Graphs, Tensor Cores, multi-GPU support.
Use Cases:
- Deep Learning & AI
- Scientific Simulations
- Data Science & Analytics
- Robotics & Autonomous Systems
- Cryptocurrency & Finance
- Video & Image Processing
- Signal Processing
CUDA Program Source Code (Example)
Section titled “CUDA Program Source Code (Example)”Example source code (Hello World) path → /home/$USER/job_template/C/cuda_hello_world.cu
#include <stdio.h>#include <cuda_runtime.h>
// CUDA kernel to print Hello World__global__ void helloWorldKernel() { printf("Hello World from thread %d, block %d\n", threadIdx.x, blockIdx.x);}
int main() { // Launch the kernel with 1 block of 10 threads helloWorldKernel<<<1, 10>>>();
// Synchronize to ensure all printf calls are completed cudaDeviceSynchronize();
// Check for any CUDA errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(err)); return 1; }
return 0;}Example source code (Pi Calculation) path → /home/$USER/job_template/C/cuda_pi.cu
#include <stdio.h>#include <cuda_runtime.h>
#define N 1000000000 // 1E9#define d 1E-9#define d2 1E-18
// Macro for CUDA error checking#define CUDA_CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ exit(1); \ } \ } while (0)
// CUDA kernel to compute partial sums__global__ void compute_pi(double *sum, int n, int offset) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int global_idx = idx + offset; double x2;
if (global_idx < n) { x2 = d2 * global_idx * global_idx; sum[idx] = 1.0 / (1.0 + x2); }}
int main() { int deviceCount; CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); if (deviceCount == 0) { printf("No CUDA devices found!\n"); return 1; } printf("Found %d CUDA device(s)\n", deviceCount);
double pi = 0.0, total_sum = 0.0; cudaEvent_t start, stop; float total_seconds = 0.0f;
// Allocate host memory for sums double *h_sums = (double*)malloc(N * sizeof(double)); if (!h_sums) { fprintf(stderr, "Host memory allocation failed\n"); return 1; }
// Create CUDA events for global timing CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&stop)); // Record start time (use default device for global timing) CUDA_CHECK(cudaSetDevice(0)); CUDA_CHECK(cudaEventRecord(start));
// Calculate workload per GPU int base_workload = N / deviceCount; int remainder = N % deviceCount; int offset = 0;
// Process each GPU for (int dev = 0; dev < deviceCount; dev++) { CUDA_CHECK(cudaSetDevice(dev)); CUDA_CHECK(cudaDeviceSynchronize()); // Ensure device is ready
// Calculate workload for this GPU int workload = base_workload + (dev < remainder ? 1 : 0); if (workload == 0) continue;
float gpu_milliseconds = 0.0f; cudaEvent_t gpu_start, gpu_stop;
// Create events on the current device CUDA_CHECK(cudaEventCreate(&gpu_start)); CUDA_CHECK(cudaEventCreate(&gpu_stop));
// Record GPU start time CUDA_CHECK(cudaEventRecord(gpu_start));
// Allocate device memory double *d_sum; CUDA_CHECK(cudaMalloc(&d_sum, workload * sizeof(double)));
// Set up grid and block dimensions int threadsPerBlock = 256; int blocks = (workload + threadsPerBlock - 1) / threadsPerBlock;
// Launch kernel compute_pi<<<blocks, threadsPerBlock>>>(d_sum, N, offset); CUDA_CHECK(cudaGetLastError());
// Copy results back to host CUDA_CHECK(cudaMemcpy(&h_sums[offset], d_sum, workload * sizeof(double), cudaMemcpyDeviceToHost));
// Record GPU end time CUDA_CHECK(cudaEventRecord(gpu_stop)); CUDA_CHECK(cudaEventSynchronize(gpu_stop)); // Ensure GPU work is complete CUDA_CHECK(cudaEventElapsedTime(&gpu_milliseconds, gpu_start, gpu_stop));
// Convert milliseconds to seconds for this GPU float gpu_seconds = gpu_milliseconds / 1000.0f; printf("GPU %d: Time=%f seconds, Workload=%d\n", dev, gpu_seconds, workload);
// Update offset and accumulate time offset += workload; total_seconds += gpu_seconds;
// Clean up device resources CUDA_CHECK(cudaFree(d_sum)); CUDA_CHECK(cudaEventDestroy(gpu_start)); CUDA_CHECK(cudaEventDestroy(gpu_stop));
// Synchronize device before switching CUDA_CHECK(cudaDeviceSynchronize()); }
// Sum results on host for (int i = 0; i < N; i++) { total_sum += h_sums[i]; }
// Record end time on default device CUDA_CHECK(cudaSetDevice(0)); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); float event_milliseconds; CUDA_CHECK(cudaEventElapsedTime(&event_milliseconds, start, stop)); float event_seconds = event_milliseconds / 1000.0f;
// Calculate and print PI pi = 4 * d * total_sum; printf("Total Time (sum of GPU times)=%f seconds; Event-based Time=%f seconds; PI=%lf\n", total_seconds, event_seconds, pi);
// Clean up free(h_sums); CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop));
return 0;For more information about CUDA, please refer to Nvidia CUDA Official Site