04 IntroductionGPUsCUDA
04 IntroductionGPUsCUDA
Vishwesh Jatala
Assistant Professor
Department of CSE
Indian Institute of Technology Bhilai
[email protected]
2023-24 W
1
Course Outline
■ Introduction
■ Overview of Parallel Architectures
■ Performance
■ Parallel Programming
❑ GPUs and CUDA programming
❑ CUDA thread organization
❑ Instruction execution
❑ GPU memories
❑ Synchronization
❑ Unified memory
■ Case studies
■ Extracting Parallelism from Sequential Programs
Automatically
2
Outline
3
Motivation
4
Why GPUs?
■ Multicore processors
❑ Task level parallelism
❑ Graphics rendering is
computationally
expensive
❑ Not efficient for
graphics applications
7
GPU Architecture
8
Parallelizing Programs on GPUs
9
Programming Models
10
Introduction to CUDA Programming
GPU (Device)
(2) Kernel SM SM SM
Device Memory
Memory
CPU (Host)
11
Hello World
#include <stdio.h>
int main() {
printf("Hello World.\n"); Compile: gcc hello.c
Run: ./a.out
return 0; Hello World.
}
12
Hello World in GPU
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel() {
printf(“Hello World.\n”); Compile: nvcc hello.cu
Run: ./a.out
}
Hello World.
int main() {
dkernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
13
Hello World in GPU
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel() {
printf(“Hello World.\n”); Compile: nvcc hello.cu
} Run: ./a.out
No output
int main() {
dkernel<<<1, 1>>>();
return 0;
GPU Kernel launch is asynchronous!
}
14
Hello World in GPU
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel() {
printf(“Hello World.\n”); Compile: nvcc hello.cu
Run: ./a.out
}
Hello World.
int main() {
dkernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
15
Hello World in Parallel in GPU
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel() {
printf(“Hello World.\n”); Compile: nvcc hello.cu
} Run: ./a.out
Hello World.
int main() { Hello World.
32 times
dkernel<<<1, 32>>>(); …………….
Hello World.
cudaDeviceSynchronize();
return 0;
}
16
Example-1
#include <stdio.h>
#define N 100
int main() {
int i;
for (i = 0; i < N; ++i)
printf("%d\n", i * i);
return 0;
}
17
Example-1
#include <stdio.h>
#include <stdio.h> #include <cuda.h>
#define N 100 #define N 100
int main() { __global__ void fun() {
int i; printf("%d\n", threadIdx.x*threadIdx.x);
for (i = 0; i < N; ++i) }
} cudaDeviceSynchronize();
return 0;
}
18
GPU Hello World with a Global
19
Separate Memories
DRAM DRAM
PCI Express
Bus
CPU GPU
20
CUDA Programs with Data Transfers
GPU (Device)
(2) Kernel SM SM SM
Device Memory
Memory
CPU (Host)
21
Data Transfer
22
CPU-GPU Communication
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(char *arr, int arrlen) {
unsigned id = threadIdx.x;
if (id < arrlen) {
++arr[id];
}
}
int main() {
char cpuarr[] = "CS516", *gpuarr;
cudaMalloc(&gpuarr, sizeof(char) * (1 + strlen(cpuarr)));
cudaMemcpy(gpuarr, cpuarr, sizeof(char) * (1 + strlen(cpuarr)), cudaMemcpyHostToDevice);
dkernel<<<1, 32>>>(gpuarr, strlen(cpuarr));
cudaDeviceSynchronize(); // unnecessary.
cudaMemcpy(cpuarr, gpuarr, sizeof(char) * (1 + strlen(cpuarr)), cudaMemcpyDeviceToHost);
printf(cpuarr);
return 0;
}
23
Example
#include <stdio.h>
#include <stdio.h> #include <cuda.h>
#define N 100 #define N 100
__global__ void fun(int *a) {
int main() { a[threadIdx.x] = threadIdx.x * threadIdx.x;
}
int a[N], i;
int main() {
int a[N], *da;
for (i = 0; i < N; ++i)
int i;
a[i] = i * i;
cudaMalloc(&da, N * sizeof(int));
return 0; fun<<<1, N>>>(da);
cudaMemcpy(a, da, N * sizeof(int),
}
cudaMemcpyDeviceToHost);
Takeaway for (i = 0; i < N; ++i)
printf("%d\n", a[i]);
return 0;
}
24
References
25