0% found this document useful (0 votes)
19 views

PDC assignment

Uploaded by

maryamasad668
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views

PDC assignment

Uploaded by

maryamasad668
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

Name: Maryam Asad

Reg. no.: COSC211101083

Course Instructor: Sir Ahsan Aslam

Due Date: 30th December, 2024 (14:00 hours)


1. Host to Device Code Conversions

Example 1: Basic Vector Addition

Host Code (CPU)

#include <iostream>

void vectorAdd(float* A, float* B, float* C, int N) {


for (int i = 0; i < N; ++i) {
C[i] = A[i] + B[i];
}
}

int main() {
int N = 1000;
float A[N], B[N], C[N];
// Initialize A and B arrays
vectorAdd(A, B, C, N);
return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void vectorAddKernel(float* A, float* B, float* C, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
C[idx] = A[idx] + B[idx];
}
}

int main() {
int N = 1000;
float *d_A, *d_B, *d_C;
float A[N], B[N], C[N];

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(float));
cudaMalloc((void**)&d_B, N * sizeof(float));
cudaMalloc((void**)&d_C, N * sizeof(float));

// Copy data to device


cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * sizeof(float), cudaMemcpyHostToDevice);

// Launch the kernel


vectorAddKernel<<<(N + 255) / 256, 256>>>(d_A, d_B, d_C, N);

// Copy result back to host


cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}

Example 2: Array Initialization and Summation

Host Code (CPU)

#include <iostream>

int main() {
const int N = 1000;
int A[N];
int sum = 0;

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

// Calculate sum of A
for (int i = 0; i < N; ++i) {
sum += A[i];
}

std::cout << "Sum: " << sum << std::endl;


return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void sumKernel(int* A, int* sum, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
atomicAdd(sum, A[idx]);
}
}

int main() {
const int N = 1000;
int *d_A, *d_sum;
int A[N];
int sum = 0;

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(int));
cudaMalloc((void**)&d_sum, sizeof(int));

// Initialize sum to 0 on device


cudaMemcpy(d_sum, &sum, sizeof(int), cudaMemcpyHostToDevice);
// Copy data to device
cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
sumKernel<<<(N + 255) / 256, 256>>>(d_A, d_sum, N);

// Copy result back to host


cudaMemcpy(&sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Sum: " << sum << std::endl;

// Free device memory


cudaFree(d_A);
cudaFree(d_sum);

return 0;
}

Example 3: Parallel Array Scaling

Host Code (CPU)

#include <iostream>

void scaleArray(int* A, int scale, int N) {


for (int i = 0; i < N; ++i) {
A[i] *= scale;
}
}

int main() {
const int N = 1000;
int A[N];

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

int scale = 2;
scaleArray(A, scale, N);

return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void scaleArrayKernel(int* A, int scale, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
A[idx] *= scale;
}
}

int main() {
const int N = 1000;
int *d_A;
int A[N];

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

int scale = 2;

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
scaleArrayKernel<<<(N + 255) / 256, 256>>>(d_A, scale, N);

// Copy result back to host


cudaMemcpy(A, d_A, N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);

return 0;
}

Example 4: Matrix Transposition

Host Code (CPU)

#include <iostream>

void transposeMatrix(int* A, int* B, int N) {


for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
B[j * N + i] = A[i * N + j];
}
}
}

int main() {
const int N = 3;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N];

transposeMatrix(A, B, N);

return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void transposeMatrixKernel(int* A, int* B, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;

if (idx < N && idy < N) {


B[idy * N + idx] = A[idx * N + idy];
}
}

int main() {
const int N = 3;
int *d_A, *d_B;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N];

// Allocate device memory


cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
dim3 block(1, 1);
dim3 grid(N, N);
transposeMatrixKernel<<<grid, block>>>(d_A, d_B, N);

// Copy result back to host


cudaMemcpy(B, d_B, N * N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);

return 0;
}

Example 5: Finding the Maximum Value

Host Code (CPU)

#include <iostream>

int findMax(int* A, int N) {


int maxVal = A[0];
for (int i = 1; i < N; ++i) {
if (A[i] > maxVal) {
maxVal = A[i];
}
}
return maxVal;
}

int main() {
const int N = 1000;
int A[N];

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

int maxVal = findMax(A, N);


std::cout << "Max Value: " << maxVal << std::endl;

return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void findMaxKernel(int* A, int* maxVal, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
atomicMax(maxVal, A[idx]);
}
}

int main() {
const int N = 1000;
int *d_A, *d_maxVal;
int A[N];
int maxVal = 0;

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(int));
cudaMalloc((void**)&d_maxVal, sizeof(int));

// Initialize maxVal to 0 on device


cudaMemcpy(d_maxVal, &maxVal, sizeof(int), cudaMemcpyHostToDevice);

// Copy data to device


cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
findMaxKernel<<<(N + 255) / 256, 256>>>(d_A, d_maxVal, N);

// Copy result back to host


cudaMemcpy(&maxVal, d_maxVal, sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Max Value: " << maxVal << std::endl;

// Free device memory


cudaFree(d_A);
cudaFree(d_maxVal);

return 0;
}

2. Matrix Multiplication Examples in Device Code


Matrix Multiplication (Example 1)

Device Code (GPU)

#include <iostream>

__global__ void matrixMultiplyKernel(int* A, int* B, int* C, int N) {


int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < N && col < N) {
int val = 0;
for (int k = 0; k < N; ++k) {
val += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = val;
}
}

int main() {
const int N = 3;
int *d_A, *d_B, *d_C;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N] = {9, 8, 7, 6, 5, 4, 3, 2, 1};
int C[N * N] = {0};

// Allocate device memory


cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
cudaMalloc((void**)&d_C, N * N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
dim3 block(1, 1);
dim3 grid(N, N);
matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, N);

// Copy result back to host


cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}

Matrix Multiplication (Example 2)

Device Code (GPU)

#include <iostream>

__global__ void matrixMultiplyKernel(int* A, int* B, int* C, int N) {


int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < N && col < N) {
int value = 0;
for (int i = 0; i < N; i++) {
value += A[row * N + i] * B[i * N + col];
}
C[row * N + col] = value;
}
}

int main() {
const int N = 4;
int *d_A, *d_B, *d_C;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
int B[N * N] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
int C[N * N] = {0};

// Allocate device memory


cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
cudaMalloc((void**)&d_C, N * N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
dim3 block(2, 2);
dim3 grid(N / 2, N / 2);
matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, N);

// Copy result back to host


cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}

You might also like