GPU 编程第四次作业（实验五）

1 实验步骤一：

1.1 代码

#include<stdio.h>
#include<stdlib.h>
#define N 4

int main(void)
{
    int arr[N][N] = {{1,2,3,4}, {5,6,7,8}, {9,10,11,12}, {13,14,15,16}};
    printf("Original 2D array: \n");
    for(int i=0; i<N; i++){
        for(int j=0; j<N; j++){
            printf("%d ", arr[i][j]);
        }
        printf("\n");
    }

    printf("\n Row-major layout: \n");
 
    int *p1 = NULL;
    int *p2 = NULL;
    int *p3 = NULL;
    // Todo
    p1 = arr[0];
    printf("Approach 1: (address:%p)\n", p1);   
    for(int j=0; j<N*N; j++){
        printf("%d ", *(p1+j));
    }
    
    // Todo
    p2 = *arr;
    printf("Approach 2: (address:%p)\n", p2);
    for(int j=0; j<N*N; j++){
        printf("%d ", *(p2+j));
    }
    
    // Todo
    p3 = arr[0][0];
    printf("Approach 3: (address:%p)\n", p3);
    for(int j=0; j<N*N; j++){
        printf("%d ", *(p3+j));
    }

    return 0;
}

1.2 运行结果

2 实验步骤二

2.1 代码

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image/stb_image_write.h"
#include"error_check.h"
#include"time_helper.h"

// Todo
// Implement the cuda kernel function ***rgb_to_sepia_gpu***
__global__ void rgb_to_sepia_gpu(unsigned char *input_image, unsigned char *output_image, int width, int height, int channels) {
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    if (Col < width && Row < height) {
        int offset = (Row * width + Col) * channels;
        unsigned char c1 = input_image[offset];
        unsigned char c2 = input_image[offset + 1];
        unsigned char c3 = input_image[offset + 2];
        *(output_image + offset) = (unsigned char)fmin((c1 * 0.393 + c2 * 0.769 + c3 * 0.189), 255.0);
        *(output_image + offset + 1) = (unsigned char)fmin((c1 * 0.349 + c2 * 0.686 + c3 * 0.168), 255.0);
        *(output_image + offset + 2) = (unsigned char)fmin((c1 * 0.272 + c2 * 0.534 + c3 * 0.131), 255.0);
        if (channels == 4) {
            *(output_image + offset + 3) = input_image[offset + 3];
        }
    }
}

void rgb_to_sepia_cpu(unsigned char *input_image, unsigned char *output_image, int width, int height, int channels)
{
    for(int row=0; row<height; row++)
    {
        for(int col=0; col<width; col++)
        {
            int offset = (row*width + col)*channels;
            unsigned char c1 = input_image[offset];
            unsigned char c2 = input_image[offset+1];
            unsigned char c3 = input_image[offset+2];

            *(output_image + offset) = (unsigned char)fmin((c1 * 0.393 + c2 * 0.769 + c3 * 0.189), 255.0);
            *(output_image + offset + 1) = (unsigned char)fmin((c1 * 0.349 + c2 * 0.686 + c3 * 0.168), 255.0);
            *(output_image + offset + 2) = (unsigned char)fmin((c1 * 0.272 + c2 * 0.534 + c3 * 0.131), 255.0);

            if(channels==4)
            {
                *(output_image + offset + 3) = input_image[offset + 3];
            }
        }
    }
}

int main(int argc, char *argv[])
{
    if(argc<4)
    {
        printf("Usage: command    input-image-name    output-image-name option   option(cpu/gpu)");
        return -1;
    }
    char *input_image_name = argv[1];
    char *output_image_name = argv[2];
    char *option = argv[3];

    int width, height, original_no_channels;
    int desired_no_channels = 0; // Pass 0 to load the image as is
    unsigned char *stbi_img = stbi_load(input_image_name, &width, &height, &original_no_channels, desired_no_channels);
    if(stbi_img==NULL){ printf("Error in loading the image.\n"); exit(1);}
    printf("Loaded image with a width of %dpx, a height of %dpx. The original image had %d channels, the loaded image has %d channels.\n", width, height, original_no_channels, desired_no_channels);

    int channels = original_no_channels;
    int img_mem_size = width * height * channels * sizeof(char);
    double begin;
    if(strcmp(option, "cpu")==0)
    {
        printf("Processing with CPU!\n");
        unsigned char *sepia_img = (unsigned char *)malloc(img_mem_size);
        if(sepia_img==NULL){  printf("Unable to allocate memory for the sepia image. \n");  exit(1);  }

        
        // Time stamp
        begin = cpuSecond();

        // CPU computation (for reference)
        rgb_to_sepia_cpu(stbi_img, sepia_img, width, height, channels);

        // Time stamp
        printf("Time cost [CPU]:%f s\n", cpuSecond()-begin);

        // Save to an image file
        stbi_write_jpg(output_image_name, width, height, channels, sepia_img, 100);

        free(sepia_img);
    }
    else if(strcmp(option, "gpu")==0) 
    {
        printf("Processing with GPU!\n");

        //  Todo: 1. Allocate memory on GPU
        unsigned char *Input, *Output;
        CHECK(cudaMalloc((void**)&Input, img_mem_size));
        CHECK(cudaMalloc((void**)&Output, img_mem_size));

        //  Todo: 2. Copy data from host memory to device memory
        CHECK(cudaMemcpy(Input, stbi_img, img_mem_size, cudaMemcpyHostToDevice));

        //  Todo: 3. Call kernel function
        //        3.1 Declare block and grid sizes

        const int block_x = 32, block_y = 32;
        dim3 block(block_x, block_y);
        const int grid_x = (width - 1) / block_x + 1, grid_y = (height - 1) / block_y + 1;
        dim3 grid(grid_x, grid_y);

        //        3.2 Record the time cost of GPU computation
        begin = cpuSecond();

        //  Todo: 3.3 Call the kernel function (Don't forget to call cudaDeviceSynchronize() before time recording)
        rgb_to_sepia_gpu<<<grid, block>>>(Input, Output, width, height, channels);
        CHECK(cudaDeviceSynchronize());
        printf("Time cost [GPU]:%f s\n", cpuSecond()-begin);

        //  Todo:  4. Copy data from device to host
        unsigned char *sepia_img = (unsigned char *)malloc(img_mem_size);
        CHECK(cudaMemcpy(sepia_img, Output, img_mem_size, cudaMemcpyDeviceToHost));

        //  Todo:  5. Save results as an image
        /*  stbi_write_jpg(output_image_name, width, height, channels, sepia_img_from_gpu, 100);  */
        stbi_write_jpg(output_image_name, width, height, channels, sepia_img, 100);

        //  Todo:  6. Release host memory and device memory
        CHECK(cudaFree(Input));
        CHECK(cudaFree(Output));
        free(sepia_img);
    } 
    else
    {
        printf("Unexpected option (please use cpu/gpu) !\n");
    }   

    stbi_image_free(stbi_img);

    return 0;
}

2.2 运行结果

3 实验步骤三

3.1 主要代码

__global__ void BLUR_gpu(unsigned char *input_image, unsigned char *output_image, int width, int height, int channels, int blur_size) {
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    if ((Col < width) && (Row < height)) {
        double* sum = (double*) malloc(channels * sizeof(double));
        for (int i = 0; i < channels; i++) {
            sum[i] = 0.0;
        }
        double cnt = 0.0;
        for (int movRow = -blur_size; movRow <= blur_size; movRow++) {
            for (int movCol = -blur_size; movCol <= blur_size; movCol++) {
                int newRow = Row + movRow;
                int newCol = Col + movCol;
                if ((newRow >= 0) && (newRow < height) && (newCol >= 0) && (newCol < width)) {
                    cnt++;
                    int newOffset = (newRow * width + newCol) * channels;
                    for (int i = 0; i < channels; i++) {
                        unsigned char v = *(input_image + newOffset + i);
                        sum[i] += (double) v;
                    }
                }
            }
        }
        int offset = (Row * width + Col) * channels;
        for (int i = 0; i < channels; i++) {
            *(output_image + offset + i) = (unsigned char) (sum[i] / cnt);
        }
        free(sum);
    }
}

3.2 一些结果

blur_size=15

blur_size=20

posted @ 2023-04-06 20:41 缙云山车神阅读(23) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

缙云山车神

我会飞

GPU 编程第四次作业（实验五）

GPU 编程第四次作业（实验五）

1 实验步骤一：

1.1 代码

1.2 运行结果

2 实验步骤二

2.1 代码

2.2 运行结果

3 实验步骤三

3.1 主要代码

3.2 一些结果

公告