SagiK-Repository / CUDA_CPP_DeepLearning

CUDA C++로 진행하는 DeepLearning의 모든 것!
0 stars 0 forks source link

Perseptron Forward Input #2

Open SAgiKPJH opened 1 year ago

SAgiKPJH commented 1 year ago
// AND 학습 - Only Weight
// nvcc -o "DeepLearning/Perceptron/Perceptron_Forward_Normal.cu" "DeepLearning/Perceptron/Perceptron_Forward_Normal.cu.cu" -lpng --expt-relaxed-constexpr -lcurand -lcuda -lcudart -lcublas
// "./DeepLearning/Perceptron/Perceptron_Forward_Normal.cu"

// Using CUDA library : -lcuda -lcudart -lcublas

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <regex> // std::function

const int MODEL_COUNT = 2;
const int MODEL_LAYER[MODEL_COUNT] = {2, 1}; // Input Size만 변경

const int INPUT_SIZE = MODEL_LAYER[0];
const int OUTPUT_SIZE = MODEL_LAYER[MODEL_COUNT-1];

const int BATCH_SIZE = 4;

__global__ void forward_kernel(float* inputs, float* weights, float *bias, float* output)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    int index = threadIdx.x;
    float sum = inputs[id * INPUT_SIZE] * weights[0] + inputs[id * INPUT_SIZE + 1] * weights[1] + bias[0];
    output[index] = sum;
}

int array_weight_count(const int *array, int count) {
    // [2 + 3 + 4 + 5 + 1] -> 2*3 + 3*4 + 4*5 + 5+1
    int sum = 0;
    for (int i = 0; i < count; i++) {
        if (i >= count)
            continue;

        if (array[i] <= 0){
            sum = -1;
            std::cout << "MODEL_LAYER Setting Error" << std::endl;
            exit(EXIT_FAILURE);
        }

        sum += array[i] * array[i+1];
    }

    return sum;
}

int array_bias_count(const int *array, int count) {
    // [2 + 3 + 4 + 5 + 1] -> 3 + 4 + 5 + 1
    int sum = 0;
    for (int i = 1; i < count; i++) {

        if (array[count] <= 0){
            sum = -1;
            std::cout << "MODEL_LAYER Setting Error" << std::endl;
            exit(EXIT_FAILURE);
        }

        sum += array[i];
    }

    return sum;
}

// cuda_timmer("Execution time", []() { /*실행할 코드*/ });
void cuda_timmer(const std::string& msg, std::function<void()> f) {
    cudaEvent_t start, stop;
    float elapsedTime = 0.0f;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    f();

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    std::cout << msg << " Elapsed time: " << elapsedTime << " ms" << std::endl;
}

int main()
{
    // 입력 값, 가중치, 편향 초기화
    float inputs[BATCH_SIZE][INPUT_SIZE] = { 0.0f };
    int weight_count = array_weight_count(MODEL_LAYER, MODEL_COUNT);
    int bias_count = array_bias_count(MODEL_LAYER, MODEL_COUNT);
    float weights[weight_count] = {0};
    // initialize_weight(weights);
    float bias[bias_count] = {0};
    // initialize_bias(bias);

    // 출력 값을 저장할 배열 초기화
    float output[BATCH_SIZE] = {0.0f};

    // CUDA 메모리 할당
    float *d_inputs, *d_output, *d_weights, *d_bias;
    cudaMalloc(&d_inputs, BATCH_SIZE * INPUT_SIZE * sizeof(float));
    cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
    cudaMalloc(&d_weights, weight_count * sizeof(float));
    cudaMalloc(&d_bias, bias_count * sizeof(float));

    // 입력 데이터를 GPU로 복사
    cudaMemcpy(d_inputs, inputs, BATCH_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_weights, weights, weight_count * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias, bias, bias_count * sizeof(float), cudaMemcpyHostToDevice);

    // 파라미터 GPU로 복사
    int* d_model_layer;
    int* d_model_count;
    cudaMalloc(&d_model_layer, MODEL_COUNT * sizeof(int));
    cudaMalloc(&d_model_count, sizeof(int));
    cudaMemcpy(d_model_layer, MODEL_LAYER, MODEL_COUNT * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_model_count, &MODEL_COUNT, sizeof(int), cudaMemcpyHostToDevice);

    // CUDA 커널 함수 호출
    forward_kernel<<<1, 4>>>(d_inputs, d_weights, d_bias, d_output);

    // 출력 데이터를 호스트로 복사
    cudaMemcpy(output, d_output, BATCH_SIZE * sizeof(float), cudaMemcpyDeviceToHost);

    // 결과 출력
    for (int i = 0; i < BATCH_SIZE; i++) {
        std::cout << "AND(" << inputs[i][0] << ", " << inputs[i][1] << ") = " << output[i] << std::endl;
    }

    return 0;
}
SAgiKPJH commented 1 year ago

image

SAgiKPJH commented 1 year ago

2023-04-13

// AND 학습 - Only Weight
// nvcc -o "DeepLearning/Perceptron/Perceptron_Forward_Normal.cu" "DeepLearning/Perceptron/Perceptron_Forward_Normal.cu.cu" -lpng --expt-relaxed-constexpr -lcurand -lcuda -lcudart -lcublas
// "./DeepLearning/Perceptron/Perceptron_Forward_Normal.cu"

// Using CUDA library : -lcuda -lcudart -lcublas

// 1 Batch Size, 2 Model_count

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <regex> // std::function

const int MODEL_COUNT = 2;
const int MODEL_LAYER[MODEL_COUNT] = {2, 1}; // Input Size만 변경

const int INPUT_SIZE = MODEL_LAYER[0];
const int OUTPUT_SIZE = MODEL_LAYER[MODEL_COUNT-1];

const int BATCH_SIZE = 1;

__global__ void forward_kernel(float* inputs, float* weights, float *bias, float* output, int* MODEL_LAYER, int* MODEL_COUNT)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    int index = threadIdx.x;
    float sum = inputs[id * INPUT_SIZE] * weights[0] + inputs[id * INPUT_SIZE + 1] * weights[1] + bias[0];
    output[index] = sum;
}

int array_weight_count(const int *array, int count) {
    // [2 + 3 + 4 + 5 + 1] -> 2*3 + 3*4 + 4*5 + 5+1
    int sum = 0;
    for (int i = 0; i < count; i++) {
        if (i >= count)
            continue;

        if (array[i] <= 0){
            sum = -1;
            std::cout << "MODEL_LAYER Setting Error" << std::endl;
            exit(EXIT_FAILURE);
        }

        sum += array[i] * array[i+1];
    }

    return sum;
}

int array_bias_count(const int *array, int count) {
    // [2 + 3 + 4 + 5 + 1] -> 3 + 4 + 5 + 1
    int sum = 0;
    for (int i = 1; i < count; i++) {

        if (array[count] <= 0){
            sum = -1;
            std::cout << "MODEL_LAYER Setting Error" << std::endl;
            exit(EXIT_FAILURE);
        }

        sum += array[i];
    }

    return sum;
}

// cuda_timmer("Execution time", []() { /*실행할 코드*/ });
void cuda_timmer(const std::string& msg, std::function<void()> f) {
    cudaEvent_t start, stop;
    float elapsedTime = 0.0f;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    f();

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    std::cout << msg << " Elapsed time: " << elapsedTime << " ms" << std::endl;
}

int main()
{
    // 입력 값, 가중치, 편향 초기화
    float inputs[BATCH_SIZE][INPUT_SIZE] = { 0.0f };
    int weight_count = array_weight_count(MODEL_LAYER, MODEL_COUNT);
    int bias_count = array_bias_count(MODEL_LAYER, MODEL_COUNT);
    float weights[weight_count] = {0};
    // initialize_weight(weights);
    float bias[bias_count] = {0};
    // initialize_bias(bias);

    // 출력 값을 저장할 배열 초기화
    float output[BATCH_SIZE] = {0.0f};

    // CUDA 메모리 할당
    float *d_inputs, *d_output, *d_weights, *d_bias;
    cudaMalloc(&d_inputs, BATCH_SIZE * INPUT_SIZE * sizeof(float));
    cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
    cudaMalloc(&d_weights, weight_count * sizeof(float));
    cudaMalloc(&d_bias, bias_count * sizeof(float));

    // 입력 데이터를 GPU로 복사
    cudaMemcpy(d_inputs, inputs, BATCH_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_weights, weights, weight_count * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_bias, bias, bias_count * sizeof(float), cudaMemcpyHostToDevice);

    // 파라미터 GPU로 복사
    int* d_model_layer;
    int* d_model_count;
    cudaMalloc(&d_model_layer, MODEL_COUNT * sizeof(int));
    cudaMalloc(&d_model_count, sizeof(int));
    cudaMemcpy(d_model_layer, MODEL_LAYER, MODEL_COUNT * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_model_count, &MODEL_COUNT, sizeof(int), cudaMemcpyHostToDevice);

    // CUDA 커널 함수 호출
    forward_kernel<<<1, BATCH_SIZE>>>(d_inputs, d_weights, d_bias, d_output, d_model_layer, d_model_count);

    // 출력 데이터를 호스트로 복사
    cudaMemcpy(output, d_output, BATCH_SIZE * sizeof(float), cudaMemcpyDeviceToHost);

    // 결과 출력
    for (int i = 0; i < BATCH_SIZE; i++) {
        std::cout << "AND(" << inputs[i][0] << ", " << inputs[i][1] << ") = " << output[i] << std::endl;
    }

    return 0;
}