airockchip / rknn_model_zoo

Apache License 2.0
1.06k stars 195 forks source link

different execution times on the same rk3588 board with different operating systems (android, linux) #189

Open egormcobakaster opened 2 months ago

egormcobakaster commented 2 months ago

I slightly modified the code in examples/resnet/cpp/main to measure execution time. measurements showed different results on linux and android, with the first run out of 100 showing approximately the same time on android and linux.

#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <chrono>
#include <fstream>
#include <iostream>

#include "resnet.h"
#include "image_utils.h"
#include "file_utils.h"

constexpr const char* Imagenet_classes_file_path = "./model/synset.txt";

int main(int argc, char **argv)
{
    if (argc != 3) {
        printf("%s <model_path> <image_path>\n", argv[0]);
        return -1;
    }

    const char* model_path = argv[1];
    const char* image_path = argv[2];

    int line_count;
    char** lines = read_lines_from_file(Imagenet_classes_file_path, &line_count);
    if (lines == NULL) {
        printf("read classes label file fail! path=%s\n", Imagenet_classes_file_path);
        return -1;
    }

    int ret;
    rknn_app_context_t rknn_app_ctx;
    memset(&rknn_app_ctx, 0, sizeof(rknn_app_context_t));

    ret = init_resnet_model(model_path, &rknn_app_ctx);
    if (ret != 0) {
        printf("init_resnet_model fail! ret=%d model_path=%s\n", ret, model_path);
        return -1;
    }

    image_buffer_t src_image;
    memset(&src_image, 0, sizeof(image_buffer_t));
    ret = read_image(image_path, &src_image);
    if (ret != 0) {
        printf("read image fail! ret=%d image_path=%s\n", ret, image_path);
        return -1;
    }

    int topk = 5;
    resnet_result result[topk];

    int num_repit = 100;
    double first_run_time = 0.0, last_run_time = 0.0, avg_time = 0.0;
    double total_time = 0.0;

    // Inference loop with 100 repetitions
    for (int i = 0; i < num_repit; i++) {
        auto start_time = std::chrono::high_resolution_clock::now();

        ret = inference_resnet_model(&rknn_app_ctx, &src_image, result, topk);

        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
        if (ret != 0) {
            printf("inference_resnet_model fail! ret=%d\n", ret);
            goto out;
        }

        if (i == 0) {
            first_run_time = duration;
        }
        if (i == num_repit - 1) {
            last_run_time = duration;
        }

        total_time += duration;
    }

    avg_time = total_time / num_repit;

    printf("Inference - First run: %.2f ms, Last run: %.2f ms, Average: %.2f ms\n", first_run_time, last_run_time, avg_time);

    for (int i = 0; i < topk; i++) {
        printf("[%d] score=%.6f class=%s\n", result[i].cls, result[i].score, lines[result[i].cls]);
    }

out:
    ret = release_resnet_model(&rknn_app_ctx);
    if (ret != 0) {
        printf("release_resnet_model fail! ret=%d\n", ret);
    }

    if (src_image.virt_addr != NULL) {
        free(src_image.virt_addr);
    }

    if (lines != NULL) {
        free_lines(lines, line_count);
    }

    return 0;
}

linux output: Inference - First run: 25.00 ms, Last run: 21.00 ms, Average: 21.88 ms [155] score=0.879479 class=n02086240 Shih-Tzu [154] score=0.113574 class=n02086079 Pekinese, Pekingese, Peke [204] score=0.002490 class=n02098413 Lhasa, Lhasa apso [262] score=0.001698 class=n02112706 Brabancon griffon [254] score=0.000742 class=n02110958 pug, pug-dog

android output: Inference - First run: 22.00 ms, Last run: 37.00 ms, Average: 32.86 ms [155] score=0.879479 class=n02086240 Shih-Tzu [154] score=0.113574 class=n02086079 Pekinese, Pekingese, Peke [204] score=0.002490 class=n02098413 Lhasa, Lhasa apso [262] score=0.001698 class=n02112706 Brabancon griffon [254] score=0.000742 class=n02110958 pug, pug-dog

swdee commented 1 month ago

You will get different execution times on the same OS based on which CPU cores the program is running on. The RK3588 has 4 fast Cortex-A76 cores at 2.4Ghz and 4 efficient Cortex-A55 cores at 1.8Ghz, so depending on how the OS has scheduled the execution of your program this will effect your inference timing.

To avoid this variation you need to set the CPU affinity of the program to run on the fast A76 cores only, eg:

#define _GNU_SOURCE
#include <stdio.h>
#include <sched.h>
#include <unistd.h>
#include <pthread.h>

int main() {
    cpu_set_t cpuset;
    pid_t pid;

    // Initialize the CPU set to zero
    CPU_ZERO(&cpuset);

    // Add the A76 cores (typically core 4, 5, 6, 7) to the CPU set
    CPU_SET(4, &cpuset);  // Add core 4
    CPU_SET(5, &cpuset);  // Add core 5
    CPU_SET(6, &cpuset);  // Add core 6
    CPU_SET(7, &cpuset);  // Add core 7

    // Get the process ID (0 means the calling process)
    pid = getpid();

    // Set the CPU affinity for the process
    if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset) == -1) {
        perror("sched_setaffinity");
        return -1;
    }

    // Print confirmation
    printf("CPU affinity set to A76 cores only (4, 5, 6, 7)\n");

    // Your program logic here...

    return 0;
}
zen-xingle commented 1 month ago

Have you set the CPU/NPU/DDR frequency? This script helps to do this https://github.com/airockchip/rknn_model_zoo/blob/main/scaling_frequency.sh