Why do more threads take longer?

Today I test the NNPACK in ARMv8 machine, I found when I improve the thread number, the time increases. I am very confused and I am no sure what is the problem, the script just like this:
#include <iostream>
#include <sys/time.h>

#include <vector>
#include "nnpack.h"

using namespace std;

float test_nnpack(size_t bs, size_t threads)
{
    enum nnp_status init_status = nnp_initialize();
    if (init_status != nnp_status_success)
    {
        return 0;
    }

    enum nnp_convolution_algorithm algorithm;
    enum nnp_convolution_transform_strategy strategy=nnp_convolution_transform_strategy_tuple_based;
    const size_t batch_size = 1;
    const size_t input_channels = 16;
    const size_t output_channels = 16;
        const size_t kernel_num = 3;
    const struct nnp_padding input_padding = {1, 1, 1, 1};
    const struct nnp_size input_size = {224, 224};
    const struct nnp_size kernel_size = {3, 3};
    const struct nnp_size stride = {.width=1, .height=1};
    const struct nnp_size output_size = {
        .width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width)/stride.width + 1,
        .height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height)/stride.height + 1
    };
    float *input, *kernel, *output, *bias;

    input = (float *)malloc(batch_size * input_channels * input_size.height * input_size.width * sizeof(float));
    kernel = (float *)malloc(input_channels * output_channels * kernel_size.height * kernel_size.width * sizeof(float));
    output = (float *)malloc(batch_size * output_channels * output_size.height * output_size.width * sizeof(float));
    bias = (float *)malloc(output_channels * sizeof(float));

    pthreadpool_t threadpool = nullptr;
        if (true) {   
            threadpool = pthreadpool_create(32);  
            printf("Threads: %zu\n", pthreadpool_get_threads_count(threadpool));  
        }   

    struct nnp_profile computation_profile;

    int i, j, c, iter;
    struct timeval start, end;

    for (c = 0; c < input_channels; c++)
    {
        for (i = 0; i < input_size.height; i++)
        {
            for (j = 0; j < input_size.width; j++)
            {
                input[c * input_size.height * input_size.width + i * input_size.width + j] = (i * input_size.width + j) * 0.1;
            }
        }
    }

    for(i = 0; i < output_channels; i++)
    {
        for (j = 0; j < input_channels * kernel_size.height * kernel_size.width; j++)
        {
            kernel[i * input_channels * kernel_size.height * kernel_size.width + j] = 0.1;
        }
    }

    for (i = 0; i < output_channels; i++)
    {
        bias[i] = 1.0;
    }

        iter = 1;
    gettimeofday(&start, nullptr); 
    for (i = 0; i < iter; i++)
    {
        algorithm = nnp_convolution_algorithm_wt8x8;
        nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
    }
    gettimeofday(&end, nullptr);
    long second = end.tv_sec - start.tv_sec;
    long usecond = end.tv_usec - start.tv_usec;
    float mtime = (second * 1000 + usecond / 1000.0);
    cout << "Winograd convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;

    gettimeofday(&start, nullptr);
    for (i = 0; i < iter; i++)
    {
        algorithm = nnp_convolution_algorithm_ft8x8;
        nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
    }
    gettimeofday(&end, nullptr);
    second = end.tv_sec - start.tv_sec;
    usecond = end.tv_usec - start.tv_usec;
    mtime = (second * 1000 + usecond / 1000.0);
    cout << "FFT8x8 convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;        

    gettimeofday(&start, nullptr);
    for (i = 0; i < iter; i++)
    {
        algorithm = nnp_convolution_algorithm_implicit_gemm;
        nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
    }
    gettimeofday(&end, nullptr);
    second = end.tv_sec - start.tv_sec;
    usecond = end.tv_usec - start.tv_usec;
    mtime = (second * 1000 + usecond / 1000.0);
    cout << "GEMM convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl; 

    gettimeofday(&start, nullptr);
    for (i = 0; i < iter; i++)
    {
        algorithm = nnp_convolution_algorithm_direct;
        nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
    }
    gettimeofday(&end, nullptr);
    second = end.tv_sec - start.tv_sec;
    usecond = end.tv_usec - start.tv_usec;
    mtime = (second * 1000 + usecond / 1000.0);
    cout << "Direct convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl; 

    return 0;
}
int main(int argc, char* argv[])
{
        size_t batch_size = atoi(argv[1]);
        size_t thread_num = atoi(argv[2]);
    test_nnpack(batch_size,thread_num);
    return 0;
}
My machine are numa arch, however I am sure the 32 threads run on same node so there is no numa remote access issue. Please tell me how to improve the time ? Thanks!
Maratyszcza / NNPACK

Why do more threads take longer? #194