espressif / esp-dl

Espressif deep-learning library for AIoT applications
MIT License
519 stars 116 forks source link

Pnet valid box issue #19

Closed caleb221 closed 2 years ago

caleb221 commented 4 years ago

Hello! For the life of me I cannot figure out what is wrong with an implementation of Pnet (smallest and first layer in MTCNN) the configuration is set to the same as the one in the example code (min_face = 0.07 and pyramid to 80) the input is a XGA frame buffer converted to a dl_marix3du_t, and then normalized and turned into the proper floating point dl_matrix3d_t the code is shown below, the weights are obtained from a .h file that adds values to input->items setOutput is a function that sets a global mtmn_net_t variable to the matrix given at the point of calling (0 for score, 1 for bounding boxes)

void pnetCore1(void *pvParameter) { printf("\n[PNET] HELLO P net!\n");

while(true)//xSemaphoreTake(skeletonKey,portMAX_DELAY))//true) {
xSemaphoreTake(skeletonKey,portMAX_DELAY);
//===================================== // PNET LAYER 1

dl_matrix3d_t *in =(dl_matrix3d_t*) pvParameter;
dl_matrix3d_t *filt_c1 = dl_matrix3d_alloc(10,3,3,3);//(10,3,3,3);
dl_matrix3d_t *bias1   = dl_matrix3d_alloc(1,1,1,10);//(1,1,1,10);  
dl_matrix3d_t *prelu1  = dl_matrix3d_alloc(1,1,1,10);//(1,1,1,10); 
dl_matrix3d_t *out1;   
dl_matrix3d_t *outPool;

//init weights getpnet_conv1_0(filt_c1);//pnetVals.h getpnet_conv1_1(bias1);//pnetvals.h getpnet_prelu1_0(prelu1);//pnetvals.h //calculate out1= dl_matrix3dff_conv_3x3(in,filt_c1,bias1,1,1,PADDING_VALID); //clean up dl_matrix3d_free(filt_c1); dl_matrix3d_free(bias1); //pool outPool=dl_matrix3d_pooling(out1,1,1,2,2,PADDING_VALID,DL_POOLING_MAX); //clean up dl_matrix3d_free(out1); dl_matrix3d_p_relu(outPool, prelu1); dl_matrix3d_free(prelu1);

    //=====================================
    // PNET LAYER 3
dl_matrix3d_t *filt_c2 = dl_matrix3d_alloc(16,3,3,10);
dl_matrix3d_t *bias2   = dl_matrix3d_alloc(1,1,1,16);
dl_matrix3d_t *pool2;
dl_matrix3d_t *out2;
dl_matrix3d_t *prelu2 = dl_matrix3d_alloc(1,1,1,16);

//init weights getpnet_conv2_0(filt_c2); getpnet_conv2_1(bias2); getpnet_prelu2_0(prelu2); //calculate out2 = dl_matrix3dff_conv_3x3(outPool,filt_c2,bias2,1,1,PADDING_VALID); //clean up dl_matrix3d_free(outPool); //pool pool2=dl_matrix3d_pooling(out2,1,1,2,2,PADDING_VALID,DL_POOLING_MAX); //clean up dl_matrix3d_free(out2); dl_matrix3d_p_relu(pool2,prelu2); dl_matrix3d_free(prelu2);

//=====================================
// PNET LAYER 3
dl_matrix3d_t *filt_c3 = dl_matrix3d_alloc(32,3,3,16);
    dl_matrix3d_t *bias3   = dl_matrix3d_alloc(1,1,1,32);
dl_matrix3d_t *pool3;
    dl_matrix3d_t *out3;
//get weights
    getpnet_conv3_0(filt_c3);
    getpnet_conv3_1(bias3);
//calculate
    out3 = dl_matrix3dff_conv_3x3(pool2,filt_c3,bias3,1,1,PADDING_VALID);
//clean up
dl_matrix3d_free(pool2);
dl_matrix3d_free(bias3);
dl_matrix3d_free(filt_c3);
//pool
pool3=dl_matrix3d_pooling(out3,1,1,2,2,PADDING_VALID,DL_POOLING_MAX);
//clean up
dl_matrix3d_free(out3);
dl_matrix3d_t *prelu3 = dl_matrix3d_alloc(1,1,1,32);
    getpnet_prelu3_0(prelu3);
    //prelu
dl_matrix3d_p_relu(pool3,prelu3);                
dl_matrix3d_free(prelu3);

dl_matrix3d_t *score_filter=dl_matrix3d_alloc(2,1,1,32);
dl_matrix3d_t *score_bias = dl_matrix3d_alloc(1,1,1,2);
dl_matrix3d_t *score_out;

getpnet_score_0(score_filter);
getpnet_score_1(score_bias);

score_out= dl_matrix3dff_conv_3x3(pool3,score_filter,score_bias,1,1,PADDING_VALID);

dl_matrix3d_free(score_filter);
dl_matrix3d_free(score_bias);

dl_matrix3d_softmax(score_out); 

dl_matrix3d_t *bbox_filter=dl_matrix3d_alloc(4,1,1,32);
dl_matrix3d_t *bbox_bias=dl_matrix3d_alloc(1,1,1,4);
dl_matrix3d_t *bbox_out=dl_matrix3d_alloc(1,1,1,4);

getpnet_bbox_pred_0(bbox_filter);
getpnet_bbox_pred_1(bbox_bias);

bbox_out = dl_matrix3dff_conv_3x3(pool3,bbox_filter,bbox_bias,1,1,PADDING_VALID); //dl_matrix3d_free(out3); dl_matrix3d_free(bbox_filter); dl_matrix3d_free(bbox_bias); dl_matrix3d_free(pool3); //========================================= // SET MEMORY //

//output->offset=bbox_out;

initOut(score_out->w,score_out->h,bbox_out->w,bbox_out->h );
setOutput(bbox_out,1);// bounding boxes
setOutput(score_out,0);//score output   

//printf("\n\n");

    dl_matrix3d_free(bbox_out);
    dl_matrix3d_free(score_out);

printf("\n[PNET] Bye PNET!\n"); xSemaphoreGive(skeletonKey); vTaskDelay(1000/portTICK_PERIOD_MS); vTaskDelete( NULL ); }//endForeverWhile

printf("\n\n\n[PNET] I SHOULD NOT SEE THIS! \n\n\n"); //vTaskDelay(1000/portTICK_PERIOD_MS); for(; ; ) { vTaskDelay(1000/portTICK_PERIOD_MS); xSemaphoreGive(skeletonKey); printf(".");
vTaskDelete( NULL ); }

}

XiaochaoGONG commented 4 years ago

What's wrong concretely ?

caleb221 commented 4 years ago

Oh! youre right i hadnt stated the problem, sorry about that. While this code does compile and run, the behavior is strange in that it comes off as almost constants (when the input is coming from the camera) For example, using this code will provide a valid result on the second iteration of the model, but after will no longer give any such results. I am looking into the camera configuration and other parts of the code, but I need to make sure I am using the library correctly so that I can locate where the bug is coming from

XiaochaoGONG commented 4 years ago

Can you get right results of convolutional networks, for each layer ? If so, then take a look into how processing the results.