vladmandic / human

Human: AI-powered 3D Face Detection & Rotation Tracking, Face Description & Recognition, Body Pose Tracking, 3D Hand & Finger Tracking, Iris Analysis, Age & Gender & Emotion Prediction, Gaze Tracking, Gesture Recognition
https://vladmandic.github.io/human/demo/index.html
MIT License
2.38k stars 325 forks source link

Feature: Padding Input image to square improves face detection #475

Closed vladmandic closed 2 months ago

vladmandic commented 5 months ago

Discussed in https://github.com/vladmandic/human/discussions/468

Originally posted by **StillTravelling** May 20, 2024 I'm not sure if this is a bug or not but in order for face detection to work consistently I'm having to pad out a HD image. The incoming video is 25 fps. Using human 3.2.2 on node. Here's my code: ``` const Human = require('C:\\TestNode\\node_modules\\@vladmandic\\human\\dist\\human.node-gpu.js').default; const humanConfig = { modelBasePath: 'file://hmodels', debug: false, async: true, filter: { enabled: false }, cacheSensitivity : 0.9, //skipAllowed: true, //skipFrames: 200, face: { enabled: true, detector: { enabled: true, maxDetected: 1, rotation: false, minConfidence: 0.8 }, attention: { enabled: false }, mesh: { enabled: false }, iris: { enabled: false }, description: { enabled: false }, emotion: { enabled: false }, antispoof: { enabled: false }, liveness: { enabled: false }, }, gesture: { enabled: false }, hand: { enabled: false }, body: { enabled: false }, object: { enabled: false }, segmentation: { enabled: false} }; const human = new Human(humanConfig); ... async function tensorToBuffer(tensor) { const data = await tensor.data(); const buffer = Buffer.from(data); return buffer; } function resizeImage(imageTensor, width, height) { return human.tf.image.resizeBilinear(imageTensor, [height, width]); } // Function to extract a region from an image tensor function extractImage(imageTensor, left, top, width, height) { return imageTensor.slice([top, left, 0], [height, width, -1]); } function padImage(imageTensor, targetWidth, targetHeight) { const [height, width, channels] = imageTensor.shape; const top = Math.floor((targetHeight - height) / 2); const bottom = targetHeight - height - top; const left = Math.floor((targetWidth - width) / 2); const right = targetWidth - width - left; return human.tf.pad(imageTensor, [[top, bottom], [left, right], [0, 0]]); } let lastbox; let facedetect = false; async function processImage2(frameToSend, arW, arH, fW, fH){ let extractedImage; const tensor = await human.tf.node.decodeJpeg(frameToSend, 3); // decode jpeg buffer to raw tensor let current_f = img_dim2; let current_ar = use_aspect_ratio; //set it here because use_aspect_ratio might change mid processing when received as msg if(current_ar == 'cover'){ extractedImage = tensor; } else if (['32', '64', '128', '256','512'].includes(current_ar)) //select the center 32 pixels of the image { let nv = parseInt(current_ar); extractedImage = await human.tf.tidy(() => extractImage(tensor, (arW-nv) / 2, (arH-nv) / 2, nv, nv)); } else if (['32t', '64t', '128t', '256t','512t'].includes(current_ar)) //select the center 32 pixels offset height of the image { let nv = parseInt(current_ar.replace('t','')); let nvt = nv*2; extractedImage = await human.tf.tidy(() => extractImage(tensor, (arW-nv) / 2, (arH-nvt) / 2, nv, nv)); } else if(current_ar.includes('face')){ if(busy ) { human.tf.dispose(tensor); console.log("Busy"); return; } busy = true; let nW = arW / current_f; //arW = 1920 current_f = 4 nW = 480 let nH = arH / current_f; //arH = 1080 current_f = 4 nH = 270 const tensor_r = await human.tf.tidy(() => resizeImage(tensor, nW,nH)); //resize to improve performance? const tensor_b = await human.tf.tidy(() => padImage(tensor_r, nW,nW)); //pad to a square to improve face rec -- this definitely improve detection human.tf.dispose(tensor_r); let res; if(current_ar == 'faceInterpolated'){ const res1 = await human.detect(tensor_b); res = await human.next(res1); } else{ res = await human.detect(tensor_b); } //console.log(human.tf.engine().memory()); //console.log(res.performance); if (res?.face?.[0]){ facedetect = true; thebox = res.face[0].box let left = thebox[0]; let top = thebox[1]; let width2 = thebox[2] ; let height2 = thebox[3]; lastbox = thebox; // Ensure the coordinates and dimensions are within the bounds of the original image if (left < 0) left = 0; if (top < 0) top = 0; if (left + thebox[2] > nW) width2 = nW - left; if (top + thebox[3] > nW) height2 = nW - top; extractedImage = await human.tf.tidy(() => extractImage(tensor_b, left, top, width2, height2)); human.tf.dispose(tensor_b); } else{ facedetect = false; extractedImage = tensor; } busy = false; } else{ //contain extractedImage = tensor; } const exBuffer = await tensorToBuffer(extractedImage); //set to buffer so can be used by sharp sharpImage = sharp(exBuffer, { raw: { width: extractedImage.shape[1], height: extractedImage.shape[0], channels: 3 }}); //export as sharp to resize later as tf resize bilinear seems to be horrible human.tf.dispose(tensor); human.tf.dispose(extractedImage); return sharpImage; } ``` Is this is a bug? I can't find anywhere in the documentation the input image should be cropped or padded into a square? Finally performance is great when a face is found, but when a face isn't found, performance drops from 25fps to about 17fps.
vladmandic commented 2 months ago

added in 3.3 alpha release.