jaroslawkrol / vision-camera-realtime-object-detection

VisionCamera Frame Processor Plugin to detect objects using TensorFlow Lite Task Vision
MIT License
108 stars 13 forks source link

Task :vision-camera-realtime-object-detection:compileDebugKotlin FAILED #13

Open bigg-S opened 5 months ago

bigg-S commented 5 months ago

After following the guide on how to set up this package, the build keeps failing with the error, " Task :vision-camera-realtime-object-detection:compileDebugKotlin FAILED ". The following is my package.json: "dependencies": { "@babel/plugin-proposal-export-namespace-from": "^7.18.9", "@react-native-async-storage/async-storage": "^1.18.2", "@react-navigation/bottom-tabs": "^6.5.8", "@react-navigation/drawer": "^6.6.3", "@react-navigation/native": "^6.1.17", "@react-navigation/native-stack": "^6.9.26", "@react-navigation/stack": "^6.3.17", "@reduxjs/toolkit": "^1.9.5", "axios": "^1.4.0", "deprecated-react-native-prop-types": "^5.0.0", "eslint-plugin-simple-import-sort": "^8.0.0", "i18n-js": "3.9.2", "lodash": "^4.17.21", "lodash.memoize": "4.1.2", "lottie-react-native": "^5.1.6", "moment": "^2.29.4", "nativewind": "^2.0.11", "react": "18.2.0", "react-native": "^0.72.14", "react-native-fast-tflite": "^1.2.0", "react-native-gesture-handler": "^2.16.2", "react-native-image-picker": "^7.1.2", "react-native-localize": "^2.2.4", "react-native-permissions": "^4.1.5", "react-native-reanimated": "^3.3.0", "react-native-safe-area-context": "^4.10.1", "react-native-screens": "^3.31.1", "react-native-vector-icons": "^9.2.0", "react-native-vision-camera": "^4.0.5", "react-native-worklets-core": "^1.3.3", "react-redux": "^8.1.1", "reanimated-bottom-sheet": "^1.0.0-alpha.22", "redux-persist": "^6.0.0", "redux-thunk": "^2.4.2", "vision-camera-realtime-object-detection": "^0.5.1", "vision-camera-resize-plugin": "^3.1.0" },

what might the issue be?

Orange9000 commented 3 months ago

It is incompatible with the latest version of vision camera. I've successfully managed to get it working with react-native-vision-camera@4.5.0 by editing some of the native code, as well getting around assets limitation on expo managed project. I can share the details if anyone would like.

anurgsrivastava commented 1 month ago

@Orange9000 Can you please share the details?

Orange9000 commented 1 month ago

@Orange9000 Can you please share the details?

I ended up rewriting the android part completely to support mediapipe instead of regular tensorflow lite. Here is the patch. Note that I got rid of rotation logic, which might be necessary in some projects.

diff --git a/node_modules/vision-camera-realtime-object-detection/android/build.gradle b/node_modules/vision-camera-realtime-object-detection/android/build.gradle
index 7c37883..87ed609 100644
--- a/node_modules/vision-camera-realtime-object-detection/android/build.gradle
+++ b/node_modules/vision-camera-realtime-object-detection/android/build.gradle
@@ -72,6 +72,7 @@ dependencies {
   implementation "com.facebook.react:react-native"
   api project(":react-native-vision-camera")
   implementation "androidx.camera:camera-core:1.1.0-alpha06"
+  implementation 'com.google.mediapipe:tasks-vision:0.10.14'
   implementation "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
   implementation 'org.tensorflow:tensorflow-lite-task-vision:0.4.0'
 }
diff --git a/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/RealtimeObjectDetectionProcessorPluginPackage.kt b/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/RealtimeObjectDetectionProcessorPluginPackage.kt
index 58f83d6..dc9a250 100644
--- a/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/RealtimeObjectDetectionProcessorPluginPackage.kt
+++ b/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/RealtimeObjectDetectionProcessorPluginPackage.kt
@@ -4,12 +4,20 @@ import com.facebook.react.ReactPackage
 import com.facebook.react.bridge.NativeModule
 import com.facebook.react.bridge.ReactApplicationContext
 import com.facebook.react.uimanager.ViewManager
-import com.mrousavy.camera.frameprocessor.FrameProcessorPlugin
+import com.mrousavy.camera.frameprocessors.FrameProcessorPlugin
+import com.mrousavy.camera.frameprocessors.FrameProcessorPluginRegistry
 import com.visioncamerarealtimeobjectdetection.realtimeobjectdetectionprocessor.RealtimeObjectDetectionProcessorPlugin

 class RealtimeObjectDetectionProcessorPluginPackage : ReactPackage {
+  companion object {
+    init {
+      FrameProcessorPluginRegistry.addFrameProcessorPlugin("detectObjects") { proxy, options ->
+        RealtimeObjectDetectionProcessorPlugin(proxy, options)
+      }
+    }
+  }
+
   override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
-    FrameProcessorPlugin.register(RealtimeObjectDetectionProcessorPlugin(reactContext))
     return emptyList()
   }

diff --git a/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/realtimeobjectdetectionprocessor/RealtimeObjectDetectionProcessorPlugin.kt b/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/realtimeobjectdetectionprocessor/RealtimeObjectDetectionProcessorPlugin.kt
index 812784b..592d741 100644
--- a/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/realtimeobjectdetectionprocessor/RealtimeObjectDetectionProcessorPlugin.kt
+++ b/node_modules/vision-camera-realtime-object-detection/android/src/main/java/com/visioncamerarealtimeobjectdetection/realtimeobjectdetectionprocessor/RealtimeObjectDetectionProcessorPlugin.kt
@@ -1,135 +1,87 @@
 package com.visioncamerarealtimeobjectdetection.realtimeobjectdetectionprocessor

-import kotlin.math.max
-import android.graphics.Matrix
-import android.graphics.RectF
-import androidx.camera.core.ImageProxy
 import com.facebook.react.bridge.ReactApplicationContext
-import com.facebook.react.bridge.ReadableMap
-import com.facebook.react.bridge.WritableNativeArray
-import com.facebook.react.bridge.WritableNativeMap
-import com.google.android.odml.image.MediaMlImageBuilder
-import com.mrousavy.camera.frameprocessor.FrameProcessorPlugin
-import org.tensorflow.lite.task.core.BaseOptions
-import org.tensorflow.lite.task.vision.detector.ObjectDetector
-
-class RealtimeObjectDetectionProcessorPlugin(reactContext: ReactApplicationContext) :
-    FrameProcessorPlugin("detectObjects") {
-    private val _context: ReactApplicationContext = reactContext
-    private var _detector: ObjectDetector? = null

-    fun rotateRect(rect: RectF, degrees: Int): RectF {
-        val matrix = Matrix()
-        matrix.postRotate(degrees.toFloat(), rect.centerX(), rect.centerY())
-        val rotatedRect = RectF(rect)
-        matrix.mapRect(rotatedRect)
-        return rotatedRect
-    }
+import com.google.mediapipe.tasks.core.BaseOptions
+import com.google.mediapipe.tasks.vision.core.RunningMode
+import com.google.mediapipe.framework.image.BitmapImageBuilder
+import com.google.mediapipe.tasks.components.containers.Detection
+import com.google.mediapipe.tasks.vision.objectdetector.ObjectDetector
+
+import com.mrousavy.camera.frameprocessors.Frame
+import com.mrousavy.camera.frameprocessors.VisionCameraProxy
+import com.mrousavy.camera.frameprocessors.FrameProcessorPlugin
+
+class RealtimeObjectDetectionProcessorPlugin(proxy: VisionCameraProxy, options: Map<String, Any>?): FrameProcessorPlugin() {
+    private val _context: ReactApplicationContext = proxy.context
+    private var _detector: ObjectDetector? = null

-    fun getDetectorWithModelFile(config: ReadableMap): ObjectDetector {
+    fun getDetectorWithModelFile(config: Map<String, Any>): ObjectDetector {
         if (_detector == null) {
-            val modelFile = config.getString("modelFile")
-
-            val scoreThreshold = config.getDouble("scoreThreshold").toFloat()
-            val maxResults = config.getInt("maxResults")
-            val numThreads = config.getInt("numThreads")
-
-            val baseOptionsBuilder = BaseOptions.builder().setNumThreads(numThreads)
-
-            val optionsBuilder =
-                ObjectDetector.ObjectDetectorOptions.builder()
-                    .setBaseOptions(baseOptionsBuilder.build())
-                    .setScoreThreshold(scoreThreshold)
-                    .setMaxResults(maxResults)
-
-            _detector =
-                ObjectDetector.createFromFileAndOptions(
-                    _context,
-                    "custom/$modelFile",
-                    optionsBuilder.build()
-                )
+            val modelFile = config["modelFile"].toString()
+
+            val maxResults = (config["maxResults"] as? Number)?.toInt()
+            val scoreThreshold = (config["scoreThreshold"] as? Number)?.toFloat()
+
+            val optionsBuilder = ObjectDetector.ObjectDetectorOptions.builder()
+                .setBaseOptions(BaseOptions.builder().setModelAssetPath(modelFile).build())
+                .setRunningMode(RunningMode.IMAGE)
+                .setMaxResults(maxResults)
+
+            if (scoreThreshold != null && scoreThreshold > 0) {
+                optionsBuilder.setScoreThreshold(scoreThreshold)
+            }
+
+            val options = optionsBuilder.build()
+
+            _detector = ObjectDetector.createFromOptions(_context, options);
         }
         return _detector!!
     }

-    override fun callback(frame: ImageProxy, params: Array<Any>): WritableNativeArray {
+    fun convertToConfigWithDefault(input: Map<String, Any>?): Map<String, Any> {
+        return input ?: emptyMap()
+    }
+
+    override fun callback(frame: Frame, arguments: Map<String, Any>?): Any? {
         val mediaImage = frame.image

+        val results: MutableList<Any> = arrayListOf()
+
         if (mediaImage == null) {
-            return WritableNativeArray()
+            return results
         }

-        val config = params[0] as ReadableMap
-
-        val mlImage = MediaMlImageBuilder(mediaImage).build()
+        val config = convertToConfigWithDefault(arguments)

-        val frameWidth = mlImage.width
-        val frameHeight = mlImage.height
+        val bitmap = frame.getImageProxy().toBitmap()
+        val mlImage = BitmapImageBuilder(bitmap).build()

-        // val ratio = max(mlImage.width.toFloat() / frameWidth, mlImage.height.toFloat() / frameHeight)
+        val detectedObjects = getDetectorWithModelFile(config).detect(mlImage)?.detections()

-        val results = WritableNativeArray()
-        val detectedObjects = getDetectorWithModelFile(config).detect(mlImage)
+        detectedObjects?.forEach { detectedObject ->
+            val labels: MutableList<Any> = arrayListOf()

-        for (detectedObject in detectedObjects) {
-            val labels = WritableNativeArray()
-
-            for (label in detectedObject.categories) {
-                val labelMap = WritableNativeMap()
-
-                labelMap.putInt("index", label.index)
-                labelMap.putString("label", label.label)
-                labelMap.putDouble("confidence", label.score.toDouble())
-
-                labels.pushMap(labelMap)
+            detectedObject.categories().forEach { label ->
+                labels.add(mapOf(
+                    "index" to label.index(),
+                    "label" to label.categoryName(),
+                    "confidence" to label.score().toDouble()
+                ))
             }

-            if (labels.size() > 0) {
-                val objectMap = WritableNativeMap()
-
-                objectMap.putArray("labels", labels)
-
-                val top = when (frame.imageInfo.rotationDegrees) {
-                    90 -> detectedObject.boundingBox.left / frameWidth
-                    180 -> (frameHeight - detectedObject.boundingBox.bottom) / frameHeight
-                    270 -> (frameWidth - detectedObject.boundingBox.right) / frameWidth
-                    else -> detectedObject.boundingBox.top / frameHeight
-                }
-
-                val height = when (frame.imageInfo.rotationDegrees) {
-                    90 -> (detectedObject.boundingBox.right - detectedObject.boundingBox.left) / frameWidth
-                    180 -> (detectedObject.boundingBox.bottom - detectedObject.boundingBox.top) / frameHeight
-                    270 -> (detectedObject.boundingBox.right - detectedObject.boundingBox.left) / frameWidth
-                    else -> (detectedObject.boundingBox.bottom - detectedObject.boundingBox.top) / frameHeight
-                }
-
-                val left = when (frame.imageInfo.rotationDegrees) {
-                    90 -> (frameHeight - detectedObject.boundingBox.bottom) / frameHeight
-                    180 -> (frameWidth - detectedObject.boundingBox.right) / frameWidth
-                    270 -> detectedObject.boundingBox.top / frameHeight
-                    else -> detectedObject.boundingBox.left / frameWidth
-                }
-
-                val width = when (frame.imageInfo.rotationDegrees) {
-                    90 -> (detectedObject.boundingBox.bottom - detectedObject.boundingBox.top) / frameHeight
-                    180 -> (detectedObject.boundingBox.right - detectedObject.boundingBox.left) / frameWidth
-                    270 -> (detectedObject.boundingBox.bottom - detectedObject.boundingBox.top) / frameHeight
-                    else -> (detectedObject.boundingBox.right - detectedObject.boundingBox.left) / frameWidth
-                }
-
-                println("abcde: ${top} ${left} ${width} ${height}")
-                println("xxxxx: ${mediaImage.width} ${mediaImage.height}")
-                println("xxxxx: ${frame.imageInfo.rotationDegrees}")
-
-                objectMap.putDouble("top", top.toDouble())
-                objectMap.putDouble("left", left.toDouble())
-                objectMap.putDouble("width", width.toDouble())
-                objectMap.putDouble("height", height.toDouble())
-
-                results.pushMap(objectMap)
+            if (labels.isNotEmpty()) {
+                results.add(mapOf(
+                    "labels" to labels,
+                    "top" to detectedObject.boundingBox().top.toDouble(),
+                    "left" to detectedObject.boundingBox().left.toDouble(),
+                    "width" to detectedObject.boundingBox().width().toDouble(),
+                    "height" to detectedObject.boundingBox().height().toDouble()
+                ))
             }
         }

         return results
     }
+
 }
diff --git a/node_modules/vision-camera-realtime-object-detection/src/index.tsx b/node_modules/vision-camera-realtime-object-detection/src/index.tsx
index 1911567..8e0450b 100644
--- a/node_modules/vision-camera-realtime-object-detection/src/index.tsx
+++ b/node_modules/vision-camera-realtime-object-detection/src/index.tsx
@@ -14,7 +14,6 @@ export interface ObjectLabel {
 }

 export interface DetectedObject {
-  frameRotation: number;
   labels: ObjectLabel[];

   /**

It expects model file to be in this location: android/app/src/main/assets/trained.tflite

Usage example:

import {Frame, VisionCameraProxy} from 'react-native-vision-camera';
import {FrameProcessorConfig} from 'vision-camera-realtime-object-detection';

const plugin = VisionCameraProxy.initFrameProcessorPlugin('detectObjects', {});

export function scanObjects(
  frame: Frame,
  config: FrameProcessorConfig,
): string {
  'worklet';

  if (plugin == null) {
    throw new Error('Failed to load Frame Processor Plugin "detectObjects"!');
  }

  return plugin.call(frame, config);
}
const frameProcessorConfig: FrameProcessorConfig = {
  numThreads: 1,
  maxResults: 1,
  scoreThreshold: 0.1,
  modelFile: 'trained.tflite',
};

const frameProcessor = useSkiaFrameProcessor(frame => {
  'worklet';

  frame.render();

  runAsync(frame, () => {
    'worklet';
    const detectedObjects: DetectedObject[] = scanObjects(
      frame,
      frameProcessorConfig,
    );

    const rects: DetectionRectangleInterface[] = [];
    for (const inFrameObject of detectedObjects) {
      const label = inFrameObject?.labels[0]?.label;
      const confidence = inFrameObject?.labels[0]?.confidence * 100;

      rects.push({
        label,
        confidence,
        y: inFrameObject.top,
        x: inFrameObject.left,
        width: inFrameObject.width,
        height: inFrameObject.height,
      });
    }

    setRectJS(rects);
  });
}, []);

Never got to test it with decent model though, only the poorly trained one, so I am not sure it detects everything correctly. But it is a 100% working example, tested on react-native-vision-camera@4.5.0.

I also made a kaggle, if you need to train your own model. Let me know if you want me to share it.