homuler / MediaPipeUnityPlugin

Unity plugin to run MediaPipe
MIT License
1.79k stars 465 forks source link

Error when trying to get face landmarks using Holistic #700

Closed jcelsi closed 2 years ago

jcelsi commented 2 years ago

Plugin Version or Commit ID

v0.10.0

Unity Version

2021.3.3f1

Your Host OS

Ubuntu 22.04

Target Platform

UnityEditor

Description

I'm trying to implement the Holistic solution but replicating the steps for de getting started tutorial. I've been able to replicate the implementation for the hair segmentation solution, combined with the face mesh solution in one graph. I want to do the same for Hair segmentation and Holistic.

This is my implementation for the solution

using System;
using System.Runtime.InteropServices;
using UnityEngine.Rendering;
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
using Mediapipe.Unity.CoordinateSystem;

using Stopwatch = System.Diagnostics.Stopwatch;

namespace Mediapipe.Unity.Tutorial
{
  public class HairAndHolisticSolution : MonoBehaviour
  {
    [SerializeField] private TextAsset _configAsset;
    [SerializeField] private RawImage _screen;
    [SerializeField] private Texture _tree;
    [SerializeField] private int _width;
    [SerializeField] private int _height;
    [SerializeField] private int _fps;
    [SerializeField] private Material _HairMaterial;
    [SerializeField] private GenerateMesh generateMesh;

    [SerializeField] private bool Kinect;

    private CalculatorGraph _graph;
    private ResourceManager _resourceManager;
    private GraphicsBuffer _maskBuffer;

    private WebCamTexture _webCamTexture;
    private Texture2D _inputTexture;
    private Color32[] _inputPixelData;
    private Texture2D _outputTexture;
    private Color32[] _outputPixelData;
    private Texture liveTexture;

    private float[] _maskArray;
    private int _maskWidth;
    private int _maskHeight;

    private static readonly int[] _PointsMask ={
      4, 10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149,
      150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109
    };

    private IEnumerator Start()
    {
        if (WebCamTexture.devices.Length == 0)
        {
            throw new System.Exception("Web Camera devices are not found");
        }

        int camerIdcount = 0;
        foreach (var device in WebCamTexture.devices)
        {
          Debug.Log(device);
          if (device.name.Contains( Kinect? "Kinect" : "Integrated"))
          {
            break;
          }
          camerIdcount+=1;
        }
        var webCamDevice = WebCamTexture.devices[camerIdcount];
        _webCamTexture = new WebCamTexture(webCamDevice.name, _width, _height, _fps);
        _webCamTexture.Play();

        yield return new WaitUntil(() => _webCamTexture.width > 16);
        yield return GpuManager.Initialize();

        if (!GpuManager.IsInitialized)
        {
            throw new System.Exception("Failed to initialize GPU resources");
        }

        _screen.rectTransform.sizeDelta = new Vector2(_width, _height);

        _inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
        _inputPixelData = new Color32[_width * _height];

        _outputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
        _outputPixelData = new Color32[_width * _height];

        _screen.texture = _inputTexture;

        _resourceManager = new LocalResourceManager();
        yield return _resourceManager.PrepareAssetAsync("hair_segmentation.bytes");
        yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes");
        yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");

        var stopwatch = new Stopwatch();

        _graph = new CalculatorGraph(_configAsset.text);
        _graph.SetGpuResources(GpuManager.GpuResources).AssertOk();

        //Output de mascara facial
        var outputVideoStream = new OutputStream<ImageFramePacket, ImageFrame>(_graph, "hair_mask");
        //Output de landmarks
        var multiFaceLandmarksStream = new OutputStream<NormalizedLandmarkListVectorPacket, List<NormalizedLandmarkList>>(_graph, "face_landmarks");

        outputVideoStream.StartPolling().AssertOk();
        multiFaceLandmarksStream.StartPolling().AssertOk();
        _graph.StartRun().AssertOk();
        stopwatch.Start();

        _maskWidth = _width;
        _maskHeight = _height;
        _maskArray = new float[512 * 512];

        var screenRect = _screen.GetComponent<RectTransform>().rect;

        var stride = Marshal.SizeOf(typeof(float));
        _maskBuffer = new GraphicsBuffer(GraphicsBuffer.Target.Structured, 512 * 512, stride);
        _HairMaterial.SetBuffer("_MaskBuffer", _maskBuffer);

        var texture = new Texture2D(1, 1, TextureFormat.RGBA32, false);
        var textureColor = new Color32((byte)(255), (byte)(255), (byte)(255), (byte)(255));
        texture.SetPixels32(new Color32[] { textureColor });
        texture.Apply();

        _HairMaterial.SetTexture("_MaskTex", texture);

        while (true)
        {
            _inputTexture.SetPixels32(_webCamTexture.GetPixels32(_inputPixelData));
            var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData<byte>());
            var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
            _graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();

            _inputTexture.SetPixels32(_inputPixelData);
            _inputTexture.Apply();

            yield return new WaitForEndOfFrame();

            if (outputVideoStream.TryGetNext(out var outputVideo))
            {
              if (outputVideo!= null)
              {

                var _ = outputVideo.TryReadChannelNormalized(0, _maskArray);

                _maskBuffer.SetData(_maskArray);
                _HairMaterial.SetTexture("_MaskTex", _inputTexture);
                //_HairMaterial.SetTexture("_MainTex", _inputTexture);
              }  
            }

            if (multiFaceLandmarksStream.TryGetNext(out var multiFaceLandmarks))
            {
                if (multiFaceLandmarks != null && multiFaceLandmarks.Count > 0)
                {
                  foreach (var landmarks in multiFaceLandmarks)
                  {
                    // top of the head
                    var topOfHead = landmarks.Landmark[10];
                    //Debug.Log($"Unity Local Coordinates: {screenRect.GetPoint(topOfHead)}, Image Coordinates: {topOfHead}");

                    var vectores = new NormalizedLandmark[_PointsMask.Length];

                    for(var i = 0; i < _PointsMask.Length; i++){

                        var point = landmarks.Landmark[_PointsMask[i]];
                        vectores[i] = point;
                    }
                    generateMesh.Rebuild(vectores);
                  }
                }
            }
        }
    }

    private void OnDestroy()
    {
      if (_webCamTexture != null)
      {
        _webCamTexture.Stop();
      }

      if (_graph != null)
      {
        try
        {
          _graph.CloseInputStream("input_video").AssertOk();
          _graph.WaitUntilDone().AssertOk();
        }
        finally
        {

          _graph.Dispose();
        }
      }
      //_maskBuffer.Dispose();
      GpuManager.Shutdown();
    }
  }
}

And this is the mixed graph of hair segmentation and holistic.

# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Copied from mediapipe/graphs/hair_segmentation/hair_segmentation_mobile_gpu.pbtxt
#
# CHANGES:
#   - `input_video` is ImageFrame (ImageFrameToGpuBufferCalculator converts it into GpuBuffer)
#   - ImageTransformationCalculator rotates the input
#   - Remove RecolorCalculator
#   - output ImageFrame

# MediaPipe graph that performs hair segmentation with TensorFlow Lite on GPU.
# Used in the example in
# mediapipie/examples/android/src/java/com/mediapipe/apps/hairsegmentationgpu.

input_stream: "input_video"
output_stream: "hair_mask"
output_stream: "pose_landmarks"
output_stream: "pose_world_landmarks"
output_stream: "segmentation_mask"
output_stream: "pose_roi"
output_stream: "pose_detection"
output_stream: "face_landmarks"
output_stream: "left_hand_landmarks"
output_stream: "right_hand_landmarks"

# Throttles the images flowing downstream for flow control. It passes through
# the very first incoming image unaltered, and waits for
# TfLiteTensorsToSegmentationCalculator downstream in the graph to finish
# generating the corresponding hair mask before it passes through another
# image. All images that come in while waiting are dropped, limiting the number
# of in-flight images between this calculator and
# TfLiteTensorsToSegmentationCalculator to 1. This prevents the nodes in between
# from queuing up incoming images and data excessively, which leads to increased
# latency and memory usage, unwanted in real-time mobile applications. It also
# eliminates unnecessarily computation, e.g., a transformed image produced by
# ImageTransformationCalculator may get dropped downstream if the subsequent
# TfLiteConverterCalculator or TfLiteInferenceCalculator is still busy
# processing previous inputs.
node {
  calculator: "FlowLimiterCalculator"
  input_stream: "input_video"
  input_stream: "FINISHED:face_landmarks"
  input_stream_info: {
    tag_index: "FINISHED"
    back_edge: true
  }
  output_stream: "throttled_input_video"
  node_options: {
    [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] {
      max_in_flight: 1
      max_in_queue: 0
      # Timeout is disabled (set to 0) as first frame processing can take more
      # than 1 second.
      in_flight_timeout: 0
    }
  }
}

node: {
  calculator: "ImageFrameToGpuBufferCalculator"
  input_stream: "throttled_input_video"
  output_stream: "throttled_input_video_gpu"
}

node {
  calculator: "ConstantSidePacketCalculator"
  output_side_packet: "PACKET:0:input_rotation"
  output_side_packet: "PACKET:1:input_horizontally_flipped"
  output_side_packet: "PACKET:2:input_vertically_flipped"

  output_side_packet: "PACKET:3:output_rotation"
  output_side_packet: "PACKET:4:output_horizontally_flipped"
  output_side_packet: "PACKET:5:output_vertically_flipped"

  output_side_packet: "PACKET:6:num_faces"
  output_side_packet: "PACKET:7:with_attention"

  output_side_packet: "PACKET:8:model_complexity"
  output_side_packet: "PACKET:9:smooth_landmarks"
  output_side_packet: "PACKET:10:refine_face_landmarks"
  output_side_packet: "PACKET:11:enable_segmentation"
  output_side_packet: "PACKET:12:smooth_segmentation"
  node_options: {
    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
      packet { int_value: 0 }
      packet { bool_value: False }
      packet { bool_value: True }
      packet { int_value: 0 }
      packet { bool_value: False }
      packet { bool_value: False }
      packet { int_value: 1 }
      packet { bool_value: True }

      packet { int_value: 1 }
      packet { bool_value: true }
      packet { bool_value: true }
      packet { bool_value: true }
      packet { bool_value: true }
    }
  }
}

# Transforms the input image on GPU to a 512x512 image. To scale the image, by
# default it uses the STRETCH scale mode that maps the entire input image to the
# entire transformed image. As a result, image aspect ratio may be changed and
# objects in the image may be deformed (stretched or squeezed), but the hair
# segmentation model used in this graph is agnostic to that deformation.
node: {
  calculator: "ImageTransformationCalculator"
  input_stream: "IMAGE_GPU:throttled_input_video_gpu"
  input_side_packet: "ROTATION_DEGREES:input_rotation"
  input_side_packet: "FLIP_HORIZONTALLY:input_horizontally_flipped"
  input_side_packet: "FLIP_VERTICALLY:input_vertically_flipped"
  output_stream: "IMAGE_GPU:transformed_input_video"
  node_options: {
    [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
      output_width: 512
      output_height: 512
    }
  }
}

# Caches a mask fed back from the previous round of hair segmentation, and upon
# the arrival of the next input image sends out the cached mask with the
# timestamp replaced by that of the input image, essentially generating a packet
# that carries the previous mask. Note that upon the arrival of the very first
# input image, an empty packet is sent out to jump start the feedback loop.
node {
  calculator: "PreviousLoopbackCalculator"
  input_stream: "MAIN:throttled_input_video_gpu"
  input_stream: "LOOP:hair_mask_gpu"
  input_stream_info: {
    tag_index: "LOOP"
    back_edge: true
  }
  output_stream: "PREV_LOOP:previous_hair_mask"
}

# Embeds the hair mask generated from the previous round of hair segmentation
# as the alpha channel of the current input image.
node {
  calculator: "SetAlphaCalculator"
  input_stream: "IMAGE_GPU:transformed_input_video"
  input_stream: "ALPHA_GPU:previous_hair_mask"
  output_stream: "IMAGE_GPU:mask_embedded_input_video"
}

# Converts the transformed input image on GPU into an image tensor stored in
# tflite::gpu::GlBuffer. The zero_center option is set to false to normalize the
# pixel values to [0.f, 1.f] as opposed to [-1.f, 1.f]. With the
# max_num_channels option set to 4, all 4 RGBA channels are contained in the
# image tensor.
node {
  calculator: "TfLiteConverterCalculator"
  input_stream: "IMAGE_GPU:mask_embedded_input_video"
  output_stream: "TENSORS_GPU:image_tensor"
  node_options: {
    [type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] {
      zero_center: false
      max_num_channels: 4
    }
  }
}

# Generates a single side packet containing a TensorFlow Lite op resolver that
# supports custom ops needed by the model used in this graph.
node {
  calculator: "TfLiteCustomOpResolverCalculator"
  output_side_packet: "op_resolver"
  node_options: {
    [type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] {
      use_gpu: true
    }
  }
}

# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
# tensor representing the hair segmentation, which has the same width and height
# as the input image tensor.
node {
  calculator: "TfLiteInferenceCalculator"
  input_stream: "TENSORS_GPU:image_tensor"
  output_stream: "TENSORS_GPU:segmentation_tensor"
  input_side_packet: "CUSTOM_OP_RESOLVER:op_resolver"
  node_options: {
    [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
      model_path: "mediapipe/models/hair_segmentation.tflite"
      use_gpu: true
    }
  }
}

# Decodes the segmentation tensor generated by the TensorFlow Lite model into a
# mask of values in [0.f, 1.f], stored in the R channel of a GPU buffer. It also
# takes the mask generated previously as another input to improve the temporal
# consistency.
node {
  calculator: "TfLiteTensorsToSegmentationCalculator"
  input_stream: "TENSORS_GPU:segmentation_tensor"
  input_stream: "PREV_MASK_GPU:previous_hair_mask"
  output_stream: "MASK_GPU:hair_mask_gpu"
  node_options: {
    [type.googleapis.com/mediapipe.TfLiteTensorsToSegmentationCalculatorOptions] {
      tensor_width: 512
      tensor_height: 512
      tensor_channels: 2
      combine_with_previous_ratio: 0.9
      output_layer_index: 1
    }
  }
}

node: {
  calculator: "ImageTransformationCalculator"
  input_stream: "IMAGE_GPU:hair_mask_gpu"
  input_side_packet: "ROTATION_DEGREES:output_rotation"
  input_side_packet: "FLIP_HORIZONTALLY:output_horizontally_flipped"
  input_side_packet: "FLIP_VERTICALLY:output_vertically_flipped"
  output_stream: "IMAGE_GPU:hair_mask_unrotated_gpu"
}

node: {
  calculator: "GpuBufferToImageFrameCalculator"
  input_stream: "hair_mask_unrotated_gpu"
  output_stream: "hair_mask"
}

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------                                          

node: {
  calculator: "ImageFrameToGpuBufferCalculator"
  input_stream: "throttled_input_video"
  output_stream: "throttled_input_video_gpu_mesh"
}

node: {
  calculator: "ImageTransformationCalculator"
  input_stream: "IMAGE_GPU:throttled_input_video_gpu_mesh"
  input_side_packet: "ROTATION_DEGREES:input_rotation"
  input_side_packet: "FLIP_HORIZONTALLY:input_horizontally_flipped"
  input_side_packet: "FLIP_VERTICALLY:input_vertically_flipped"
  output_stream: "IMAGE_GPU:transformed_input_video_mesh"
  node_options: {
    [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
      flip_vertically: true,
      flip_horizontally: false
    }
  }
}

# Subgraph that detects faces and corresponding landmarks.
node {
  calculator: "HolisticLandmarkGpu"
  input_stream: "IMAGE:transformed_input_video_mesh"
  input_side_packet: "MODEL_COMPLEXITY:model_complexity"
  input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks"
  input_side_packet: "REFINE_FACE_LANDMARKS:refine_face_landmarks"
  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
  input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation"
  output_stream: "POSE_LANDMARKS:pose_landmarks"
  output_stream: "WORLD_LANDMARKS:pose_world_landmarks"
  output_stream: "SEGMENTATION_MASK:segmentation_mask_gpu"
  output_stream: "POSE_ROI:pose_roi"
  output_stream: "POSE_DETECTION:pose_detection"
  output_stream: "FACE_LANDMARKS:face_landmarks"
  output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks"
  output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks"
}

node: {
  calculator: "ImageTransformationCalculator"
  input_stream: "IMAGE_GPU:segmentation_mask_gpu"
  input_side_packet: "ROTATION_DEGREES:output_rotation"
  input_side_packet: "FLIP_HORIZONTALLY:output_horizontally_flipped"
  input_side_packet: "FLIP_VERTICALLY:output_vertically_flipped"
  output_stream: "IMAGE_GPU:segmentation_mask_unrotated_gpu"
}

node: {
  calculator: "GpuBufferToImageFrameCalculator"
  input_stream: "segmentation_mask_unrotated_gpu"
  output_stream: "segmentation_mask"
}

But when I try to run the code it throws an error when trying yo get the landmarks if (outputVideoStream.TryGetNext(out var outputVideo))

I get the following error message

MediaPipeException: MediaPipe Aborted, refer glog files for more details
Mediapipe.MpReturnCodeExtension.Assert (Mediapipe.MpReturnCode code) (at Packages/com.github.homuler.mediapipe/Runtime/Scripts/PInvoke/MpReturnCode.cs:48)
Mediapipe.NormalizedLandmarkListVectorPacket.Get () (at Packages/com.github.homuler.mediapipe/Runtime/Scripts/Framework/Packet/NormalizedLandmarkListVectorPacket.cs:29)
Mediapipe.Unity.OutputStream`2[TPacket,TValue].TryGetNext (TValue& value, System.Int64 timestampThreshold, System.Boolean allowBlock) (at Packages/com.github.homuler.mediapipe/Runtime/Scripts/Unity/OutputStream.cs:229)
Mediapipe.Unity.OutputStream`2[TPacket,TValue].TryGetNext (TValue& value, System.Boolean allowBlock) (at Packages/com.github.homuler.mediapipe/Runtime/Scripts/Unity/OutputStream.cs:235)
Mediapipe.Unity.Tutorial.HairAndHolisticSolution+<Start>d__22.MoveNext () (at Assets/MediaPipeUnity/Tutorial/Official Solution/Scripts/HairAndHolisticSolution.cs:150)
UnityEngine.SetupCoroutine.InvokeMoveNext (System.Collections.IEnumerator enumerator, System.IntPtr returnValueAddress) (at /home/bokken/buildslave/unity/build/Runtime/Export/Scripting/Coroutines.cs:17)
UnityEngine.GUIUtility:ProcessEvent(Int32, IntPtr, Boolean&) (at /home/bokken/buildslave/unity/build/Modules/IMGUI/GUIUtility.cs:189)

Code to Reproduce the issue

No response

Additional Context

No response

homuler commented 2 years ago

MediaPipeException: MediaPipe Aborted, refer glog files for more details

So please open Editor.log and read the Glog messages. I think Glog outputs its log to stderr (Editor.log) by default, but if you can't find them, see https://github.com/homuler/MediaPipeUnityPlugin/wiki/Getting-Started#glog.

var multiFaceLandmarksStream = new OutputStream<NormalizedLandmarkListVectorPacket, List<NormalizedLandmarkList>>(_graph, "face_landmarks");

At least, the output type of face_landmarks is NormalizedLandmarkListPacket, so this line should be fixed.

// The output contains at most one face landmark list (not multiple like the Face Mesh solution).
var faceLandmarksStream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "face_landmarks");
jcelsi commented 2 years ago

that worked. Thanks a lot