[BUG] env["CUDA_VISIBLE_DEVICES"] = "0" doesn't work

zackees commented 2 months ago

In the following function setting the CUDA_VISIBLE_DEVICES does not work. All work goes to GPU 1 (no matter what combination of settings I use), which is my smallest graphics card of the two that I have.

It's notable that running parallel rembg on partitioned folders is way faster than running all of them through just one, despite the fact it's going to just one GPU. It appears that there's a lot of computation that is being done on the CPU that is delaying the content being sent to the GPU so that when two jobs run, there is greater utilization of the CPU.

I'm not sure what is necessary to make CUDA_VISIBLE_DEVICES work with rembg. But I hope that this can be specified so that at least I can run my faster graphics card (3060) instead of 1060.

def video_remove_background(
    video_path: Path,
    output_dir: Path,
    bitrate_megs: float,
    output_height: Optional[int] = None,
    fps_override: Optional[float] = None,
    model: str = MODEL,
    keep_files: bool = False,
    exposed_gpus: Optional[list[int]] = None,
    num_jobs: Optional[int] = None,
) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    vidinfo: VidInfo = get_video_info(video_path)
    print(f"Video dimensions: {vidinfo.width}x{vidinfo.height}")
    cmd = f"static_ffmpeg -hide_banner -y -i {video_path} {output_dir}/%07d.png"
    rtn = os.system(cmd)
    if rtn != 0:
        raise OSError("Error converting video to images")
    print(f"Images saved to {output_dir}")

    if num_jobs is None:
        if exposed_gpus is None:
            num_jobs = 1
        else:
            num_jobs = len(exposed_gpus)

    if exposed_gpus is None or len(exposed_gpus) == 1:
        final_output_dir = output_dir / "video"
        final_output_dir.mkdir(parents=True, exist_ok=True)
        cmd = f'rembg p -a -ae 15 --post-process-mask -m {model} "{output_dir}" "{final_output_dir}"'
        print(f"Running: {cmd}")
        os.system(cmd)
        print(f"Images with background removed saved to {final_output_dir}")
    else:
        # Split the images into subfolders for parallel processing
        img_files = list(output_dir.glob("*.png"))
        img_files.sort()
        chunk_size = (len(img_files) + num_jobs - 1) // num_jobs
        img_chunks = [
            img_files[i : i + chunk_size] for i in range(0, len(img_files), chunk_size)
        ]

        def process_chunk(chunk, gpu_id, job_id):
            chunk_dir = output_dir / str(job_id)
            chunk_dir.mkdir(parents=True, exist_ok=True)
            for img in chunk:
                shutil.move(str(img), str(chunk_dir / img.name))

            final_output_dir = chunk_dir / "video"
            final_output_dir.mkdir(parents=True, exist_ok=True)
            env = {}
            env["NVIDIA_VISIBLE_DEVICES"] = str(gpu_id)
            env["CUDA_VISIBLE_DEVICES"] = "0"
            new_env = env.copy()
            env.update(os.environ)
            cmd = f'rembg p -a -ae 15 --post-process-mask -m {model} "{chunk_dir}" "{final_output_dir}"'
            print(f"Running: {cmd}, with updated environment: {new_env}")
            subprocess.run(cmd, shell=True, env=env, check=True)
            print(f"Images with background removed saved to {final_output_dir}")
            print()

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_jobs) as executor:
            futures = [
                executor.submit(process_chunk, chunk, exposed_gpus[job_id % len(exposed_gpus)], job_id)
                for job_id, chunk in enumerate(img_chunks)
            ]
            concurrent.futures.wait(futures)

        # Merge the processed images back into the main video directory
        final_output_dir = output_dir / "video"
        final_output_dir.mkdir(parents=True, exist_ok=True)
        for gpu_id in range(num_jobs):
            chunk_output_dir = output_dir / str(gpu_id) / "video"
            for img in chunk_output_dir.glob("*.png"):
                shutil.move(str(img), str(final_output_dir / img.name))

    fps: float = fps_override if fps_override else vidinfo.fps
    out_vid_path = Path(str(video_path.with_suffix("")) + f"-nobg-{model}.webm")
    filter_stmt = ""
    if output_height is not None:
        filter_stmt = f'-vf "scale=-1:{output_height}"'
    cmd = (
        f"static_ffmpeg -hide_banner -y -framerate {vidinfo.fps}"
        f' -i "{final_output_dir}/%07d.png" {filter_stmt} -c:v libvpx-vp9 -b:v {bitrate_megs}M'
        f' -auto-alt-ref 0 -pix_fmt yuva420p -an -r {fps} "{out_vid_path}"'
    )
    print(f"Running: {cmd}")
    rtn = os.system(cmd)
    if rtn != 0:
        raise OSError("Error converting images to video")

    # Command to merge the audio from the original video with the new video
    final_output_path = Path(str(video_path.with_suffix("")) + "-nobackground.webm")
    print(f"Mixing audio from {video_path} into {final_output_path}")
    cmd = (
        f'static_ffmpeg -hide_banner -y -i "{out_vid_path}" -i "{video_path}"'
        f' -c:v copy -c:a libvorbis -map 0:v:0 -map 1:a:0 "{final_output_path}"'
    )
    print(f"Running: {cmd}")
    rtn = os.system(cmd)
    if rtn != 0:
        raise OSError("Error merging video and audio")

    # Delete intermediate files if --keep-files is not set
    if not keep_files:
        shutil.rmtree(output_dir, ignore_errors=True)
        os.remove(out_vid_path)

zackees commented 2 months ago

Update: I don't see any GPU usage at all going on with this package. However, I get a 2x increase when I simply partitioned the source image directory into two and run rembg p on both.

zackees commented 2 months ago

Yeah, the one runtime is not running in gpu mode

danielgatis / rembg

[BUG] env["CUDA_VISIBLE_DEVICES"] = "0" doesn't work #626