When the encoder of v4l2 uses the cbr mode to encode H264, the frame rate is low[BUG]

q5270114 commented 1 year ago

each frame consumes approximately 100ms.

respberry pi 4b. os:

this is my encode code :

//
// Created by admin on 2022/11/22.
//
#ifdef ENABLE_V4L2_HARDWARE_ENCODE
#include "V4l2Encoder.h"
#include "spdlog/spdlog.h"
int xioctl(int fd, unsigned long ctl, void *arg)
{
    int ret, num_tries = 10;
    do
    {
        ret = ioctl(fd, ctl, arg);
    } while (ret == -1 && errno == EINTR && num_tries-- > 0);
    return ret;
}

V4l2Encoder::V4l2Encoder(int width, int height, long bitrate, int fps, Callback cb) {
    this -> cb = cb;
    const char device_name[] = "/dev/video11";
    fd_ = open(device_name, O_RDWR, 0);
    if (fd_ < 0)
        throw std::runtime_error("failed to open V4L2 H264 encoder");
    spdlog::info("Opened V4L2 H264Encoder on {}  as fd {}", device_name, fd_);

    v4l2_control ctrl = {};
    // set bitrate mode
    ctrl.id = V4L2_CID_MPEG_VIDEO_BITRATE_MODE;
    ctrl.value = V4L2_MPEG_VIDEO_BITRATE_MODE_CBR;
    if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
        throw std::runtime_error("failed to set bitrate");

    // set bitrate
    if (bitrate) {
        ctrl.id = V4L2_CID_MPEG_VIDEO_BITRATE;
        ctrl.value = bitrate;
        if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
            throw std::runtime_error("failed to set bitrate");
    }

    // set profile
    ctrl.id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
    ctrl.value = V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
    if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
        throw std::runtime_error("failed to set profile");

    // set level
    ctrl.id = V4L2_CID_MPEG_VIDEO_H264_LEVEL;
    ctrl.value = V4L2_MPEG_VIDEO_H264_LEVEL_4_2;
    if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
        throw std::runtime_error("failed to set level");

    // set i frame period
    ctrl.id = V4L2_CID_MPEG_VIDEO_H264_I_PERIOD;
    ctrl.value = fps;
    if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
        throw std::runtime_error("failed to set intra period");

    // set seq header
    ctrl.id = V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER;
    ctrl.value = 1;
    if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
        throw std::runtime_error("failed to set inline headers");

    // set input frame fmt
    v4l2_format fmt = {};
    fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    fmt.fmt.pix_mp.width = width;
    fmt.fmt.pix_mp.height = height;
    // We assume YUV420 here, but it would be nice if we could do something
    // like info.pixel_format.toV4L2Fourcc();
    fmt.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
    fmt.fmt.pix_mp.plane_fmt[0].bytesperline = width; // stride
    fmt.fmt.pix_mp.field = V4L2_FIELD_ANY;
    fmt.fmt.pix_mp.colorspace = V4L2_COLORSPACE_SMPTE170M;
    fmt.fmt.pix_mp.num_planes = 1;
    if (xioctl(fd_, VIDIOC_S_FMT, &fmt) < 0)
        throw std::runtime_error("failed to set output format");

    v4l2_format outFmt = {};
    outFmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
    outFmt.fmt.pix_mp.width = width;
    outFmt.fmt.pix_mp.height = height;
    outFmt.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_H264;
    outFmt.fmt.pix_mp.field = V4L2_FIELD_ANY;
    outFmt.fmt.pix_mp.colorspace = V4L2_COLORSPACE_DEFAULT;
    outFmt.fmt.pix_mp.num_planes = 1;
    outFmt.fmt.pix_mp.plane_fmt[0].bytesperline = 0;
    outFmt.fmt.pix_mp.plane_fmt[0].sizeimage = 512 << 10;
    if (xioctl(fd_, VIDIOC_S_FMT, &outFmt) < 0)
        throw std::runtime_error("failed to set capture format");

    struct v4l2_streamparm parm = {};
    parm.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    parm.parm.output.timeperframe.numerator = 1000 / (double) fps; // input frame rate
    parm.parm.output.timeperframe.denominator = 1000;
    if (xioctl(fd_, VIDIOC_S_PARM, &parm) < 0)
        throw std::runtime_error("failed to set streamparm");

    v4l2_requestbuffers reqbufs = {};
    reqbufs.count = NUM_OUTPUT_BUFFERS;
    reqbufs.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    reqbufs.memory = V4L2_MEMORY_MMAP;
    if (xioctl(fd_, VIDIOC_REQBUFS, &reqbufs) < 0)
        throw std::runtime_error("request for output buffers failed");
    spdlog::info("Got {} output buffers", reqbufs.count);

    for (unsigned int i = 0; i < reqbufs.count; i++) {
        input_buffers_available_.push(i);
        v4l2_plane planes[VIDEO_MAX_PLANES];
        v4l2_buffer buffer = {};
        buffer.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
        buffer.memory = V4L2_MEMORY_MMAP;
        buffer.index = i;
        buffer.length = 1;
        buffer.m.planes = planes;
        if (xioctl(fd_, VIDIOC_QUERYBUF, &buffer) < 0)
            throw std::runtime_error("failed to output query buffer " + std::to_string(i));
        buffers_out[i].mem = mmap(0, buffer.m.planes[0].length, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
                                  buffer.m.planes[0].m.mem_offset);
        if (buffers_out[i].mem == MAP_FAILED)
            throw std::runtime_error("failed to mmap output buffer " + std::to_string(i));
        buffers_out[i].size = buffer.m.planes[0].length;
//        if (xioctl(fd_, VIDIOC_DQBUF, &buffer) < 0)
//            throw std::runtime_error("failed to queue output buffer " + std::to_string(i));
    }

    reqbufs = {};
    reqbufs.count = NUM_CAPTURE_BUFFERS;
    reqbufs.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
    reqbufs.memory = V4L2_MEMORY_MMAP;
    if (xioctl(fd_, VIDIOC_REQBUFS, &reqbufs) < 0) {
        throw std::runtime_error("request for capture buffers failed");
    }
    spdlog::info("Got {} capture buffers", reqbufs.count);
    num_capture_buffers_ = reqbufs.count;

    for (unsigned int i = 0; i < reqbufs.count; i++) {
        v4l2_plane planes[VIDEO_MAX_PLANES];
        v4l2_buffer buffer = {};
        buffer.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
        buffer.memory = V4L2_MEMORY_MMAP;
        buffer.index = i;
        buffer.length = 1;
        buffer.m.planes = planes;
        if (xioctl(fd_, VIDIOC_QUERYBUF, &buffer) < 0)
            throw std::runtime_error("failed to capture query buffer " + std::to_string(i));
        buffers_[i].mem = mmap(0, buffer.m.planes[0].length, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
                               buffer.m.planes[0].m.mem_offset);
        if (buffers_[i].mem == MAP_FAILED)
            throw std::runtime_error("failed to mmap capture buffer " + std::to_string(i));
        buffers_[i].size = buffer.m.planes[0].length;
        // Whilst we're going through all the capture buffers, we may as well queue
        // them ready for the encoder to write into.
        if (xioctl(fd_, VIDIOC_QBUF, &buffer) < 0)
            throw std::runtime_error("failed to queue capture buffer " + std::to_string(i));
    }

    v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    if (xioctl(fd_, VIDIOC_STREAMON, &type) < 0)
        throw std::runtime_error("failed to start output streaming");
    type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;

    if (xioctl(fd_, VIDIOC_STREAMON, &type) < 0)
        throw std::runtime_error("failed to start capture streaming");
    std::cout << "V4L2 Codec streaming started" << std::endl;

    outputThread = std::thread(std::bind(&V4l2Encoder::outputFun, this));
    pollThread = std::thread(std::bind(&V4l2Encoder::pollFun, this));
}

V4l2Encoder::~V4l2Encoder() {
    abortPoll_ = true;
    pollThread.join();
    abortOutput_ = true;
    outputThread.join();

    // Turn off streaming on both the output and capture queues, and "free" the
    // buffers that we requested. The capture ones need to be "munmapped" first.

    v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    if (xioctl(fd_, VIDIOC_STREAMOFF, &type) < 0)
        std::cout << "V4L2 Failed to stop output streaming" << std::endl;
    type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
    if (xioctl(fd_, VIDIOC_STREAMOFF, &type) < 0)
        std::cout << "V4L2 Failed to stop capture streaming" << std::endl;

    v4l2_requestbuffers reqbufs = {};
    reqbufs.count = 0;
    reqbufs.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    reqbufs.memory = V4L2_MEMORY_MMAP;
    if (xioctl(fd_, VIDIOC_REQBUFS, &reqbufs) < 0)
        std::cout << "V4L2 Request to free output buffers failed" << std::endl;

    for (int i = 0; i < num_capture_buffers_; i++)
        if (munmap(buffers_[i].mem, buffers_[i].size) < 0)
            std::cout << "V4L2 Failed to unmap buffer" << std::endl;
    reqbufs = {};
    reqbufs.count = 0;
    reqbufs.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
    reqbufs.memory = V4L2_MEMORY_MMAP;
    if (xioctl(fd_, VIDIOC_REQBUFS, &reqbufs) < 0)
        std::cout << "V4L2 Request to free capture buffers failed" << std::endl;

    close(fd_);
    std::cout << "V4L2 Encoder closed" << std::endl;
}

void V4l2Encoder::encode(uint8_t * data, size_t size, unsigned long timestamp_us) {
    int index;
    {
        std::lock_guard<std::mutex> lock(input_buffers_available_mutex_);
        if (input_buffers_available_.empty()) {
            // throw std::runtime_error("no buffers available to queue codec input");
            spdlog::warn("no buffers available to queue codec input");
            return;
        }
        index = input_buffers_available_.front();
        input_buffers_available_.pop();
        spdlog::debug("encode - output buf index={}", index);
    }
    v4l2_buffer buf = {};
    v4l2_plane planes[VIDEO_MAX_PLANES] = {};
    buf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
    buf.index = index;
    buf.field = V4L2_FIELD_NONE;
    buf.memory = V4L2_MEMORY_MMAP;
    buf.length = 1;
    buf.timestamp.tv_sec = timestamp_us / 1000000;
    buf.timestamp.tv_usec = timestamp_us % 1000000;
    memcpy(buffers_out[index].mem, data, size);
    buf.m.planes = planes;
    buf.m.planes[0].bytesused = size;
    buf.m.planes[0].length = size;
    if (xioctl(fd_, VIDIOC_QBUF, &buf) < 0)
        throw std::runtime_error("failed to queue input to codec");
}

void V4l2Encoder::outputFun() {
    OutputItem item;
    while (true) {
        {
            std::unique_lock<std::mutex> lock(output_mutex_);
            while (true)
            {
                // Must check the abort first, to allow items in the output
                // queue to have a callback.
                if (abortOutput_ && output_queue_.empty())
                    return;

                if (!output_queue_.empty())
                {
                    item = output_queue_.front();
                    output_queue_.pop();
                    break;
                }
                else
                    output_cond_var_.wait_for(lock, std::chrono::milliseconds (200));
            }
        }

        // call back encoded data.
        cb((uint8_t*)item.mem, item.bytes_used, item.timestamp_us, item.keyframe);

//        spdlog::debug("item.bytes_used={}, item.timestamp_us={}, item.keyframe={}", item.bytes_used , item.timestamp_us , item.keyframe);
        v4l2_buffer buf = {};
        v4l2_plane planes[VIDEO_MAX_PLANES] = {};
        buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
        buf.memory = V4L2_MEMORY_MMAP;
        buf.index = item.index;
        buf.length = 1;
        buf.m.planes = planes;
        buf.m.planes[0].bytesused = 0;
        buf.m.planes[0].length = item.length;
        if (xioctl(fd_, VIDIOC_QBUF, &buf) < 0)
            throw std::runtime_error("failed to re-queue encoded buffer");
    }
}

void V4l2Encoder::pollFun() {
    while (true) {
        pollfd p = { fd_, POLLIN, 0 };
        int ret = poll(&p, 1, 200);
        {
            std::lock_guard<std::mutex> lock(input_buffers_available_mutex_);
            if (abortPoll_ && input_buffers_available_.size() == NUM_OUTPUT_BUFFERS)
                break;
        }
        if (ret == -1)
        {
            if (errno == EINTR)
                continue;
            throw std::runtime_error("unexpected errno " + std::to_string(errno) + " from poll");
        }

        if (p.revents & POLLIN)
        {
            v4l2_buffer buf = {};
            v4l2_plane planes[VIDEO_MAX_PLANES] = {};
            buf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
            buf.memory = V4L2_MEMORY_MMAP;
            buf.length = 1;
            buf.m.planes = planes;
            int ret = xioctl(fd_, VIDIOC_DQBUF, &buf);
            if (ret == 0)
            {
                // Return this to the caller, first noting that this buffer, identified
                // by its index, is available for queueing up another frame.
                {
                    std::lock_guard<std::mutex> lock(input_buffers_available_mutex_);
                    input_buffers_available_.push(buf.index);
                }
                // input_done_callback_(nullptr);
            }

            buf = {};
            memset(planes, 0, sizeof(planes));
            buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
            buf.memory = V4L2_MEMORY_MMAP;
            buf.length = 1;
            buf.m.planes = planes;
            ret = xioctl(fd_, VIDIOC_DQBUF, &buf);
            if (ret == 0)
            {

                // We push this encoded buffer to another thread so that our
                // application can take its time with the data without blocking the
                // encode process.
                int64_t timestamp_us = (buf.timestamp.tv_sec * (int64_t)1000000) + buf.timestamp.tv_usec;
                OutputItem item = { buffers_[buf.index].mem,
                                    buf.m.planes[0].bytesused,
                                    buf.m.planes[0].length,
                                    buf.index,
                                    !!(buf.flags & V4L2_BUF_FLAG_KEYFRAME),
                                    timestamp_us };
                std::lock_guard<std::mutex> lock(output_mutex_);
                output_queue_.push(item);
                output_cond_var_.notify_one();
            }
        }
    }
}

void V4l2Encoder::setBitrate(long bitrate) {
    v4l2_control ctrl = {};
    // set bitrate
    if (bitrate) {
        ctrl.id = V4L2_CID_MPEG_VIDEO_BITRATE;
        ctrl.value = bitrate;
        if (xioctl(fd_, VIDIOC_S_CTRL, &ctrl) < 0)
            throw std::runtime_error("failed to set bitrate");
    }
}
#endif//ENABLE_V4L2_HARDWARE_ENCODE

6by9 commented 1 year ago

Your quoted code is insufficient for anyone to reproduce your test case. You've given no information as to what resolution you're asking to encode for a start.

The 2 queues should normally be treated independently, not with DQBUF being called on both based on the one event. Due to pipelining and internal buffering the input frame is finished with quite a while before the encoded frame is ready. At 1080p, memory says that the full frame encode takes around 45ms.

q5270114 commented 1 year ago

Sorry, my test case is based on 1080P encoding.When I set the encoder to VBR mode, this test case can encode H264 videos with 1080p close to 60fps. Do you mean that 1080p requires 45ms per frame in CBR mode? So, in CBR mode, the encoder can only encode 1080P 22FPS, right?

6by9 commented 1 year ago

Do you mean that 1080p requires 45ms per frame in CBR mode? So, in CBR mode, the encoder can only encode 1080P 22FPS, right?

No, it is pipelined. The hardware need images in a particular format, so the first stage is an image conversion. The second stage is generating motion estimation values for all potential options. The third is the choice of motion vector The fourth is entropy coding (CAVLC or CABAC)

Memory says that steps 2&3 are done as one job, but that still leaves a potential pipeline of 3 frames. Max frame rate is the time taken for the longest of the steps, not the total. What does fail badly with a pipelined codec is feeding one frame in and waiting for the output frame before feeding the next one in.

You've created this as an issue under libcamera-apps. Are you actually using libcamera-apps, or just the V4L2 encoder? If libcamera-apps, why the need to create your own wrapper around the H264 encoder?

q5270114 commented 1 year ago

Thanks for your reply, we have a requirement to use CBR mode hard encoder encoded h264 (1080P 30FPS) live broadcast, and we need to adjust the bit rate in real time. Since there are few hard coding data about raspberry pie, I can only choose to modify the examples in libcamera apps to support our own business implementation. Do you have any good suggestions for me to modify my test cases to achieve this goal? By the way, when I use the ffmpeg command line tool to code, the hard encoder's encoding speed is relatively fast, and the bit rate control is very stable

q5270114 commented 1 year ago

Do you mean that 1080p requires 45ms per frame in CBR mode? So, in CBR mode, the encoder can only encode 1080P 22FPS, right?

No, it is pipelined. The hardware need images in a particular format, so the first stage is an image conversion. The second stage is generating motion estimation values for all potential options. The third is the choice of motion vector The fourth is entropy coding (CAVLC or CABAC)

Memory says that steps 2&3 are done as one job, but that still leaves a potential pipeline of 3 frames. Max frame rate is the time taken for the longest of the steps, not the total. What does fail badly with a pipelined codec is feeding one frame in and waiting for the output frame before feeding the next one in.

You've created this as an issue under libcamera-apps. Are you actually using libcamera-apps, or just the V4L2 encoder? If libcamera-apps, why the need to create your own wrapper around the H264 encoder?

raspberrypi / rpicam-apps

When the encoder of v4l2 uses the cbr mode to encode H264, the frame rate is low[BUG] #509