framereader: replace swscale with libyuv, reduce cpu usage by half (#22992)

* use libyuv * cleanup old-commit-hash: 5ae51745097e3245c541456029fbc0fcce63a3c3
2021-11-26 21:41:14 +08:00
parent 89fcaad057
commit e308bc0a3f
4 changed files with 27 additions and 51 deletions
--- a/selfdrive/camerad/SConscript
+++ b/selfdrive/camerad/SConscript
@@ -21,7 +21,7 @@ else:
    if USE_FRAME_STREAM:
      cameras = ['cameras/camera_frame_stream.cc']
    else:
-      libs += ['avutil', 'avcodec', 'avformat', 'swscale', 'bz2', 'ssl', 'curl', 'crypto']
+      libs += ['avutil', 'avcodec', 'avformat', 'bz2', 'ssl', 'curl', 'crypto']
      # TODO: import replay_lib from root SConstruct
      cameras = ['cameras/camera_replay.cc', 
        env.Object('camera-util', '#/selfdrive/ui/replay/util.cc'),
--- a/selfdrive/ui/SConscript
+++ b/selfdrive/ui/SConscript
@@ -116,7 +116,7 @@ if arch in ['x86_64', 'Darwin'] or GetOption('extras'):
  replay_lib_src = ["replay/replay.cc", "replay/camera.cc", "replay/filereader.cc", "replay/logreader.cc", "replay/framereader.cc", "replay/route.cc", "replay/util.cc"]

  replay_lib = qt_env.Library("qt_replay", replay_lib_src, LIBS=base_libs)
-  replay_libs = [replay_lib, 'avutil', 'avcodec', 'avformat', 'bz2', 'curl', 'swscale', 'yuv'] + qt_libs
+  replay_libs = [replay_lib, 'avutil', 'avcodec', 'avformat', 'bz2', 'curl', 'yuv'] + qt_libs
  qt_env.Program("replay/replay", ["replay/main.cc"], LIBS=replay_libs)

  qt_env.Program("watch3", ["watch3.cc"], LIBS=qt_libs + ['common', 'json11'])
--- a/selfdrive/ui/replay/framereader.cc
+++ b/selfdrive/ui/replay/framereader.cc
@@ -34,8 +34,6 @@ enum AVPixelFormat get_hw_format(AVCodecContext *ctx, const enum AVPixelFormat *
 }  // namespace

 FrameReader::FrameReader(bool local_cache, int chunk_size, int retries) : FileReader(local_cache, chunk_size, retries) {
-  input_ctx = avformat_alloc_context();
-  sws_frame.reset(av_frame_alloc());
 }

 FrameReader::~FrameReader() {
@@ -47,9 +45,6 @@ FrameReader::~FrameReader() {
  if (input_ctx) avformat_close_input(&input_ctx);
  if (hw_device_ctx) av_buffer_unref(&hw_device_ctx);

-  if (rgb_sws_ctx_) sws_freeContext(rgb_sws_ctx_);
-  if (yuv_sws_ctx_) sws_freeContext(yuv_sws_ctx_);
-
  if (avio_ctx_) {
    av_freep(&avio_ctx_->buffer);
    avio_context_free(&avio_ctx_);
@@ -60,6 +55,9 @@ bool FrameReader::load(const std::string &url, bool no_cuda, std::atomic<bool> *
  std::string content = read(url, abort);
  if (content.empty()) return false;

+  input_ctx = avformat_alloc_context();
+  if (!input_ctx) return false;
+
  struct buffer_data bd = {
    .data = (uint8_t *)content.data(),
    .offset = 0,
@@ -99,18 +97,11 @@ bool FrameReader::load(const std::string &url, bool no_cuda, std::atomic<bool> *
  if (!no_cuda) {
    if (!initHardwareDecoder(AV_HWDEVICE_TYPE_CUDA)) {
      printf("No CUDA capable device was found. fallback to CPU decoding.\n");
+    } else {
+      nv12toyuv_buffer.resize(getYUVSize());
    }
  }

-  rgb_sws_ctx_ = sws_getContext(decoder_ctx->width, decoder_ctx->height, sws_src_format,
-                                width, height, AV_PIX_FMT_BGR24,
-                                SWS_BILINEAR, NULL, NULL, NULL);
-  if (!rgb_sws_ctx_) return false;
-  yuv_sws_ctx_ = sws_getContext(decoder_ctx->width, decoder_ctx->height, sws_src_format,
-                                width, height, AV_PIX_FMT_YUV420P,
-                                SWS_BILINEAR, NULL, NULL, NULL);
-  if (!yuv_sws_ctx_) return false;
-
  ret = avcodec_open2(decoder_ctx, decoder, NULL);
  if (ret < 0) return false;

@@ -149,17 +140,6 @@ bool FrameReader::initHardwareDecoder(AVHWDeviceType hw_device_type) {
    return false;
  }

-  // get sws source format
-  AVHWFramesConstraints *hw_frames_const = av_hwdevice_get_hwframe_constraints(hw_device_ctx, nullptr);
-  assert(hw_frames_const != 0);
-  for (AVPixelFormat *p = hw_frames_const->valid_sw_formats; *p != AV_PIX_FMT_NONE; p++) {
-    if (sws_isSupportedInput(*p)) {
-      sws_src_format = *p;
-      break;
-    }
-  }
-  av_hwframe_constraints_free(&hw_frames_const);
-
  decoder_ctx->hw_device_ctx = av_buffer_ref(hw_device_ctx);
  decoder_ctx->opaque = &hw_pix_fmt;
  decoder_ctx->get_format = get_hw_format;
@@ -228,27 +208,26 @@ AVFrame *FrameReader::decodeFrame(AVPacket *pkt) {
 }

 bool FrameReader::copyBuffers(AVFrame *f, uint8_t *rgb, uint8_t *yuv) {
-  if (yuv) {
-    if (sws_src_format == AV_PIX_FMT_NV12) {
-      // libswscale crash if height is not 16 bytes aligned for NV12->YUV420 conversion
-      assert(sws_src_format == AV_PIX_FMT_NV12);
+  if (hw_pix_fmt == AV_PIX_FMT_CUDA) {
+    uint8_t *y = yuv ? yuv : nv12toyuv_buffer.data();
+    uint8_t *u = y + width * height;
+    uint8_t *v = u + (width / 2) * (height / 2);
+    libyuv::NV12ToI420(f->data[0], f->linesize[0], f->data[1], f->linesize[1],
+                       y, width, u, width / 2, v, width / 2, width, height);
+    libyuv::I420ToRGB24(y, width, u, width / 2, v, width / 2,
+                        rgb, width * 3, width, height);
+  } else {
+    if (yuv) {
      uint8_t *u = yuv + width * height;
      uint8_t *v = u + (width / 2) * (height / 2);
-      libyuv::NV12ToI420(f->data[0], f->linesize[0],
-                         f->data[1], f->linesize[1],
-                         yuv, width,
-                         u, width / 2,
-                         v, width / 2,
-                         width, height);
-    } else {
-      av_image_fill_arrays(sws_frame->data, sws_frame->linesize, yuv, AV_PIX_FMT_YUV420P, width, height, 1);
-      int ret = sws_scale(yuv_sws_ctx_, (const uint8_t **)f->data, f->linesize, 0, f->height, sws_frame->data, sws_frame->linesize);
-      if (ret < 0) return false;
+      memcpy(yuv, f->data[0], width * height);
+      memcpy(u, f->data[1], width / 2 * height / 2);
+      memcpy(v, f->data[2], width / 2 * height / 2);
    }
+    libyuv::I420ToRGB24(f->data[0], f->linesize[0],
+                        f->data[1], f->linesize[1],
+                        f->data[2], f->linesize[2],
+                        rgb, width * 3, width, height);
  }
-
-  // images is going to be written to output buffers, no alignment (align = 1)
-  av_image_fill_arrays(sws_frame->data, sws_frame->linesize, rgb, AV_PIX_FMT_BGR24, width, height, 1);
-  int ret = sws_scale(rgb_sws_ctx_, (const uint8_t **)f->data, f->linesize, 0, f->height, sws_frame->data, sws_frame->linesize);
-  return ret >= 0;
+  return true;
 }
--- a/selfdrive/ui/replay/framereader.h
+++ b/selfdrive/ui/replay/framereader.h
@@ -9,8 +9,6 @@
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
-#include <libswscale/swscale.h>
-#include <libavutil/imgutils.h>
 }

 struct AVFrameDeleter {
@@ -42,9 +40,7 @@ private:
    bool failed = false;
  };
  std::vector<Frame> frames_;
-  AVPixelFormat sws_src_format = AV_PIX_FMT_YUV420P;
-  SwsContext *rgb_sws_ctx_ = nullptr, *yuv_sws_ctx_ = nullptr;
-  std::unique_ptr<AVFrame, AVFrameDeleter>av_frame_, sws_frame, hw_frame;
+  std::unique_ptr<AVFrame, AVFrameDeleter>av_frame_, hw_frame;
  AVFormatContext *input_ctx = nullptr;
  AVCodecContext *decoder_ctx = nullptr;
  int key_frames_count_ = 0;
@@ -53,4 +49,5 @@ private:

  AVPixelFormat hw_pix_fmt = AV_PIX_FMT_NONE;
  AVBufferRef *hw_device_ctx = nullptr;
+  std::vector<uint8_t> nv12toyuv_buffer;
 };