From 1f332405495f87951da89642d6ba845dea29268c Mon Sep 17 00:00:00 2001
From: Vincent Davis Jr <vince@underview.tech>
Date: Thu, 8 Dec 2022 11:22:01 -0600
Subject: [PATCH] rpidistro-ffmpeg: upgrade 4.3.2 -> 4.3.4

Upgrades version of ffmpeg to 4.3.4
* Reason for not upgrading to 4.3.5 all ported raspberrypi
  team patches may not be included in that version/commit.
* SRCREV set to 246e1a55a0eca931537d8706acd8b133c07beb05

Updates to PACKAGECONFIG
* Only include --enable-opengl flag when opengl is set in
  DISTRO_FEATURES
* Add new flag --enable-epoxy required by vout-egl
* vout-egl requires both libepoxy and x11. Only
  enable vout-egl if x11 contained in DISTRO_FEATURES.
* The remaining RPI-Distro related flags added
  through patches. Are only enabled if vc4graphics
  is disabled and userland graphics enabled. As an
  attempt to keep ffmpeg ./configure generic unless
  specified other wise.

Removes TARGET_CFLAGS:append as include flags are set in
./configure via the 2001-configure-setup-for-OE-core-usage.patch
patch.

Replaces patches with updated patches used in actual commit.
Adds four new patches to fix ./configure, compile, runtime bugs.

PATCHES:
- 2001-configure-setup-for-OE-core-usage.patch
* The ./configure stage fails if neither x11 or wayland defined
  in DISTRO_FEATURES. When opengl enabled ./configure checks for
  relevant headers. The last header it checks for is ES2/gl.h which
  doesn't exists. Neither do the others if certain perameters
  are not meet. Patch addes check for GLES2/gl2.h which does
  exists. We use utilize GLESv2 to compile and link with.
  Patch also replaces where compiler find mmal and omx headers
  and libs.

- 2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
* After configure stage succeeds the compile stage fails as
  SelectedGetProcAddress isn't defined. It can't be define as
  if x11 isn't enabled. Patch defines SelectedGetProcAddress
  if x11 not enabled, but sdl2 enabled to SDL_GL_GetProcAddress.
  If neither sdl2 or x11 is enabled patch loads GL functions
  pointers at compile time versus dynamically at runtime.

- 2003-libavcodec-fix-v4l2_req_devscan.patch
* v412_req_devscan.h function definitions where different
  from v412_req_devscan.c function implementations.

- 2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
* Fixes where libbcm_host.so and libopenmaxil.so are loaded from.

Signed-off-by: Vincent Davis Jr <vince@underview.tech>
---
 ...nc-avoid-callee-preserved-vfp-regist.patch |    40 +-
 .../0002-Fix-build-on-powerpc-and-ppc64.patch |     7 +-
 ...c-remove-monowhite-from-apng-formats.patch |     8 +-
 ...0.patch => 0004-ffmpeg-4.3.4-rpi_14.patch} | 11120 +++++++++++++---
 ...005-fix_flags.diff => 0005-fix-flags.diff} |     7 +-
 ...01-configure-setup-for-OE-core-usage.patch |    82 +
 ...l_enc-update-dynamic-function-loader.patch |   111 +
 ...2003-libavcodec-fix-v4l2_req_devscan.patch |    45 +
 ...omx-replace-opt-vc-path-with-usr-lib.patch |    35 +
 ...peg_4.3.2.bb => rpidistro-ffmpeg_4.3.4.bb} |    35 +-
 10 files changed, 9774 insertions(+), 1716 deletions(-)
 rename recipes-multimedia/rpidistro-ffmpeg/files/{0004-ffmpeg-4.3.2-rpi_10.patch => 0004-ffmpeg-4.3.4-rpi_14.patch} (84%)
 rename recipes-multimedia/rpidistro-ffmpeg/files/{0005-fix_flags.diff => 0005-fix-flags.diff} (89%)
 create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
 create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
 create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
 create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
 rename recipes-multimedia/rpidistro-ffmpeg/{rpidistro-ffmpeg_4.3.2.bb => rpidistro-ffmpeg_4.3.4.bb} (89%)

diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
index e9c9eb7..d9c07dd 100644
--- a/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
@@ -2,6 +2,11 @@ From: James Cowgill <jcowgill@debian.org>
 Date: Sun, 11 Aug 2019 16:50:56 +0100
 Subject: avcodec/arm/sbcenc: avoid callee preserved vfp registers
 
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
 When compiling FFmpeg with GCC-9, some very random segfaults were
 observed in code which had previously called down into the SBC encoder
 NEON assembly routines. This was caused by these functions clobbering
@@ -19,8 +24,6 @@ sets of registers consecutively numbered which hopefully makes the
 code more easy to follow. Since this commit only reallocates
 registers, it should have no performance impact.
 
-Upstream-status: Pending
-
 Signed-off-by: James Cowgill <jcowgill@debian.org>
 ---
  libavcodec/arm/sbcdsp_neon.S | 220 +++++++++++++++++++++----------------------
@@ -38,7 +41,7 @@ index d83d21d..914abfb 100644
 -        vld1.16         {d8, d9}, [r2, :128]!
 +        vld1.16         {d16, d17}, [r0, :64]!
 +        vld1.16         {d20, d21}, [r2, :128]!
-
+ 
 -        vmull.s16       q0, d4, d8
 -        vld1.16         {d6,  d7}, [r0, :64]!
 -        vmull.s16       q1, d5, d9
@@ -47,7 +50,7 @@ index d83d21d..914abfb 100644
 +        vld1.16         {d18, d19}, [r0, :64]!
 +        vmull.s16       q1, d17, d21
 +        vld1.16         {d22, d23}, [r2, :128]!
-
+ 
 -        vmlal.s16       q0, d6, d10
 -        vld1.16         {d4, d5}, [r0, :64]!
 -        vmlal.s16       q1, d7, d11
@@ -56,7 +59,7 @@ index d83d21d..914abfb 100644
 +        vld1.16         {d16, d17}, [r0, :64]!
 +        vmlal.s16       q1, d19, d23
 +        vld1.16         {d20, d21}, [r2, :128]!
-
+ 
 -        vmlal.s16       q0, d4, d8
 -        vld1.16         {d6,  d7}, [r0, :64]!
 -        vmlal.s16       q1, d5, d9
@@ -65,7 +68,7 @@ index d83d21d..914abfb 100644
 +        vld1.16         {d18, d19}, [r0, :64]!
 +        vmlal.s16       q1, d17, d21
 +        vld1.16         {d22, d23}, [r2, :128]!
-
+ 
 -        vmlal.s16       q0, d6, d10
 -        vld1.16         {d4, d5}, [r0, :64]!
 -        vmlal.s16       q1, d7, d11
@@ -74,23 +77,23 @@ index d83d21d..914abfb 100644
 +        vld1.16         {d16, d17}, [r0, :64]!
 +        vmlal.s16       q1, d19, d23
 +        vld1.16         {d20, d21}, [r2, :128]!
-
+ 
 -        vmlal.s16       q0, d4, d8
 -        vmlal.s16       q1, d5, d9
 +        vmlal.s16       q0, d16, d20
 +        vmlal.s16       q1, d17, d21
-
+ 
          vpadd.s32       d0, d0, d1
          vpadd.s32       d1, d2, d3
-
+ 
          vrshrn.s32      d0, q0, SBC_PROTO_FIXED_SCALE
-
+ 
 -        vld1.16         {d2, d3, d4, d5}, [r2, :128]!
 +        vld1.16         {d16, d17, d18, d19}, [r2, :128]!
-
+ 
          vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
          vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
-
+ 
 -        vmull.s16       q3, d2, d0
 -        vmull.s16       q4, d3, d0
 -        vmlal.s16       q3, d4, d1
@@ -99,14 +102,14 @@ index d83d21d..914abfb 100644
 +        vmull.s16       q11, d17, d0
 +        vmlal.s16       q10, d18, d1
 +        vmlal.s16       q11, d19, d1
-
+ 
 -        vpadd.s32       d0, d6, d7 /* TODO: can be eliminated */
 -        vpadd.s32       d1, d8, d9 /* TODO: can be eliminated */
 +        vpadd.s32       d0, d20, d21 /* TODO: can be eliminated */
 +        vpadd.s32       d1, d22, d23 /* TODO: can be eliminated */
-
+ 
          vst1.32         {d0, d1}, [r1, :128]
-
+ 
 @@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
          /* TODO: merge even and odd cases (or even merge all four calls to this
           * function) in order to have only aligned reads from 'in' array
@@ -213,13 +216,13 @@ index d83d21d..914abfb 100644
 +        vpadd.s32       d1, d26, d27
 +        vpadd.s32       d2, d28, d29
 +        vpadd.s32       d3, d30, d31
-
+ 
          vrshr.s32       q0, q0, SBC_PROTO_FIXED_SCALE
          vrshr.s32       q1, q1, SBC_PROTO_FIXED_SCALE
 @@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
          vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
          vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
-
+ 
 -        vld1.16         {d4, d5}, [r2, :128]!
 -        vmull.s16       q6, d4, d0
 -        vld1.16         {d6, d7}, [r2, :128]!
@@ -284,5 +287,6 @@ index d83d21d..914abfb 100644
 +        vpadd.s32       d1, d26, d27 /* TODO: can be eliminated */
 +        vpadd.s32       d2, d28, d29 /* TODO: can be eliminated */
 +        vpadd.s32       d3, d30, d31 /* TODO: can be eliminated */
-
+ 
          vst1.32         {d0, d1, d2, d3}, [r1, :128]
+ 
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
index 4d9c1b9..f398791 100644
--- a/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
@@ -2,7 +2,10 @@ From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
 Date: Tue, 19 Jan 2021 20:35:29 +0100
 Subject: Fix build on powerpc and ppc64
 
-Upstream-status: Pending
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
 
 ---
  libswscale/ppc/yuv2rgb_altivec.c | 10 ++++++++++
@@ -15,7 +18,7 @@ index 5365452..930ef6b 100644
 @@ -283,6 +283,16 @@ static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
   * ------------------------------------------------------------------------------
   */
-
+ 
 +#if !HAVE_VSX
 +static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
 +{
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
index 38f3fd4..11e3383 100644
--- a/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
@@ -2,13 +2,15 @@ From: Paul B Mahol <onemda@gmail.com>
 Date: Sun, 14 Feb 2021 17:20:03 +0100
 Subject: avcodec/pngenc: remove monowhite from apng formats
 
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
 Monowhite pixel format is not supported, and it does not make sense
 to add support for it.
 
 Fixes #7989
-
-Upstream-status: Pending
-
 ---
  libavcodec/pngenc.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.2-rpi_10.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
similarity index 84%
rename from recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.2-rpi_10.patch
rename to recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
index 6bab0d0..740ac0e 100644
--- a/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.2-rpi_10.patch
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
@@ -1,16 +1,27 @@
-Upstream-status: Pending
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
 
 --- a/configure
 +++ b/configure
-@@ -274,6 +274,7 @@ External library support:
+@@ -207,6 +207,7 @@ External library support:
+   --disable-bzlib          disable bzlib [autodetect]
+   --disable-coreimage      disable Apple CoreImage framework [autodetect]
+   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
++  --disable-epoxy          disable epoxy [autodetect]
+   --enable-frei0r          enable frei0r video filtering [no]
+   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
+                            if openssl, librtmp or gmp is not used [no]
+@@ -274,6 +275,7 @@ External library support:
    --enable-libtls          enable LibreSSL (via libtls), needed for https support
                             if openssl, gnutls or mbedtls is not used [no]
    --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-+  --enable-libudev         enable libudev [no]
++  --disable-libudev        disable libudev [autodetect]
    --enable-libv4l2         enable libv4l2/v4l-utils [no]
    --enable-libvidstab      enable video stabilization using vid.stab [no]
    --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -336,12 +337,17 @@ External library support:
+@@ -336,12 +338,17 @@ External library support:
    --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
    --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
    --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
@@ -28,23 +39,17 @@ Upstream-status: Pending
    --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
    --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
    --disable-videotoolbox   disable VideoToolbox code [autodetect]
-@@ -1771,6 +1777,7 @@ EXTERNAL_LIBRARY_LIST="
-     libdav1d
-     libdc1394
-     libdrm
+@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
+     avfoundation
+     bzlib
+     coreimage
 +    epoxy
-     libflite
-     libfontconfig
-     libfreetype
-@@ -1807,6 +1814,7 @@ EXTERNAL_LIBRARY_LIST="
-     libtesseract
-     libtheora
-     libtwolame
+     iconv
 +    libudev
-     libv4l2
-     libvorbis
-     libvpx
-@@ -1861,7 +1869,10 @@ HWACCEL_LIBRARY_LIST="
+     libxcb
+     libxcb_shm
+     libxcb_shape
+@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST="
      mmal
      omx
      opencl
@@ -53,9 +58,9 @@ Upstream-status: Pending
 +    rpi4_8
 +    rpi4_10
  "
-
+ 
  DOCUMENT_LIST="
-@@ -1877,12 +1888,16 @@ FEATURE_LIST="
+@@ -1877,12 +1889,16 @@ FEATURE_LIST="
      gray
      hardcoded_tables
      omx_rpi
@@ -70,17 +75,17 @@ Upstream-status: Pending
 +    vout_drm
 +    vout_egl
  "
-
+ 
  # this list should be kept in linking order
-@@ -1923,6 +1938,7 @@ SUBSYSTEM_LIST="
+@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST="
      pixelutils
      network
      rdft
 +    rpi
  "
-
+ 
  # COMPONENT_LIST needs to come last to ensure correct dependency checking
-@@ -2405,9 +2421,11 @@ CONFIG_EXTRA="
+@@ -2405,9 +2422,11 @@ CONFIG_EXTRA="
      rangecoder
      riffdec
      riffenc
@@ -92,7 +97,7 @@ Upstream-status: Pending
      scene_sad
      sinewin
      snappy
-@@ -2737,6 +2755,8 @@ hap_decoder_select="snappy texturedsp"
+@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp"
  hap_encoder_deps="libsnappy"
  hap_encoder_select="texturedspenc"
  hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
@@ -101,7 +106,7 @@ Upstream-status: Pending
  huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
  huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
  hymt_decoder_select="huffyuv_decoder"
-@@ -2903,6 +2923,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
+@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
  dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
  ffnvcodec_deps_any="libdl LoadLibrary"
  nvdec_deps="ffnvcodec"
@@ -109,7 +114,7 @@ Upstream-status: Pending
  vaapi_x11_deps="xlib"
  videotoolbox_hwaccel_deps="videotoolbox pthreads"
  videotoolbox_hwaccel_extralibs="-framework QuartzCore"
-@@ -2934,6 +2955,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
+@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
  hevc_dxva2_hwaccel_select="hevc_decoder"
  hevc_nvdec_hwaccel_deps="nvdec"
  hevc_nvdec_hwaccel_select="hevc_decoder"
@@ -122,16 +127,15 @@ Upstream-status: Pending
  hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
  hevc_vaapi_hwaccel_select="hevc_decoder"
  hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
-@@ -3401,8 +3428,14 @@ sndio_indev_deps="sndio"
+@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio"
  sndio_outdev_deps="sndio"
  v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
  v4l2_indev_suggest="libv4l2"
 +v4l2_outdev_deps="libdrm"
  v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
  v4l2_outdev_suggest="libv4l2"
-+vout_drm_outdev_deps="libdrm vout_drm"
-+vout_egl_outdev_deps="xlib"
-+vout_egl_outdev_select="epoxy"
++vout_drm_outdev_deps="libdrm"
++vout_egl_outdev_deps="xlib epoxy"
 +vout_rpi_outdev_deps="rpi"
 +vout_rpi_outdev_select="sand"
  vfwcap_indev_deps="vfw32 vfwcap_defines"
@@ -145,23 +149,20 @@ Upstream-status: Pending
  unsharp_opencl_filter_deps="opencl"
  uspp_filter_deps="gpl avcodec"
  vaguedenoiser_filter_deps="gpl"
-@@ -6299,6 +6333,7 @@ enabled libdav1d          && require_pkg
- enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
- enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
- enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
-+enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
- enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
-                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
-                                  warn "using libfdk without pkg-config"; } }
-@@ -6376,6 +6411,7 @@ enabled libtls            && require_pkg
- enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
-                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
-                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
-+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
- enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
- enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
- enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf
-@@ -6430,11 +6466,12 @@ enabled mbedtls           && { check_pkg
+@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob
+ enabled xlib &&
+     check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
+ 
++enabled libudev &&
++    check_pkg_config libudev libudev libudev.h udev_new
++
++enabled epoxy &&
++    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
++
+ check_headers direct.h
+ check_headers dirent.h
+ check_headers dxgidebug.h
+@@ -6430,11 +6470,12 @@ enabled mbedtls           && { check_pkg
                                 check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
                                 die "ERROR: mbedTLS not found"; }
  enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
@@ -176,26 +177,32 @@ Upstream-status: Pending
                                 die "ERROR: mmal not found" &&
                                 check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
  enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
-@@ -6475,6 +6512,10 @@ enabled rkmpp             && { require_p
+@@ -6475,8 +6516,16 @@ enabled rkmpp             && { require_p
                                 { enabled libdrm ||
                                   die "ERROR: rkmpp requires --enable-libdrm"; }
                               }
 +enabled v4l2_request      && { enabled libdrm ||
 +                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
 +                             { enabled libudev ||
-+                               die "ERROR: v4l2-request requires --enable-libudev"; }
++                               die "ERROR: v4l2-request requires libudev"; }
  enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
-
-
-@@ -6556,6 +6597,8 @@ if enabled v4l2_m2m; then
+ 
++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
++
++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
++                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
+ 
+ if enabled gcrypt; then
+     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then
      check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
  fi
-
+ 
 +check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
 +check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
  check_headers sys/videoio.h
  test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
-
+ 
 --- a/fftools/ffmpeg.c
 +++ b/fftools/ffmpeg.c
 @@ -2119,8 +2119,8 @@ static int ifilter_send_frame(InputFilte
@@ -208,11 +215,11 @@ Upstream-status: Pending
 +                       ifilter->height != av_frame_cropped_height(frame);
          break;
      }
-
+ 
 @@ -2131,6 +2131,9 @@ static int ifilter_send_frame(InputFilte
          (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
          need_reinit = 1;
-
+ 
 +    if (no_cvt_hw && fg->graph)
 +        need_reinit = 0;
 +
@@ -221,7 +228,7 @@ Upstream-status: Pending
          if (ret < 0)
 @@ -2401,8 +2404,7 @@ static int decode_video(InputStream *ist
          decoded_frame->top_field_first = ist->top_field_first;
-
+ 
      ist->frames_decoded++;
 -
 -    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
@@ -229,7 +236,21 @@ Upstream-status: Pending
          err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
          if (err < 0)
              goto fail;
-@@ -2820,6 +2822,16 @@ static enum AVPixelFormat get_format(AVC
+@@ -2600,7 +2602,12 @@ static int process_input_packet(InputStr
+         case AVMEDIA_TYPE_VIDEO:
+             ret = decode_video    (ist, repeating ? NULL : &avpkt, &got_output, &duration_pts, !pkt,
+                                    &decode_failed);
+-            if (!repeating || !pkt || got_output) {
++            // Pi: Do not inc dts if no_cvt_hw set
++            // V4L2 H264 decode has long latency and sometimes spits out a long
++            // stream of output without input. In this case incrementing DTS is wrong.
++            // There may be cases where the condition as written is correct so only
++            // "fix" in the cases which cause problems
++            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
+                 if (pkt && pkt->duration) {
+                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
+                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
+@@ -2820,6 +2827,16 @@ static enum AVPixelFormat get_format(AVC
          } else {
              const HWAccel *hwaccel = NULL;
              int i;
@@ -246,10 +267,10 @@ Upstream-status: Pending
              for (i = 0; hwaccels[i].name; i++) {
                  if (hwaccels[i].pix_fmt == *p) {
                      hwaccel = &hwaccels[i];
-@@ -2914,6 +2926,15 @@ static int init_input_stream(int ist_ind
+@@ -2914,6 +2931,15 @@ static int init_input_stream(int ist_ind
              return ret;
          }
-
+ 
 +#if CONFIG_HEVC_RPI_DECODER
 +        ret = -1;
 +        if (strcmp(codec->name, "hevc_rpi") == 0 &&
@@ -270,7 +291,7 @@ Upstream-status: Pending
      HWACCEL_QSV,
 +    HWACCEL_RPI,
  };
-
+ 
  typedef struct HWAccel {
 @@ -590,6 +591,7 @@ extern int video_sync_method;
  extern float frame_drop_threshold;
@@ -283,15 +304,15 @@ Upstream-status: Pending
 --- a/fftools/ffmpeg_filter.c
 +++ b/fftools/ffmpeg_filter.c
 @@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputF
-
+ 
      ifilter->format = frame->format;
-
+ 
 -    ifilter->width               = frame->width;
 -    ifilter->height              = frame->height;
 +    ifilter->width               = av_frame_cropped_width(frame);
 +    ifilter->height              = av_frame_cropped_height(frame);
      ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
-
+ 
      ifilter->sample_rate         = frame->sample_rate;
 --- a/fftools/ffmpeg_hw.c
 +++ b/fftools/ffmpeg_hw.c
@@ -309,7 +330,7 @@ Upstream-status: Pending
 @@ -130,6 +130,12 @@ static const char *opt_name_enc_time_bas
      }\
  }
-
+ 
 +#if CONFIG_RPI
 +static int rpi_init(AVCodecContext *avctx) {
 +    return 0;
@@ -376,7 +397,7 @@ Upstream-status: Pending
 +					  v4l2_req_devscan.o weak_link.o
  OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
  OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
-
+ 
 @@ -391,6 +396,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER)        +
  OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
                                            hevc_data.o
@@ -399,7 +420,7 @@ Upstream-status: Pending
 +OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
 +OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
 +OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
++                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
  OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
  OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
  OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
@@ -435,6 +456,1866 @@ Upstream-status: Pending
 +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
 +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
 +endif
+--- a/libavcodec/aarch64/Makefile
++++ b/libavcodec/aarch64/Makefile
+@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)
+ NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
+                                            aarch64/hpeldsp_neon.o
+ NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
+-NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
++NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
++                                           aarch64/simple_idct_neon.o
+ NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+ NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
++NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
+ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
+ 
+ # decoders/encoders
+--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -27,19 +27,29 @@
+ #include "libavcodec/idctdsp.h"
+ #include "idct.h"
+ 
++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++
+ av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+-    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+-            c->idct_put  = ff_simple_idct_put_neon;
+-            c->idct_add  = ff_simple_idct_add_neon;
+-            c->idct      = ff_simple_idct_neon;
+-            c->perm_type = FF_IDCT_PERM_PARTTRANS;
++    if (have_neon(cpu_flags)) {
++        if (!avctx->lowres && !high_bit_depth) {
++            if (avctx->idct_algo == FF_IDCT_AUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++                c->idct_put  = ff_simple_idct_put_neon;
++                c->idct_add  = ff_simple_idct_add_neon;
++                c->idct      = ff_simple_idct_neon;
++                c->perm_type = FF_IDCT_PERM_PARTTRANS;
++            }
+         }
++
++        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
++        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
++        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+     }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/idctdsp_neon.S
+@@ -0,0 +1,130 @@
++/*
++ * IDCT AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// Clamp 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x1], x2
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x1], x2
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x1], x2
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x1], x2
++        st1             {v4.8b}, [x1], x2
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1]
++        ret
++endfunc
++
++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_signed_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        movi            v4.8b, #128
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        sqxtn           v0.8b, v0.8h
++        sqxtn           v1.8b, v1.8h
++        sqxtn           v2.8b, v2.8h
++        sqxtn           v3.8b, v3.8h
++        sqxtn           v5.8b, v16.8h
++        add             v0.8b, v0.8b, v4.8b
++        sqxtn           v6.8b, v17.8h
++        add             v1.8b, v1.8b, v4.8b
++        sqxtn           v7.8b, v18.8h
++        add             v2.8b, v2.8b, v4.8b
++        sqxtn           v16.8b, v19.8h
++        add             v3.8b, v3.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        add             v0.8b, v5.8b, v4.8b
++        st1             {v1.8b}, [x1], x2
++        add             v1.8b, v6.8b, v4.8b
++        st1             {v2.8b}, [x1], x2
++        add             v2.8b, v7.8b, v4.8b
++        st1             {v3.8b}, [x1], x2
++        add             v3.8b, v16.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1], x2
++        st1             {v3.8b}, [x1]
++        ret
++endfunc
++
++// Add 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit input and results
++//   x2 = row stride for 8-bit input and results, bytes
++function ff_add_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        mov             x3, x1
++        ld1             {v4.8b}, [x1], x2
++        ld1             {v5.8b}, [x1], x2
++        ld1             {v6.8b}, [x1], x2
++        ld1             {v7.8b}, [x1], x2
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        uaddw           v0.8h, v0.8h, v4.8b
++        uaddw           v1.8h, v1.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        ld1             {v4.8b}, [x1], x2
++        uaddw           v3.8h, v3.8h, v7.8b
++        ld1             {v5.8b}, [x1], x2
++        sqxtun          v0.8b, v0.8h
++        ld1             {v6.8b}, [x1], x2
++        sqxtun          v1.8b, v1.8h
++        ld1             {v7.8b}, [x1]
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        uaddw           v4.8h, v16.8h, v4.8b
++        st1             {v0.8b}, [x3], x2
++        uaddw           v0.8h, v17.8h, v5.8b
++        st1             {v1.8b}, [x3], x2
++        uaddw           v1.8h, v18.8h, v6.8b
++        st1             {v2.8b}, [x3], x2
++        uaddw           v2.8h, v19.8h, v7.8b
++        sqxtun          v4.8b, v4.8h
++        sqxtun          v0.8b, v0.8h
++        st1             {v3.8b}, [x3], x2
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        st1             {v4.8b}, [x3], x2
++        st1             {v0.8b}, [x3], x2
++        st1             {v1.8b}, [x3], x2
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
+--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -21,10 +21,28 @@
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ 
+ #include "config.h"
+ 
++void ff_vc1_inv_trans_8x8_neon(int16_t *block);
++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (have_neon(cpu_flags)) {
++        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
++        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
++        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
++        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
++        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
++        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
++        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
++        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
++
++        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+     }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/vc1dsp_neon.S
+@@ -0,0 +1,1546 @@
++/*
++ * VC1 AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// VC-1 8x8 inverse transform
++// On entry:
++//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
++// On exit:
++//   array at x0 updated to hold transformed block; also now held in row-major order
++function ff_vc1_inv_trans_8x8_neon, export=1
++        ld1             {v1.16b, v2.16b}, [x0], #32
++        ld1             {v3.16b, v4.16b}, [x0], #32
++        ld1             {v5.16b, v6.16b}, [x0], #32
++        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
++        sub             x1, x0, #3*32
++        ld1             {v16.16b, v17.16b}, [x0]
++        shl             v7.8h, v2.8h, #4        //          16 * src[8]
++        shl             v18.8h, v2.8h, #2       //           4 * src[8]
++        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
++        ldr             d0, .Lcoeffs_it8
++        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
++        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
++        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
++        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
++        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
++        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
++        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
++        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
++        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
++        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
++        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
++        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
++        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
++        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
++        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
++        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
++        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
++        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
++        neg             v3.8h, v7.8h            // -t1
++        neg             v4.8h, v20.8h           // +t2
++        neg             v6.8h, v19.8h           // +t3
++        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
++        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
++        neg             v7.8h, v18.8h           // +t4
++        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
++        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
++        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
++        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
++        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
++        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
++        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
++        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
++        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
++        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
++        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
++        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
++        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
++        trn2            v17.8h, v3.8h, v4.8h
++        trn2            v18.8h, v5.8h, v6.8h
++        trn2            v19.8h, v2.8h, v1.8h
++        trn2            v20.8h, v7.8h, v16.8h
++        trn1            v21.4s, v17.4s, v18.4s
++        trn2            v17.4s, v17.4s, v18.4s
++        trn1            v18.4s, v19.4s, v20.4s
++        trn2            v19.4s, v19.4s, v20.4s
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.2d, v21.2d, v18.2d
++        trn1            v20.2d, v17.2d, v19.2d
++        trn1            v5.8h, v5.8h, v6.8h
++        trn1            v1.8h, v2.8h, v1.8h
++        trn1            v2.8h, v7.8h, v16.8h
++        trn1            v6.2d, v21.2d, v18.2d
++        trn2            v7.2d, v17.2d, v19.2d
++        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
++        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
++        trn1            v18.4s, v3.4s, v5.4s
++        trn1            v19.4s, v1.4s, v2.4s
++        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
++        shl             v22.8h, v6.8h, #2       //           4 * src[8]
++        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
++        trn2            v3.4s, v3.4s, v5.4s
++        trn2            v1.4s, v1.4s, v2.4s
++        shl             v2.8h, v6.8h, #4        //          16 * src[8]
++        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
++        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
++        trn1            v22.2d, v18.2d, v19.2d
++        trn2            v18.2d, v18.2d, v19.2d
++        trn1            v19.2d, v3.2d, v1.2d
++        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
++        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
++        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
++        trn2            v1.2d, v3.2d, v1.2d
++        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
++        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
++        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
++        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
++        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
++        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        neg             v21.8h, v17.8h          // +t2
++        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
++        neg             v4.8h, v5.8h            // +t3
++        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
++        neg             v24.8h, v16.8h          // +t4
++        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
++        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
++        neg             v3.8h, v2.8h            // -t1
++        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
++        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
++        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
++        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
++        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
++        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
++        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
++        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
++        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
++        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
++        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
++        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
++        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
++        st1             {v2.16b, v3.16b}, [x1], #32
++        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
++        st1             {v4.16b, v5.16b}, [x1], #32
++        st1             {v16.16b, v17.16b}, [x1], #32
++        st1             {v0.16b, v1.16b}, [x1]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_neon, export=1
++        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
++        mov             x3, x0
++        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        ld1             {v5.8b}, [x0], x1
++        trn2            v6.4h, v1.4h, v3.4h
++        trn2            v7.4h, v2.4h, v4.4h
++        trn1            v1.4h, v1.4h, v3.4h
++        trn1            v2.4h, v2.4h, v4.4h
++        trn2            v3.4h, v16.4h, v18.4h
++        trn2            v4.4h, v17.4h, v19.4h
++        trn1            v16.4h, v16.4h, v18.4h
++        trn1            v17.4h, v17.4h, v19.4h
++        ld1             {v18.8b}, [x0], x1
++        trn1            v19.2s, v6.2s, v3.2s
++        trn2            v3.2s, v6.2s, v3.2s
++        trn1            v6.2s, v7.2s, v4.2s
++        trn2            v4.2s, v7.2s, v4.2s
++        trn1            v7.2s, v1.2s, v16.2s
++        trn1            v20.2s, v2.2s, v17.2s
++        shl             v21.4h, v19.4h, #4      //          16 * src[1]
++        trn2            v1.2s, v1.2s, v16.2s
++        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
++        trn2            v2.2s, v2.2s, v17.2s
++        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
++        ld1             {v22.8b}, [x0], x1
++        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
++        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
++        ld1             {v25.8b}, [x0]
++        shl             v26.4h, v19.4h, #2      //           4 * src[1]
++        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
++        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
++        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
++        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
++        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
++        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
++        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
++        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
++        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
++        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
++        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
++        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
++        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
++        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
++        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
++        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
++        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
++        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
++        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
++        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
++        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
++        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
++        neg             v6.4h, v21.4h           // -t1
++        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
++        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
++        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
++        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
++        neg             v3.4h, v17.4h           // +t2
++        neg             v4.4h, v16.4h           // +t3
++        neg             v28.4h, v23.4h          // +t4
++        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
++        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
++        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
++        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
++        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
++        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
++        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
++        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
++        trn1            v1.2d, v7.2d, v1.2d
++        trn1            v2.2d, v20.2d, v2.2d
++        trn1            v3.2d, v24.2d, v27.2d
++        trn1            v4.2d, v19.2d, v26.2d
++        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
++        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
++        trn2            v6.8h, v1.8h, v2.8h
++        trn1            v1.8h, v1.8h, v2.8h
++        trn2            v2.8h, v3.8h, v4.8h
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.4s, v6.4s, v2.4s
++        trn1            v7.4s, v1.4s, v3.4s
++        trn2            v1.4s, v1.4s, v3.4s
++        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
++        trn1            v2.4s, v6.4s, v2.4s
++        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
++        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
++        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v2.8h, v3.8h            // -t4/2
++        neg             v6.8h, v4.8h            // -t3/2
++        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
++        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
++        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
++        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
++        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
++        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
++        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
++        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v18.8b
++        uaddw           v2.8h, v2.8h, v22.8b
++        uaddw           v3.8h, v3.8h, v25.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_neon, export=1
++        mov             x3, #16
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
++        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
++        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
++        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
++        ld1             {v4.d}[1], [x2]         // 70 71 72 73
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        ld1             {v7.s}[0], [x0], x1
++        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
++        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
++        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
++        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
++        ld1             {v4.s}[0], [x0], x1
++        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
++        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
++        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
++        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
++        ld1             {v5.s}[1], [x0], x1
++        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
++        ld1             {v6.s}[1], [x0], x1
++        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
++        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
++        ld1             {v7.s}[1], [x0], x1
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
++        ld1             {v4.s}[1], [x0]
++        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v3.8h, v16.8h           // -t3/2
++        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
++        neg             v18.8h, v17.8h          // -t4/2
++        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
++        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
++        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
++        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
++        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
++        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
++        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
++        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
++        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
++        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
++        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
++        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
++        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
++        mov             d18, v3.d[1]            // 50 51 52 53
++        shl             v19.4h, v3.4h, #4       //          16 * src[8]
++        mov             d20, v16.d[1]           // 70 71 72 73
++        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
++        mov             d22, v17.d[1]           // 40 41 42 43
++        shl             v23.4h, v3.4h, #2       //           4 * src[8]
++        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
++        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
++        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
++        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
++        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
++        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
++        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
++        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
++        mov             d23, v1.d[1]            // 60 61 62 63
++        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
++        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
++        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
++        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
++        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
++        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
++        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
++        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        neg             v23.4h, v24.4h          // +t2
++        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
++        neg             v17.4h, v21.4h          // +t3
++        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
++        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        neg             v16.4h, v19.4h          // -t1
++        neg             v27.4h, v2.4h           // +t4
++        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
++        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
++        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
++        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
++        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
++        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
++        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
++        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
++        trn1            v0.2d, v20.2d, v0.2d
++        trn1            v2.2d, v18.2d, v22.2d
++        trn1            v3.2d, v25.2d, v3.2d
++        trn1            v1.2d, v26.2d, v1.2d
++        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
++        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
++        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        uaddw           v3.8h, v3.8h, v7.8b
++        uaddw           v1.8h, v1.8h, v4.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v2.s}[0], [x4], x1
++        st1             {v3.s}[0], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v2.s}[1], [x4], x1
++        st1             {v3.s}[1], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_neon, export=1
++        mov             x3, #16
++        ldr             d0, .Lcoeffs_it4
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2]         // 30 31 32 33
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v5.s}[1], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
++        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
++        ld1             {v6.s}[1], [x0]
++        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
++        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
++        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
++        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
++        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
++        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
++        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
++        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
++        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v7.4h, v3.4h            // -t3/2
++        neg             v16.4h, v4.4h           // -t4/2
++        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
++        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
++        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
++        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
++        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
++        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
++        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
++        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
++        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
++        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
++        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
++        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
++        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
++        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
++        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
++        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
++        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
++        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v3.4h, v2.4h            // -t4/2
++        neg             v7.4h, v4.4h            // -t3/2
++        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
++        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
++        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
++        trn1            v0.2d, v4.2d, v3.2d
++        trn1            v1.2d, v2.2d, v7.2d
++        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v6.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 8x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0], x1
++        ld1             {v4.8b}, [x0], x1
++        add             w2, w2, #1
++        ld1             {v5.8b}, [x0], x1
++        asr             w2, w2, #1
++        ld1             {v6.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v7.8b}, [x0]
++        add             w0, w2, #16
++        asr             w0, w0, #5
++        dup             v16.8h, w0
++        uaddw           v0.8h, v16.8h, v0.8b
++        uaddw           v1.8h, v16.8h, v1.8b
++        uaddw           v2.8h, v16.8h, v2.8b
++        uaddw           v3.8h, v16.8h, v3.8b
++        uaddw           v4.8h, v16.8h, v4.8b
++        uaddw           v5.8h, v16.8h, v5.8b
++        sqxtun          v0.8b, v0.8h
++        uaddw           v6.8h, v16.8h, v6.8b
++        sqxtun          v1.8b, v1.8h
++        uaddw           v7.8h, v16.8h, v7.8b
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x3], x1
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x3], x1
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x3], x1
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x3], x1
++        st1             {v4.8b}, [x3], x1
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0]
++        add             w0, w2, #1
++        asr             w0, w0, #1
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v4.8h, w0
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v2.s}[0], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v3.s}[0], [x0], x1
++        add             w2, w2, #4
++        asr             w2, w2, #3
++        add             w2, w2, w2, lsl #1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, #16
++        asr             w2, w2, #5
++        dup             v4.8h, w2
++        ld1             {v1.s}[1], [x0], x1
++        ld1             {v2.s}[1], [x0], x1
++        ld1             {v3.s}[1], [x0]
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v2.s}[0], [x3], x1
++        st1             {v3.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3], x1
++        st1             {v2.s}[1], [x3], x1
++        st1             {v3.s}[1], [x3]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v1.s}[1], [x0]
++        add             w0, w2, #4
++        asr             w0, w0, #3
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v2.8h, w0
++        uaddw           v0.8h, v2.8h, v0.8b
++        uaddw           v1.8h, v2.8h, v1.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3]
++        ret
++endfunc
++
++.align  5
++.Lcoeffs_it8:
++.quad   0x000F00090003
++.Lcoeffs_it4:
++.quad   0x0011000B0005
++.Lcoeffs:
++.quad   0x00050002
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.s}[0], [x0], x1     // P5
++        ld1             {v2.s}[0], [x3], x1     // P1
++        ld1             {v3.s}[0], [x3], x1     // P2
++        ld1             {v4.s}[0], [x0], x1     // P6
++        ld1             {v5.s}[0], [x3], x1     // P3
++        ld1             {v6.s}[0], [x0], x1     // P7
++        ld1             {v7.s}[0], [x3]         // P4
++        ld1             {v16.s}[0], [x0]        // P8
++        ushll           v17.8h, v1.8b, #1       // 2*P5
++        dup             v18.8h, w2              // pq
++        ushll           v2.8h, v2.8b, #1        // 2*P1
++        uxtl            v3.8h, v3.8b            // P2
++        uxtl            v4.8h, v4.8b            // P6
++        uxtl            v19.8h, v5.8b           // P3
++        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v3.8h, v6.8b            // P7
++        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
++        ushll           v5.8h, v5.8b, #1        // 2*P3
++        uxtl            v6.8h, v7.8b            // P4
++        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v3.8h, v16.8b           // P8
++        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
++        uxtl            v1.8h, v1.8b            // P5
++        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
++        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        sub             v3.4h, v6.4h, v1.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        abs             v4.4h, v3.4h
++        srshr           v7.4h, v17.4h, #3
++        srshr           v2.4h, v2.4h, #3
++        sshr            v4.4h, v4.4h, #1        // clip
++        srshr           v5.4h, v5.4h, #3
++        abs             v7.4h, v7.4h            // a2
++        sshr            v3.4h, v3.4h, #8        // clip_sign
++        abs             v2.4h, v2.4h            // a1
++        cmeq            v16.4h, v4.4h, #0       // test clip == 0
++        abs             v17.4h, v5.4h           // a0
++        sshr            v5.4h, v5.4h, #8        // a0_sign
++        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
++        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
++        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v7.8b, v2.8b    // a3
++        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v4.4h
++        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v6.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3]
++        dup             v5.8h, w2               // pq
++        trn1            v6.8b, v1.8b, v2.8b
++        trn2            v1.8b, v1.8b, v2.8b
++        trn1            v2.8b, v3.8b, v4.8b
++        trn2            v3.8b, v3.8b, v4.8b
++        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
++        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
++        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
++        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
++        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
++        uxtl            v6.8h, v7.8b            // P2, P6
++        uxtl            v7.8h, v2.8b            // P3, P7
++        uxtl            v1.8h, v1.8b            // P4, P8
++        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
++        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
++        uxtl            v4.8h, v4.8b            // P1, P5
++        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        mov             d6, v6.d[1]             // P6
++        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        mov             d4, v4.d[1]             // P5
++        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
++        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        sub             v7.4h, v1.4h, v4.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        srshr           v3.8h, v3.8h, #3
++        abs             v6.4h, v7.4h
++        sshr            v7.4h, v7.4h, #8        // clip_sign
++        srshr           v2.4h, v2.4h, #3
++        abs             v3.8h, v3.8h            // a1, a2
++        sshr            v6.4h, v6.4h, #1        // clip
++        mov             d16, v3.d[1]            // a2
++        abs             v17.4h, v2.4h           // a0
++        cmeq            v18.4h, v6.4h, #0       // test clip == 0
++        sshr            v2.4h, v2.4h, #8        // a0_sign
++        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
++        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
++        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v16.8b, v3.8b   // a3
++        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v6.4h
++        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v3.8b, v4.8h
++        sqxtun          v2.8b, v1.8h
++        st2             {v2.b, v3.b}[0], [x0], x1
++        st2             {v2.b, v3.b}[1], [x0], x1
++        st2             {v2.b, v3.b}[2], [x0], x1
++        st2             {v2.b, v3.b}[3], [x0]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x0], x1       // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.8b}, [x3], x1       // P1
++        ld1             {v4.8b}, [x3], x1       // P2
++        ld1             {v5.8b}, [x0], x1       // P6
++        ld1             {v6.8b}, [x3], x1       // P3
++        ld1             {v7.8b}, [x0], x1       // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5
++        ushll           v3.8h, v3.8b, #1        // 2*P1
++        ld1             {v17.8b}, [x3]          // P4
++        uxtl            v4.8h, v4.8b            // P2
++        ld1             {v18.8b}, [x0]          // P8
++        uxtl            v5.8h, v5.8b            // P6
++        dup             v19.8h, w2              // pq
++        uxtl            v20.8h, v6.8b           // P3
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v4.8h, v7.8b            // P7
++        ushll           v6.8h, v6.8b, #1        // 2*P3
++        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v7.8h, v17.8b           // P4
++        uxtl            v17.8h, v18.8b          // P8
++        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v1.8h, v1.8b            // P5
++        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v4.8h, v7.8h, v1.8h     // P4-P5
++        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
++        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++        abs             v17.8h, v4.8h
++        sshr            v4.8h, v4.8h, #8        // clip_sign
++        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        sshr            v17.8h, v17.8h, #1      // clip
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v16.8h, v16.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v5.8h, v17.8h, #0       // test clip == 0
++        srshr           v3.8h, v3.8h, #3
++        abs             v16.8h, v16.8h          // a2
++        abs             v3.8h, v3.8h            // a1
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
++        abs             v20.8h, v6.8h           // a0
++        sshr            v6.8h, v6.8h, #8        // a0_sign
++        bsl             v18.16b, v16.16b, v3.16b // a3
++        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
++        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
++        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
++        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
++        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v2.16b, v3.16b, v2.16b
++        cmhs            v3.8h, v0.8h, v17.8h
++        and             w0, w0, w2
++        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
++        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
++        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v7.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #2
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3], x1
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3]
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        dup             v4.8h, w2               // pq
++        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn1            v7.2s, v6.2s, v3.2s     // P1
++        trn1            v18.2s, v19.2s, v16.2s  // P2
++        trn2            v3.2s, v6.2s, v3.2s     // P5
++        trn2            v6.2s, v19.2s, v16.2s   // P6
++        trn1            v16.2s, v2.2s, v17.2s   // P3
++        trn2            v2.2s, v2.2s, v17.2s    // P7
++        ushll           v7.8h, v7.8b, #1        // 2*P1
++        trn1            v17.2s, v1.2s, v5.2s    // P4
++        ushll           v19.8h, v3.8b, #1       // 2*P5
++        trn2            v1.2s, v1.2s, v5.2s     // P8
++        uxtl            v5.8h, v18.8b           // P2
++        uxtl            v6.8h, v6.8b            // P6
++        uxtl            v18.8h, v16.8b          // P3
++        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v2.8h, v2.8b            // P7
++        ushll           v5.8h, v16.8b, #1       // 2*P3
++        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v16.8h, v17.8b          // P4
++        uxtl            v1.8h, v1.8b            // P8
++        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v2.8h, v3.8b            // P5
++        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v3.8h, v16.8h, v2.8h    // P4-P5
++        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
++        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        abs             v1.8h, v3.8h
++        sshr            v3.8h, v3.8h, #8        // clip_sign
++        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
++        sshr            v1.8h, v1.8h, #1        // clip
++        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v17.8h, v19.8h, #3
++        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v6.8h, v1.8h, #0        // test clip == 0
++        srshr           v7.8h, v7.8h, #3
++        abs             v17.8h, v17.8h          // a2
++        abs             v7.8h, v7.8h            // a1
++        srshr           v5.8h, v5.8h, #3
++        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
++        abs             v19.8h, v5.8h           // a0
++        sshr            v5.8h, v5.8h, #8        // a0_sign
++        bsl             v18.16b, v17.16b, v7.16b // a3
++        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
++        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
++        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
++        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v1.8h
++        and             w5, w2, w3
++        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
++        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v1.8b, v2.8h
++        sqxtun          v0.8b, v16.8h
++        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[4], [x4], x1
++        st2             {v0.b, v1.b}[5], [x4], x1
++        st2             {v0.b, v1.b}[6], [x4], x1
++        st2             {v0.b, v1.b}[7], [x4]
++2:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.16b}, [x0], x1      // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.16b}, [x3], x1      // P1
++        ld1             {v4.16b}, [x3], x1      // P2
++        ld1             {v5.16b}, [x0], x1      // P6
++        ld1             {v6.16b}, [x3], x1      // P3
++        ld1             {v7.16b}, [x0], x1      // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
++        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
++        ld1             {v18.16b}, [x3]         // P4
++        uxtl            v19.8h, v4.8b           // P2[0..7]
++        ld1             {v20.16b}, [x0]         // P8
++        uxtl            v21.8h, v5.8b           // P6[0..7]
++        dup             v22.8h, w2              // pq
++        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
++        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
++        uxtl2           v4.8h, v4.16b           // P2[8..15]
++        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++        uxtl2           v5.8h, v5.16b           // P6[8..15]
++        uxtl            v23.8h, v6.8b           // P3[0..7]
++        uxtl            v24.8h, v7.8b           // P7[0..7]
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
++        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
++        uxtl            v25.8h, v18.8b          // P4[0..7]
++        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
++        uxtl2           v26.8h, v6.16b          // P3[8..15]
++        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl2           v7.8h, v7.16b           // P7[8..15]
++        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
++        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl2           v18.8h, v18.16b         // P4[8..15]
++        uxtl            v23.8h, v20.8b          // P8[0..7]
++        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        uxtl            v24.8h, v1.8b           // P5[0..7]
++        uxtl2           v20.8h, v20.16b         // P8[8..15]
++        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl2           v1.8h, v1.16b           // P5[8..15]
++        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
++        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
++        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
++        abs             v27.8h, v26.8h
++        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
++        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        abs             v28.8h, v7.8h
++        sshr            v27.8h, v27.8h, #1      // clip[0..7]
++        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
++        sshr            v23.8h, v28.8h, #1      // clip[8..15]
++        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
++        srshr           v17.8h, v17.8h, #3
++        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
++        srshr           v16.8h, v16.8h, #3
++        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        abs             v17.8h, v17.8h          // a1[0..7]
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        srshr           v3.8h, v3.8h, #3
++        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v16.8h, v16.8h          // a2[0..7]
++        srshr           v19.8h, v19.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
++        abs             v3.8h, v3.8h            // a1[8..15]
++        srshr           v4.8h, v4.8h, #3
++        abs             v19.8h, v19.8h          // a2[8..15]
++        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
++        abs             v17.8h, v4.8h           // a0[0..7]
++        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
++        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
++        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        abs             v19.8h, v6.8h           // a0[8..15]
++        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
++        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
++        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
++        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
++        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
++        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        cmhs            v19.8h, v3.8h, v27.8h
++        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v16.16b, v20.16b, v17.16b
++        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
++        cmtst           v2.2d, v5.2d, v2.2d
++        cmhs            v3.8h, v0.8h, v23.8h
++        mov             w4, v5.s[1]
++        mov             w5, v5.s[3]
++        and             w0, w0, w2
++        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        orr             v2.16b, v7.16b, v2.16b
++        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
++        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             w2, w4, w5
++        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        and             w0, w0, w2
++        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        sqxtun          v2.8b, v25.8h
++        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
++        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        sqxtun          v0.8b, v24.8h
++        sqxtun2         v2.16b, v18.8h
++        sqxtun2         v0.16b, v1.8h
++        st1             {v2.16b}, [x3], x1
++        st1             {v0.16b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #3
++        ld1             {v3.8b}, [x3], x1
++        add             x5, x0, x1, lsl #2
++        ld1             {v4.8b}, [x3], x1
++        add             x6, x4, x1, lsl #2
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3], x1
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        ld1             {v2.8b}, [x3], x1
++        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
++        ld1             {v19.8b}, [x3], x1
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        ld1             {v4.8b}, [x3], x1
++        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        ld1             {v21.8b}, [x3], x1
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        ld1             {v6.8b}, [x3], x1
++        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        ld1             {v23.8b}, [x3], x1
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        ld1             {v17.8b}, [x3], x1
++        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
++        ld1             {v25.8b}, [x3]
++        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
++        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
++        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
++        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
++        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
++        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
++        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
++        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
++        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
++        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
++        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
++        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
++        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
++        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
++        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
++        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
++        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
++        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
++        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
++        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
++        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
++        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
++        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
++        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
++        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
++        uxtl            v17.8h, v27.8b          // P2[0..7]
++        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
++        uxtl            v20.8h, v21.8b          // P6[0..7]
++        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
++        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
++        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
++        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
++        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
++        uxtl            v26.8h, v26.8b          // P2[8..15]
++        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
++        uxtl            v17.8h, v18.8b          // P6[8..15]
++        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
++        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
++        uxtl            v28.8h, v7.8b           // P3[0..7]
++        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++        uxtl            v16.8h, v16.8b          // P7[0..7]
++        uxtl            v26.8h, v21.8b          // P3[8..15]
++        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++        uxtl            v22.8h, v22.8b          // P7[8..15]
++        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
++        uxtl            v27.8h, v27.8b          // P4[0..7]
++        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
++        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
++        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
++        uxtl            v4.8h, v18.8b           // P4[8..15]
++        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl            v1.8h, v1.8b            // P8[0..7]
++        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl            v2.8h, v2.8b            // P8[8..15]
++        uxtl            v16.8h, v19.8b          // P5[0..7]
++        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl            v18.8h, v23.8b          // P5[8..15]
++        dup             v19.8h, w2              // pq
++        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
++        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
++        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        abs             v23.8h, v21.8h
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
++        abs             v26.8h, v22.8h
++        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
++        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        sshr            v23.8h, v23.8h, #1      // clip[0..7]
++        sshr            v26.8h, v26.8h, #1      // clip[8..15]
++        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
++        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
++        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
++        srshr           v5.8h, v5.8h, #3
++        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        srshr           v2.8h, v6.8h, #3
++        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        srshr           v6.8h, v24.8h, #3
++        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        abs             v5.8h, v5.8h            // a1[0..7]
++        srshr           v24.8h, v25.8h, #3
++        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        abs             v2.8h, v2.8h            // a2[0..7]
++        abs             v6.8h, v6.8h            // a1[8..15]
++        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v17.8h, v24.8h          // a2[8..15]
++        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
++        srshr           v3.8h, v3.8h, #3
++        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
++        srshr           v7.8h, v7.8h, #3
++        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
++        abs             v2.8h, v3.8h            // a0[8..15]
++        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
++        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
++        abs             v5.8h, v7.8h            // a0[0..7]
++        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
++        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
++        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
++        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
++        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
++        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
++        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w7, v2.s[1]
++        mov             w8, v2.s[3]
++        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        mov             w2, v5.s[1]             // move to gp reg
++        cmhs            v2.8h, v3.8h, v26.8h
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v23.8h
++        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
++        and             w9, w7, w8
++        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
++        and             w10, w2, w3
++        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        and             w9, w10, w9
++        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
++        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        sqxtun          v2.8b, v4.8h
++        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v27.8h
++        sqxtun          v1.8b, v16.8h
++        sqxtun          v3.8b, v18.8h
++        tbnz            w2, #0, 1f
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f
++        st2             {v0.b, v1.b}[4], [x5], x1
++        st2             {v0.b, v1.b}[5], [x5], x1
++        st2             {v0.b, v1.b}[6], [x5], x1
++        st2             {v0.b, v1.b}[7], [x5]
++2:      tbnz            w7, #0, 3f
++        st2             {v2.b, v3.b}[0], [x4], x1
++        st2             {v2.b, v3.b}[1], [x4], x1
++        st2             {v2.b, v3.b}[2], [x4], x1
++        st2             {v2.b, v3.b}[3], [x4]
++3:      tbnz            w8, #0, 4f
++        st2             {v2.b, v3.b}[4], [x6], x1
++        st2             {v2.b, v3.b}[5], [x6], x1
++        st2             {v2.b, v3.b}[6], [x6], x1
++        st2             {v2.b, v3.b}[7], [x6]
++4:      ret
++endfunc
++
++// Copy at most the specified number of bytes from source to destination buffer,
++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
++// On entry:
++//   x0 -> source buffer
++//   w1 = max number of bytes to copy
++//   x2 -> destination buffer, optimally 8-byte aligned
++// On exit:
++//   w0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        // Offset by 80 to screen out cases that are too short for us to handle,
++        // and also make it easy to test for loop termination, or to determine
++        // whether we need an odd number of half-iterations of the loop.
++        subs            w1, w1, #80
++        b.mi            90f
++
++        // Set up useful constants
++        movi            v20.4s, #3, lsl #24
++        movi            v21.4s, #3, lsl #16
++
++        tst             w1, #32
++        b.ne            1f
++
++          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
++          ext             v25.16b, v0.16b, v1.16b, #1
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++          add             w1, w1, #32
++          b               3f
++
++1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
++        ext             v25.16b, v3.16b, v4.16b, #1
++        ext             v26.16b, v3.16b, v4.16b, #2
++        ext             v27.16b, v3.16b, v4.16b, #3
++        ext             v29.16b, v4.16b, v5.16b, #1
++        ext             v30.16b, v4.16b, v5.16b, #2
++        ext             v31.16b, v4.16b, v5.16b, #3
++        bic             v24.16b, v3.16b, v20.16b
++        bic             v25.16b, v25.16b, v20.16b
++        bic             v26.16b, v26.16b, v20.16b
++        bic             v27.16b, v27.16b, v20.16b
++        bic             v28.16b, v4.16b, v20.16b
++        bic             v29.16b, v29.16b, v20.16b
++        bic             v30.16b, v30.16b, v20.16b
++        bic             v31.16b, v31.16b, v20.16b
++        eor             v24.16b, v24.16b, v21.16b
++        eor             v25.16b, v25.16b, v21.16b
++        eor             v26.16b, v26.16b, v21.16b
++        eor             v27.16b, v27.16b, v21.16b
++        eor             v28.16b, v28.16b, v21.16b
++        eor             v29.16b, v29.16b, v21.16b
++        eor             v30.16b, v30.16b, v21.16b
++        eor             v31.16b, v31.16b, v21.16b
++        cmeq            v24.4s, v24.4s, #0
++        cmeq            v25.4s, v25.4s, #0
++        cmeq            v26.4s, v26.4s, #0
++        cmeq            v27.4s, v27.4s, #0
++        // Drop through...
++2:        mov             v0.16b, v5.16b
++          ld1             {v1.16b, v2.16b}, [x0], #32
++        cmeq            v28.4s, v28.4s, #0
++        cmeq            v29.4s, v29.4s, #0
++        cmeq            v30.4s, v30.4s, #0
++        cmeq            v31.4s, v31.4s, #0
++        orr             v24.16b, v24.16b, v25.16b
++        orr             v26.16b, v26.16b, v27.16b
++        orr             v28.16b, v28.16b, v29.16b
++        orr             v30.16b, v30.16b, v31.16b
++          ext             v25.16b, v0.16b, v1.16b, #1
++        orr             v22.16b, v24.16b, v26.16b
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++        orr             v23.16b, v28.16b, v30.16b
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++        orr             v22.16b, v22.16b, v23.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++        addv            s22, v22.4s
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++        mov             w3, v22.s[0]
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++        cbnz            w3, 90f
++        st1             {v3.16b, v4.16b}, [x2], #32
++3:          mov             v3.16b, v2.16b
++            ld1             {v4.16b, v5.16b}, [x0], #32
++          cmeq            v28.4s, v28.4s, #0
++          cmeq            v29.4s, v29.4s, #0
++          cmeq            v30.4s, v30.4s, #0
++          cmeq            v31.4s, v31.4s, #0
++          orr             v24.16b, v24.16b, v25.16b
++          orr             v26.16b, v26.16b, v27.16b
++          orr             v28.16b, v28.16b, v29.16b
++          orr             v30.16b, v30.16b, v31.16b
++            ext             v25.16b, v3.16b, v4.16b, #1
++          orr             v22.16b, v24.16b, v26.16b
++            ext             v26.16b, v3.16b, v4.16b, #2
++            ext             v27.16b, v3.16b, v4.16b, #3
++            ext             v29.16b, v4.16b, v5.16b, #1
++          orr             v23.16b, v28.16b, v30.16b
++            ext             v30.16b, v4.16b, v5.16b, #2
++            ext             v31.16b, v4.16b, v5.16b, #3
++            bic             v24.16b, v3.16b, v20.16b
++            bic             v25.16b, v25.16b, v20.16b
++            bic             v26.16b, v26.16b, v20.16b
++          orr             v22.16b, v22.16b, v23.16b
++            bic             v27.16b, v27.16b, v20.16b
++            bic             v28.16b, v4.16b, v20.16b
++            bic             v29.16b, v29.16b, v20.16b
++            bic             v30.16b, v30.16b, v20.16b
++            bic             v31.16b, v31.16b, v20.16b
++          addv            s22, v22.4s
++            eor             v24.16b, v24.16b, v21.16b
++            eor             v25.16b, v25.16b, v21.16b
++            eor             v26.16b, v26.16b, v21.16b
++            eor             v27.16b, v27.16b, v21.16b
++            eor             v28.16b, v28.16b, v21.16b
++          mov             w3, v22.s[0]
++            eor             v29.16b, v29.16b, v21.16b
++            eor             v30.16b, v30.16b, v21.16b
++            eor             v31.16b, v31.16b, v21.16b
++            cmeq            v24.4s, v24.4s, #0
++            cmeq            v25.4s, v25.4s, #0
++            cmeq            v26.4s, v26.4s, #0
++            cmeq            v27.4s, v27.4s, #0
++          cbnz            w3, 91f
++          st1             {v0.16b, v1.16b}, [x2], #32
++        subs            w1, w1, #64
++        b.pl            2b
++
++90:     add             w0, w1, #80
++        ret
++
++91:     sub             w1, w1, #32
++        b               90b
++endfunc
 --- a/libavcodec/allcodecs.c
 +++ b/libavcodec/allcodecs.c
 @@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
@@ -448,7 +2329,7 @@ Upstream-status: Pending
 @@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_c
      }
  }
-
+ 
 +static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
 +{
 +    const enum AVPixelFormat *pf = p->pix_fmts;
@@ -528,7 +2409,7 @@ Upstream-status: Pending
 @@ -26,83 +26,209 @@
  #include "libavutil/internal.h"
  #include "libavcodec/cabac.h"
-
+ 
 +
  #define get_cabac_inline get_cabac_inline_arm
  static av_always_inline int get_cabac_inline_arm(CABACContext *c,
@@ -622,7 +2503,7 @@ Upstream-status: Pending
 +    );
 +    return bit;
 +}
-
+ 
 -    __asm__ volatile(
 -        "ldrb       %[bit]        , [%[state]]                  \n\t"
 -        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
@@ -719,7 +2600,7 @@ Upstream-status: Pending
 +#endif
 +        "lsls       %[range] , %[low], #16          \n\t"
 +        "bne        1f                              \n\t"
-
+ 
 -    return bit & 1;
 +        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
 +        "rev        %[tmp]   , %[tmp]               \n\t"
@@ -803,7 +2684,7 @@ Upstream-status: Pending
 +}
 +
  #endif /* HAVE_ARMV6T2_INLINE */
-
+ 
  #endif /* AVCODEC_ARM_CABAC_H */
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevc_cabac.h
@@ -15211,6 +17092,883 @@ Upstream-status: Pending
 +        bx          lr
 +
 +endfunc
+--- a/libavcodec/arm/vc1dsp_init_neon.c
++++ b/libavcodec/arm/vc1dsp_init_neon.c
+@@ -19,6 +19,7 @@
+ #include <stdint.h>
+ 
+ #include "libavutil/attributes.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+ 
+@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_
+ void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ 
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int rnd);
+ 
+@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ #define FN_ASSIGN(X, Y) \
+     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+ 
++    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+     FN_ASSIGN(1, 0);
+     FN_ASSIGN(2, 0);
+@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+--- a/libavcodec/arm/vc1dsp_neon.S
++++ b/libavcodec/arm/vc1dsp_neon.S
+@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e
+         vst1.32         {d1[1]},  [r0,:32]
+         bx              lr
+ endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1[0]}, [r0], r1       @ P5
++        vld1.32         {d2[0]}, [r3], r1       @ P1
++        vld1.32         {d3[0]}, [r3], r1       @ P2
++        vld1.32         {d4[0]}, [r0], r1       @ P6
++        vld1.32         {d5[0]}, [r3], r1       @ P3
++        vld1.32         {d6[0]}, [r0], r1       @ P7
++        vld1.32         {d7[0]}, [r3]           @ P4
++        vld1.32         {d16[0]}, [r0]          @ P8
++        vshll.u8        q9, d1, #1              @ 2*P5
++        vdup.16         d17, r2                 @ pq
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vmovl.u8        q11, d3                 @ P2
++        vmovl.u8        q1, d4                  @ P6
++        vmovl.u8        q12, d5                 @ P3
++        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q11, d6                 @ P7
++        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmovl.u8        q3, d7                  @ P4
++        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q11, d16                @ P8
++        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
++        vmovl.u8        q12, d1                 @ P5
++        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
++        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
++        vsub.i16        d1, d6, d24             @ P4-P5
++        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
++        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
++        vabs.s16        d2, d1
++        vrshr.s16       d3, d18, #3
++        vrshr.s16       d5, d20, #3
++        vshr.s16        d2, d2, #1              @ clip
++        vrshr.s16       d4, d4, #3
++        vabs.s16        d3, d3                  @ a2
++        vshr.s16        d1, d1, #8              @ clip_sign
++        vabs.s16        d5, d5                  @ a1
++        vceq.i16        d7, d2, #0              @ test clip == 0
++        vabs.s16        d16, d4                 @ a0
++        vshr.s16        d4, d4, #8              @ a0_sign
++        vcge.s16        d18, d5, d3             @ test a1 >= a2
++        vcge.s16        d17, d16, d17           @ test a0 >= pq
++        vbsl            d18, d3, d5             @ a3
++        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
++        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d5, d18, d16            @ test a3 >= a0
++        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d4, d0, d2
++        tst             r0, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d4, d2, d0              @ FFMIN(d, clip)
++        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q12
++        vst1.32         {d0[0]}, [r3], r1
++        vst1.32         {d1[0]}, [r3]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3]
++        vdup.16         d1, r2                  @ pq
++        vtrn.8          q1, q2
++        vtrn.16         d2, d3                  @ P1, P5, P3, P7
++        vtrn.16         d4, d5                  @ P2, P6, P4, P8
++        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
++        vmovl.u8        q8, d4                  @ P2, P6
++        vmovl.u8        q9, d3                  @ P3, P7
++        vmovl.u8        q2, d5                  @ P4, P8
++        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
++        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
++        vmovl.u8        q1, d2                  @ P1, P5
++        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
++        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
++        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
++        vsub.i16        d3, d4, d2              @ P4-P5
++        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
++        vrshr.s16       q3, q3, #3
++        vabs.s16        d5, d3
++        vshr.s16        d3, d3, #8              @ clip_sign
++        vrshr.s16       d16, d20, #3
++        vabs.s16        q3, q3                  @ a1, a2
++        vshr.s16        d5, d5, #1              @ clip
++        vabs.s16        d17, d16                @ a0
++        vceq.i16        d18, d5, #0             @ test clip == 0
++        vshr.s16        d16, d16, #8            @ a0_sign
++        vcge.s16        d19, d6, d7             @ test a1 >= a2
++        vcge.s16        d1, d17, d1             @ test a0 >= pq
++        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
++        vbsl            d19, d7, d6             @ a3
++        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
++        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d3[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d3, d0, d5
++        tst             r2, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d3, d5, d0              @ FFMIN(d, clip)
++        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q1
++        vqmovun.s16     d0, q2
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1}, [r0 :64], r1      @ P5
++        vld1.32         {d2}, [r3 :64], r1      @ P1
++        vld1.32         {d3}, [r3 :64], r1      @ P2
++        vld1.32         {d4}, [r0 :64], r1      @ P6
++        vld1.32         {d5}, [r3 :64], r1      @ P3
++        vld1.32         {d6}, [r0 :64], r1      @ P7
++        vshll.u8        q8, d1, #1              @ 2*P5
++        vshll.u8        q9, d2, #1              @ 2*P1
++        vld1.32         {d7}, [r3 :64]          @ P4
++        vmovl.u8        q1, d3                  @ P2
++        vld1.32         {d20}, [r0 :64]         @ P8
++        vmovl.u8        q11, d4                 @ P6
++        vdup.16         q12, r2                 @ pq
++        vmovl.u8        q13, d5                 @ P3
++        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
++        vmovl.u8        q1, d6                  @ P7
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
++        vmovl.u8        q3, d7                  @ P4
++        vmovl.u8        q10, d20                @ P8
++        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
++        vmovl.u8        q1, d1                  @ P5
++        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
++        vsub.i16        q13, q3, q1             @ P4-P5
++        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q10, q13
++        vshr.s16        q13, q13, #8            @ clip_sign
++        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q10, q10, #1            @ clip
++        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q11, q10, #0            @ test clip == 0
++        vrshr.s16       q9, q9, #3
++        vabs.s16        q8, q8                  @ a2
++        vabs.s16        q9, q9                  @ a1
++        vrshr.s16       q2, q2, #3
++        vcge.s16        q14, q9, q8             @ test a1 >= a2
++        vabs.s16        q15, q2                 @ a0
++        vshr.s16        q2, q2, #8              @ a0_sign
++        vbsl            q14, q8, q9             @ a3
++        vcge.s16        q8, q15, q12            @ test a0 >= pq
++        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
++        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q12, q14, q15           @ test a3 >= a0
++        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
++        vshl.i64        q11, q9, #16
++        vmov.32         r0, d18[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r2, d19[1]
++        vshr.s64        q9, q11, #48
++        vcge.s16        q11, q0, q10
++        vorr            q8, q8, q9
++        and             r0, r0, r2
++        vbsl            q11, q10, q0            @ FFMIN(d, clip)
++        tst             r0, #1
++        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
++        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q1
++        vst1.32         {d0}, [r3 :64], r1
++        vst1.32         {d1}, [r3 :64]
++1:      bx              lr
++endfunc
++
++.align  5
++.Lcoeffs:
++.quad   0x00050002
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        push            {lr}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        add             r12, r0, r1, lsl #2
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d7}, [r3], r1
++        vld1.32         {d17}, [r3]
++        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
++        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
++        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.32         d2, d6                  @ P1, P5
++        vtrn.32         d4, d16                 @ P2, P6
++        vtrn.32         d3, d7                  @ P3, P7
++        vtrn.32         d5, d17                 @ P4, P8
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vshll.u8        q11, d6, #1             @ 2*P5
++        vmovl.u8        q12, d4                 @ P2
++        vmovl.u8        q13, d16                @ P6
++        vmovl.u8        q14, d3                 @ P3
++        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q12, d7                 @ P7
++        vshll.u8        q1, d3, #1              @ 2*P3
++        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
++        vmovl.u8        q2, d5                  @ P4
++        vmovl.u8        q8, d17                 @ P8
++        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q3, d6                  @ P5
++        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
++        vsub.i16        q12, q2, q3             @ P4-P5
++        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q8, q12
++        vshr.s16        q12, q12, #8            @ clip_sign
++        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q8, q8, #1              @ clip
++        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q11, q11, #3
++        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q13, q8, #0             @ test clip == 0
++        vrshr.s16       q10, q10, #3
++        vabs.s16        q11, q11                @ a2
++        vabs.s16        q10, q10                @ a1
++        vrshr.s16       q1, q1, #3
++        vcge.s16        q14, q10, q11           @ test a1 >= a2
++        vabs.s16        q15, q1                 @ a0
++        vshr.s16        q1, q1, #8              @ a0_sign
++        vbsl            q14, q11, q10           @ a3
++        vcge.s16        q9, q15, q9             @ test a0 >= pq
++        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
++        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q11, q14, q15           @ test a3 >= a0
++        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d20[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r3, d21[1]
++        vcge.s16        q10, q0, q8
++        and             r14, r2, r3
++        vbsl            q10, q8, q0             @ FFMIN(d, clip)
++        tst             r14, #1
++        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q3
++        vqmovun.s16     d0, q2
++        tst             r2, #1
++        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      tst             r3, #1
++        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
++        vst2.8          {d0[4], d1[4]}, [r12], r1
++        vst2.8          {d0[5], d1[5]}, [r12], r1
++        vst2.8          {d0[6], d1[6]}, [r12], r1
++        vst2.8          {d0[7], d1[7]}, [r12]
++2:      pop             {pc}
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        vpush           {d8-d15}
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.64         {q1}, [r0 :128], r1     @ P5
++        vld1.64         {q2}, [r3 :128], r1     @ P1
++        vld1.64         {q3}, [r3 :128], r1     @ P2
++        vld1.64         {q4}, [r0 :128], r1     @ P6
++        vld1.64         {q5}, [r3 :128], r1     @ P3
++        vld1.64         {q6}, [r0 :128], r1     @ P7
++        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
++        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
++        vld1.64         {q9}, [r3 :128]         @ P4
++        vmovl.u8        q10, d6                 @ P2[0..7]
++        vld1.64         {q11}, [r0 :128]        @ P8
++        vmovl.u8        q12, d8                 @ P6[0..7]
++        vdup.16         q13, r2                 @ pq
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
++        vmovl.u8        q3, d7                  @ P2[8..15]
++        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q4, d9                  @ P6[8..15]
++        vmovl.u8        q14, d10                @ P3[0..7]
++        vmovl.u8        q15, d12                @ P7[0..7]
++        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
++        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
++        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q6, d13                 @ P7[8..15]
++        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q14, d18                @ P4[0..7]
++        vmovl.u8        q9, d19                 @ P4[8..15]
++        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vmovl.u8        q15, d11                @ P3[8..15]
++        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
++        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q15, d22                @ P8[0..7]
++        vmovl.u8        q11, d23                @ P8[8..15]
++        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q6, d2                  @ P5[0..7]
++        vmovl.u8        q1, d3                  @ P5[8..15]
++        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
++        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q7, q7, #3
++        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vabs.s16        q11, q15
++        vabs.s16        q8, q8                  @ a1[0..7]
++        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
++        vrshr.s16       q2, q2, #3
++        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q7, q7                  @ a2[0..7]
++        vrshr.s16       q10, q10, #3
++        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vrshr.s16       q3, q3, #3
++        vabs.s16        q10, q10                @ a2[8..15]
++        vbsl            q4, q7, q8              @ a3[0..7]
++        vabs.s16        q7, q12
++        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
++        vrshr.s16       q5, q5, #3
++        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
++        vshr.s16        q7, q7, #1              @ clip[8..15]
++        vbsl            q12, q10, q2            @ a3[8..15]
++        vabs.s16        q2, q3                  @ a0[0..7]
++        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
++        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
++        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
++        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
++        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
++        vabs.s16        q4, q5                  @ a0[8..15]
++        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
++        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
++        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
++        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
++        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vmov.32         r2, d5[1]
++        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
++        vshl.i64        q2, q2, #16
++        vcge.s16        q12, q15, q11
++        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vshr.s64        q2, q2, #48
++        and             r0, r0, r2
++        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
++        vshl.i64        q11, q4, #16
++        vmov.32         r2, d8[1]
++        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q10, q2
++        vmov.32         r12, d9[1]
++        vshr.s64        q4, q11, #48
++        vcge.s16        q10, q0, q7
++        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vorr            q4, q8, q4
++        and             r2, r2, r12
++        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
++        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             r0, r0, r2
++        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        tst             r0, #1
++        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
++        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        vqmovun.s16     d4, q14
++        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        vqmovun.s16     d0, q6
++        vqmovun.s16     d5, q9
++        vqmovun.s16     d1, q1
++        vst1.64         {q2}, [r3 :128], r1
++        vst1.64         {q0}, [r3 :128]
++1:      vpop            {d8-d15}
++        bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        push            {r4-r6,lr}
++        vpush           {d8-d15}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d3}, [r3], r1
++        add             r4, r0, r1, lsl #2
++        vld1.32         {d10}, [r3], r1
++        vld1.32         {d11}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d8}, [r3], r1
++        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
++        vld1.32         {d14}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d12}, [r3], r1
++        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
++        vld1.32         {d13}, [r3], r1
++        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vld1.32         {d1}, [r3], r1
++        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
++        vld1.32         {d7}, [r3], r1
++        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vld1.32         {d9}, [r3], r1
++        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
++        vld1.32         {d15}, [r3]
++        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
++        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
++        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
++        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
++        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
++        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
++        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
++        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
++        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
++        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
++        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
++        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
++        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
++        vmovl.u8        q1, d3                  @ P2[0..7]
++        vmovl.u8        q12, d4                 @ P6[0..7]
++        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
++        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
++        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
++        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
++        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vmovl.u8        q1, d10                 @ P3[0..7]
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
++        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q14, d6                 @ P2[8..15]
++        vmovl.u8        q3, d7                  @ P6[8..15]
++        vmovl.u8        q15, d8                 @ P7[0..7]
++        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q1, d12                 @ P3[8..15]
++        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
++        vmovl.u8        q4, d9                  @ P7[8..15]
++        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
++        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q5, d11                 @ P4[0..7]
++        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
++        vmovl.u8        q6, d13                 @ P4[8..15]
++        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q1, d14                 @ P8[0..7]
++        vmovl.u8        q7, d15                 @ P8[8..15]
++        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q4, d16                 @ P5[0..7]
++        vmovl.u8        q8, d1                  @ P5[8..15]
++        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
++        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q10, q10, #3
++        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
++        vrshr.s16       q11, q11, #3
++        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q2, q2, #3
++        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vabs.s16        q10, q10                @ a1[0..7]
++        vrshr.s16       q13, q13, #3
++        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vabs.s16        q3, q11                 @ a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q11, q1
++        vabs.s16        q12, q13                @ a2[8..15]
++        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
++        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
++        vrshr.s16       q15, q15, #3
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vrshr.s16       q14, q14, #3
++        vbsl            q13, q3, q10            @ a3[0..7]
++        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
++        vabs.s16        q10, q15                @ a0[8..15]
++        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
++        vbsl            q3, q12, q2             @ a3[8..15]
++        vabs.s16        q2, q14                 @ a0[0..7]
++        vabs.s16        q12, q7
++        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
++        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
++        vshr.s16        q12, q12, #1            @ clip[8..15]
++        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
++        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
++        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
++        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
++        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
++        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
++        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
++        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
++        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vcge.s16        q14, q13, q12
++        vmov.32         r2, d4[1]               @ move to gp reg
++        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vmov.32         r3, d5[1]
++        vcge.s16        q2, q0, q11
++        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
++        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
++        vmov.32         r5, d6[1]
++        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmov.32         r6, d7[1]
++        and             r12, r2, r3
++        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        and             r14, r5, r6
++        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        and             r12, r12, r14
++        vqmovun.s16     d4, q6
++        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        tst             r12, #1
++        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
++        vqmovun.s16     d2, q5
++        vqmovun.s16     d3, q4
++        vqmovun.s16     d5, q8
++        tst             r2, #1
++        bne             1f
++        vst2.8          {d2[0], d3[0]}, [r0], r1
++        vst2.8          {d2[1], d3[1]}, [r0], r1
++        vst2.8          {d2[2], d3[2]}, [r0], r1
++        vst2.8          {d2[3], d3[3]}, [r0]
++1:      add             r0, r4, r1, lsl #2
++        tst             r3, #1
++        bne             2f
++        vst2.8          {d2[4], d3[4]}, [r4], r1
++        vst2.8          {d2[5], d3[5]}, [r4], r1
++        vst2.8          {d2[6], d3[6]}, [r4], r1
++        vst2.8          {d2[7], d3[7]}, [r4]
++2:      add             r4, r0, r1, lsl #2
++        tst             r5, #1
++        bne             3f
++        vst2.8          {d4[0], d5[0]}, [r0], r1
++        vst2.8          {d4[1], d5[1]}, [r0], r1
++        vst2.8          {d4[2], d5[2]}, [r0], r1
++        vst2.8          {d4[3], d5[3]}, [r0]
++3:      tst             r6, #1
++        bne             4f
++        vst2.8          {d4[4], d5[4]}, [r4], r1
++        vst2.8          {d4[5], d5[5]}, [r4], r1
++        vst2.8          {d4[6], d5[6]}, [r4], r1
++        vst2.8          {d4[7], d5[7]}, [r4]
++4:      vpop            {d8-d15}
++        pop             {r4-r6,pc}
++endfunc
++
++@ Copy at most the specified number of bytes from source to destination buffer,
++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
++@ On entry:
++@   r0 -> source buffer
++@   r1 = max number of bytes to copy
++@   r2 -> destination buffer, optimally 8-byte aligned
++@ On exit:
++@   r0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        @ Offset by 48 to screen out cases that are too short for us to handle,
++        @ and also make it easy to test for loop termination, or to determine
++        @ whether we need an odd number of half-iterations of the loop.
++        subs    r1, r1, #48
++        bmi     90f
++
++        @ Set up useful constants
++        vmov.i32        q0, #0x3000000
++        vmov.i32        q1, #0x30000
++
++        tst             r1, #16
++        bne             1f
++
++          vld1.8          {q8, q9}, [r0]!
++          vbic            q12, q8, q0
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++          add             r1, r1, #16
++          b               3f
++
++1:      vld1.8          {q10, q11}, [r0]!
++        vbic            q12, q10, q0
++        vext.8          q13, q10, q11, #1
++        vext.8          q14, q10, q11, #2
++        vext.8          q15, q10, q11, #3
++        veor            q12, q12, q1
++        vbic            q13, q13, q0
++        vbic            q14, q14, q0
++        vbic            q15, q15, q0
++        vceq.i32        q12, q12, #0
++        veor            q13, q13, q1
++        veor            q14, q14, q1
++        veor            q15, q15, q1
++        vceq.i32        q13, q13, #0
++        vceq.i32        q14, q14, #0
++        vceq.i32        q15, q15, #0
++        @ Drop through...
++2:        vmov            q8, q11
++          vld1.8          {q9}, [r0]!
++        vorr            q13, q12, q13
++        vorr            q15, q14, q15
++          vbic            q12, q8, q0
++        vorr            q3, q13, q15
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++        vorr            d6, d6, d7
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++        vmov            r3, r12, d6
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++        orrs            r3, r3, r12
++        bne             90f
++        vst1.64         {q10}, [r2]!
++3:          vmov            q10, q9
++            vld1.8          {q11}, [r0]!
++          vorr            q13, q12, q13
++          vorr            q15, q14, q15
++            vbic            q12, q10, q0
++          vorr            q3, q13, q15
++            vext.8          q13, q10, q11, #1
++            vext.8          q14, q10, q11, #2
++            vext.8          q15, q10, q11, #3
++            veor            q12, q12, q1
++          vorr            d6, d6, d7
++            vbic            q13, q13, q0
++            vbic            q14, q14, q0
++            vbic            q15, q15, q0
++            vceq.i32        q12, q12, #0
++          vmov            r3, r12, d6
++            veor            q13, q13, q1
++            veor            q14, q14, q1
++            veor            q15, q15, q1
++            vceq.i32        q13, q13, #0
++            vceq.i32        q14, q14, #0
++            vceq.i32        q15, q15, #0
++          orrs            r3, r3, r12
++          bne             91f
++          vst1.64         {q8}, [r2]!
++        subs            r1, r1, #32
++        bpl             2b
++
++90:     add             r0, r1, #48
++        bx              lr
++
++91:     sub             r1, r1, #16
++        b               90b
++endfunc
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
 @@ -2567,6 +2567,17 @@ typedef struct AVHWAccel {
@@ -15229,7 +17987,7 @@ Upstream-status: Pending
 +     */
 +    void (*abort_frame)(AVCodecContext *avctx);
  } AVHWAccel;
-
+ 
  /**
 --- a/libavcodec/cabac.h
 +++ b/libavcodec/cabac.h
@@ -15253,7 +18011,7 @@ Upstream-status: Pending
 +++ b/libavcodec/codec.h
 @@ -350,6 +350,17 @@ const AVCodec *av_codec_iterate(void **o
  AVCodec *avcodec_find_decoder(enum AVCodecID id);
-
+ 
  /**
 + * Find a registered decoder with a matching codec ID and pix_fmt.
 + * A decoder will pix_fmt set to NULL will match any fmt.
@@ -15761,12 +18519,788 @@ Upstream-status: Pending
 +};
 +
 +#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v3.h
+@@ -0,0 +1,255 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	padding[5];
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u8	num_active_dpb_entries;
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v4.h
+@@ -0,0 +1,515 @@
++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
++/*
++ *  Video for Linux Two controls header file
++ *
++ *  Copyright (C) 1999-2012 the contributors
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or
++ *  (at your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  Alternatively you can redistribute this file under the terms of the
++ *  BSD license as stated below:
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in
++ *     the documentation and/or other materials provided with the
++ *     distribution.
++ *  3. The names of its contributors may not be used to endorse or promote
++ *     products derived from this software without specific prior written
++ *     permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *  The contents of this header was split off from videodev2.h. All control
++ *  definitions should be added to this header, which is included by
++ *  videodev2.h.
++ */
++
++#ifndef AVCODEC_HEVC_CTRLS_V4_H
++#define AVCODEC_HEVC_CTRLS_V4_H
++
++#include <linux/const.h>
++#include <linux/types.h>
++
++#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
++#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
++#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
++
++enum v4l2_stateless_hevc_decode_mode {
++	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_stateless_hevc_start_code {
++	V4L2_STATELESS_HEVC_START_CODE_NONE,
++	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/**
++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
++ *
++ * @video_parameter_set_id: specifies the value of the
++ *			vps_video_parameter_set_id of the active VPS
++ * @seq_parameter_set_id: provides an identifier for the SPS for
++ *			  reference by other syntax elements
++ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
++ *				in units of luma samples
++ * @pic_height_in_luma_samples: specifies the height of each decoded picture
++ *				in units of luma samples
++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
++ *                         samples of the luma array
++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
++ *                           samples of the chroma arrays
++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
++ *                                     the variable MaxPicOrderCntLsb
++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
++ *                                    required size of the decoded picture
++ *                                    buffer for the codec video sequence
++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
++ *				    value of SpsMaxLatencyPictures array
++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
++ *					    luma coding block size
++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
++ *					      the maximum and minimum luma
++ *					      coding block size
++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
++ *					       transform block size
++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
++ *						 the maximum and minimum luma
++ *						 transform block size
++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in inter
++ *					 prediction mode
++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in intra
++ *					 prediction mode
++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
++ *                                    bits used to represent each of PCM sample
++ *                                    values of the luma component
++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
++ *                                      of bits used to represent each of PCM
++ *                                      sample values of the chroma components
++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
++ *                                              minimum size of coding blocks
++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
++ *						  the maximum and minimum size of
++ *						  coding blocks
++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
++ *				 syntax structures included in the SPS
++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
++ *				reference pictures that are specified in the SPS
++ * @chroma_format_idc: specifies the chroma sampling
++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
++ *                             of temporal sub-layers
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_sps {
++	__u8	video_parameter_set_id;
++	__u8	seq_parameter_set_id;
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u8	reserved[6];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++/**
++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
++ *
++ * @pic_parameter_set_id: identifies the PPS for reference by other
++ *			  syntax elements
++ * @num_extra_slice_header_bits: specifies the number of extra slice header
++ *				 bits that are present in the slice header RBSP
++ *				 for coded pictures referring to the PPS.
++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l0_active_minus1
++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l1_active_minus1
++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
++ *		     each slice referring to the PPS
++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
++ *			    tree block size and the minimum luma coding block
++ *			    size of coding units that convey cu_qp_delta_abs
++ *			    and cu_qp_delta_sign_flag
++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
++ *			     partitioning the picture
++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
++ *			  the picture
++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
++ *			 units of coding tree blocks
++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
++ *		       units of coding tree blocks
++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
++ *			  beta divided by 2
++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
++ *			divided by 2
++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
++ *                                    the variable Log2ParMrgLevel
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_PPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_pps {
++	__u8	pic_parameter_set_id;
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++	__u8	reserved;
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++/**
++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
++ *
++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
++ * @flags: long term flag for the reference frame
++ * @field_pic: whether the reference is a field picture or a frame.
++ * @reserved: padding field. Should be zeroed by applications.
++ * @pic_order_cnt_val: the picture order count of the current picture.
++ */
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	reserved;
++	__s32	pic_order_cnt_val;
++};
++
++/**
++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
++ *
++ * @delta_luma_weight_l0: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 0
++ * @luma_offset_l0: the additive offset applied to the luma prediction value
++ *		    for list 0
++ * @delta_chroma_weight_l0: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 0
++ * @chroma_offset_l0: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 0
++ * @delta_luma_weight_l1: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 1
++ * @luma_offset_l1: the additive offset applied to the luma prediction value
++ *		    for list 1
++ * @delta_chroma_weight_l1: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 1
++ * @chroma_offset_l1: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 1
++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
++ *			    all luma weighting factors
++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
++ *				    of the denominator for all chroma
++ *				    weighting factors
++ */
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++/**
++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
++ *
++ * This control is a dynamically sized 1-dimensional array,
++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
++ *
++ * @bit_size: size (in bits) of the current slice data
++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
++ * @num_entry_point_offsets: specifies the number of entry point offset syntax
++ *			     elements in the slice header.
++ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
++ * @colour_plane_id: specifies the colour plane associated with the current slice
++ * @slice_pic_order_cnt: specifies the picture order count
++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 0
++ *                                that may be used to decode the slice
++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 1
++ *                                that may be used to decode the slice
++ * @collocated_ref_idx: specifies the reference index of the collocated picture used
++ *			for temporal motion vector prediction
++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
++ *				   motion vector prediction candidates supported in
++ *				   the slice subtracted from 5
++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
++ *		    blocks in the slice
++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
++ * @slice_act_y_qp_offset: screen content extension parameters
++ * @slice_act_cb_qp_offset: screen content extension parameters
++ * @slice_act_cr_qp_offset: screen content extension parameters
++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
++ *		more fields
++ * @reserved0: padding field. Should be zeroed by applications.
++ * @slice_segment_addr: specifies the address of the first coding tree block in
++ *			the slice segment
++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS
++ * @pred_weight_table: the prediction weight coefficients for inter-picture
++ *		       prediction
++ * @reserved1: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_byte_offset;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__s32	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	__u8	reserved0[3];
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u8	reserved1[2];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++/**
++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
++ *
++ * @pic_order_cnt_val: picture order count
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS of the first slice
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS of the first slice
++ * @num_active_dpb_entries: the number of entries in dpb
++ * @num_poc_st_curr_before: the number of reference pictures in the short-term
++ *			    set that come before the current frame
++ * @num_poc_st_curr_after: the number of reference pictures in the short-term
++ *			   set that come after the current frame
++ * @num_poc_lt_curr: the number of reference pictures in the long-term set
++ * @poc_st_curr_before: provides the index of the short term before references
++ *			in DPB array
++ * @poc_st_curr_after: provides the index of the short term after references
++ *		       in DPB array
++ * @poc_lt_curr: provides the index of the long term references in DPB array
++ * @reserved: padding field. Should be zeroed by applications.
++ * @dpb: the decoded picture buffer, for meta-data about reference frames
++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++	__u8	num_active_dpb_entries;
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	reserved[4];
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++/**
++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
++ *
++ * @scaling_list_4x4: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_8x8: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_16x16:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_32x32:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ */
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
 --- a/libavcodec/hevc_parser.c
 +++ b/libavcodec/hevc_parser.c
 @@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod
      avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
      avctx->level    = ps->sps->ptl.general_ptl.level_idc;
-
+ 
 +    if (ps->sps->chroma_format_idc == 1) {
 +        avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
 +            ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
@@ -15783,12 +19317,69 @@ Upstream-status: Pending
      if (ps->vps->vps_timing_info_present_flag) {
          num = ps->vps->vps_num_units_in_tick;
          den = ps->vps->vps_time_scale;
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex
+         if (!frame->rpl_buf)
+             goto fail;
+ 
+-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+-        if (!frame->tab_mvf_buf)
+-            goto fail;
+-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        if (s->tab_mvf_pool) {
++            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++            if (!frame->tab_mvf_buf)
++                goto fail;
++            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        }
+ 
+-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+-        if (!frame->rpl_tab_buf)
+-            goto fail;
+-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+-        for (j = 0; j < frame->ctb_count; j++)
+-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        if (s->rpl_tab_pool) {
++            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++            if (!frame->rpl_tab_buf)
++                goto fail;
++            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
++            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++            for (j = 0; j < frame->ctb_count; j++)
++                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        }
+ 
+         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s
+     int ctb_count    = frame->ctb_count;
+     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+     int i;
++    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+ 
+     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+         return AVERROR_INVALIDDATA;
+ 
+-    for (i = ctb_addr_ts; i < ctb_count; i++)
+-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++    if (frame->rpl_tab) {
++        for (i = ctb_addr_ts; i < ctb_count; i++)
++            frame->rpl_tab[i] = tab;
++    }
+ 
+-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++    frame->refPicList = tab->refPicList;
+ 
+     return 0;
+ }
 --- a/libavcodec/hevcdec.c
 +++ b/libavcodec/hevcdec.c
 @@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon
-
+ 
      ff_set_sar(avctx, sps->vui.sar);
-
+ 
 +    if (sps->chroma_format_idc == 1) {
 +        avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
 +            sps->vui.chroma_sample_loc_type_top_field + 1 :
@@ -15816,7 +19407,7 @@ Upstream-status: Pending
 +                     CONFIG_HEVC_RPI4_10_HWACCEL + \
                       CONFIG_HEVC_VDPAU_HWACCEL)
      enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
-
+ 
      switch (sps->pix_fmt) {
      case AV_PIX_FMT_YUV420P:
      case AV_PIX_FMT_YUVJ420P:
@@ -15851,7 +19442,43 @@ Upstream-status: Pending
          break;
      case AV_PIX_FMT_YUV444P:
  #if CONFIG_HEVC_VDPAU_HWACCEL
-@@ -3230,7 +3258,14 @@ static int hevc_decode_frame(AVCodecCont
+@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const
+     if (!sps)
+         return 0;
+ 
++    // If hwaccel then we don't need all the s/w decode helper arrays
++    if (s->avctx->hwaccel) {
++        export_stream_params(s, sps);
++
++        s->avctx->pix_fmt = pix_fmt;
++        s->ps.sps = sps;
++        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++        return 0;
++    }
++
+     ret = pic_arrays_init(s, sps);
+     if (ret < 0)
+         goto fail;
+@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext
+                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+     int ret;
+ 
+-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    if (s->horizontal_bs) {
++        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
++        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    }
+ 
+     s->is_decoded        = 0;
+     s->first_nal_type    = s->nal_unit_type;
+@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont
      s->ref = NULL;
      ret    = decode_nal_units(s, avpkt->data, avpkt->size);
      if (ret < 0)
@@ -15863,10 +19490,38 @@ Upstream-status: Pending
 +
          return ret;
 +    }
-
+ 
      if (avctx->hwaccel) {
          if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-@@ -3585,6 +3620,15 @@ AVCodec ff_hevc_decoder = {
+@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s
+     if (ret < 0)
+         return ret;
+ 
+-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+-    if (!dst->tab_mvf_buf)
+-        goto fail;
+-    dst->tab_mvf = src->tab_mvf;
++    if (src->tab_mvf_buf) {
++        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++        if (!dst->tab_mvf_buf)
++            goto fail;
++        dst->tab_mvf = src->tab_mvf;
++    }
+ 
+-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+-    if (!dst->rpl_tab_buf)
+-        goto fail;
+-    dst->rpl_tab = src->rpl_tab;
++    if (src->rpl_tab_buf) {
++        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++        if (!dst->rpl_tab_buf)
++            goto fail;
++        dst->rpl_tab = src->rpl_tab;
++    }
+ 
+     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+     if (!dst->rpl_buf)
+@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = {
  #if CONFIG_HEVC_NVDEC_HWACCEL
                                 HWACCEL_NVDEC(hevc),
  #endif
@@ -15897,12 +19552,12 @@ Upstream-status: Pending
 --- a/libavcodec/hwconfig.h
 +++ b/libavcodec/hwconfig.h
 @@ -24,6 +24,7 @@
-
-
+ 
+ 
  #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
 +#define HWACCEL_CAP_MT_SAFE         (1 << 1)
-
-
+ 
+ 
  typedef struct AVCodecHWConfigInternal {
 @@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
      HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
@@ -15922,7 +19577,7 @@ Upstream-status: Pending
 @@ -24,6 +24,9 @@
   * MMAL Video Decoder
   */
-
+ 
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
 +#pragma GCC diagnostic ignored "-Wredundant-decls"
@@ -15935,12 +19590,12 @@ Upstream-status: Pending
  #include <interface/mmal/vc/mmal_vc_api.h>
 +#pragma GCC diagnostic pop
  #include <stdatomic.h>
-
+ 
  #include "avcodec.h"
 --- a/libavcodec/pthread_frame.c
 +++ b/libavcodec/pthread_frame.c
 @@ -191,7 +191,8 @@ static attribute_align_arg void *frame_w
-
+ 
          /* if the previous thread uses hwaccel then we take the lock to ensure
           * the threads don't run concurrently */
 -        if (avctx->hwaccel) {
@@ -15950,9 +19605,9 @@ Upstream-status: Pending
              p->hwaccel_serializing = 1;
          }
 @@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecConte
-
+ 
      if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
-
+ 
 -    if (avctx->hwaccel && !p->hwaccel_serializing) {
 +    if (avctx->hwaccel &&
 +        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
@@ -15965,7 +19620,7 @@ Upstream-status: Pending
 @@ -293,6 +293,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags
      { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
      { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
-
+ 
 +    /* RPI (Might as well define for everything) */
 +    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
 +    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
@@ -15974,13 +19629,13 @@ Upstream-status: Pending
 +
      { AV_PIX_FMT_NONE, 0 },
  };
-
+ 
 --- a/libavcodec/rawenc.c
 +++ b/libavcodec/rawenc.c
 @@ -24,6 +24,7 @@
   * Raw Video Encoder
   */
-
+ 
 +#include "config.h"
  #include "avcodec.h"
  #include "raw.h"
@@ -15993,13 +19648,13 @@ Upstream-status: Pending
 +#if CONFIG_SAND
 +#include "libavutil/rpi_sand_fns.h"
 +#endif
-
+ 
  static av_cold int raw_encode_init(AVCodecContext *avctx)
  {
 @@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
      return 0;
  }
-
+ 
 +#if CONFIG_SAND
 +static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
 +                      const AVFrame *frame)
@@ -16080,7 +19735,7 @@ Upstream-status: Pending
 -                                       frame->width, frame->height, 1);
 +    int ret;
 +    AVFrame * frame = NULL;
-
+ 
 -    if (ret < 0)
 +#if CONFIG_SAND
 +    if (av_rpi_is_sand_frame(src_frame)) {
@@ -16104,7 +19759,7 @@ Upstream-status: Pending
 +                                       frame->width, frame->height, 1);
 +    if (ret < 0)
 +        goto fail;
-
+ 
      if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
 -        return ret;
 +        goto fail;
@@ -16114,7 +19769,7 @@ Upstream-status: Pending
                                         frame->width, frame->height, 1)) < 0)
 -        return ret;
 +        goto fail;
-
+ 
      if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
         frame->format   == AV_PIX_FMT_YUYV422) {
 @@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *av
@@ -16130,7 +19785,7 @@ Upstream-status: Pending
 +    *got_packet = 0;
 +    return ret;
  }
-
+ 
  AVCodec ff_rawvideo_encoder = {
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_cabac.c
@@ -17637,7 +21292,7 @@ Upstream-status: Pending
 +    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
 +
 +    int use_vpu;
-+#if RPI_COMPRESS_COEFFS
++#if RPI_COMPRESS_COEFFS                                
 +    int num_nonzero = 0;
 +    int use_compress = 0;
 +    int *coeffs32;
@@ -17979,7 +21634,7 @@ Upstream-status: Pending
 +          }
 +          use_compress = 0;
 +        }
-+#endif
++#endif            
 +
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
@@ -18055,7 +21710,7 @@ Upstream-status: Pending
 +                        scale,
 +                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
 +                        shift);
-+#if RPI_COMPRESS_COEFFS
++#if RPI_COMPRESS_COEFFS                                
 +                      if (use_compress)
 +                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
 +                      else
@@ -18247,11 +21902,11 @@ Upstream-status: Pending
 +#endif
 +
 +    if (!use_dc) {
-+#if RPI_COMPRESS_COEFFS
++#if RPI_COMPRESS_COEFFS                                
 +        if (use_compress) {
 +          coeffs32[num_nonzero] = 0;
 +        }
-+#endif
++#endif      
 +        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
 +    }
 +}
@@ -29840,7 +33495,7 @@ Upstream-status: Pending
 +    unsigned int i;
 +    for (i = 0; i != 4; ++i) {
 +        cf->s[i].n = 0;
-+#if RPI_COMPRESS_COEFFS
++#if RPI_COMPRESS_COEFFS        
 +        cf->s[i].packed = 1;
 +        cf->s[i].packed_n = 0;
 +#endif
@@ -46104,76 +49759,104 @@ Upstream-status: Pending
 @@ -21,6 +21,7 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
-
+ 
 +#include <drm_fourcc.h>
  #include <linux/videodev2.h>
  #include <sys/ioctl.h>
  #include <sys/mman.h>
-@@ -30,12 +31,14 @@
+@@ -29,57 +30,82 @@
+ #include <poll.h>
  #include "libavcodec/avcodec.h"
  #include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
  #include "libavutil/pixdesc.h"
 +#include "libavutil/hwcontext.h"
  #include "v4l2_context.h"
  #include "v4l2_buffers.h"
  #include "v4l2_m2m.h"
 +#include "weak_link.h"
-
+ 
  #define USEC_PER_SEC 1000000
 -static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
 +static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
-
- static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
+ 
+-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
  {
-@@ -52,34 +55,44 @@ static inline AVCodecContext *logger(V4L
- static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
+     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+         container_of(buf->context, V4L2m2mContext, output) :
+         container_of(buf->context, V4L2m2mContext, capture);
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Buffer *buf)
++static inline AVCodecContext *logger(const V4L2Buffer * const buf)
  {
-     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+     return buf_to_m2mctx(buf)->avctx;
+ }
+ 
+-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
+ {
+-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
 -
 -    if (s->avctx->pkt_timebase.num)
 -        return s->avctx->pkt_timebase;
 -    return s->avctx->time_base;
++    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
 +    const AVRational tb = s->avctx->pkt_timebase.num ?
 +        s->avctx->pkt_timebase :
 +        s->avctx->time_base;
 +    return tb.num && tb.den ? tb : v4l2_timebase;
  }
-
+ 
 -static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
-+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
++static inline struct timeval tv_from_int(const int64_t t)
  {
 -    int64_t v4l2_pts;
--
++    return (struct timeval){
++        .tv_usec = t % USEC_PER_SEC,
++        .tv_sec  = t / USEC_PER_SEC
++    };
++}
+ 
 -    if (pts == AV_NOPTS_VALUE)
 -        pts = 0;
--
++static inline int64_t int_from_tv(const struct timeval t)
++{
++    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+ 
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
      /* convert pts to v4l2 timebase */
 -    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 +    const int64_t v4l2_pts =
-+        no_rescale ? pts :
 +        pts == AV_NOPTS_VALUE ? 0 :
 +            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++    out->buf.timestamp = tv_from_int(v4l2_pts);
  }
-
+ 
 -static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
-+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
  {
 -    int64_t v4l2_pts;
 -
++    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
      /* convert pts back to encoder timebase */
 -    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-+    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                         avbuf->buf.timestamp.tv_usec;
-
--    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+-                        avbuf->buf.timestamp.tv_usec;
 +    return
-+        no_rescale ? v4l2_pts :
++        avbuf->context->no_pts_rescale ? v4l2_pts :
 +        v4l2_pts == 0 ? AV_NOPTS_VALUE :
 +            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
 +}
-+
+ 
+-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
 +static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
 +{
 +    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
@@ -46184,12 +49867,15 @@ Upstream-status: Pending
 +        out->buf.length = length;
 +    }
  }
-
+ 
  static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
-@@ -116,6 +129,105 @@ static enum AVColorPrimaries v4l2_get_co
+@@ -116,49 +142,176 @@ static enum AVColorPrimaries v4l2_get_co
      return AVCOL_PRI_UNSPECIFIED;
  }
-
+ 
+-static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+-{
+-    enum v4l2_quantization qt;
 +static void v4l2_set_color(V4L2Buffer *buf,
 +                           const enum AVColorPrimaries avcp,
 +                           const enum AVColorSpace avcs,
@@ -46230,7 +49916,10 @@ Upstream-status: Pending
 +    default:
 +        break;
 +    }
-+
+ 
+-    qt = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+-        buf->context->format.fmt.pix_mp.quantization :
+-        buf->context->format.fmt.pix.quantization;
 +    switch (avcs) {
 +    case AVCOL_SPC_RGB:
 +        cs = V4L2_COLORSPACE_SRGB;
@@ -46260,7 +49949,10 @@ Upstream-status: Pending
 +    default:
 +        break;
 +    }
-+
+ 
+-    switch (qt) {
+-    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+-    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
 +    switch (xfer) {
 +    case AVCOL_TRC_BT709:
 +        xfer = V4L2_XFER_FUNC_709;
@@ -46274,10 +49966,11 @@ Upstream-status: Pending
 +    case AVCOL_TRC_SMPTE2084:
 +        xfer = V4L2_XFER_FUNC_SMPTE2084;
 +        break;
-+    default:
-+        break;
-+    }
-+
+     default:
+         break;
+     }
+ 
+-     return AVCOL_RANGE_UNSPECIFIED;
 +    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
 +        buf->context->format.fmt.pix_mp.colorspace = cs;
 +        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
@@ -46287,15 +49980,58 @@ Upstream-status: Pending
 +        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
 +        buf->context->format.fmt.pix.xfer_func = xfer;
 +    }
+ }
+ 
+-static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++static inline enum v4l2_quantization
++buf_quantization(const V4L2Buffer * const buf)
+ {
+-    enum v4l2_ycbcr_encoding ycbcr;
+-    enum v4l2_colorspace cs;
++    return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++        buf->context->format.fmt.pix_mp.quantization :
++        buf->context->format.fmt.pix.quantization;
++}
+ 
+-    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++static inline enum v4l2_colorspace
++buf_colorspace(const V4L2Buffer * const buf)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+         buf->context->format.fmt.pix_mp.colorspace :
+         buf->context->format.fmt.pix.colorspace;
++}
+ 
+-    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++static inline enum v4l2_ycbcr_encoding
++buf_ycbcr_enc(const V4L2Buffer * const buf)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+         buf->context->format.fmt.pix_mp.ycbcr_enc:
+         buf->context->format.fmt.pix.ycbcr_enc;
++}
+ 
+-    switch(cs) {
+-    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
++static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
++{
++    switch (buf_quantization(buf)) {
++    case V4L2_QUANTIZATION_LIM_RANGE:
++        return AVCOL_RANGE_MPEG;
++    case V4L2_QUANTIZATION_FULL_RANGE:
++        return AVCOL_RANGE_JPEG;
++    case V4L2_QUANTIZATION_DEFAULT:
++        // If YUV (which we assume for all video decode) then, from the header
++        // comments, range is limited unless CS is JPEG
++        return buf_colorspace(buf) == V4L2_COLORSPACE_JPEG ?
++            AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
++    default:
++        break;
++    }
++
++     return AVCOL_RANGE_UNSPECIFIED;
 +}
 +
- static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
- {
-     enum v4l2_quantization qt;
-@@ -134,6 +246,20 @@ static enum AVColorRange v4l2_get_color_
-      return AVCOL_RANGE_UNSPECIFIED;
- }
-
 +static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
 +{
 +    const enum v4l2_quantization q =
@@ -46310,13 +50046,51 @@ Upstream-status: Pending
 +    }
 +}
 +
- static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++{
++    switch (buf_colorspace(buf)) {
++    case V4L2_COLORSPACE_JPEG:  // JPEG -> SRGB
++    case V4L2_COLORSPACE_SRGB:
++        return AVCOL_SPC_RGB;
+     case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+     case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+     case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+     case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+     case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+     case V4L2_COLORSPACE_BT2020:
+-        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+-            return AVCOL_SPC_BT2020_CL;
+-        else
+-             return AVCOL_SPC_BT2020_NCL;
++        return buf_ycbcr_enc(buf) == V4L2_YCBCR_ENC_BT2020_CONST_LUM ?
++            AVCOL_SPC_BT2020_CL : AVCOL_SPC_BT2020_NCL;
+     default:
+         break;
+     }
+@@ -168,17 +321,9 @@ static enum AVColorSpace v4l2_get_color_
+ 
+ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
  {
-     enum v4l2_ycbcr_encoding ycbcr;
-@@ -210,73 +336,165 @@ static enum AVColorTransferCharacteristi
+-    enum v4l2_ycbcr_encoding ycbcr;
++    const enum v4l2_ycbcr_encoding ycbcr = buf_ycbcr_enc(buf);
+     enum v4l2_xfer_func xfer;
+-    enum v4l2_colorspace cs;
+-
+-    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+-        buf->context->format.fmt.pix_mp.colorspace :
+-        buf->context->format.fmt.pix.colorspace;
+-
+-    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+-        buf->context->format.fmt.pix_mp.ycbcr_enc:
+-        buf->context->format.fmt.pix.ycbcr_enc;
++    const enum v4l2_colorspace cs = buf_colorspace(buf);
+ 
+     xfer = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+         buf->context->format.fmt.pix_mp.xfer_func:
+@@ -210,73 +355,165 @@ static enum AVColorTransferCharacteristi
      return AVCOL_TRC_UNSPECIFIED;
  }
-
+ 
 -static void v4l2_free_buffer(void *opaque, uint8_t *unused)
 +static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
  {
@@ -46327,7 +50101,7 @@ Upstream-status: Pending
 -        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
 +    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
 +}
-
+ 
 -        if (s->reinit) {
 -            if (!atomic_load(&s->refcount))
 -                sem_post(&s->refsync);
@@ -46343,7 +50117,7 @@ Upstream-status: Pending
 +{
 +    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
 +}
-
+ 
 -        av_buffer_unref(&avbuf->context_ref);
 -    }
 +static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
@@ -46351,14 +50125,14 @@ Upstream-status: Pending
 +    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
 +        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
  }
-
+ 
 -static int v4l2_buf_increase_ref(V4L2Buffer *in)
 +static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
  {
 -    V4L2m2mContext *s = buf_to_m2mctx(in);
 +    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
 +    AVDRMLayerDescriptor *layer;
-
+ 
 -    if (in->context_ref)
 -        atomic_fetch_add(&in->context_refcount, 1);
 -    else {
@@ -46368,7 +50142,7 @@ Upstream-status: Pending
 +    /* fill the DRM frame descriptor */
 +    drm_desc->nb_objects = avbuf->num_planes;
 +    drm_desc->nb_layers = 1;
-
+ 
 -        in->context_refcount = 1;
 +    layer = &drm_desc->layers[0];
 +    layer->nb_planes = avbuf->num_planes;
@@ -46378,7 +50152,7 @@ Upstream-status: Pending
 +        layer->planes[i].offset = 0;
 +        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
      }
-
+ 
 -    in->status = V4L2BUF_RET_USER;
 -    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
 +    switch (avbuf->context->av_pix_fmt) {
@@ -46386,7 +50160,7 @@ Upstream-status: Pending
 +
 +        layer->format = DRM_FORMAT_YUYV;
 +        layer->nb_planes = 1;
-
+ 
 -    return 0;
 +        break;
 +
@@ -46435,7 +50209,7 @@ Upstream-status: Pending
 +
 +    return (uint8_t *) drm_desc;
  }
-
+ 
 -static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
 +static void v4l2_free_bufref(void *opaque, uint8_t *data)
  {
@@ -46443,25 +50217,25 @@ Upstream-status: Pending
 +    AVBufferRef * bufref = (AVBufferRef *)data;
 +    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
 +    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
-
+ 
 -    if (plane >= in->num_planes)
 -        return AVERROR(EINVAL);
 +    if (ctx != NULL) {
 +        // Buffer still attached to context
 +        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-
+ 
 -    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
 -    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
 -                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
 -    if (!*buf)
 -        return AVERROR(ENOMEM);
 +        ff_mutex_lock(&ctx->lock);
-
+ 
 -    ret = v4l2_buf_increase_ref(in);
 -    if (ret)
 -        av_buffer_unref(buf);
-+        avbuf->status = V4L2BUF_AVAILABLE;
-
++        ff_v4l2_buffer_set_avail(avbuf);
+ 
 -    return ret;
 +        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
 +            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
@@ -46482,8 +50256,9 @@ Upstream-status: Pending
 +
 +    ff_weak_link_unlock(avbuf->context_wl);
 +    av_buffer_unref(&bufref);
-+}
-+
+ }
+ 
+-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
 +static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 +{
 +    struct v4l2_exportbuffer expbuf;
@@ -46514,20 +50289,19 @@ Upstream-status: Pending
 +    }
 +
 +    return 0;
- }
-
--static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
++}
++
 +static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
  {
      unsigned int bytesused, length;
 +    int rv = 0;
-
+ 
      if (plane >= out->num_planes)
          return AVERROR(EINVAL);
-@@ -284,32 +502,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
+@@ -284,32 +521,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
      length = out->plane_info[plane].length;
      bytesused = FFMIN(size+offset, length);
-
+ 
 -    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
 -
 -    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
@@ -46540,7 +50314,7 @@ Upstream-status: Pending
 +        size = length - offset;
 +        rv = AVERROR(ENOMEM);
      }
-
+ 
 -    return 0;
 +    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
 +
@@ -46564,14 +50338,14 @@ Upstream-status: Pending
 +    avbuf->status = V4L2BUF_RET_USER;
 +    return newbuf;
  }
-
+ 
  static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
  {
 -    int i, ret;
 +    int i;
-
+ 
      frame->format = avbuf->context->av_pix_fmt;
-
+ 
 -    for (i = 0; i < avbuf->num_planes; i++) {
 -        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
 -        if (ret)
@@ -46579,7 +50353,7 @@ Upstream-status: Pending
 +    frame->buf[0] = wrap_avbuf(avbuf);
 +    if (frame->buf[0] == NULL)
 +        return AVERROR(ENOMEM);
-
++
 +    if (buf_to_m2mctx(avbuf)->output_drm) {
 +        /* 1. get references to the actual data */
 +        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
@@ -46587,7 +50361,7 @@ Upstream-status: Pending
 +        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
 +        return 0;
 +    }
-+
+ 
 +
 +    /* 1. get references to the actual data */
 +    for (i = 0; i < avbuf->num_planes; i++) {
@@ -46595,9 +50369,9 @@ Upstream-status: Pending
          frame->linesize[i] = avbuf->plane_info[i].bytesperline;
 -        frame->data[i] = frame->buf[i]->data;
      }
-
+ 
      /* fixup special cases */
-@@ -318,17 +561,17 @@ static int v4l2_buffer_buf_to_swframe(AV
+@@ -318,17 +580,17 @@ static int v4l2_buffer_buf_to_swframe(AV
      case AV_PIX_FMT_NV21:
          if (avbuf->num_planes > 1)
              break;
@@ -46606,7 +50380,7 @@ Upstream-status: Pending
 +        frame->linesize[1] = frame->linesize[0];
 +        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
          break;
-
+ 
      case AV_PIX_FMT_YUV420P:
          if (avbuf->num_planes > 1)
              break;
@@ -46619,12 +50393,12 @@ Upstream-status: Pending
 +        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
 +        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
          break;
-
+ 
      default:
-@@ -338,68 +581,95 @@ static int v4l2_buffer_buf_to_swframe(AV
+@@ -338,68 +600,127 @@ static int v4l2_buffer_buf_to_swframe(AV
      return 0;
  }
-
+ 
 +static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
 +{
 +    if (dst_stride == src_stride && w + 32 >= dst_stride) {
@@ -46643,6 +50417,38 @@ Upstream-status: Pending
 +{
 +    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
 +}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++        return AVERROR(EINVAL);
++
++    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        // Only currently cope with single buffer types
++        if (out->buf.length != 1)
++            return AVERROR_PATCHWELCOME;
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->planes[0].m.fd = src->objects[0].fd;
++    }
++    else {
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->buf.m.fd      = src->objects[0].fd;
++    }
++
++    // No need to copy src AVDescriptor and if we did then we may confuse
++    // fd close on free
++    out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++    return 0;
++}
 +
  static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
  {
@@ -46756,7 +50562,7 @@ Upstream-status: Pending
 +                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
 +                return -1;
 +            }
-
+ 
 -    for (i = 0; i < out->num_planes; i++) {
 -        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]);
 -        if (ret)
@@ -46770,42 +50576,60 @@ Upstream-status: Pending
 -
      return 0;
  }
-
-@@ -411,14 +681,22 @@ static int v4l2_buffer_swframe_to_buf(co
-
- int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+ 
+@@ -409,16 +730,31 @@ static int v4l2_buffer_swframe_to_buf(co
+  *
+  ******************************************************************************/
+ 
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
  {
 -    v4l2_set_pts(out, frame->pts);
-+    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
+-
+-    return v4l2_buffer_swframe_to_buf(frame, out);
++    out->buf.flags = frame->key_frame ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
 +    // Beware that colour info is held in format rather than the actual
 +    // v4l2 buffer struct so this may not be as useful as you might hope
 +    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
 +    v4l2_set_color_range(out, frame->color_range);
 +    // PTS & interlace are buffer vars
-+    v4l2_set_pts(out, frame->pts, 0);
++    if (track_ts)
++        out->buf.timestamp = tv_from_int(track_ts);
++    else
++        v4l2_set_pts(out, frame->pts);
 +    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
-
-     return v4l2_buffer_swframe_to_buf(frame, out);
++
++    return frame->format == AV_PIX_FMT_DRM_PRIME ?
++        v4l2_buffer_primeframe_to_buf(frame, out) :
++        v4l2_buffer_swframe_to_buf(frame, out);
  }
-
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
+ 
+ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
  {
      int ret;
 +    V4L2Context * const ctx = avbuf->context;
-
+ 
      av_frame_unref(frame);
-
-@@ -433,13 +711,24 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+ 
+@@ -429,17 +765,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+ 
+     /* 2. get frame information */
+     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
++    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
++            AV_PICTURE_TYPE_NONE;
+     frame->color_primaries = v4l2_get_color_primaries(avbuf);
      frame->colorspace = v4l2_get_color_space(avbuf);
      frame->color_range = v4l2_get_color_range(avbuf);
      frame->color_trc = v4l2_get_color_trc(avbuf);
--    frame->pts = v4l2_get_pts(avbuf);
-+    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
+     frame->pts = v4l2_get_pts(avbuf);
      frame->pkt_dts = AV_NOPTS_VALUE;
 +    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
 +    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
-
+ 
      /* these values are updated also during re-init in v4l2_process_driver_event */
 -    frame->height = avbuf->context->height;
 -    frame->width = avbuf->context->width;
@@ -46820,18 +50644,17 @@ Upstream-status: Pending
 +        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
 +            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
 +        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
-+            frame->width - (ctx->selection.top + ctx->selection.height) : 0;
++            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
 +    }
-
+ 
      /* 3. report errors upstream */
      if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
-@@ -452,15 +741,16 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
-
+@@ -452,15 +803,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+ 
  int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
  {
 -    int ret;
-+    av_log(logger(avbuf), AV_LOG_INFO, "%s\n", __func__);
-
+-
      av_packet_unref(pkt);
 -    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
 -    if (ret)
@@ -46840,29 +50663,25 @@ Upstream-status: Pending
 +    pkt->buf = wrap_avbuf(avbuf);
 +    if (pkt->buf == NULL)
 +        return AVERROR(ENOMEM);
-
+ 
      pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
 -    pkt->data = pkt->buf->data;
 +    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
-
++    pkt->flags = 0;
+ 
      if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
          pkt->flags |= AV_PKT_FLAG_KEY;
-@@ -470,36 +760,89 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
-         pkt->flags |= AV_PKT_FLAG_CORRUPT;
-     }
-
--    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
-+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
-
+@@ -475,31 +826,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
      return 0;
  }
-
+ 
 -int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen, int no_rescale_pts)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp)
  {
      int ret;
-
+ 
 -    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf);
 -    if (ret)
 +    if (extlen) {
@@ -46874,18 +50693,23 @@ Upstream-status: Pending
 +    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
 +    if (ret && ret != AVERROR(ENOMEM))
          return ret;
-
+ 
 -    v4l2_set_pts(out, pkt->pts);
-+    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
-
-     if (pkt->flags & AV_PKT_FLAG_KEY)
-         out->flags = V4L2_BUF_FLAG_KEYFRAME;
-
--    return 0;
++    if (timestamp)
++        out->buf.timestamp = tv_from_int(timestamp);
++    else
++        v4l2_set_pts(out, pkt->pts);
++
++    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
+ 
+-    if (pkt->flags & AV_PKT_FLAG_KEY)
+-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
 +    return ret;
- }
-
--int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
++}
+ 
+-    return 0;
 +int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
 +{
 +    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
@@ -46908,23 +50732,27 @@ Upstream-status: Pending
 +            close(avbuf->drm_frame.objects[i].fd);
 +    }
 +
++    av_buffer_unref(&avbuf->ref_buf);
++
 +    ff_weak_link_unref(&avbuf->context_wl);
 +
 +    av_free(avbuf);
-+}
+ }
+ 
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
 +
-+
-+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
  {
 -    V4L2Context *ctx = avbuf->context;
      int ret, i;
 +    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
 +    AVBufferRef * bufref;
-+
+ 
+-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
 +    *pbufref = NULL;
 +    if (avbuf == NULL)
 +        return AVERROR(ENOMEM);
-
++
 +    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
 +    if (bufref == NULL) {
 +        av_free(avbuf);
@@ -46932,10 +50760,10 @@ Upstream-status: Pending
 +    }
 +
 +    avbuf->context = ctx;
-     avbuf->buf.memory = V4L2_MEMORY_MMAP;
++    avbuf->buf.memory = mem;
      avbuf->buf.type = ctx->type;
      avbuf->buf.index = index;
-
+ 
 +    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
 +        avbuf->drm_frame.objects[i].fd = -1;
 +    }
@@ -46945,43 +50773,48 @@ Upstream-status: Pending
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.length = VIDEO_MAX_PLANES;
          avbuf->buf.m.planes = avbuf->planes;
-@@ -507,7 +850,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
-
+@@ -507,7 +918,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ 
      ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
      if (ret < 0)
 -        return AVERROR(errno);
 +        goto fail;
-
+ 
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->num_planes = 0;
-@@ -527,25 +870,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
-
+@@ -520,6 +931,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+         avbuf->num_planes = 1;
+ 
+     for (i = 0; i < avbuf->num_planes; i++) {
++        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+ 
+         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -527,25 +940,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ 
          if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
              avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
 -            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
 -                                           PROT_READ | PROT_WRITE, MAP_SHARED,
 -                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
 +
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
++            if (want_mmap)
 +                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
 +                                               PROT_READ | PROT_WRITE, MAP_SHARED,
 +                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+            }
          } else {
              avbuf->plane_info[i].length = avbuf->buf.length;
 -            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
 -                                          PROT_READ | PROT_WRITE, MAP_SHARED,
 -                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
 +
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
++            if (want_mmap)
 +                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
 +                                               PROT_READ | PROT_WRITE, MAP_SHARED,
 +                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+            }
          }
-
+ 
 -        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
 -            return AVERROR(ENOMEM);
 +        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
@@ -46990,19 +50823,19 @@ Upstream-status: Pending
 +            goto fail;
 +        }
      }
-
+ 
      avbuf->status = V4L2BUF_AVAILABLE;
-
+ 
 -    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
 -        return 0;
 -
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.m.planes = avbuf->planes;
          avbuf->buf.length   = avbuf->num_planes;
-@@ -555,7 +906,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+@@ -555,20 +972,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
          avbuf->buf.length    = avbuf->planes[0].length;
      }
-
+ 
 -    return ff_v4l2_buffer_enqueue(avbuf);
 +    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
 +        if (buf_to_m2mctx(avbuf)->output_drm) {
@@ -47019,19 +50852,20 @@ Upstream-status: Pending
 +    av_buffer_unref(&bufref);
 +    return ret;
  }
-
+ 
  int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
-@@ -564,9 +928,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* a
-
-     avbuf->buf.flags = avbuf->flags;
-
+ {
+     int ret;
++    int qc;
+ 
+-    avbuf->buf.flags = avbuf->flags;
 +    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
 +        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
 +               avbuf->context->name, avbuf->buf.index,
 +               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
 +               avbuf->context->q_count);
 +    }
-+
+ 
      ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
 -    if (ret < 0)
 -        return AVERROR(errno);
@@ -47042,31 +50876,34 @@ Upstream-status: Pending
 +               err, strerror(err));
 +        return AVERROR(err);
 +    }
+ 
++    // Lock not wanted - if called from buffer free then lock already obtained
++    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+     avbuf->status = V4L2BUF_IN_DRIVER;
++    pthread_cond_broadcast(&avbuf->context->cond);
 +
-+    ++avbuf->context->q_count;
 +    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
 +           avbuf->context->name, avbuf->buf.index,
-+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-+           avbuf->context->q_count);
-
-     avbuf->status = V4L2BUF_IN_DRIVER;
-
++           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
+ 
+     return 0;
+ }
 --- a/libavcodec/v4l2_buffers.h
 +++ b/libavcodec/v4l2_buffers.h
-@@ -27,25 +27,34 @@
+@@ -27,25 +27,38 @@
  #include <stdatomic.h>
  #include <linux/videodev2.h>
-
+ 
 +#include "libavutil/hwcontext_drm.h"
  #include "avcodec.h"
-
+ 
  enum V4L2Buffer_status {
      V4L2BUF_AVAILABLE,
      V4L2BUF_IN_DRIVER,
 +    V4L2BUF_IN_USE,
      V4L2BUF_RET_USER,
  };
-
+ 
  /**
   * V4L2Buffer (wrapper for v4l2_buffer management)
   */
@@ -47083,49 +50920,70 @@ Upstream-status: Pending
 +     */
      struct V4L2Context *context;
 +    struct ff_weak_link_client *context_wl;
-
+ 
 -    /* This object is refcounted per-plane, so we need to keep track
 -     * of how many context-refs we are holding. */
 -    AVBufferRef *context_ref;
 -    atomic_uint context_refcount;
 +    /* DRM descriptor */
 +    AVDRMFrameDescriptor drm_frame;
-
++    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++     * are done
++     */
++    AVBufferRef * ref_buf;
+ 
      /* keep track of the mmap address and mmap length */
      struct V4L2Plane_info {
-@@ -70,11 +79,12 @@ typedef struct V4L2Buffer {
-  *
-  * @param[in] frame The AVFRame to push the information to
-  * @param[in] buf The V4L2Buffer to get the information from
-+ * @param[in] no_rescale_pts If non-zero do not rescale PTS
-  *
-  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
-  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
-  */
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
-
- /**
-  * Extracts the data from a V4L2Buffer to an AVPacket
-@@ -98,6 +108,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
+@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
+     struct v4l2_buffer buf;
+     struct v4l2_plane planes[VIDEO_MAX_PLANES];
+ 
+-    int flags;
+     enum V4L2Buffer_status status;
+ 
+ } V4L2Buffer;
+@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
   */
  int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
-
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen, int no_rescale_pts);
+ 
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp);
 +
  /**
   * Extracts the data from an AVFrame to a V4L2Buffer
   *
-@@ -116,7 +129,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
+@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+ 
+ /**
+  * Initializes a V4L2Buffer
+@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
   *
   * @returns 0 in case of success, a negative AVERROR code otherwise
   */
 -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
-
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
+ 
  /**
   * Enqueues a V4L2Buffer
+@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+  */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+ 
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++    avbuf->status = V4L2BUF_AVAILABLE;
++    av_buffer_unref(&avbuf->ref_buf);
++}
++
+ 
+ #endif // AVCODEC_V4L2_BUFFERS_H
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -27,11 +27,13 @@
@@ -47139,41 +50997,233 @@ Upstream-status: Pending
  #include "v4l2_fmt.h"
  #include "v4l2_m2m.h"
 +#include "weak_link.h"
-
+ 
  struct v4l2_format_update {
      uint32_t v4l2_fmt;
-@@ -53,16 +55,6 @@ static inline AVCodecContext *logger(V4L
-     return ctx_to_m2mctx(ctx)->avctx;
- }
-
--static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
--}
--
--static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
--}
--
- static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
+     int update_avfmt;
+ };
+ 
+-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
  {
-     struct AVRational sar = { 0, 1 };
-@@ -94,8 +86,8 @@ static inline unsigned int v4l2_resoluti
+-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+-        container_of(ctx, V4L2m2mContext, output) :
+-        container_of(ctx, V4L2m2mContext, capture);
++    return (int64_t)n;
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+ {
+-    return ctx_to_m2mctx(ctx)->avctx;
++    return (unsigned int)pts;
+ }
+ 
+-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = avpkt->size,
++        .pts              = avpkt->pts,
++        .dts              = avpkt->dts,
++        .reordered_opaque = avctx->reordered_opaque,
++        .pkt_pos          = avpkt->pos,
++        .pkt_duration     = avpkt->duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
+ }
+ 
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = 0,
++        .pts              = frame->pts,
++        .dts              = AV_NOPTS_VALUE,
++        .reordered_opaque = frame->reordered_opaque,
++        .pkt_pos          = frame->pkt_pos,
++        .pkt_duration     = frame->pkt_duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVFrame *const frame)
++{
++    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++    {
++        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        frame->pts              = AV_NOPTS_VALUE;
++        frame->pkt_dts          = AV_NOPTS_VALUE;
++        frame->reordered_opaque = x->last_opaque;
++        frame->pkt_pos          = -1;
++        frame->pkt_duration     = 0;
++        frame->pkt_size         = -1;
++    }
++    else if (!t->discard)
++    {
++        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
++        frame->pkt_dts          = t->dts;
++        frame->reordered_opaque = t->reordered_opaque;
++        frame->pkt_pos          = t->pkt_pos;
++        frame->pkt_duration     = t->pkt_duration;
++        frame->pkt_size         = t->pkt_size;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (frame->pts != AV_NOPTS_VALUE)
++            x->last_pts = frame->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        return -1;
++    }
++
++    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++    return 0;
++}
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVPacket *const pkt)
++{
++    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++    {
++        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        pkt->pts                = AV_NOPTS_VALUE;
++    }
++    else if (!t->discard)
++    {
++        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (pkt->pts != AV_NOPTS_VALUE)
++            x->last_pts = pkt->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        return -1;
++    }
++
++    // * Would like something much better than this...xlat(offset + out_count)?
++    pkt->dts = pkt->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           pkt->pts, t->track_pts, n);
++    return 0;
++}
++
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
++{
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
++}
++
++static inline AVCodecContext *logger(const V4L2Context *ctx)
++{
++    return ctx_to_m2mctx(ctx)->avctx;
+ }
+ 
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte
+     return sar;
+ }
+ 
+-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
++static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++    return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+ {
+-    struct v4l2_format *fmt1 = &ctx->format;
+-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+-        :
+-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++    const struct v4l2_format *fmt1 = &ctx->format;
++    int ret = !ctx_buffers_alloced(ctx) ||
++        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
++            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
++            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
++            :
++            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
++            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
+ 
      if (ret)
-         av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
+-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
++        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
              ctx->name,
 -            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
 -            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
++            ctx_buffers_alloced(ctx),
 +            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
 +            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
-
+ 
      return ret;
  }
-@@ -153,58 +145,67 @@ static inline void v4l2_save_to_context(
+@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(
      }
  }
-
+ 
 -/**
 - * handle resolution change event and end of stream event
 - * returns 1 if reinit was successful, negative if it failed
@@ -47192,7 +51242,7 @@ Upstream-status: Pending
 +        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
 +        .target = V4L2_SEL_TGT_COMPOSE
 +    };
-
+ 
 -    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
 -    if (ret < 0) {
 -        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
@@ -47201,7 +51251,7 @@ Upstream-status: Pending
 +    memset(r, 0, sizeof(*r));
 +    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
 +        return AVERROR(errno);
-
+ 
 -    if (evt.type == V4L2_EVENT_EOS) {
 -        ctx->done = 1;
 -        return 0;
@@ -47209,49 +51259,45 @@ Upstream-status: Pending
 +    *r = selection.r;
 +    return 0;
 +}
-
+ 
 -    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
 -        return 0;
 +static int do_source_change(V4L2m2mContext * const s)
 +{
 +    AVCodecContext *const avctx = s->avctx;
-+
+ 
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
+-    if (ret) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
+-        return 0;
+-    }
 +    int ret;
 +    int reinit;
-+    int full_reinit;
 +    struct v4l2_format cap_fmt = s->capture.format;
-+    struct v4l2_format out_fmt = s->output.format;
 +
-+    s->resize_pending = 0;
 +    s->capture.done = 0;
-
-     ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
-     if (ret) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
-+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->output.name);
-         return 0;
-     }
-
+ 
      ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
      if (ret) {
 -        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
 +        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
          return 0;
      }
-
-     full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
-     if (full_reinit) {
+ 
+-    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
+-    if (full_reinit) {
 -        s->output.height = v4l2_get_height(&out_fmt);
 -        s->output.width = v4l2_get_width(&out_fmt);
 -        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-+        s->output.height = ff_v4l2_get_format_height(&out_fmt);
-+        s->output.width = ff_v4l2_get_format_width(&out_fmt);
-     }
-+    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-+
+-    }
 +    get_default_selection(&s->capture, &s->capture.selection);
-
-     reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++
++    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
++    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
++        reinit = 1;
+ 
+-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++    s->capture.format = cap_fmt;
      if (reinit) {
 -        s->capture.height = v4l2_get_height(&cap_fmt);
 -        s->capture.width = v4l2_get_width(&cap_fmt);
@@ -47259,368 +51305,491 @@ Upstream-status: Pending
 +        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
 +        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
      }
+ 
+-    if (full_reinit || reinit)
+-        s->reinit = 1;
+-
+-    if (full_reinit) {
+-        ret = ff_v4l2_m2m_codec_full_reinit(s);
+-        if (ret) {
+-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
+-            return AVERROR(EINVAL);
+-        }
+-        goto reinit_run;
++    // If we don't support selection (or it is bust) and we obviously have HD then kludge
++    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
++        (s->capture.height == 1088 && s->capture.width == 1920)) {
++        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+     }
+ 
 +    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
 +
-+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
++    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
 +           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
++           s->capture.width, s->capture.height,
 +           s->capture.selection.width, s->capture.selection.height,
-+           s->capture.selection.left, s->capture.selection.top);
-
-     if (full_reinit || reinit)
-         s->reinit = 1;
-@@ -212,34 +213,88 @@ static int v4l2_handle_event(V4L2Context
-     if (full_reinit) {
-         ret = ff_v4l2_m2m_codec_full_reinit(s);
-         if (ret) {
--            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
-+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit failed\n");
-             return AVERROR(EINVAL);
-         }
-         goto reinit_run;
-     }
-
++           s->capture.selection.left, s->capture.selection.top, reinit);
++
      if (reinit) {
 -        if (s->avctx)
+-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
 +        if (avctx)
-             ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
++            ret = ff_set_dimensions(s->avctx,
++                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
++                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
          if (ret < 0)
 -            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
 +            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
-
+ 
          ret = ff_v4l2_m2m_codec_reinit(s);
          if (ret) {
 -            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
 +            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
              return AVERROR(EINVAL);
          }
++
++        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
++            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
++                   s->capture.width, s->capture.height,
++                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
++            return AVERROR(EINVAL);
++        }
++
++        // Update pixel format - should only actually do something on initial change
++        s->capture.av_pix_fmt =
++            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
++        if (s->output_drm) {
++            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
++        }
++        else
++            avctx->pix_fmt = s->capture.av_pix_fmt;
++
          goto reinit_run;
      }
-
+ 
 -    /* dummy event received */
 -    return 0;
 +    /* Buffers are OK so just stream off to ack */
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
++    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
 +
 +    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
 +    if (ret)
 +        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
 +    s->draining = 0;
-
+ 
      /* reinit executed */
  reinit_run:
 +    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
      return 1;
  }
-
-+static int ctx_done(V4L2Context * const ctx)
-+{
-+    int rv = 0;
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+
-+    ctx->done = 1;
-+
-+    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        rv = do_source_change(s);
-+
-+    return rv;
-+}
-+
-+/**
-+ * handle resolution change event and end of stream event
-+ * returns 1 if reinit was successful, negative if it failed
-+ * returns 0 if reinit was not executed
-+ */
-+static int v4l2_handle_event(V4L2Context *ctx)
-+{
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    struct v4l2_event evt = { 0 };
-+    int ret;
-+
-+    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
-+    if (ret < 0) {
-+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
-+        return 0;
-+    }
-+
-+    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
-+
-+    if (evt.type == V4L2_EVENT_EOS) {
-+//        ctx->done = 1;
-+        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
-+        return 0;
-+    }
-+
-+    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
-+        return 0;
-+
-+    s->resize_pending = 1;
-+    if (!ctx->done)
-+        return 0;
-+
-+    return do_source_change(s);
-+}
-+
- static int v4l2_stop_decode(V4L2Context *ctx)
- {
-     struct v4l2_decoder_cmd cmd = {
-@@ -280,8 +335,26 @@ static int v4l2_stop_encode(V4L2Context
+ 
+@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context
      return 0;
  }
-
-+static int count_in_driver(const V4L2Context * const ctx)
+ 
+-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
+-{
+-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+-    struct v4l2_buffer buf = { 0 };
+-    V4L2Buffer *avbuf;
+-    struct pollfd pfd = {
+-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+-        .fd = ctx_to_m2mctx(ctx)->fd,
++// DQ a buffer
++// Amalgamates all the various ways there are of signalling EOS/Event to
++// generate a consistant EPIPE.
++//
++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
++//
++// Returns:
++//  0               Success
++//  AVERROR(EPIPE)  Nothing more to read
++//  AVERROR(ENOSPC) No buffers in Q to put result in
++//  *               AVERROR(..)
++
++ static int
++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
 +{
-+    int i;
-+    int n = 0;
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    V4L2Buffer * avbuf;
++    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
 +
-+    if (!ctx->bufrefs)
-+        return -1;
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
 +
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (avbuf->status == V4L2BUF_IN_DRIVER)
-+            ++n;
-+    }
-+    return n;
-+}
-+
- static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
- {
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
-     struct v4l2_plane planes[VIDEO_MAX_PLANES];
-     struct v4l2_buffer buf = { 0 };
-     V4L2Buffer *avbuf;
-@@ -290,50 +363,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(
-         .fd = ctx_to_m2mctx(ctx)->fd,
++    struct v4l2_buffer buf = {
++        .type = ctx->type,
++        .memory = V4L2_MEMORY_MMAP,
      };
-     int i, ret;
-+    int no_rx_means_done = 0;
-
+-    int i, ret;
+ 
 -    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
-+    if (is_capture && ctx->bufrefs) {
-         for (i = 0; i < ctx->num_buffers; i++) {
+-        for (i = 0; i < ctx->num_buffers; i++) {
 -            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                 break;
-         }
-         if (i == ctx->num_buffers)
+-                break;
+-        }
+-        if (i == ctx->num_buffers)
 -            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
-+            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
-                                                 "userspace. Increase num_capture_buffers "
-                                                 "to prevent device deadlock or dropped "
+-                                                "userspace. Increase num_capture_buffers "
+-                                                "to prevent device deadlock or dropped "
 -                                                "packets/frames.\n");
-+                                                "packets/frames.\n", i);
-     }
-
-+#if 0
-+    // I think this is true but pointless
-+    // we will get some other form of EOF signal
-+
-     /* if we are draining and there are no more capture buffers queued in the driver we are done */
+-    }
+-
+-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
 -    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
-+    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
-         for (i = 0; i < ctx->num_buffers; i++) {
-             /* capture buffer initialization happens during decode hence
-              * detection happens at runtime
-              */
+-        for (i = 0; i < ctx->num_buffers; i++) {
+-            /* capture buffer initialization happens during decode hence
+-             * detection happens at runtime
+-             */
 -            if (!ctx->buffers)
-+            if (!ctx->bufrefs)
-                 break;
-
+-                break;
+-
 -            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                 goto start;
-         }
-         ctx->done = 1;
-         return NULL;
-     }
-+#endif
-
- start:
+-                goto start;
+-        }
+-        ctx->done = 1;
+-        return NULL;
+-    }
+-
+-start:
 -    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
 -        pfd.events =  POLLOUT | POLLWRNORM;
 -    else {
-+    if (is_capture) {
-         /* no need to listen to requests for more input while draining */
-         if (ctx_to_m2mctx(ctx)->draining)
-             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-+    } else {
-+        pfd.events =  POLLOUT | POLLWRNORM;
+-        /* no need to listen to requests for more input while draining */
+-        if (ctx_to_m2mctx(ctx)->draining)
+-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
++    *ppavbuf = NULL;
++
++    if (ctx->flag_last)
++        return AVERROR(EPIPE);
++
++    if (is_mp) {
++        buf.length = VIDEO_MAX_PLANES;
++        buf.m.planes = planes;
      }
-+    no_rx_means_done = s->resize_pending && is_capture;
-
-     for (;;) {
+ 
+-    for (;;) {
 -        ret = poll(&pfd, 1, timeout);
-+        // If we have a resize pending then all buffers should be Qed
-+        // With a resize pending we should be in drain but evidence suggests
-+        // that not all decoders do this so poll to clear
-+        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
-+        const int e = pfd.events;
-+
-+        ret = poll(&pfd, 1, t2);
-+
-         if (ret > 0)
-             break;
+-        if (ret > 0)
+-            break;
 -        if (errno == EINTR)
 -            continue;
+-        return NULL;
++    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
++        const int err = errno;
++        av_assert0(AVERROR(err) < 0);
++        if (err != EINTR) {
++            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
++                ctx->name, av_err2str(AVERROR(err)));
 +
-+        if (ret < 0) {
-+            int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
-+                   err, strerror(err),
-+                   e, count_in_driver(ctx));
-+            return NULL;
-+        }
++            if (err == EPIPE)
++                ctx->flag_last = 1;
 +
-+        // ret == 0 (timeout)
-+        if (no_rx_means_done) {
-+            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
-+            ret = ctx_done(ctx);
-+            if (ret > 0)
-+                goto start;
++            return AVERROR(err);
 +        }
-+        if (timeout == -1)
-+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
-         return NULL;
      }
-
-@@ -343,7 +450,8 @@ start:
-            no need to raise a warning */
-         if (timeout == 0) {
-             for (i = 0; i < ctx->num_buffers; i++) {
++    atomic_fetch_sub(&ctx->q_count, 1);
+ 
+-    /* 0. handle errors */
+-    if (pfd.revents & POLLERR) {
+-        /* if we are trying to get free buffers but none have been queued yet
+-           no need to raise a warning */
+-        if (timeout == 0) {
+-            for (i = 0; i < ctx->num_buffers; i++) {
 -                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
-+                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+                if (avbuf->status != V4L2BUF_AVAILABLE)
-                     av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-             }
-         }
-@@ -361,22 +469,25 @@ start:
-             ctx->done = 1;
-             return NULL;
+-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+-            }
++    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
++    ff_v4l2_buffer_set_avail(avbuf);
++    avbuf->buf = buf;
++    if (is_mp) {
++        memcpy(avbuf->planes, planes, sizeof(planes));
++        avbuf->buf.m.planes = avbuf->planes;
++    }
++    // Done with any attached buffer
++    av_buffer_unref(&avbuf->ref_buf);
++
++    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
++        // Zero length cap buffer return == EOS
++        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
++
++            // Must reQ so we don't leak
++            // May not matter if the next thing we do is release all the
++            // buffers but better to be tidy.
++            ff_v4l2_buffer_enqueue(avbuf);
++
++            ctx->flag_last = 1;
++            return AVERROR(EPIPE);
          }
+-        else
+-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+ 
+-        return NULL;
++#ifdef V4L2_BUF_FLAG_LAST
++        // If flag_last set then this contains data but is the last frame
++        // so remember that but return OK
++        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
++            ctx->flag_last = 1;
++#endif
+     }
+ 
+-    /* 1. handle resolution changes */
+-    if (pfd.revents & POLLPRI) {
+-        ret = v4l2_handle_event(ctx);
+-        if (ret < 0) {
+-            /* if re-init failed, abort */
+-            ctx->done = 1;
+-            return NULL;
+-        }
 -        if (ret) {
 -            /* if re-init was successful drop the buffer (if there was one)
 -             * since we had to reconfigure capture (unmap all buffers)
 -             */
 -            return NULL;
--        }
-+        if (ret > 0)
-+            goto start;
-     }
-
-     /* 2. dequeue the buffer */
-     if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
-
--        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (is_capture) {
-             /* there is a capture buffer ready */
-             if (pfd.revents & (POLLIN | POLLRDNORM))
-                 goto dequeue;
-
-+            // CAPTURE Q drained
-+            if (no_rx_means_done) {
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-+                return NULL;
-+            }
++    *ppavbuf = avbuf;
++    return 0;
++}
 +
-             /* the driver is ready to accept more input; instead of waiting for the capture
-              * buffer to complete we return NULL so input can proceed (we are single threaded)
-              */
-@@ -394,37 +505,58 @@ dequeue:
-             buf.m.planes = planes;
++/**
++ * handle resolution change event and end of stream event
++ * Expects to be called after the stream has stopped
++ *
++ * returns 1 if reinit was successful, negative if it failed
++ * returns 0 if reinit was not executed
++ */
++static int
++get_event(V4L2m2mContext * const m)
++{
++    AVCodecContext * const avctx = m->avctx;
++    struct v4l2_event evt = { 0 };
++
++    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
++        const int rv = AVERROR(errno);
++        if (rv == AVERROR(EINTR))
++            continue;
++        if (rv == AVERROR(EAGAIN)) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
++            return AVERROR_EOF;
          }
-
++        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
++        return rv;
+     }
+ 
+-    /* 2. dequeue the buffer */
+-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
++    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+ 
+-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            /* there is a capture buffer ready */
+-            if (pfd.revents & (POLLIN | POLLRDNORM))
+-                goto dequeue;
++    if (evt.type == V4L2_EVENT_EOS) {
++        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
++        return AVERROR_EOF;
++    }
++
++    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
++        return do_source_change(m);
++
++    return 0;
++}
++
++
++// Get a buffer
++// If output then just gets the buffer in the expected way
++// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
++
++static int
++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
++{
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
++
++    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
++    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
++    const unsigned int poll_event = POLLPRI;
++
++    *ppavbuf = NULL;
+ 
+-            /* the driver is ready to accept more input; instead of waiting for the capture
+-             * buffer to complete we return NULL so input can proceed (we are single threaded)
+-             */
+-            if (pfd.revents & (POLLOUT | POLLWRNORM))
+-                return NULL;
++    for (;;) {
++        struct pollfd pfd = {
++            .fd = m->fd,
++            // If capture && stream not started then assume we are waiting for the initial event
++            .events = !is_cap ? poll_out :
++                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
++                    poll_event,
++        };
++        int ret;
++
++        if (ctx->done) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
++            return AVERROR_EOF;
+         }
+ 
+-dequeue:
+-        memset(&buf, 0, sizeof(buf));
+-        buf.memory = V4L2_MEMORY_MMAP;
+-        buf.type = ctx->type;
+-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+-            memset(planes, 0, sizeof(planes));
+-            buf.length = VIDEO_MAX_PLANES;
+-            buf.m.planes = planes;
++        // If capture && timeout == -1 then also wait for rx buffer free
++        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
++            pfd.events |= poll_out;
++
++        // If nothing Qed all we will get is POLLERR - avoid that
++        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
++            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
++            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
++            return AVERROR(ENOSPC);
+         }
+ 
 -        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
 -        if (ret) {
 -            if (errno != EAGAIN) {
 -                ctx->done = 1;
 -                if (errno != EPIPE)
-+        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
-+            const int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            if (err != EAGAIN) {
-+                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
-+                if (err != EPIPE || !is_capture)
-                     av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
 -                        ctx->name, av_err2str(AVERROR(errno)));
-+                        ctx->name, av_err2str(AVERROR(err)));
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-             }
-             return NULL;
-         }
-+        --ctx->q_count;
-+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
-+               ctx->name, buf.index,
-+               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-+               ctx->q_count, ++ctx->dq_count);
-+
-+        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-+        avbuf->status = V4L2BUF_AVAILABLE;
-+        avbuf->buf = buf;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-+            memcpy(avbuf->planes, planes, sizeof(planes));
-+            avbuf->buf.m.planes = avbuf->planes;
++        // Timeout kludged s.t. "forever" eventually gives up & produces logging
++        // If waiting for an event when we have seen a last_frame then we expect
++        //   it to be ready already so force a short timeout
++        ret = poll(&pfd, 1,
++                   ff_v4l2_ctx_eos(ctx) ? 10 :
++                   timeout == -1 ? 3000 : timeout);
++        if (ret < 0) {
++            ret = AVERROR(errno);  // Remember errno before logging etc.
++            av_assert0(ret < 0);
 +        }
-
--        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
-             int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
-                             buf.m.planes[0].bytesused : buf.bytesused;
-             if (bytesused == 0) {
--                ctx->done = 1;
-+                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
 +
-+                // Must reQ so we don't leak
-+                // May not matter if the next thing we do is release all the
-+                // buffers but better to be tidy.
-+                ff_v4l2_buffer_enqueue(avbuf);
++        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++               ctx->name, ret, timeout, pfd.events, pfd.revents);
 +
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-                 return NULL;
++        if (ret < 0) {
++            if (ret == AVERROR(EINTR))
++                continue;
++            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++            return ret;
++        }
++
++        if (ret == 0) {
++            if (timeout == -1)
++                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
++            if (ff_v4l2_ctx_eos(ctx)) {
++                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
++                ret = get_event(m);
++                if (ret < 0) {
++                    ctx->done = 1;
++                    return ret;
++                }
              }
- #ifdef V4L2_BUF_FLAG_LAST
+-            return NULL;
++            return AVERROR(EAGAIN);
+         }
+ 
+-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
+-                            buf.m.planes[0].bytesused : buf.bytesused;
+-            if (bytesused == 0) {
++        if ((pfd.revents & POLLERR) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++            return AVERROR_UNKNOWN;
++        }
++
++        if ((pfd.revents & poll_event) != 0) {
++            ret = get_event(m);
++            if (ret < 0) {
+                 ctx->done = 1;
+-                return NULL;
++                return ret;
+             }
+-#ifdef V4L2_BUF_FLAG_LAST
 -            if (buf.flags & V4L2_BUF_FLAG_LAST)
 -                ctx->done = 1;
-+            if (buf.flags & V4L2_BUF_FLAG_LAST) {
-+                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
-+                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
-+                ctx_done(ctx);
-+            }
- #endif
+-#endif
++            continue;
++        }
++
++        if ((pfd.revents & poll_cap) != 0) {
++            ret = dq_buf(ctx, ppavbuf);
++            if (ret == AVERROR(EPIPE))
++                continue;
++            return ret;
          }
-
+ 
 -        avbuf = &ctx->buffers[buf.index];
 -        avbuf->status = V4L2BUF_AVAILABLE;
 -        avbuf->buf = buf;
 -        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
 -            memcpy(avbuf->planes, planes, sizeof(planes));
 -            avbuf->buf.m.planes = avbuf->planes;
--        }
-         return avbuf;
++        if ((pfd.revents & poll_out) != 0) {
++            if (is_cap)
++                return AVERROR(EAGAIN);
++            return dq_buf(ctx, ppavbuf);
+         }
+-        return avbuf;
++
++        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
++        return AVERROR_UNKNOWN;
      }
-
-@@ -443,8 +575,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
++}
+ 
+-    return NULL;
++// Clear out flags and timestamps that should should be set by the user
++// Returns the passed avbuf
++static V4L2Buffer *
++clean_v4l2_buffer(V4L2Buffer * const avbuf)
++{
++    struct v4l2_buffer *const buf = &avbuf->buf;
++
++    buf->flags = 0;
++    buf->field = V4L2_FIELD_ANY;
++    buf->timestamp = (struct timeval){0};
++    buf->timecode = (struct v4l2_timecode){0};
++    buf->sequence = 0;
++
++    return avbuf;
+ }
+ 
+ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ {
+-    int timeout = 0; /* return when no more buffers to dequeue */
+     int i;
+ 
+     /* get back as many output buffers as possible */
+     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-          do {
+-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
++        V4L2Buffer * avbuf;
++        do {
++            get_qbuf(ctx, &avbuf, 0);
++        } while (avbuf);
      }
-
+ 
      for (i = 0; i < ctx->num_buffers; i++) {
 -        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
 -            return &ctx->buffers[i];
 +        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
 +        if (avbuf->status == V4L2BUF_AVAILABLE)
-+            return avbuf;
++            return clean_v4l2_buffer(avbuf);
      }
-
+ 
      return NULL;
-@@ -452,25 +585,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
-
+@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
+ 
  static int v4l2_release_buffers(V4L2Context* ctx)
  {
 -    struct v4l2_requestbuffers req = {
@@ -47632,12 +51801,12 @@ Upstream-status: Pending
 +    int i;
 +    int ret = 0;
 +    const int fd = ctx_to_m2mctx(ctx)->fd;
-
+ 
 -    for (i = 0; i < ctx->num_buffers; i++) {
 -        V4L2Buffer *buffer = &ctx->buffers[i];
 +    // Orphan any buffers in the wild
 +    ff_weak_link_break(&ctx->wl_master);
-
+ 
 -        for (j = 0; j < buffer->num_planes; j++) {
 -            struct V4L2Plane_info *p = &buffer->plane_info[j];
 -            if (p->mm_addr && p->length)
@@ -47672,15 +51841,15 @@ Upstream-status: Pending
 +                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
          }
      }
-+    ctx->q_count = 0;
-
++    atomic_store(&ctx->q_count, 0);
+ 
 -    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
 +    return ret;
  }
-
+ 
  static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -499,6 +652,8 @@ static inline int v4l2_try_raw_format(V4
-
+@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4
+ 
  static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
  {
 +    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
@@ -47688,10 +51857,10 @@ Upstream-status: Pending
      enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
      struct v4l2_fmtdesc fdesc;
      int ret;
-@@ -517,6 +672,13 @@ static int v4l2_get_raw_format(V4L2Conte
+@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte
          if (ret)
              return AVERROR(EINVAL);
-
+ 
 +        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
 +            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
 +                fdesc.index++;
@@ -47702,10 +51871,10 @@ Upstream-status: Pending
          pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
          ret = v4l2_try_raw_format(ctx, pixfmt);
          if (ret){
-@@ -569,18 +731,77 @@ static int v4l2_get_coded_format(V4L2Con
+@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con
    *
    *****************************************************************************/
-
+ 
 +
 +static void flush_all_buffers_status(V4L2Context* const ctx)
 +{
@@ -47717,9 +51886,9 @@ Upstream-status: Pending
 +    for (i = 0; i < ctx->num_buffers; ++i) {
 +        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
 +        if (buf->status == V4L2BUF_IN_DRIVER)
-+            buf->status = V4L2BUF_AVAILABLE;
++            ff_v4l2_buffer_set_avail(buf);
 +    }
-+    ctx->q_count = 0;
++    atomic_store(&ctx->q_count, 0);
 +}
 +
 +static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
@@ -47749,18 +51918,25 @@ Upstream-status: Pending
  int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
  {
      int type = ctx->type;
-     int ret;
+-    int ret;
++    int ret = 0;
 +    AVCodecContext * const avctx = logger(ctx);
-+
-+    ff_mutex_lock(&ctx->lock);
-+
-+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        stuff_all_buffers(avctx, ctx);
-
-     ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+ 
+-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
 -    if (ret < 0)
 -        return AVERROR(errno);
-+    if (ret < 0) {
++    // Avoid doing anything if there is nothing we can do
++    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
++        return 0;
+ 
+-    ctx->streamon = (cmd == VIDIOC_STREAMON);
++    ff_mutex_lock(&ctx->lock);
+ 
+-    return 0;
++    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
++        stuff_all_buffers(avctx, ctx);
++
++    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
 +        const int err = errno;
 +        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
 +               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
@@ -47770,73 +51946,153 @@ Upstream-status: Pending
 +    {
 +        if (cmd == VIDIOC_STREAMOFF)
 +            flush_all_buffers_status(ctx);
-
--    ctx->streamon = (cmd == VIDIOC_STREAMON);
++        else
++            ctx->first_buf = 1;
++
 +        ctx->streamon = (cmd == VIDIOC_STREAMON);
 +        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
 +               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
 +    }
-
--    return 0;
++
++    // Both stream off & on effectively clear flag_last
++    ctx->flag_last = 0;
++
 +    ff_mutex_unlock(&ctx->lock);
 +
 +    return ret;
  }
-
+ 
  int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-@@ -608,7 +829,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
++    int64_t track_ts;
+     V4L2Buffer* avbuf;
+     int ret;
+ 
+     if (!frame) {
+         ret = v4l2_stop_encode(ctx);
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+         s->draining= 1;
+         return 0;
+     }
+@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
+     if (!avbuf)
+         return AVERROR(ENOMEM);
+ 
+-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+     if (ret)
+         return ret;
+ 
      return ff_v4l2_buffer_enqueue(avbuf);
  }
-
+ 
 -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
 +int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-+                                   const void * extdata, size_t extlen, int no_rescale_pts)
++                                   const void * extdata, size_t extlen)
  {
      V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer* avbuf;
-@@ -616,8 +838,9 @@ int ff_v4l2_context_enqueue_packet(V4L2C
-
+     int ret;
++    int64_t track_ts;
+ 
      if (!pkt->size) {
          ret = v4l2_stop_decode(ctx);
 +        // Log but otherwise ignore stop failure
          if (ret)
 -            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
          s->draining = 1;
          return 0;
      }
-@@ -626,14 +849,17 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C
      if (!avbuf)
          return AVERROR(EAGAIN);
-
+ 
 -    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
 -    if (ret)
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
++    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
 +    if (ret == AVERROR(ENOMEM))
 +        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
 +               __func__, pkt->size, avbuf->planes[0].length);
 +    else if (ret)
          return ret;
-
+ 
      return ff_v4l2_buffer_enqueue(avbuf);
- }
-
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
+@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+ 
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer *avbuf;
-
-@@ -650,7 +876,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
-         return AVERROR(EAGAIN);
-     }
-
++    int rv;
+ 
+-    /*
+-     * timeout=-1 blocks until:
+-     *  1. decoded frame available
+-     *  2. an input buffer is ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
+-
+-        return AVERROR(EAGAIN);
+-    }
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv;
++        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
+ 
 -    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
++   return 0;
  }
-
+ 
  int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-@@ -702,78 +928,155 @@ int ff_v4l2_context_get_format(V4L2Conte
-
+ {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer *avbuf;
++    int rv;
+ 
+-    /*
+-     * blocks until:
+-     *  1. encoded packet available
+-     *  2. an input buffer ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
++            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
++        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
+ 
+-        return AVERROR(EAGAIN);
+-    }
+-
+-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++    return 0;
+ }
+ 
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte
+ 
  int ff_v4l2_context_set_format(V4L2Context* ctx)
  {
 -    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
@@ -47864,29 +52120,30 @@ Upstream-status: Pending
 +    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
 +    return ret;
  }
-
+ 
  void ff_v4l2_context_release(V4L2Context* ctx)
  {
      int ret;
-
+ 
 -    if (!ctx->buffers)
 +    if (!ctx->bufrefs)
          return;
-
+ 
      ret = v4l2_release_buffers(ctx);
      if (ret)
          av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
-
+ 
 -    av_freep(&ctx->buffers);
 +    av_freep(&ctx->bufrefs);
 +    av_buffer_unref(&ctx->frames_ref);
 +
 +    ff_mutex_destroy(&ctx->lock);
++    pthread_cond_destroy(&ctx->cond);
  }
-
+ 
 -int ff_v4l2_context_init(V4L2Context* ctx)
 +
-+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
  {
 -    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
 +    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
@@ -47897,17 +52154,19 @@ Upstream-status: Pending
 -        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
 -        return AVERROR_PATCHWELCOME;
 -    }
--
++    int ret;
++    int i;
+ 
 -    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
 -    if (ret)
 -        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
-+    int ret;
-+    int i;
-
++    av_assert0(ctx->bufrefs == NULL);
+ 
      memset(&req, 0, sizeof(req));
 -    req.count = ctx->num_buffers;
+-    req.memory = V4L2_MEMORY_MMAP;
 +    req.count = req_buffers;
-     req.memory = V4L2_MEMORY_MMAP;
++    req.memory = mem;
      req.type = ctx->type;
 -    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
 -    if (ret < 0) {
@@ -47920,7 +52179,7 @@ Upstream-status: Pending
 +            return ret;
 +        }
      }
-
+ 
      ctx->num_buffers = req.count;
 -    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
 -    if (!ctx->buffers) {
@@ -47930,7 +52189,7 @@ Upstream-status: Pending
 -        return AVERROR(ENOMEM);
 +        goto fail_release;
      }
-
+ 
 -    for (i = 0; i < req.count; i++) {
 -        ctx->buffers[i].context = ctx;
 -        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
@@ -47942,14 +52201,14 @@ Upstream-status: Pending
 +    }
 +
 +    for (i = 0; i < ctx->num_buffers; i++) {
-+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
++        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
 +        if (ret) {
              av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
 -            goto error;
 +            goto fail_release;
          }
      }
-
+ 
      av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
          V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
          req.count,
@@ -47959,9 +52218,9 @@ Upstream-status: Pending
 +        ff_v4l2_get_format_height(&ctx->format),
          V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
          V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
-
+ 
      return 0;
-
+ 
 -error:
 +fail_release:
      v4l2_release_buffers(ctx);
@@ -47976,14 +52235,16 @@ Upstream-status: Pending
 +
 +    // It is not valid to reinit a context without a previous release
 +    av_assert0(ctx->bufrefs == NULL);
-+
+ 
+-    av_freep(&ctx->buffers);
 +    if (!v4l2_type_supported(ctx)) {
 +        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
 +        return AVERROR_PATCHWELCOME;
 +    }
-
--    av_freep(&ctx->buffers);
++
 +    ff_mutex_init(&ctx->lock, NULL);
++    pthread_cond_init(&ctx->cond, NULL);
++    atomic_init(&ctx->q_count, 0);
 +
 +    if (s->output_drm) {
 +        AVHWFramesContext *hwframes;
@@ -47997,8 +52258,8 @@ Upstream-status: Pending
 +        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
 +        hwframes->format = AV_PIX_FMT_DRM_PRIME;
 +        hwframes->sw_format = ctx->av_pix_fmt;
-+        hwframes->width = ctx->width;
-+        hwframes->height = ctx->height;
++        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
++        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
 +        ret = av_hwframe_ctx_init(ctx->frames_ref);
 +        if (ret < 0)
 +            goto fail_unref_hwframes;
@@ -48011,12 +52272,12 @@ Upstream-status: Pending
 +        goto fail_unref_hwframes;
 +    }
 +
-+    ret = create_buffers(ctx, ctx->num_buffers);
++    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
 +    if (ret < 0)
 +        goto fail_unref_hwframes;
 +
 +    return 0;
-
+ 
 +fail_unref_hwframes:
 +    av_buffer_unref(&ctx->frames_ref);
 +fail_unlock:
@@ -48031,14 +52292,14 @@ Upstream-status: Pending
  #include "libavutil/buffer.h"
 +#include "libavutil/thread.h"
  #include "v4l2_buffers.h"
-
+ 
  typedef struct V4L2Context {
 @@ -70,11 +71,18 @@ typedef struct V4L2Context {
       */
      int width, height;
      AVRational sample_aspect_ratio;
 +    struct v4l2_rect selection;
-
+ 
      /**
 -     * Indexed array of V4L2Buffers
 +     * If the default size of buffer is less than this then try to
@@ -48051,50 +52312,98 @@ Upstream-status: Pending
 +     * Indexed array of pointers to V4L2Buffers
 +     */
 +    AVBufferRef **bufrefs;
-
+ 
      /**
       * Readonly after init.
-@@ -92,6 +100,12 @@ typedef struct V4L2Context {
+@@ -82,16 +90,38 @@ typedef struct V4L2Context {
+     int num_buffers;
+ 
+     /**
++     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++     */
++    enum v4l2_memory buf_mem;
++
++    /**
+      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+      */
+     int streamon;
+ 
++    /* 1st buffer after stream on */
++    int first_buf;
++
+     /**
+      *  Either no more buffers available or an unrecoverable error was notified
+      *  by the V4L2 kernel driver: once set the context has to be exited.
       */
      int done;
-
+ 
++    int flag_last;
++
++    /**
++     * If NZ then when Qing frame/pkt use this rather than the
++     * "real" PTS
++     */
++    uint64_t track_ts;
++
 +    AVBufferRef *frames_ref;
-+    int q_count;
-+    int dq_count;
++    atomic_int q_count;
 +    struct ff_weak_link_master *wl_master;
 +
 +    AVMutex lock;
++    pthread_cond_t cond;
  } V4L2Context;
-
+ 
  /**
-@@ -156,9 +170,12 @@ int ff_v4l2_context_dequeue_packet(V4L2C
+@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C
   * @param[in] ctx The V4L2Context to dequeue from.
   * @param[inout] f The AVFrame to dequeue to.
   * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
-+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
-+ *       timestamp directly)
 + *
   * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ *                AVERROR(ENOSPC) if no buffer availible to put
++ *                the frame in
   */
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
-
- /**
-  * Enqueues a buffer to a V4L2Context from an AVPacket
-@@ -170,7 +187,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+ 
+@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
   * @param[in] pkt A pointer to an AVPacket.
   * @return 0 in case of success, a negative error otherwise.
   */
 -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
-
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
+ 
  /**
   * Enqueues a buffer to a V4L2Context from an AVFrame
 --- a/libavcodec/v4l2_m2m.c
 +++ b/libavcodec/v4l2_m2m.c
-@@ -215,13 +215,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+@@ -36,6 +36,14 @@
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
+ 
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
++
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
+@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m
+ 
+     s->capture.done = s->output.done = 0;
+     s->capture.name = "capture";
++    s->capture.buf_mem = V4L2_MEMORY_MMAP;
+     s->output.name = "output";
++    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     atomic_init(&s->refcount, 0);
+     sem_init(&s->refsync, 0, 0);
+ 
+@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
          av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
-
+ 
      /* 2. unmap the capture buffers (v4l2 and ffmpeg):
 -     *    we must wait for all references to be released before being allowed
 -     *    to queue new buffers.
@@ -48104,30 +52413,49 @@ Upstream-status: Pending
 -        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
 -
      ff_v4l2_context_release(&s->capture);
-
+ 
      /* 3. get the new capture format */
-@@ -328,7 +322,10 @@ static void v4l2_m2m_destroy_context(voi
+@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+ 
+     /* 5. complete reinit */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     return 0;
+ }
+@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2
+ 
+     /* start again now that we know the stream dimensions */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     ret = ff_v4l2_context_get_format(&s->output, 0);
+     if (ret) {
+@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi
      ff_v4l2_context_release(&s->capture);
      sem_destroy(&s->refsync);
-
+ 
 -    close(s->fd);
 +    if (s->fd != -1)
 +        close(s->fd);
 +
++    av_packet_unref(&s->buf_pkt);
++    av_freep(&s->extdata_data);
++
 +    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
-
+ 
      av_free(s);
  }
-@@ -338,17 +335,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
+@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
      V4L2m2mContext *s = priv->context;
      int ret;
-
+ 
 -    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
 -    if (ret)
 -        av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
 +    if (!s)
 +        return 0;
-
+ 
 -    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
 -    if (ret)
 -        av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
@@ -48145,9 +52473,9 @@ Upstream-status: Pending
 +        if (ret)
 +            av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
 +    }
-
+ 
      ff_v4l2_context_release(&s->output);
-
+ 
 +    close(s->fd);
 +    s->fd = -1;
 +
@@ -48157,20 +52485,65 @@ Upstream-status: Pending
 +    s->avctx = NULL;
 +    priv->context = NULL;
      av_buffer_unref(&priv->context_ref);
-
+ 
      return 0;
+@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *
+     return v4l2_configure_contexts(s);
+ }
+ 
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+-    *s = av_mallocz(sizeof(V4L2m2mContext));
+-    if (!*s)
++    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++    *pps = NULL;
++    if (!s)
+         return AVERROR(ENOMEM);
+ 
+-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+                                          &v4l2_m2m_destroy_context, NULL, 0);
+     if (!priv->context_ref) {
+-        av_freep(s);
++        av_free(s);
+         return AVERROR(ENOMEM);
+     }
+ 
+     /* assign the context */
+-    priv->context = *s;
+-    (*s)->priv = priv;
++    priv->context = s;
++    s->priv = priv;
+ 
+     /* populate it */
+-    priv->context->capture.num_buffers = priv->num_capture_buffers;
+-    priv->context->output.num_buffers  = priv->num_output_buffers;
+-    priv->context->self_ref = priv->context_ref;
+-    priv->context->fd = -1;
++    s->capture.num_buffers = priv->num_capture_buffers;
++    s->output.num_buffers  = priv->num_output_buffers;
++    s->self_ref = priv->context_ref;
++    s->fd = -1;
++
++    xlat_init(&s->xlat);
+ 
++    *pps = s;
+     return 0;
+ }
 --- a/libavcodec/v4l2_m2m.h
 +++ b/libavcodec/v4l2_m2m.h
 @@ -30,6 +30,7 @@
  #include <linux/videodev2.h>
-
+ 
  #include "libavcodec/avcodec.h"
 +#include "libavutil/pixfmt.h"
  #include "v4l2_context.h"
-
+ 
  #define container_of(ptr, type, member) ({ \
-@@ -38,7 +39,18 @@
-
+@@ -38,7 +39,37 @@
+ 
  #define V4L_M2M_DEFAULT_OPTS \
      { "num_output_buffers", "Number of buffers in the output context",\
 -        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
@@ -48179,26 +52552,45 @@ Upstream-status: Pending
 +#define FF_V4L2_M2M_TRACK_SIZE 128
 +typedef struct V4L2m2mTrackEl {
 +    int     discard;   // If we see this buffer its been flushed, so discard
++    int     pending;
 +    int     pkt_size;
 +    int64_t pts;
++    int64_t dts;
 +    int64_t reordered_opaque;
 +    int64_t pkt_pos;
 +    int64_t pkt_duration;
 +    int64_t track_pts;
 +} V4L2m2mTrackEl;
-
++
++typedef struct pts_stats_s
++{
++    void * logctx;
++    const char * name;  // For debug
++    unsigned int last_count;
++    unsigned int last_interval;
++    int64_t last_pts;
++    int64_t guess;
++} pts_stats_t;
++
++typedef struct xlat_track_s {
++    unsigned int track_no;
++    int64_t last_pts;
++    int64_t last_opaque;
++    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++} xlat_track_t;
+ 
  typedef struct V4L2m2mContext {
      char devname[PATH_MAX];
-@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext {
+@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
+     AVCodecContext *avctx;
      sem_t refsync;
      atomic_uint refcount;
-     int reinit;
-+    int resize_pending;
-
+-    int reinit;
+ 
      /* null frame/packet received */
      int draining;
-@@ -63,6 +76,23 @@ typedef struct V4L2m2mContext {
-
+@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext {
+ 
      /* reference back to V4L2m2mPriv */
      void *priv;
 +
@@ -48207,49 +52599,72 @@ Upstream-status: Pending
 +    /* generate DRM frames */
 +    int output_drm;
 +
++    /* input frames are drmprime */
++    int input_drm;
++
 +    /* Frame tracking */
-+    int64_t last_pkt_dts;
-+    int64_t last_opaque;
-+    unsigned int track_no;
-+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++    xlat_track_t xlat;
++    int pending_hw;
++    int pending_n;
++
++    pts_stats_t pts_stat;
 +
 +    /* req pkt */
 +    int req_pkt;
 +
 +    /* Ext data sent */
 +    int extdata_sent;
++    /* Ext data sent in packet - overrides ctx */
++    uint8_t * extdata_data;
++    size_t extdata_size;
++
++#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
++    /* Quirks */
++    unsigned int quirks;
++
  } V4L2m2mContext;
-
+ 
  typedef struct V4L2m2mPriv {
-@@ -73,6 +103,7 @@ typedef struct V4L2m2mPriv {
-
+@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv {
+ 
      int num_output_buffers;
      int num_capture_buffers;
 +    enum AVPixelFormat pix_fmt;
  } V4L2m2mPriv;
-
+ 
  /**
-@@ -126,4 +157,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
   */
  int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
-
+ 
 +
-+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
 +{
 +    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
 +}
 +
-+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
 +{
 +    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
 +}
 +
++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
++{
++    return ctx->flag_last;
++}
++
 +
  #endif /* AVCODEC_V4L2_M2M_H */
 --- a/libavcodec/v4l2_m2m_dec.c
 +++ b/libavcodec/v4l2_m2m_dec.c
 @@ -23,6 +23,10 @@
-
+ 
  #include <linux/videodev2.h>
  #include <sys/ioctl.h>
 +
@@ -48259,10 +52674,10 @@ Upstream-status: Pending
  #include "libavutil/pixfmt.h"
  #include "libavutil/pixdesc.h"
  #include "libavutil/opt.h"
-@@ -30,26 +34,51 @@
+@@ -30,75 +34,111 @@
  #include "libavcodec/decode.h"
  #include "libavcodec/internal.h"
-
+ 
 +#include "libavcodec/hwaccels.h"
 +#include "libavcodec/internal.h"
 +#include "libavcodec/hwconfig.h"
@@ -48270,7 +52685,80 @@ Upstream-status: Pending
  #include "v4l2_context.h"
  #include "v4l2_m2m.h"
  #include "v4l2_fmt.h"
-
+ 
+-static int v4l2_try_start(AVCodecContext *avctx)
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++
++#ifndef FF_API_BUFFER_SIZE_T
++#define FF_API_BUFFER_SIZE_T 1
++#endif
++
++static int64_t pts_stats_guess(const pts_stats_t * const stats)
+ {
+-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    V4L2Context *const capture = &s->capture;
+-    V4L2Context *const output = &s->output;
+-    struct v4l2_selection selection = { 0 };
+-    int ret;
++    if (stats->last_pts == AV_NOPTS_VALUE ||
++            stats->last_interval == 0 ||
++            stats->last_count >= STATS_LAST_COUNT_MAX)
++        return AV_NOPTS_VALUE;
++    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
++}
+ 
+-    /* 1. start the output process */
+-    if (!output->streamon) {
+-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+-        if (ret < 0) {
+-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+-            return ret;
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++        if (stats->last_count < STATS_LAST_COUNT_MAX)
++            ++stats->last_count;
++        return;
++    }
++
++    if (stats->last_pts != AV_NOPTS_VALUE) {
++        const int64_t interval = pts - stats->last_pts;
++
++        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++            stats->last_count >= STATS_LAST_COUNT_MAX) {
++            if (stats->last_interval != 0)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++                       __func__, stats->name, interval, stats->last_count);
++            stats->last_interval = 0;
++        }
++        else {
++            const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++            if (frame_time != stats->last_interval)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++            stats->last_interval = frame_time;
+         }
+     }
+ 
+-    if (capture->streamon)
++    stats->last_pts = pts;
++    stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++    *stats = (pts_stats_t){
++        .logctx = logctx,
++        .name = name,
++        .last_count = 1,
++        .last_interval = 0,
++        .last_pts = AV_NOPTS_VALUE
++    };
++}
++
 +static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
 +{
 +    int ret;
@@ -48280,81 +52768,43 @@ Upstream-status: Pending
 +    };
 +
 +    if (s->output.streamon)
-+        return 0;
-+
-+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
-+
-+    if (!s->capture.streamon || ret < 0)
-+        return ret;
-+
-+    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
-+    else
-+        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
-+
-+    return ret;
-+}
-+
- static int v4l2_try_start(AVCodecContext *avctx)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const capture = &s->capture;
--    V4L2Context *const output = &s->output;
-     struct v4l2_selection selection = { 0 };
-     int ret;
-
-     /* 1. start the output process */
--    if (!output->streamon) {
--        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
--        if (ret < 0) {
--            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
--            return ret;
--        }
--    }
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
-
-     if (capture->streamon)
          return 0;
-@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext
+ 
+-    /* 2. get the capture format */
+-    capture->format.type = capture->type;
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
++    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
++    if (ret != 0) {
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+         return ret;
      }
-
-     /* 2.1 update the AVCodecContext */
+ 
+-    /* 2.1 update the AVCodecContext */
 -    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
 -    capture->av_pix_fmt = avctx->pix_fmt;
-+    capture->av_pix_fmt =
-+        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-+    if (s->output_drm) {
-+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+        avctx->sw_pix_fmt = capture->av_pix_fmt;
-+    }
-+    else
-+        avctx->pix_fmt = capture->av_pix_fmt;
-
-     /* 3. set the crop parameters */
-+#if 1
-+    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
-+    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-+    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-+#else
-     selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-     selection.r.height = avctx->coded_height;
-     selection.r.width = avctx->coded_width;
-+    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
-     ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+-
+-    /* 3. set the crop parameters */
+-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+-    selection.r.height = avctx->coded_height;
+-    selection.r.width = avctx->coded_width;
+-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
 -    if (!ret) {
-+    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-+    if (1) {
-         ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-         if (ret) {
-             av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
-@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext
-             capture->width  = selection.r.width;
-         }
+-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+-        if (ret) {
+-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+-        } else {
+-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+-            /* update the size of the resulting frame */
+-            capture->height = selection.r.height;
+-            capture->width  = selection.r.width;
+-        }
++    // STREAMON should do implicit START so this just for those that don't.
++    // It is optional so don't worry if it fails
++    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
++        ret = AVERROR(errno);
++        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
      }
 -
 -    /* 4. init the capture context now that we have the capture format */
@@ -48364,131 +52814,133 @@ Upstream-status: Pending
 -            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
 -            return AVERROR(ENOMEM);
 -        }
++    else {
++        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+     }
++    return 0;
++}
+ 
+-    /* 5. start the capture process */
+-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
+-        return ret;
 -    }
-+#endif
-
-     /* 5. start the capture process */
-     ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-@@ -133,52 +168,312 @@ static int v4l2_prepare_decoder(V4L2m2mC
++static int v4l2_try_start(AVCodecContext *avctx)
++{
++    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++    int ret;
+ 
++    /* 1. start the output process */
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
      return 0;
  }
-
+ 
+@@ -133,52 +173,525 @@ static int v4l2_prepare_decoder(V4L2m2mC
+     return 0;
+ }
+ 
 -static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-+{
-+    return (int64_t)n;
-+}
-+
-+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-+{
-+    return (unsigned int)pts;
-+}
-+
-+// FFmpeg requires us to propagate a number of vars from the coded pkt into
-+// the decoded frame. The only thing that tracks like that in V4L2 stateful
-+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-+// guarantees about PTS being unique or specified for every frame so replace
-+// the supplied PTS with a simple incrementing number and keep a circular
-+// buffer of all the things we want preserved (including the original PTS)
-+// indexed by the tracking no.
 +static void
-+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
++set_best_effort_pts(AVCodecContext *const avctx,
++             pts_stats_t * const ps,
++             AVFrame *const frame)
 +{
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++s->track_no == 0)
-+        s->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, s->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
-+    s->last_pkt_dts = avpkt->dts;
-+    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pkt_size         = avpkt->size,
-+        .pts              = avpkt->pts,
-+        .reordered_opaque = avctx->reordered_opaque,
-+        .pkt_pos          = avpkt->pos,
-+        .pkt_duration     = avpkt->duration,
-+        .track_pts        = track_pts
-+    };
-+    avpkt->pts = track_pts;
-+}
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
-+{
-+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    const V4L2m2mTrackEl *const t = s->track_els + n;
-+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-+    {
-+        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        frame->pts              = AV_NOPTS_VALUE;
-+        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->reordered_opaque = s->last_opaque;
-+        frame->pkt_pos          = -1;
-+        frame->pkt_duration     = 0;
-+        frame->pkt_size         = -1;
-+    }
-+    else if (!t->discard)
-+    {
-+        frame->pts              = t->pts;
-+        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->reordered_opaque = t->reordered_opaque;
-+        frame->pkt_pos          = t->pkt_pos;
-+        frame->pkt_duration     = t->pkt_duration;
-+        frame->pkt_size         = t->pkt_size;
-+
-+        s->last_opaque = s->track_els[n].reordered_opaque;
-+        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        return -1;
-+    }
++    pts_stats_add(ps, frame->pts);
 +
 +#if FF_API_PKT_PTS
 +FF_DISABLE_DEPRECATION_WARNINGS
 +    frame->pkt_pts = frame->pts;
 +FF_ENABLE_DEPRECATION_WARNINGS
 +#endif
-+    frame->best_effort_timestamp = frame->pts;
-+    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
-+    return 0;
++    frame->best_effort_timestamp = pts_stats_guess(ps);
++    // If we can't guess from just PTS - try DTS
++    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++        frame->best_effort_timestamp = frame->pkt_dts;
++
++    // We can't emulate what s/w does in a useful manner and using the
++    // "correct" answer seems to just confuse things.
++    frame->pkt_dts               = frame->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++}
++
++static void
++xlat_flush(xlat_track_t * const x)
++{
++    unsigned int i;
++    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
++        x->track_els[i].pending = 0;
++        x->track_els[i].discard = 1;
++    }
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
++static int
++xlat_pending(const xlat_track_t * const x)
++{
++    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
++    unsigned int i;
++    int r = 0;
++    int64_t now = AV_NOPTS_VALUE;
++
++    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
++        const V4L2m2mTrackEl * const t = x->track_els + n;
++
++        if (!t->pending)
++            continue;
++
++        if (now == AV_NOPTS_VALUE)
++            now = t->dts;
++
++        if (t->pts == AV_NOPTS_VALUE ||
++            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
++             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
++            ++r;
++    }
++
++    // If we never get any ideas about PTS vs DTS allow a lot more buffer
++    if (now == AV_NOPTS_VALUE)
++        r -= 16;
++
++    return r;
 +}
 +
 +static inline int stream_started(const V4L2m2mContext * const s) {
-+    return s->capture.streamon && s->output.streamon;
++    return s->output.streamon;
 +}
 +
 +#define NQ_OK        0
 +#define NQ_Q_FULL    1
 +#define NQ_SRC_EMPTY 2
-+#define NQ_DRAINING  3
-+#define NQ_DEAD      4
++#define NQ_NONE      3
++#define NQ_DRAINING  4
++#define NQ_DEAD      5
 +
 +#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
++
++// do_not_get      If true then no new packet will be got but status will
++//                  be set appropriately
 +
 +// AVERROR_EOF     Flushing an already flushed stream
 +// -ve             Error (all errors except EOF are unexpected)
 +// NQ_OK (0)       OK
 +// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
 +// NQ_SRC_EMPTY    Src empty (do not retry)
++// NQ_NONE         Enqueue not attempted
 +// NQ_DRAINING     At EOS, dQ dest until EOS there too
 +// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
 +
-+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
  {
 -    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
 -    V4L2Context *const capture = &s->capture;
 -    V4L2Context *const output = &s->output;
 -    AVPacket avpkt = {0};
      int ret;
-
+ 
 -    if (s->buf_pkt.size) {
 -        avpkt = s->buf_pkt;
 -        memset(&s->buf_pkt, 0, sizeof(AVPacket));
@@ -48498,8 +52950,50 @@ Upstream-status: Pending
 +    // If we don't already have a coded packet - get a new one
 +    // We will already have a coded pkt if the output Q was full last time we
 +    // tried to Q it
-+    if (!s->buf_pkt.size) {
-+        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++    if (!s->buf_pkt.size && !do_not_get) {
++        unsigned int i;
++
++        for (i = 0; i < 256; ++i) {
++            uint8_t * side_data;
++#if FF_API_BUFFER_SIZE_T
++            int side_size;
++#else
++            size_t side_size;
++#endif
++            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++            if (ret != 0)
++                break;
++
++            // New extradata is the only side-data we undertand
++            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
++            if (side_data) {
++                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
++                av_freep(&s->extdata_data);
++                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
++                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d bytes of extra data\n", (int)side_size);
++                    return AVERROR(ENOMEM);
++                }
++                memcpy(s->extdata_data, side_data, side_size);
++                s->extdata_size = side_size;
++                s->extdata_sent = 0;
++            }
++
++            if (s->buf_pkt.size != 0)
++                break;
++
++            if (s->buf_pkt.side_data_elems == 0) {
++                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
++                ret = AVERROR_EOF;
++                break;
++            }
++
++            // Retry a side-data only pkt
++        }
++        // If i >= 256 something has gone wrong
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
++            return AVERROR(EIO);
++        }
 +
 +        if (ret == AVERROR(EAGAIN)) {
 +            if (!stream_started(s)) {
@@ -48523,7 +53017,7 @@ Upstream-status: Pending
 +            if (!s->draining) {
 +                // Calling enqueue with an empty pkt starts drain
 +                av_assert0(s->buf_pkt.size == 0);
-+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
++                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
 +                if (ret) {
 +                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
 +                    return ret;
@@ -48536,22 +53030,37 @@ Upstream-status: Pending
 +            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
              return ret;
 +        }
-+
-+        xlat_pts_in(avctx, s, &s->buf_pkt);
      }
-
+ 
 -    if (s->draining)
 -        goto dequeue;
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
-
++    if (s->draining) {
++        if (s->buf_pkt.size) {
++            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
++            av_packet_unref(&s->buf_pkt);
++        }
++        return NQ_DRAINING;
++    }
+ 
 -    ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
 -    if (ret < 0) {
 -        if (ret != AVERROR(EAGAIN))
 -           return ret;
-+    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
-+                                         1);
++    if (!s->buf_pkt.size)
++        return NQ_NONE;
+ 
+-        s->buf_pkt = avpkt;
+-        /* no input buffers available, continue dequeing */
+-    }
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
++
++    if (s->extdata_sent)
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++    else if (s->extdata_data)
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
++    else
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
 +
 +    if (ret == AVERROR(EAGAIN)) {
 +        // Out of input buffers - keep packet
@@ -48561,19 +53070,19 @@ Upstream-status: Pending
 +        // In all other cases we are done with this packet
 +        av_packet_unref(&s->buf_pkt);
 +        s->extdata_sent = 1;
-
--        s->buf_pkt = avpkt;
--        /* no input buffers available, continue dequeing */
-+        if (ret) {
+ 
+-    if (avpkt.size) {
+-        ret = v4l2_try_start(avctx);
+         if (ret) {
+-            av_packet_unref(&avpkt);
 +            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
 +            return ret;
 +        }
-     }
-
--    if (avpkt.size) {
--        ret = v4l2_try_start(avctx);
--        if (ret) {
--            av_packet_unref(&avpkt);
++    }
+ 
+-            /* cant recover */
+-            if (ret == AVERROR(ENOMEM))
+-                return ret;
 +    // Start if we haven't
 +    {
 +        const int ret2 = v4l2_try_start(avctx);
@@ -48582,62 +53091,139 @@ Upstream-status: Pending
 +            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
 +        }
 +    }
-
--            /* cant recover */
--            if (ret == AVERROR(ENOMEM))
--                return ret;
++
 +    return ret;
 +}
-
++
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++    int rv = 0;
+ 
 -            return 0;
++    ff_mutex_lock(&ctx->lock);
++
++    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++            rv = AVERROR(errno);
++            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++            break;
+         }
+     }
+ 
+-dequeue:
+-    if (!s->buf_pkt.size)
+-        av_packet_unref(&avpkt);
+-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
++    ff_mutex_unlock(&ctx->lock);
++    return rv;
++}
++
++// Number of frames over what xlat_pending returns that we keep *16
++// This is a min value - if it appears to be too small the threshold should
++// adjust dynamically.
++#define PENDING_HW_MIN      (3 * 16)
++// Offset to use when setting dynamically
++// Set to %16 == 15 to avoid the threshold changing immediately as we relax
++#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
++// Number of consecutive times we've failed to get a frame when we prefer it
++// before we increase the prefer threshold (5ms * N = max expected decode
++// time)
++#define PENDING_N_THRESHOLD 6
++
 +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 +{
 +    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-+    int src_rv;
++    int src_rv = NQ_OK;
 +    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
++    unsigned int i = 0;
 +
 +    do {
-+        src_rv = try_enqueue_src(avctx, s);
++        const int pending = xlat_pending(&s->xlat);
++        const int prefer_dq = (pending > s->pending_hw / 16);
++        const int last_src_rv = src_rv;
 +
-+        // If we got a frame last time and we have nothing to enqueue then
-+        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
++        // Enqueue another pkt for decode if
++        // (a) We don't have a lot of stuff in the buffer already OR
++        // (b) ... we (think we) do but we've failed to get a frame already OR
++        // (c) We've dequeued a lot of frames without asking for input
++        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
++
++        // If we got a frame last time or we've already tried to get a frame and
++        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
++        // indicating that we want more input.
 +        // This should mean that once decode starts we enter a stable state where
 +        // we alternately ask for input and produce output
-+        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
++        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
 +            break;
 +
-+        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
-+            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
-+            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
++        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
++            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
++            break;
 +        }
 +
 +        // Try to get a new frame if
 +        // (a) we haven't already got one AND
 +        // (b) enqueue returned a status indicating that decode should be attempted
 +        if (dst_rv != 0 && TRY_DQ(src_rv)) {
-+            do {
-+                // Dequeue frame will unref any previous contents of frame
-+                // if it returns success so we don't need an explicit unref
-+                // when discarding
-+                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-+                // but there is room in the input Q
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
++            // Pick a timeout depending on state
++            const int t =
++                src_rv == NQ_DRAINING ? 300 :
++                prefer_dq ? 5 :
++                src_rv == NQ_Q_FULL ? -1 : 0;
 +
-+                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-+                           s->draining, s->capture.done);
-+                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-+                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-+                           s->draining, s->capture.done, dst_rv);
++            // Dequeue frame will unref any previous contents of frame
++            // if it returns success so we don't need an explicit unref
++            // when discarding
++            // This returns AVERROR(EAGAIN) on timeout or if
++            // there is room in the input Q and timeout == -1
++            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 +
-+                // Go again if we got a frame that we need to discard
-+            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
++            // Failure due to no buffer in Q?
++            if (dst_rv == AVERROR(ENOSPC)) {
++                // Wait & retry
++                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++                }
++            }
++
++            // Adjust dynamic pending threshold
++            if (dst_rv == 0) {
++                if (--s->pending_hw < PENDING_HW_MIN)
++                    s->pending_hw = PENDING_HW_MIN;
++                s->pending_n = 0;
++
++                set_best_effort_pts(avctx, &s->pts_stat, frame);
++            }
++            else if (dst_rv == AVERROR(EAGAIN)) {
++                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
++                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
++                    s->pending_n = 0;
++                }
++            }
++
++            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++                dst_rv = AVERROR_EOF;
++                s->capture.done = 1;
++            }
++            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++                       s->draining, s->capture.done);
++            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++                       s->draining, s->capture.done, dst_rv);
++        }
++
++        ++i;
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
++            src_rv = AVERROR(EIO);
 +        }
 +
 +        // Continue trying to enqueue packets if either
 +        // (a) we succeeded last time OR
-+        // (b) enqueue failed due to input Q full AND there is now room
-+    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
++        // (b) we didn't ret a frame and we can retry the input
++    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
 +
 +    // Ensure that the frame contains nothing if we aren't returning a frame
 +    // (might happen when discarding)
@@ -48645,7 +53231,7 @@ Upstream-status: Pending
 +        av_frame_unref(frame);
 +
 +    // If we got a frame this time ask for a pkt next time
-+    s->req_pkt = (dst_rv == 0);
++    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
 +
 +#if 0
 +    if (dst_rv == 0)
@@ -48655,8 +53241,8 @@ Upstream-status: Pending
 +            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
 +            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
 +            return -1;
-         }
-     }
++        }
++    }
 +#endif
 +
 +    return dst_rv == 0 ? 0 :
@@ -48687,18 +53273,113 @@ Upstream-status: Pending
 +}
 +#endif
 +
++static int
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    unsigned int i;
++    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
++    const uint32_t w = avctx->coded_width;
++    const uint32_t h = avctx->coded_height;
++
++    if (w == 0 || h == 0 || fcc == 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++
++    for (i = 0;; ++i) {
++        struct v4l2_frmsizeenum fs = {
++            .index = i,
++            .pixel_format = fcc,
++        };
++
++        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
++            const int err = AVERROR(errno);
++            if (err == AVERROR(EINTR))
++                continue;
++            if (i == 0 && err == AVERROR(ENOTTY)) {
++                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
++                return 0;
++            }
++            if (err != AVERROR(EINVAL)) {
++                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
++                return err;
++            }
++            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
++                   w, h, av_fourcc2str(fcc), i);
++            return err;
++        }
++
++        switch (fs.type) {
++            case V4L2_FRMSIZE_TYPE_DISCRETE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
++                       fs.discrete.width,fs.discrete.height);
++                if (w == fs.discrete.width && h == fs.discrete.height)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_STEPWISE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
++                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
++                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
++                    return 0;
++                break;
++            default:
++                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
++                return AVERROR(EINVAL);
++        }
++    }
++}
++
++static int
++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_capability cap;
++
++    memset(&cap, 0, sizeof(cap));
++    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
++        int err = errno;
++        if (err == EINTR)
++            continue;
++        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
++        return AVERROR(err);
++    }
++
++    // Could be made table driven if we have a few more but right now there
++    // seems no point
++
++    // Meson (amlogic) always gives a resolution changed event after output
++    // streamon and userspace must (re)allocate capture buffers and streamon
++    // capture to clear the event even if the capture buffers were the right
++    // size in the first place.
++    if (strcmp(cap.driver, "meson-vdec") == 0)
++        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
++
++    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
++    return 0;
++}
++
++// This heuristic is for H264 but use for everything
 +static uint32_t max_coded_size(const AVCodecContext * const avctx)
 +{
 +    uint32_t wxh = avctx->coded_width * avctx->coded_height;
 +    uint32_t size;
-
--dequeue:
--    if (!s->buf_pkt.size)
--        av_packet_unref(&avpkt);
--    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
-+    // Currently the only thing we try to set our own limits for is H264
-+    if (avctx->codec_id != AV_CODEC_ID_H264)
-+        return 0;
 +
 +    size = wxh * 3 / 2;
 +    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
@@ -48711,27 +53392,53 @@ Upstream-status: Pending
 +    // with small WxH
 +    return size + (1 << 16);
  }
-
+ 
  static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-@@ -186,8 +481,12 @@ static av_cold int v4l2_decode_init(AVCo
+@@ -186,12 +699,29 @@ static av_cold int v4l2_decode_init(AVCo
      V4L2Context *capture, *output;
      V4L2m2mContext *s;
      V4L2m2mPriv *priv = avctx->priv_data;
 +    int gf_pix_fmt;
      int ret;
-
+ 
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 +
++    if (avctx->codec_id == AV_CODEC_ID_H264) {
++        if (avctx->ticks_per_frame == 1) {
++            if(avctx->time_base.den < INT_MAX/2) {
++                avctx->time_base.den *= 2;
++            } else
++                avctx->time_base.num /= 2;
++        }
++        avctx->ticks_per_frame = 2;
++    }
++
 +    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
      ret = ff_v4l2_m2m_create_context(priv, &s);
      if (ret < 0)
          return ret;
-@@ -204,17 +503,43 @@ static av_cold int v4l2_decode_init(AVCo
-
+ 
++    pts_stats_init(&s->pts_stat, avctx, "decoder");
++    s->pending_hw = PENDING_HW_MIN;
++
+     capture = &s->capture;
+     output = &s->output;
+ 
+@@ -199,34 +729,127 @@ static av_cold int v4l2_decode_init(AVCo
+      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+      * the proper values will be retrieved from the kernel driver.
+      */
+-    output->height = capture->height = avctx->coded_height;
+-    output->width = capture->width = avctx->coded_width;
++//    output->height = capture->height = avctx->coded_height;
++//    output->width = capture->width = avctx->coded_width;
++    output->height = capture->height = 0;
++    output->width = capture->width = 0;
+ 
      output->av_codec_id = avctx->codec_id;
      output->av_pix_fmt  = AV_PIX_FMT_NONE;
 +    output->min_buf_size = max_coded_size(avctx);
-
+ 
      capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
      capture->av_pix_fmt = avctx->pix_fmt;
 +    capture->min_buf_size = 0;
@@ -48743,15 +53450,21 @@ Upstream-status: Pending
 +     *       check the v4l2_get_drm_frame function.
 +     */
 +
++    avctx->sw_pix_fmt = avctx->pix_fmt;
 +    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
-+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++           avctx->coded_width, avctx->coded_height,
++           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
 +
-+    s->output_drm = 0;
 +    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
 +        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
 +        s->output_drm = 1;
 +    }
++    else {
++        capture->av_pix_fmt = gf_pix_fmt;
++        s->output_drm = 0;
++    }
 +
 +    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
 +    if (!s->device_ref) {
@@ -48762,7 +53475,7 @@ Upstream-status: Pending
 +    ret = av_hwdevice_ctx_init(s->device_ref);
 +    if (ret < 0)
 +        return ret;
-
+ 
      s->avctx = avctx;
      ret = ff_v4l2_m2m_codec_init(priv);
      if (ret) {
@@ -48772,15 +53485,24 @@ Upstream-status: Pending
 -
          return ret;
      }
-
-@@ -223,10 +548,53 @@ static av_cold int v4l2_decode_init(AVCo
-
+ 
+-    return v4l2_prepare_decoder(s);
++    if ((ret = v4l2_prepare_decoder(s)) < 0)
++        return ret;
++
++    if ((ret = get_quirks(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = check_size(avctx, s)) != 0)
++        return ret;
++
++    return 0;
+ }
+ 
  static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  {
 -    V4L2m2mPriv *priv = avctx->priv_data;
 -    V4L2m2mContext *s = priv->context;
--    av_packet_unref(&s->buf_pkt);
--    return ff_v4l2_m2m_codec_end(priv);
 +    int rv;
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 +    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
@@ -48802,7 +53524,6 @@ Upstream-status: Pending
 +    V4L2m2mContext * const s = priv->context;
 +    V4L2Context * const output = &s->output;
 +    V4L2Context * const capture = &s->capture;
-+    int ret, i;
 +
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
 +
@@ -48810,14 +53531,23 @@ Upstream-status: Pending
 +    // states like EOS processing so don't try to optimize out (having got it
 +    // wrong once)
 +
-+    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
++    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++
++    // Clear any buffered input packet
+     av_packet_unref(&s->buf_pkt);
+-    return ff_v4l2_m2m_codec_end(priv);
++
++    // Clear a pending EOS
++    if (ff_v4l2_ctx_eos(capture)) {
++        // Arguably we could delay this but this is easy and doesn't require
++        // thought or extra vars
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
++    }
 +
 +    // V4L2 makes no guarantees about whether decoded frames are flushed or not
 +    // so mark all frames we are tracking to be discarded if they appear
-+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
-+        s->track_els[i].discard = 1;
++    xlat_flush(&s->xlat);
 +
 +    // resend extradata
 +    s->extdata_sent = 0;
@@ -48829,9 +53559,9 @@ Upstream-status: Pending
 +    // Stream on will occur when we actually submit a new frame
 +    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
  }
-
+ 
  #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -235,10 +603,16 @@ static av_cold int v4l2_decode_close(AVC
+@@ -235,10 +858,16 @@ static av_cold int v4l2_decode_close(AVC
  static const AVOption options[] = {
      V4L_M2M_DEFAULT_OPTS,
      { "num_capture_buffers", "Number of buffers in the capture context",
@@ -48840,7 +53570,7 @@ Upstream-status: Pending
 +    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
      { NULL},
  };
-
+ 
 +static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
 +    HW_CONFIG_INTERNAL(DRM_PRIME),
 +    NULL
@@ -48849,7 +53579,7 @@ Upstream-status: Pending
  #define M2MDEC_CLASS(NAME) \
      static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
          .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -259,9 +633,15 @@ static const AVOption options[] = {
+@@ -259,9 +888,15 @@ static const AVOption options[] = {
          .init           = v4l2_decode_init, \
          .receive_frame  = v4l2_receive_frame, \
          .close          = v4l2_decode_close, \
@@ -48865,7 +53595,366 @@ Upstream-status: Pending
 +        .hw_configs     = v4l2_m2m_hw_configs, \
          .wrapper_name   = "v4l2m2m", \
      }
-
+ 
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "libavutil/pixdesc.h"
+@@ -37,6 +39,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+ 
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+     struct v4l2_streamparm parm = { 0 };
+@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+     if (s->avctx->max_b_frames)
+-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+ 
+-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+     if (s->avctx->max_b_frames == 0)
+         return 0;
+ 
+     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+     return AVERROR_PATCHWELCOME;
+ }
+ 
+@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC
+     return 0;
+ }
+ 
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        case DRM_FORMAT_P030:
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++    if (a->type != b->type)
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++        unsigned int i;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->num_planes != pb->num_planes)
++            return 0;
++        for (i = 0; i != pa->num_planes; ++i) {
++            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++                return 0;
++        }
++    }
++    else {
++        const struct v4l2_pix_format *const pa = &a->fmt.pix;
++        const struct v4l2_pix_format *const pb = &b->fmt.pix;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->bytesperline != pb->bytesperline)
++            return 0;
++    }
++    return 1;
++}
++
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+     V4L2Context *const output = &s->output;
+ 
++    // Signal EOF if needed
++    if (!frame) {
++        return ff_v4l2_context_enqueue_frame(output, frame);
++    }
++
++    if (s->input_drm && !output->streamon) {
++        int rv;
++        struct v4l2_format req_format = {.type = output->format.type};
++
++        // Set format when we first get a buffer
++        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++            return rv;
++        }
++
++        ff_v4l2_context_release(output);
++
++        output->format = req_format;
++
++        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++            return rv;
++        }
++
++        if (!fmt_eq(&req_format, &output->format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++            return AVERROR(EINVAL);
++        }
++
++        output->selection.top = frame->crop_top;
++        output->selection.left = frame->crop_left;
++        output->selection.width = av_frame_cropped_width(frame);
++        output->selection.height = av_frame_cropped_height(frame);
++
++        if ((rv = ff_v4l2_context_init(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++            return rv;
++        }
++
++        {
++            struct v4l2_selection selection = {
++                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++                .target = V4L2_SEL_TGT_CROP,
++                .r = output->selection
++            };
++            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++                       av_err2str(AVERROR(errno)));
++            }
++            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++        }
++    }
++
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++    if (frame->pict_type == AV_PICTURE_TYPE_I)
+         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+ 
+@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo
+     }
+ 
+ dequeue:
+-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
++    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++        return ret;
++
++    if (capture->first_buf == 1) {
++        uint8_t * data;
++        const int len = avpkt->size;
++
++        // 1st buffer after streamon should be SPS/PPS
++        capture->first_buf = 2;
++
++        // Clear both possible stores so there is no chance of confusion
++        av_freep(&s->extdata_data);
++        s->extdata_size = 0;
++        av_freep(&avctx->extradata);
++        avctx->extradata_size = 0;
++
++        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
++            memcpy(data, avpkt->data, len);
++
++        av_packet_unref(avpkt);
++
++        if (data == NULL)
++            return AVERROR(ENOMEM);
++
++        // We need to copy the header, but keep local if not global
++        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++            avctx->extradata = data;
++            avctx->extradata_size = len;
++        }
++        else {
++            s->extdata_data = data;
++            s->extdata_size = len;
++        }
++
++        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++            return ret;
++    }
++
++    // First frame must be key so mark as such even if encoder forgot
++    if (capture->first_buf == 2)
++        avpkt->flags |= AV_PKT_FLAG_KEY;
++
++    // Add SPS/PPS to the start of every key frame if non-global headers
++    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++        const size_t newlen = s->extdata_size + avpkt->size;
++        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++        if (buf == NULL) {
++            av_packet_unref(avpkt);
++            return AVERROR(ENOMEM);
++        }
++
++        memcpy(buf->data, s->extdata_data, s->extdata_size);
++        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++        av_buffer_unref(&avpkt->buf);
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++        avpkt->size = newlen;
++    }
++
++//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
++    capture->first_buf = 0;
++    return 0;
+ }
+ 
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo
+     uint32_t v4l2_fmt_output;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo
+     capture = &s->capture;
+     output  = &s->output;
+ 
++    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+     /* common settings output/capture */
+     output->height = capture->height = avctx->height;
+     output->width = capture->width = avctx->width;
+ 
+     /* output context */
+     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+-    output->av_pix_fmt = avctx->pix_fmt;
++    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++            AV_PIX_FMT_YUV420P;
+ 
+     /* capture context */
+     capture->av_codec_id = avctx->codec_id;
+@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo
+         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+ 
+     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+-    if (pix_fmt_output != avctx->pix_fmt) {
++    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+         return AVERROR(EINVAL);
 --- /dev/null
 +++ b/libavcodec/v4l2_req_decode_q.c
 @@ -0,0 +1,84 @@
@@ -49780,16 +54869,26 @@ Upstream-status: Pending
 +#include "v4l2_req_hevc_vx.c"
 +
 --- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v3.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 3
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v4.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 4
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
 +++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -0,0 +1,1188 @@
+@@ -0,0 +1,1365 @@
 +// File included by v4l2_req_hevc_v* - not compiled on its own
 +
 +#include "decode.h"
 +#include "hevcdec.h"
 +#include "hwconfig.h"
 +
-+#include "v4l2_request_hevc.h"
-+
 +#if HEVC_CTRLS_VERSION == 1
 +#include "hevc-ctrls-v1.h"
 +
@@ -49798,10 +54897,39 @@ Upstream-status: Pending
 +
 +#elif HEVC_CTRLS_VERSION == 2
 +#include "hevc-ctrls-v2.h"
++#elif HEVC_CTRLS_VERSION == 3
++#include "hevc-ctrls-v3.h"
++#elif HEVC_CTRLS_VERSION == 4
++#include <linux/v4l2-controls.h>
++#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
++#include "hevc-ctrls-v4.h"
++#endif
 +#else
 +#error Unknown HEVC_CTRLS_VERSION
 +#endif
 +
++#ifndef V4L2_CID_STATELESS_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
++#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
++
++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
++#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
++#endif
++
++// Should be in videodev2 but we might not have a good enough one
++#ifndef V4L2_PIX_FMT_HEVC_SLICE
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++#endif
++
++#include "v4l2_request_hevc.h"
++
 +#include "libavutil/hwcontext_drm.h"
 +
 +#include <semaphore.h>
@@ -49837,11 +54965,16 @@ Upstream-status: Pending
 +    struct v4l2_ctrl_hevc_slice_params * slice_params;
 +    struct slice_info * slices;
 +
++    size_t num_offsets;
++    size_t alloced_offsets;
++    uint32_t *offsets;
++
 +} V4L2MediaReqDescriptor;
 +
 +struct slice_info {
 +    const uint8_t * ptr;
 +    size_t len; // bytes
++    size_t n_offsets;
 +};
 +
 +// Handy container for accumulating controls before setting
@@ -49929,6 +55062,7 @@ Upstream-status: Pending
 +    }
 +}
 +
++#if HEVC_CTRLS_VERSION <= 2
 +static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
 +{
 +    const HEVCFrame *frame;
@@ -49954,6 +55088,7 @@ Upstream-status: Pending
 +
 +    return 0;
 +}
++#endif
 +
 +static unsigned int
 +get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
@@ -49998,7 +55133,7 @@ Upstream-status: Pending
 +    if (rd->num_slices >= rd->alloced_slices) {
 +        struct v4l2_ctrl_hevc_slice_params * p2;
 +        struct slice_info * s2;
-+        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
++        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
 +
 +        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
 +        if (p2 == NULL)
@@ -50016,6 +55151,23 @@ Upstream-status: Pending
 +    return 0;
 +}
 +
++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
++{
++    if (rd->num_offsets + n > rd->alloced_offsets) {
++        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
++        void * p2;
++        while (rd->num_offsets + n > n2)
++            n2 *= 2;
++        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
++            return AVERROR(ENOMEM);
++        rd->offsets = p2;
++        rd->alloced_offsets = n2;
++    }
++    for (size_t i = 0; i != n; ++i)
++        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
++    return 0;
++}
++
 +static unsigned int
 +fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
 +{
@@ -50029,12 +55181,21 @@ Upstream-status: Pending
 +            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
 +
 +            entry->timestamp = frame_capture_dpb(frame->frame);
++#if HEVC_CTRLS_VERSION <= 2
 +            entry->rps = find_frame_rps_type(h, entry->timestamp);
++#else
++            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
++                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
++#endif
 +            entry->field_pic = frame->frame->interlaced_frame;
 +
++#if HEVC_CTRLS_VERSION <= 3
 +            /* TODO: Interleaved: Get the POC for each field. */
 +            entry->pic_order_cnt[0] = frame->poc;
 +            entry->pic_order_cnt[1] = frame->poc;
++#else
++            entry->pic_order_cnt_val = frame->poc;
++#endif
 +        }
 +    }
 +    return n;
@@ -50060,8 +55221,11 @@ Upstream-status: Pending
 +
 +    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
 +        .bit_size = bit_size,
++#if HEVC_CTRLS_VERSION <= 3
 +        .data_bit_offset = bit_offset,
-+
++#else
++        .data_byte_offset = bit_offset / 8 + 1,
++#endif
 +        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
 +        .slice_segment_addr = sh->slice_segment_addr,
 +
@@ -50144,6 +55308,7 @@ Upstream-status: Pending
 +    fill_pred_table(h, &slice_params->pred_weight_table);
 +
 +    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++#if HEVC_CTRLS_VERSION <= 3
 +    if (slice_params->num_entry_point_offsets > 256) {
 +        slice_params->num_entry_point_offsets = 256;
 +        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
@@ -50151,6 +55316,7 @@ Upstream-status: Pending
 +
 +    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
 +        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++#endif
 +}
 +
 +#if HEVC_CTRLS_VERSION >= 2
@@ -50526,51 +55692,66 @@ Upstream-status: Pending
 +#if HEVC_CTRLS_VERSION >= 2
 +    struct v4l2_ctrl_hevc_decode_params * const dec,
 +#endif
-+    struct v4l2_ctrl_hevc_slice_params * const slices,
-+    const unsigned int slice_no,
-+    const unsigned int slice_count)
++    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
++    void * const offsets, const size_t offset_count)
 +{
 +    int rv;
++#if HEVC_CTRLS_VERSION >= 2
++    unsigned int n = 3;
++#else
++    unsigned int n = 2;
++#endif
 +
-+    struct v4l2_ext_control control[] = {
++    struct v4l2_ext_control control[6] = {
 +        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
++            .id = V4L2_CID_STATELESS_HEVC_SPS,
 +            .ptr = &controls->sps,
 +            .size = sizeof(controls->sps),
 +        },
 +        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
++            .id = V4L2_CID_STATELESS_HEVC_PPS,
 +            .ptr = &controls->pps,
 +            .size = sizeof(controls->pps),
 +        },
 +#if HEVC_CTRLS_VERSION >= 2
 +        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
++            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
 +            .ptr = dec,
 +            .size = sizeof(*dec),
 +        },
 +#endif
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .ptr = slices + slice_no,
-+            .size = sizeof(*slices) * slice_count,
-+        },
-+        // Optional
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
 +    };
 +
-+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
-+            controls->has_scaling ?
-+                FF_ARRAY_ELEMS(control) :
-+                FF_ARRAY_ELEMS(control) - 1);
++    if (slices)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
++            .ptr = slices,
++            .size = sizeof(*slices) * slice_count,
++        };
++
++    if (controls->has_scaling)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        };
++
++#if HEVC_CTRLS_VERSION >= 4
++    if (offsets)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
++            .ptr = offsets,
++            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
++        };
++#endif
++
++    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
 +
 +    return rv;
 +}
 +
++// This only works because we started out from a single coded frame buffer
++// that will remain intact until after end_frame
 +static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 +{
 +    const HEVCContext * const h = avctx->priv_data;
@@ -50579,18 +55760,45 @@ Upstream-status: Pending
 +    int bcount = get_bits_count(&h->HEVClc->gb);
 +    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
 +
++    const unsigned int n = rd->num_slices;
++    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
++
 +    int rv;
 +    struct slice_info * si;
 +
++    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
++    // that contains the entire frame including the start code
++    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
++        buffer -= 3;
++        size += 3;
++        boff += 24;
++        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
++            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
++                   buffer[0], buffer[1], buffer[2]);
++        }
++    }
++
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
++        if (rd->slices == NULL) {
++            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
++                return AVERROR(ENOMEM);
++            rd->slices->ptr = buffer;
++            rd->num_slices = 1;
++        }
++        rd->slices->len = buffer - rd->slices->ptr + size;
++        return 0;
++    }
++
 +    if ((rv = slice_add(rd)) != 0)
 +        return rv;
 +
-+    si = rd->slices + rd->num_slices - 1;
++    si = rd->slices + n;
 +    si->ptr = buffer;
 +    si->len = size;
++    si->n_offsets = rd->num_offsets;
 +
-+    if (ctx->multi_slice && rd->num_slices > 1) {
-+        struct slice_info *const si0 = rd->slices;
++    if (n != block_start) {
++        struct slice_info *const si0 = rd->slices + block_start;
 +        const size_t offset = (buffer - si0->ptr);
 +        boff += offset * 8;
 +        size += offset;
@@ -50598,12 +55806,15 @@ Upstream-status: Pending
 +    }
 +
 +#if HEVC_CTRLS_VERSION >= 2
-+    if (rd->num_slices == 1)
++    if (n == 0)
 +        fill_decode_params(h, &rd->dec);
-+    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
++    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
 +#else
-+    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
++    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
 +#endif
++    if (ctx->max_offsets != 0 &&
++        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
++        return rv;
 +
 +    return 0;
 +}
@@ -50629,10 +55840,13 @@ Upstream-status: Pending
 +{
 +    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
 +
++    const int is_last = (j == rd->num_slices);
 +    struct slice_info *const si = rd->slices + i;
 +    struct media_request * req = NULL;
 +    struct qent_src * src = NULL;
 +    MediaBufsStatus stat;
++    void * offsets = rd->offsets + rd->slices[i].n_offsets;
++    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
 +
 +    if ((req = media_request_get(ctx->mpool)) == NULL) {
 +        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
@@ -50644,8 +55858,8 @@ Upstream-status: Pending
 +#if HEVC_CTRLS_VERSION >= 2
 +                     &rd->dec,
 +#endif
-+                     rd->slice_params,
-+                     i, j - i)) {
++                     rd->slice_params + i, j - i,
++                     offsets, n_offsets)) {
 +        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
 +        goto fail1;
 +    }
@@ -50665,13 +55879,9 @@ Upstream-status: Pending
 +        goto fail2;
 +    }
 +
-+#warning ANNEX_B start code
-+//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+//        }
-+
 +    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
 +                                   i == 0 ? rd->qe_dst : NULL,
-+                                   j == rd->num_slices);
++                                   is_last);
 +
 +    if (stat != MEDIABUFS_STATUS_SUCCESS) {
 +        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
@@ -50736,18 +55946,11 @@ Upstream-status: Pending
 +    }
 +
 +    // Send as slices
-+    if (ctx->multi_slice)
-+    {
-+        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
++    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
++        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
++        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
 +            goto fail;
 +    }
-+    else
-+    {
-+        for (i = 0; i != rd->num_slices; ++i) {
-+            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
-+                goto fail;
-+        }
-+    }
 +
 +    // Set the drm_prime desriptor
 +    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
@@ -50762,6 +55965,12 @@ Upstream-status: Pending
 +    return rv;
 +}
 +
++static inline int
++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
++{
++    return v >= c->minimum && v <= c->maximum;
++}
++
 +// Initial check & init
 +static int
 +probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
@@ -50773,17 +55982,19 @@ Upstream-status: Pending
 +
 +    // Check for var slice array
 +    struct v4l2_query_ext_ctrl qc[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SPS },
++        { .id = V4L2_CID_STATELESS_HEVC_PPS },
++        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
 +#if HEVC_CTRLS_VERSION >= 2
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
 +#endif
 +    };
 +    // Order & size must match!
 +    static const size_t ctrl_sizes[] = {
 +        sizeof(struct v4l2_ctrl_hevc_slice_params),
++        sizeof(int32_t),
 +        sizeof(struct v4l2_ctrl_hevc_sps),
 +        sizeof(struct v4l2_ctrl_hevc_pps),
 +        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
@@ -50793,26 +56004,44 @@ Upstream-status: Pending
 +    };
 +    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
 +
-+    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-+        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
++#if HEVC_CTRLS_VERSION == 2
++    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
 +        return AVERROR(EINVAL);
-+    }
-+    for (i = 0; i != noof_ctrls; ++i) {
-+        if (ctrl_sizes[i] != qc[i].elem_size) {
-+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %u != %u\n",
-+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], qc[i].elem_size);
++#elif HEVC_CTRLS_VERSION == 3
++    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#endif
++
++    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
++    i = 0;
++#if HEVC_CTRLS_VERSION >= 4
++    // Skip slice check if no slice mode
++    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        i = 1;
++#else
++    // Fail frame mode silently for anything prior to V4
++    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        return AVERROR(EINVAL);
++#endif
++    for (; i != noof_ctrls; ++i) {
++        if (qc[i].type == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
++            return AVERROR(EINVAL);
++        }
++        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
++                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
 +            return AVERROR(EINVAL);
 +        }
 +    }
 +
 +    fill_sps(&ctrl_sps, sps);
 +
-+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
++    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
 +        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
 +        return AVERROR(EINVAL);
 +    }
 +
-+    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
 +    return 0;
 +}
 +
@@ -50823,38 +56052,63 @@ Upstream-status: Pending
 +    int ret;
 +
 +    struct v4l2_query_ext_ctrl querys[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
++#if HEVC_CTRLS_VERSION >= 4
++        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
++#endif
 +    };
 +
 +    struct v4l2_ext_control ctrls[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
 +    };
 +
 +    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
 +
-+    ctx->decode_mode = querys[0].default_value;
++    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
++                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
++        1 : querys[2].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
 +
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
++#if HEVC_CTRLS_VERSION >= 4
++    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
++        0 : querys[3].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
++#else
++    ctx->max_offsets = 0;
++#endif
++
++    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
++        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
++        ctx->decode_mode = querys[0].default_value;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
 +        return AVERROR(EINVAL);
 +    }
 +
-+    ctx->start_code = querys[1].default_value;
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
++    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
++        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
++        ctx->start_code = querys[1].default_value;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
 +        return AVERROR(EINVAL);
 +    }
 +
-+    ctx->max_slices = querys[2].elems;
-+    if (ctx->max_slices > MAX_SLICES) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
-+        return AVERROR(EINVAL);
-+    }
++    // If we are in slice mode & START_CODE_NONE supported then pick that
++    // as it doesn't require the slightly dodgy look backwards in our raw buffer
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
++        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
 +
 +    ctrls[0].value = ctx->decode_mode;
 +    ctrls[1].value = ctx->start_code;
@@ -50878,6 +56132,7 @@ Upstream-status: Pending
 +
 +    av_freep(&rd->slices);
 +    av_freep(&rd->slice_params);
++    av_freep(&rd->offsets);
 +
 +    av_free(rd);
 +}
@@ -50904,6 +56159,7 @@ Upstream-status: Pending
 +    return ref;
 +}
 +
++#if 0
 +static void v4l2_req_pool_free(void *opaque)
 +{
 +    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
@@ -50915,6 +56171,7 @@ Upstream-status: Pending
 +
 +    av_buffer_pool_uninit(&hwfc->pool);
 +}
++#endif
 +
 +static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
 +{
@@ -50931,7 +56188,7 @@ Upstream-status: Pending
 +        hwfc->width = vfmt->fmt.pix.width;
 +        hwfc->height = vfmt->fmt.pix.height;
 +    }
-+
++#if 0
 +    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
 +    if (!hwfc->pool)
 +        return AVERROR(ENOMEM);
@@ -50950,12 +56207,32 @@ Upstream-status: Pending
 +    default:
 +        hwfc->initial_pool_size += 2;
 +    }
-+
++#endif
 +    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
 +
 +    return 0;
 +}
 +
++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    int rv;
++
++    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
++    if (!frame->buf[0])
++        return AVERROR(ENOMEM);
++
++    frame->data[0] = frame->buf[0]->data;
++
++    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
++
++    if ((rv = ff_attach_decode_data(frame)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
++        av_frame_unref(frame);
++        return rv;
++    }
++
++    return 0;
++}
 +
 +const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
 +    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
@@ -50968,11 +56245,12 @@ Upstream-status: Pending
 +    .end_frame      = v4l2_request_hevc_end_frame,
 +    .abort_frame    = v4l2_request_hevc_abort_frame,
 +    .frame_params   = frame_params,
++    .alloc_frame    = alloc_frame,
 +};
 +
 --- /dev/null
 +++ b/libavcodec/v4l2_req_media.c
-@@ -0,0 +1,1569 @@
+@@ -0,0 +1,1601 @@
 +/*
 + * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
 + *
@@ -51388,7 +56666,7 @@ Upstream-status: Pending
 +    free(be_dst);
 +}
 +
-+static struct qent_dst * qe_dst_new(void)
++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
 +{
 +    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
 +    if (!be_dst)
@@ -51396,7 +56674,8 @@ Upstream-status: Pending
 +    *be_dst = (struct qent_dst){
 +        .base = QENT_BASE_INITIALIZER,
 +        .lock = PTHREAD_MUTEX_INITIALIZER,
-+        .cond = PTHREAD_COND_INITIALIZER
++        .cond = PTHREAD_COND_INITIALIZER,
++        .mbc_wl = ff_weak_link_ref(wl)
 +    };
 +    return be_dst;
 +}
@@ -51568,6 +56847,7 @@ Upstream-status: Pending
 +    int vfd;
 +    bool stream_on;
 +    bool polling;
++    bool dst_fixed;             // Dst Q is fixed size
 +    pthread_mutex_t lock;
 +    struct buf_pool * src;
 +    struct buf_pool * dst;
@@ -51577,6 +56857,7 @@ Upstream-status: Pending
 +
 +    struct v4l2_format src_fmt;
 +    struct v4l2_format dst_fmt;
++    struct v4l2_capability capability;
 +};
 +
 +static int qe_v4l2_queue(struct qent_base *const be,
@@ -51747,13 +57028,13 @@ Upstream-status: Pending
 +{
 +    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
 +        size_t newsize = round_up_size(len);
-+        request_log("%s: Overrun %d > %d; trying %d\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
++        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
 +        if (!dbsc) {
 +            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
 +            return -ENOMEM;
 +        }
 +        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
-+            request_log("%s: Realloc %d failed\n", __func__, newsize);
++            request_log("%s: Realloc %zd failed\n", __func__, newsize);
 +            return -ENOMEM;
 +        }
 +    }
@@ -52069,10 +57350,13 @@ Upstream-status: Pending
 +    return MEDIABUFS_STATUS_SUCCESS;
 +}
 +
-+static int create_dst_buf(struct mediabufs_ctl *const mbc)
++// Returns noof buffers created, -ve for error
++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
 +{
++    unsigned int i;
++
 +    struct v4l2_create_buffers cbuf = {
-+        .count = 1,
++        .count = n,
 +        .memory = V4L2_MEMORY_DMABUF,
 +        .format = mbc->dst_fmt,
 +    };
@@ -52084,7 +57368,14 @@ Upstream-status: Pending
 +            return -err;
 +        }
 +    }
-+    return cbuf.index;
++
++    if (cbuf.count != n)
++        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
++
++    for (i = 0; i != cbuf.count; ++i)
++        qes[i]->base.index = cbuf.index + i;
++
++    return cbuf.count;
 +}
 +
 +struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
@@ -52092,27 +57383,29 @@ Upstream-status: Pending
 +    struct qent_dst * be_dst;
 +
 +    if (mbc == NULL) {
-+        be_dst = qe_dst_new();
++        be_dst = qe_dst_new(NULL);
 +        if (be_dst)
 +            be_dst->base.status = QENT_IMPORT;
 +        return be_dst;
 +    }
 +
-+    be_dst = base_to_dst(queue_tryget_free(mbc->dst));
-+    if (!be_dst) {
-+        int index;
-+
-+        be_dst = qe_dst_new();
++    if (mbc->dst_fixed) {
++        be_dst = base_to_dst(queue_get_free(mbc->dst));
 +        if (!be_dst)
 +            return NULL;
++    }
++    else {
++        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
++        if (!be_dst) {
++            be_dst = qe_dst_new(mbc->this_wlm);
++            if (!be_dst)
++                return NULL;
 +
-+        if ((be_dst->mbc_wl = ff_weak_link_ref(mbc->this_wlm)) == NULL ||
-+            (index = create_dst_buf(mbc)) < 0) {
-+            qe_dst_free(be_dst);
-+            return NULL;
++            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
++                qe_dst_free(be_dst);
++                return NULL;
++            }
 +        }
-+
-+        be_dst->base.index = (uint32_t)index;
 +    }
 +
 +    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
@@ -52166,29 +57459,42 @@ Upstream-status: Pending
 +    return status;
 +}
 +
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, unsigned int n)
++// ** This is a mess if we get partial alloc but without any way to remove
++//    individual V4L2 Q members we are somewhat stuffed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
 +{
-+    // **** request buffers
 +    unsigned int i;
++    int a = 0;
++    unsigned int qc;
++    struct qent_dst * qes[32];
 +
-+    for (i = 0; i != n; ++i)
++    if (n > 32)
++        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++    // Create qents first as it is hard to get rid of the V4L2 buffers on error
++    for (qc = 0; qc != n; ++qc)
 +    {
-+        int index;
-+        struct qent_dst * const be_dst = qe_dst_new();
-+        if (!be_dst)
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+        index = create_dst_buf(mbc);
-+        if (index < 0) {
-+            qe_dst_free(be_dst);
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+        }
-+
-+        // Add index to free chain
-+        be_dst->base.index = (uint32_t)index;
-+        queue_put_free(mbc->dst, &be_dst->base);
++        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
++            goto fail;
 +    }
++
++    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
++        goto fail;
++
++    for (i = 0; i != a; ++i)
++        queue_put_free(mbc->dst, &qes[i]->base);
++
++    if (a != n)
++        goto fail;
++
++    mbc->dst_fixed = fixed;
 +    return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++    for (i = (a < 0 ? 0 : a); i != qc; ++i)
++        qe_dst_free(qes[i]);
++
++    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
 +}
 +
 +struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
@@ -52446,20 +57752,24 @@ Upstream-status: Pending
 +    mediabufs_ctl_delete(mbc);
 +}
 +
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
++{
++    return mbc->capability.version;
++}
++
 +static int set_capabilities(struct mediabufs_ctl *const mbc)
 +{
-+    struct v4l2_capability capability = { 0 };
 +    uint32_t caps;
 +
-+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
++    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
 +        int err = errno;
 +        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
 +        return -err;
 +    }
 +
-+    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-+            capability.device_caps :
-+            capability.capabilities;
++    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
++            mbc->capability.device_caps :
++            mbc->capability.capabilities;
 +
 +    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
 +        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
@@ -52544,7 +57854,7 @@ Upstream-status: Pending
 +
 --- /dev/null
 +++ b/libavcodec/v4l2_req_media.h
-@@ -0,0 +1,148 @@
+@@ -0,0 +1,154 @@
 +/*
 +e.h
 +*
@@ -52646,11 +57956,14 @@ Upstream-status: Pending
 +                struct qent_dst *const dst_be,
 +                const bool is_final);
 +// Get / alloc a dst buffer & associate with a slot
-+// * BEWARE * Currently has no alloc limit
++// If the dst pool is empty then behaviour depends on the fixed flag passed to
++// dst_slots_create.  Default is !fixed = unlimited alloc
 +struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
 +                           struct dmabufs_ctl *const dbsc);
 +// Create dst slots without alloc
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, unsigned int n);
++// If fixed true then qent_alloc will only get slots from this pool and will
++// block until a qent has been unrefed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
 +
 +MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
 +MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
@@ -52686,6 +57999,9 @@ Upstream-status: Pending
 +                  struct dmabufs_ctl * const dbsc,
 +                  unsigned int n);
 +
++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
++
 +struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
 +                     const char *vpath, struct pollqueue *const pq);
 +void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
@@ -52695,7 +58011,7 @@ Upstream-status: Pending
 +#endif
 --- /dev/null
 +++ b/libavcodec/v4l2_req_pollqueue.c
-@@ -0,0 +1,363 @@
+@@ -0,0 +1,361 @@
 +#include <errno.h>
 +#include <limits.h>
 +#include <poll.h>
@@ -52885,19 +58201,19 @@ Upstream-status: Pending
 +        unsigned int i;
 +        unsigned int n = 0;
 +        struct polltask *pt;
++        struct polltask *pt_next;
 +        uint64_t now = pollqueue_now(0);
 +        int timeout = -1;
 +        int rv;
 +
-+        for (pt = pq->head; pt; pt = pt->next) {
++        for (pt = pq->head; pt; pt = pt_next) {
 +            int64_t t;
 +
++            pt_next = pt->next;
++
 +            if (pt->state == POLLTASK_Q_KILL) {
-+                struct polltask * const prev = pt->prev;
 +                pollqueue_rem_task(pq, pt);
 +                sem_post(&pt->kill_sem);
-+                if ((pt = prev) == NULL)
-+                    break;
 +                continue;
 +            }
 +
@@ -52936,8 +58252,8 @@ Upstream-status: Pending
 +         * infinite looping
 +        */
 +        pq->no_prod = true;
-+        for (i = 0, pt = pq->head; i < n; ++i) {
-+            struct polltask *const pt_next = pt->next;
++        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
++            pt_next = pt->next;
 +
 +            /* Pending? */
 +            if (a[i].revents ||
@@ -52961,8 +58277,6 @@ Upstream-status: Pending
 +                if (pt->state == POLLTASK_RUN_KILL)
 +                    sem_post(&pt->kill_sem);
 +            }
-+
-+            pt = pt_next;
 +        }
 +        pq->no_prod = false;
 +
@@ -53082,12 +58396,13 @@ Upstream-status: Pending
 +#endif /* POLLQUEUE_H_ */
 --- /dev/null
 +++ b/libavcodec/v4l2_req_utils.h
-@@ -0,0 +1,21 @@
+@@ -0,0 +1,22 @@
 +#include "libavutil/log.h"
 +
 +#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
 +
 +#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
 +#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
 +#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
 +
@@ -53106,7 +58421,7 @@ Upstream-status: Pending
 +
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,280 @@
+@@ -0,0 +1,315 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -53194,6 +58509,13 @@ Upstream-status: Pending
 +    return ctx->fns->frame_params(avctx, hw_frames_ctx);
 +}
 +
++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->alloc_frame(avctx, frame);
++}
++
++
 +static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
 +{
 +    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
@@ -53248,6 +58570,17 @@ Upstream-status: Pending
 +
 +    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
 +
++    // Give up immediately if this is something that we have no code to deal with
++    if (h->ps.sps->chroma_format_idc != 1) {
++        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
++        return AVERROR_PATCHWELCOME;
++    }
++    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
++        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
++        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
++        return AVERROR_PATCHWELCOME;
++    }
++
 +    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
 +        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
 +        return (AVERROR(-ret));
@@ -53300,7 +58633,15 @@ Upstream-status: Pending
 +        goto fail4;
 +    }
 +
-+    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
++    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
++    }
++    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
++    }
++    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
 +        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
 +        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
 +    }
@@ -53325,9 +58666,18 @@ Upstream-status: Pending
 +        goto fail4;
 +    }
 +
-+    if (mediabufs_dst_slots_create(ctx->mbufs, 1)) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
-+        goto fail4;
++    {
++        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
++            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
++        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
++               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
++               avctx->thread_count, avctx->extra_hw_frames);
++
++        // extra_hw_frames is -1 if unset
++        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
++            goto fail4;
++        }
 +    }
 +
 +    if (mediabufs_stream_on(ctx->mbufs)) {
@@ -53376,7 +58726,7 @@ Upstream-status: Pending
 +    .type           = AVMEDIA_TYPE_VIDEO,
 +    .id             = AV_CODEC_ID_HEVC,
 +    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+//    .alloc_frame    = v4l2_request_hevc_alloc_frame,
++    .alloc_frame    = v4l2_req_hevc_alloc_frame,
 +    .start_frame    = v4l2_req_hevc_start_frame,
 +    .decode_slice   = v4l2_req_hevc_decode_slice,
 +    .end_frame      = v4l2_req_hevc_end_frame,
@@ -53389,7 +58739,7 @@ Upstream-status: Pending
 +};
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.h
-@@ -0,0 +1,100 @@
+@@ -0,0 +1,101 @@
 +#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
 +#define AVCODEC_V4L2_REQUEST_HEVC_H
 +
@@ -53437,8 +58787,6 @@ Upstream-status: Pending
 +#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
 +#endif
 +
-+#define MAX_SLICES 128
-+
 +#define VCAT(name, version) name##_v##version
 +#define V2(n,v) VCAT(n, v)
 +#define V(n) V2(n, HEVC_CTRLS_VERSION)
@@ -53455,10 +58803,10 @@ Upstream-status: Pending
 +
 +    unsigned int timestamp;  // ?? maybe uint64_t
 +
-+    int multi_slice;
 +    int decode_mode;
 +    int start_code;
-+    int max_slices;
++    unsigned int max_slices;    // 0 => not wanted (frame mode)
++    unsigned int max_offsets;   // 0 => not wanted
 +
 +    req_decode_q decode_q;
 +
@@ -53483,16 +58831,121 @@ Upstream-status: Pending
 +    int (*end_frame)(AVCodecContext *avctx);
 +    void (*abort_frame)(AVCodecContext *avctx);
 +    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
 +} v4l2_req_decode_fns;
 +
 +
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
 +
 +#endif
+--- a/libavcodec/vc1dec.c
++++ b/libavcodec/vc1dec.c
+@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod
+             size = next - start - 4;
+             if (size <= 0)
+                 continue;
+-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
++            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+             init_get_bits(&gb, buf2, buf2_size * 8);
+             switch (AV_RB32(start)) {
+             case VC1_CODE_SEQHDR:
+@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte
+                 case VC1_CODE_FRAME:
+                     if (avctx->hwaccel)
+                         buf_start = start;
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     break;
+                 case VC1_CODE_FIELD: {
+                     int buf_size3;
+@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte
+                     break;
+                 }
+                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
+                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
+                     break;
+@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte
+                     ret = AVERROR(ENOMEM);
+                     goto err;
+                 }
+-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
++                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                               buf_size3 << 3);
+                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte
+                 n_slices1 = n_slices - 1;
+                 n_slices++;
+             }
+-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
+         } else {
+-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
+         }
+         init_get_bits(&s->gb, buf2, buf_size2*8);
+     } else
+--- a/libavcodec/vc1dsp.c
++++ b/libavcodec/vc1dsp.c
+@@ -32,6 +32,7 @@
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+ #include "startcode.h"
++#include "vc1_common.h"
+ 
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+ 
+     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
++    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
+ 
+     if (ARCH_AARCH64)
+         ff_vc1dsp_init_aarch64(dsp);
+--- a/libavcodec/vc1dsp.h
++++ b/libavcodec/vc1dsp.h
+@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
+      * one or more further zero bytes and a one byte.
+      */
+     int (*startcode_find_candidate)(const uint8_t *buf, int size);
++
++    /* Copy a buffer, removing startcode emulation escape bytes as we go */
++    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
+ } VC1DSPContext;
+ 
+ void ff_vc1dsp_init(VC1DSPContext* c);
 --- /dev/null
 +++ b/libavcodec/weak_link.c
-@@ -0,0 +1,100 @@
+@@ -0,0 +1,102 @@
 +#include <stdlib.h>
 +#include <pthread.h>
 +#include <stdatomic.h>
@@ -53549,6 +59002,8 @@ Upstream-status: Pending
 +
 +struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
 +{
++    if (!w)
++        return NULL;
 +    atomic_fetch_add(&w->ref_count, 1);
 +    return (struct ff_weak_link_client*)w;
 +}
@@ -53630,7 +59085,7 @@ Upstream-status: Pending
 +OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
  OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
  OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
-
+ 
 --- a/libavdevice/alldevices.c
 +++ b/libavdevice/alldevices.c
 @@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer;
@@ -53642,7 +59097,7 @@ Upstream-status: Pending
 +extern AVOutputFormat ff_vout_rpi_muxer;
  extern AVInputFormat  ff_xcbgrab_demuxer;
  extern AVOutputFormat ff_xv_muxer;
-
+ 
 --- /dev/null
 +++ b/libavdevice/drm_vout.c
 @@ -0,0 +1,643 @@
@@ -53856,7 +59311,7 @@ Upstream-status: Pending
 +
 +        while (drmWaitVBlank(de->drm_fd, &vbl)) {
 +            if (errno != EINTR) {
-+                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
++//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
 +                break;
 +            }
 +        }
@@ -54291,7 +59746,7 @@ Upstream-status: Pending
 +
 --- /dev/null
 +++ b/libavdevice/egl_vout.c
-@@ -0,0 +1,825 @@
+@@ -0,0 +1,816 @@
 +/*
 + * Copyright (c) 2020 John Cox for Raspberry Pi Trading
 + *
@@ -54334,16 +59789,8 @@ Upstream-status: Pending
 +#include <stdatomic.h>
 +#include <unistd.h>
 +
-+#include "drm_fourcc.h"
-+#include <drm.h>
-+#include <drm_mode.h>
-+#include <xf86drm.h>
-+#include <xf86drmMode.h>
 +#include <X11/Xlib.h>
 +#include <X11/Xutil.h>
-+#include <X11/Xlib-xcb.h>
-+#include <xcb/xcb.h>
-+#include <xcb/dri3.h>
 +
 +#include "libavutil/rpi_sand_fns.h"
 +
@@ -54555,8 +60002,7 @@ Upstream-status: Pending
 +   XMapWindow(dpy, win);
 +
 +   {
-+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config,
-+                                               (void *)(uintptr_t)win, NULL);
++      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
 +      if (!surf) {
 +         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
 +         return -1;
@@ -55656,7 +61102,15 @@ Upstream-status: Pending
 +};
 --- a/libavfilter/Makefile
 +++ b/libavfilter/Makefile
-@@ -434,6 +434,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)
+@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)
+ OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
+ OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_deinterlace_qsv.o
+ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
+ OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
+ OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
+ OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
+@@ -434,6 +435,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)
  OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
  OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
  OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
@@ -55666,7 +61120,15 @@ Upstream-status: Pending
                                                  opencl/unsharp.o
 --- a/libavfilter/allfilters.c
 +++ b/libavfilter/allfilters.c
-@@ -414,6 +414,7 @@ extern AVFilter ff_vf_transpose_opencl;
+@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot;
+ extern AVFilter ff_vf_deflate;
+ extern AVFilter ff_vf_deflicker;
+ extern AVFilter ff_vf_deinterlace_qsv;
++extern AVFilter ff_vf_deinterlace_v4l2m2m;
+ extern AVFilter ff_vf_deinterlace_vaapi;
+ extern AVFilter ff_vf_dejudder;
+ extern AVFilter ff_vf_delogo;
+@@ -414,6 +415,7 @@ extern AVFilter ff_vf_transpose_opencl;
  extern AVFilter ff_vf_transpose_vaapi;
  extern AVFilter ff_vf_trim;
  extern AVFilter ff_vf_unpremultiply;
@@ -55683,13 +61145,13 @@ Upstream-status: Pending
 +#if CONFIG_UNSAND_FILTER
 +#include "libavutil/rpi_sand_fns.h"
 +#endif
-
+ 
  #define FF_INTERNAL_FIELDS 1
  #include "framequeue.h"
 @@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFor
      }
  }
-
+ 
 +#if CONFIG_UNSAND_FILTER
 +static int has_sand_format(const AVFilterFormats * const ff)
 +{
@@ -55711,13 +61173,13 @@ Upstream-status: Pending
              AVFilterLink *link = filter->inputs[j];
              int convert_needed = 0;
 +            unsigned int extra_convert_tried = 0;
-
+ 
              if (!link)
                  continue;
 @@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *
              )
  #undef MERGE_DISPATCH
-
+ 
 -            if (convert_needed) {
 +            while (convert_needed) {
                  AVFilterContext *convert;
@@ -55727,7 +61189,7 @@ Upstream-status: Pending
 +                int can_retry = 0;
 +
 +                convert_needed = 0;
-
+ 
                  if (graph->disable_auto_convert) {
                      av_log(log_ctx, AV_LOG_ERROR,
 @@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *
@@ -55760,7 +61222,7 @@ Upstream-status: Pending
 +                                                                inst_name, "", NULL,
 +                                                                graph)) < 0)
 +                            return ret;
-
+ 
 -                    if ((ret = avfilter_graph_create_filter(&convert, filter,
 -                                                            inst_name, graph->scale_sws_opts, NULL,
 -                                                            graph)) < 0)
@@ -55813,7 +61275,7 @@ Upstream-status: Pending
 --- a/libavfilter/buffersrc.c
 +++ b/libavfilter/buffersrc.c
 @@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_intern
-
+ 
          switch (ctx->outputs[0]->type) {
          case AVMEDIA_TYPE_VIDEO:
 -            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
@@ -55822,6 +61284,1345 @@ Upstream-status: Pending
              break;
          case AVMEDIA_TYPE_AUDIO:
 --- /dev/null
++++ b/libavfilter/vf_deinterlace_v4l2m2m.c
+@@ -0,0 +1,1336 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * deinterlace video filter - V4L2 M2M
++ */
++
++#include <drm_fourcc.h>
++
++#include <linux/videodev2.h>
++
++#include <dirent.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <stdatomic.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/avstring.h"
++#include "libavutil/common.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/internal.h"
++#include "libavutil/mathematics.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/time.h"
++
++#define FF_INTERNAL_FIELDS 1
++#include "framequeue.h"
++#include "filters.h"
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct V4L2Queue V4L2Queue;
++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
++
++typedef struct V4L2PlaneInfo {
++    int bytesperline;
++    size_t length;
++} V4L2PlaneInfo;
++
++typedef struct V4L2Buffer {
++    int enqueued;
++    int reenqueue;
++    int fd;
++    struct v4l2_buffer buffer;
++    AVFrame frame;
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    int num_planes;
++    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
++    AVDRMFrameDescriptor drm_frame;
++    V4L2Queue *q;
++} V4L2Buffer;
++
++typedef struct V4L2Queue {
++    struct v4l2_format format;
++    int num_buffers;
++    V4L2Buffer *buffers;
++    DeintV4L2M2MContextShared *ctx;
++} V4L2Queue;
++
++typedef struct pts_stats_s
++{
++    void * logctx;
++    const char * name;  // For debug
++    unsigned int last_count;
++    unsigned int last_interval;
++    int64_t last_pts;
++} pts_stats_t;
++
++#define PTS_TRACK_SIZE 32
++typedef struct pts_track_el_s
++{
++    uint32_t n;
++    unsigned int interval;
++    AVFrame * props;
++} pts_track_el_t;
++
++typedef struct pts_track_s
++{
++    uint32_t n;
++    uint32_t last_n;
++    int got_2;
++    void * logctx;
++    pts_stats_t stats;
++    pts_track_el_t a[PTS_TRACK_SIZE];
++} pts_track_t;
++
++typedef struct DeintV4L2M2MContextShared {
++    void * logctx;  // For logging - will be NULL when done
++
++    int fd;
++    int done;
++    int width;
++    int height;
++    int orig_width;
++    int orig_height;
++    atomic_uint refcount;
++
++    AVBufferRef *hw_frames_ctx;
++
++    unsigned int field_order;
++
++    pts_track_t track;
++
++    V4L2Queue output;
++    V4L2Queue capture;
++} DeintV4L2M2MContextShared;
++
++typedef struct DeintV4L2M2MContext {
++    const AVClass *class;
++
++    DeintV4L2M2MContextShared *shared;
++} DeintV4L2M2MContext;
++
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++    return stats->last_interval;
++}
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++        if (stats->last_count < STATS_LAST_COUNT_MAX)
++            ++stats->last_count;
++        return;
++    }
++
++    if (stats->last_pts != AV_NOPTS_VALUE) {
++        const int64_t interval = pts - stats->last_pts;
++
++        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++            stats->last_count >= STATS_LAST_COUNT_MAX) {
++            if (stats->last_interval != 0)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++                       __func__, stats->name, interval, stats->last_count);
++            stats->last_interval = 0;
++        }
++        else {
++            const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++            if (frame_time != stats->last_interval)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++            stats->last_interval = frame_time;
++        }
++    }
++
++    stats->last_pts = pts;
++    stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++    *stats = (pts_stats_t){
++        .logctx = logctx,
++        .name = name,
++        .last_count = 1,
++        .last_interval = 0,
++        .last_pts = AV_NOPTS_VALUE
++    };
++}
++
++static inline uint32_t pts_track_next_n(pts_track_t * const trk)
++{
++    if (++trk->n == 0)
++        trk->n = 1;
++    return trk->n;
++}
++
++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
++{
++    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
++    pts_track_el_t * t;
++
++    // As a first guess assume that n==0 means last frame
++    if (n == 0) {
++        n = trk->last_n;
++        if (n == 0)
++            goto fail;
++    }
++
++    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++    if (t->n != n) {
++        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
++        goto fail;
++    }
++
++    // 1st frame is simple - just believe it
++    if (n != trk->last_n) {
++        trk->last_n = n;
++        trk->got_2 = 0;
++        return av_frame_copy_props(dst, t->props);
++    }
++
++    // Only believe in a single interpolated frame
++    if (trk->got_2)
++        goto fail;
++    trk->got_2 = 1;
++
++    av_frame_copy_props(dst, t->props);
++
++
++    // If we can't guess - don't
++    if (t->interval == 0) {
++        dst->best_effort_timestamp = AV_NOPTS_VALUE;
++        dst->pts = AV_NOPTS_VALUE;
++        dst->pkt_dts = AV_NOPTS_VALUE;
++    }
++    else {
++        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
++            dst->best_effort_timestamp += t->interval / 2;
++        if (dst->pts != AV_NOPTS_VALUE)
++            dst->pts += t->interval / 2;
++        if (dst->pkt_dts != AV_NOPTS_VALUE)
++            dst->pkt_dts += t->interval / 2;
++    }
++
++    return 0;
++
++fail:
++    trk->last_n = 0;
++    trk->got_2 = 0;
++    dst->pts = AV_NOPTS_VALUE;
++    dst->pkt_dts = AV_NOPTS_VALUE;
++    return 0;
++}
++
++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
++{
++    const uint32_t n = pts_track_next_n(trk);
++    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++    pts_stats_add(&trk->stats, src->pts);
++
++    t->n = n;
++    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
++    av_frame_unref(t->props);
++    av_frame_copy_props(t->props, src);
++
++    // We now know what the previous interval was, rather than having to guess,
++    // so set it.  There is a better than decent chance that this is before
++    // we use it.
++    if (t->interval != 0) {
++        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
++        prev_t->interval = t->interval;
++    }
++
++    // In case deinterlace interpolates frames use every other usec
++    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
++}
++
++static void pts_track_uninit(pts_track_t * const trk)
++{
++    unsigned int i;
++    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++        trk->a[i].n = 0;
++        av_frame_free(&trk->a[i].props);
++    }
++}
++
++static int pts_track_init(pts_track_t * const trk, void *logctx)
++{
++    unsigned int i;
++    trk->n = 1;
++    pts_stats_init(&trk->stats, logctx, "track");
++    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++        trk->a[i].n = 0;
++        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
++            pts_track_uninit(trk);
++            return AVERROR(ENOMEM);
++        }
++    }
++    return 0;
++}
++
++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
++{
++    struct v4l2_capability cap;
++    int ret;
++
++    memset(&cap, 0, sizeof(cap));
++    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
++    if (ret < 0)
++        return ret;
++
++    if (!(cap.capabilities & V4L2_CAP_STREAMING))
++        return AVERROR(EINVAL);
++
++    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
++        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++
++        return 0;
++    }
++
++    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
++        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++
++        return 0;
++    }
++
++    return AVERROR(EINVAL);
++}
++
++static int deint_v4l2m2m_try_format(V4L2Queue *queue)
++{
++    struct v4l2_format *fmt        = &queue->format;
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    int ret, field;
++
++    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
++    if (ret)
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
++
++    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
++        field = V4L2_FIELD_INTERLACED_TB;
++    else
++        field = V4L2_FIELD_NONE;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
++        fmt->fmt.pix_mp.field = field;
++        fmt->fmt.pix_mp.width = ctx->width;
++        fmt->fmt.pix_mp.height = ctx->height;
++    } else {
++        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
++        fmt->fmt.pix.field = field;
++        fmt->fmt.pix.width = ctx->width;
++        fmt->fmt.pix.height = ctx->height;
++    }
++
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
++		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
++		 fmt->fmt.pix_mp.pixelformat,
++		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
++
++    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
++    if (ret)
++        return AVERROR(EINVAL);
++
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
++		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
++		 fmt->fmt.pix_mp.pixelformat,
++		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
++             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
++            fmt->fmt.pix_mp.field != field) {
++            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
++
++            return AVERROR(EINVAL);
++        }
++    } else {
++        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
++             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
++            fmt->fmt.pix.field != field) {
++            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
++
++            return AVERROR(EINVAL);
++        }
++    }
++
++    return 0;
++}
++
++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
++{
++    struct v4l2_format *fmt        = &queue->format;
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    int ret;
++
++    struct v4l2_selection sel = {
++        .type = fmt->type,
++        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
++    };
++
++    // This works for most single object 4:2:0 types
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.pixelformat = pixelformat;
++        fmt->fmt.pix_mp.field = field;
++        fmt->fmt.pix_mp.width = width;
++        fmt->fmt.pix_mp.height = ysize / pitch;
++        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
++        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
++    } else {
++        fmt->fmt.pix.pixelformat = pixelformat;
++        fmt->fmt.pix.field = field;
++        fmt->fmt.pix.width = width;
++        fmt->fmt.pix.height = height;
++        fmt->fmt.pix.sizeimage = 0;
++        fmt->fmt.pix.bytesperline = 0;
++    }
++
++    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
++        return ret;
++    }
++
++    if (pixelformat != fmt->fmt.pix.pixelformat) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
++        return AVERROR(EINVAL);
++    }
++
++    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
++    }
++
++    sel.r.width = width;
++    sel.r.height = height;
++    sel.r.left = 0;
++    sel.r.top = 0;
++    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
++    sel.flags = V4L2_SEL_FLAG_LE;
++
++    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
++    }
++
++    return 0;
++}
++
++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
++{
++    int ret;
++
++    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
++    if (ctx->fd < 0)
++        return AVERROR(errno);
++
++    ret = deint_v4l2m2m_prepare_context(ctx);
++    if (ret)
++        goto fail;
++
++    ret = deint_v4l2m2m_try_format(&ctx->capture);
++    if (ret)
++        goto fail;
++
++    ret = deint_v4l2m2m_try_format(&ctx->output);
++    if (ret)
++        goto fail;
++
++    return 0;
++
++fail:
++    close(ctx->fd);
++    ctx->fd = -1;
++
++    return ret;
++}
++
++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
++{
++    int ret = AVERROR(EINVAL);
++    struct dirent *entry;
++    char node[PATH_MAX];
++    DIR *dirp;
++
++    dirp = opendir("/dev");
++    if (!dirp)
++        return AVERROR(errno);
++
++    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
++
++        if (strncmp(entry->d_name, "video", 5))
++            continue;
++
++        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
++        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
++        ret = deint_v4l2m2m_probe_device(ctx, node);
++        if (!ret)
++            break;
++    }
++
++    closedir(dirp);
++
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
++        ctx->fd = -1;
++
++        return ret;
++    }
++
++    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
++
++    return 0;
++}
++
++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
++{
++    int ret;
++
++    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
++    if (ret < 0)
++        return AVERROR(errno);
++
++    buf->enqueued = 1;
++
++    return 0;
++}
++
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
++{
++    struct v4l2_exportbuffer expbuf;
++    int i, ret;
++    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
++    uint32_t fmt = 0;
++
++    switch (pixelformat) {
++    case V4L2_PIX_FMT_NV12:
++        fmt = DRM_FORMAT_NV12;
++        break;
++    case V4L2_PIX_FMT_YUV420:
++        fmt = DRM_FORMAT_YUV420;
++        break;
++    default:
++        return AVERROR(EINVAL);
++    }
++
++    avbuf->drm_frame.layers[0].format = fmt;
++
++    for (i = 0; i < avbuf->num_planes; i++) {
++        memset(&expbuf, 0, sizeof(expbuf));
++
++        expbuf.index = avbuf->buffer.index;
++        expbuf.type = avbuf->buffer.type;
++        expbuf.plane = i;
++
++        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
++        if (ret < 0)
++            return AVERROR(errno);
++
++        avbuf->fd = expbuf.fd;
++
++        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
++            /* drm frame */
++            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
++            avbuf->drm_frame.objects[i].fd = expbuf.fd;
++            avbuf->drm_frame.objects[i].format_modifier = mod;
++        } else {
++            /* drm frame */
++            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
++            avbuf->drm_frame.objects[0].fd = expbuf.fd;
++            avbuf->drm_frame.objects[0].format_modifier = mod;
++        }
++    }
++
++    return 0;
++}
++
++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
++{
++    struct v4l2_format *fmt = &queue->format;
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    struct v4l2_requestbuffers req;
++    int ret, i, j, multiplanar;
++    uint32_t memory;
++
++    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
++        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
++
++    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
++
++    memset(&req, 0, sizeof(req));
++    req.count = queue->num_buffers;
++    req.memory = memory;
++    req.type = fmt->type;
++
++    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
++    if (ret < 0) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
++
++        return AVERROR(errno);
++    }
++
++    queue->num_buffers = req.count;
++    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
++    if (!queue->buffers) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
++
++        return AVERROR(ENOMEM);
++    }
++
++    for (i = 0; i < queue->num_buffers; i++) {
++        V4L2Buffer *buf = &queue->buffers[i];
++
++        buf->enqueued = 0;
++        buf->fd = -1;
++        buf->q = queue;
++
++        buf->buffer.type = fmt->type;
++        buf->buffer.memory = memory;
++        buf->buffer.index = i;
++
++        if (multiplanar) {
++            buf->buffer.length = VIDEO_MAX_PLANES;
++            buf->buffer.m.planes = buf->planes;
++        }
++
++        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
++        if (ret < 0) {
++            ret = AVERROR(errno);
++
++            goto fail;
++        }
++
++        if (multiplanar)
++            buf->num_planes = buf->buffer.length;
++        else
++            buf->num_planes = 1;
++
++        for (j = 0; j < buf->num_planes; j++) {
++            V4L2PlaneInfo *info = &buf->plane_info[j];
++
++            if (multiplanar) {
++                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
++                info->length = buf->buffer.m.planes[j].length;
++            } else {
++                info->bytesperline = fmt->fmt.pix.bytesperline;
++                info->length = buf->buffer.length;
++            }
++        }
++
++        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
++            ret = deint_v4l2m2m_enqueue_buffer(buf);
++            if (ret)
++                goto fail;
++
++            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
++            if (ret)
++                goto fail;
++        }
++    }
++
++    return 0;
++
++fail:
++    for (i = 0; i < queue->num_buffers; i++)
++        if (queue->buffers[i].fd >= 0)
++            close(queue->buffers[i].fd);
++    av_free(queue->buffers);
++    queue->buffers = NULL;
++
++    return ret;
++}
++
++static int deint_v4l2m2m_streamon(V4L2Queue *queue)
++{
++    DeintV4L2M2MContextShared * const ctx = queue->ctx;
++    int type = queue->format.type;
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++    if (ret < 0)
++        return AVERROR(errno);
++
++    return 0;
++}
++
++static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
++{
++    DeintV4L2M2MContextShared * const ctx = queue->ctx;
++    int type = queue->format.type;
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++    if (ret < 0)
++        return AVERROR(errno);
++
++    return 0;
++}
++
++// timeout in ms
++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
++{
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    struct v4l2_buffer buf = { 0 };
++    V4L2Buffer* avbuf = NULL;
++    struct pollfd pfd;
++    short events;
++    int ret;
++
++    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++        events =  POLLOUT | POLLWRNORM;
++    else
++        events = POLLIN | POLLRDNORM;
++
++    pfd.events = events;
++    pfd.fd = ctx->fd;
++
++    for (;;) {
++        ret = poll(&pfd, 1, timeout);
++        if (ret > 0)
++            break;
++        if (errno == EINTR)
++            continue;
++        return NULL;
++    }
++
++    if (pfd.revents & POLLERR)
++        return NULL;
++
++    if (pfd.revents & events) {
++        memset(&buf, 0, sizeof(buf));
++        buf.memory = V4L2_MEMORY_MMAP;
++        buf.type = queue->format.type;
++        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++            memset(planes, 0, sizeof(planes));
++            buf.length = VIDEO_MAX_PLANES;
++            buf.m.planes = planes;
++        }
++
++        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
++        if (ret) {
++            if (errno != EAGAIN)
++                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
++                       av_err2str(AVERROR(errno)));
++            return NULL;
++        }
++
++        avbuf = &queue->buffers[buf.index];
++        avbuf->enqueued = 0;
++        avbuf->buffer = buf;
++        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++            memcpy(avbuf->planes, planes, sizeof(planes));
++            avbuf->buffer.m.planes = avbuf->planes;
++        }
++        return avbuf;
++    }
++
++    return NULL;
++}
++
++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
++{
++    int i;
++    V4L2Buffer *buf = NULL;
++
++    for (i = 0; i < queue->num_buffers; i++)
++        if (!queue->buffers[i].enqueued) {
++            buf = &queue->buffers[i];
++            break;
++        }
++    return buf;
++}
++
++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
++{
++    int i;
++    V4L2Buffer *buf = NULL;
++
++    if (!queue || !queue->buffers)
++        return;
++    for (i = 0; i < queue->num_buffers; i++) {
++        buf = &queue->buffers[i];
++        if (queue->buffers[i].enqueued)
++            av_frame_unref(&buf->frame);
++    }
++}
++
++static void recycle_q(V4L2Queue * const queue)
++{
++    V4L2Buffer* avbuf;
++    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
++        av_frame_unref(&avbuf->frame);
++    }
++}
++
++static int count_enqueued(V4L2Queue *queue)
++{
++    int i;
++    int n = 0;
++
++    if (queue->buffers == NULL)
++        return 0;
++
++    for (i = 0; i < queue->num_buffers; i++)
++        if (queue->buffers[i].enqueued)
++            ++n;
++    return n;
++}
++
++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
++{
++    DeintV4L2M2MContextShared *const ctx = queue->ctx;
++    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
++    V4L2Buffer *buf;
++    int i;
++
++    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++        recycle_q(queue);
++
++    buf = deint_v4l2m2m_find_free_buf(queue);
++    if (!buf) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
++        return AVERROR(EAGAIN);
++    }
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
++        for (i = 0; i < drm_desc->nb_objects; i++)
++            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
++    else
++        buf->buffer.m.fd = drm_desc->objects[0].fd;
++
++    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
++        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
++            V4L2_FIELD_INTERLACED_BT;
++
++    if (ctx->field_order != buf->buffer.field) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
++        ctx->field_order = buf->buffer.field;
++    }
++
++    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
++
++    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
++
++    av_frame_move_ref(&buf->frame, frame);
++
++    return deint_v4l2m2m_enqueue_buffer(buf);
++}
++
++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
++{
++    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
++        V4L2Queue *capture = &ctx->capture;
++        V4L2Queue *output  = &ctx->output;
++        int i;
++
++        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
++
++        if (ctx->fd >= 0) {
++            deint_v4l2m2m_streamoff(capture);
++            deint_v4l2m2m_streamoff(output);
++        }
++
++        if (capture->buffers)
++            for (i = 0; i < capture->num_buffers; i++) {
++                capture->buffers[i].q = NULL;
++                if (capture->buffers[i].fd >= 0)
++                    close(capture->buffers[i].fd);
++            }
++
++        deint_v4l2m2m_unref_queued(output);
++
++        av_buffer_unref(&ctx->hw_frames_ctx);
++
++        if (capture->buffers)
++            av_free(capture->buffers);
++
++        if (output->buffers)
++            av_free(output->buffers);
++
++        if (ctx->fd >= 0) {
++            close(ctx->fd);
++            ctx->fd = -1;
++        }
++
++        av_free(ctx);
++    }
++}
++
++static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++{
++    V4L2Buffer *buf                = opaque;
++    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
++
++    if (!ctx->done)
++        deint_v4l2m2m_enqueue_buffer(buf);
++
++    deint_v4l2m2m_destroy_context(ctx);
++}
++
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
++{
++    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++    AVDRMLayerDescriptor *layer;
++
++    /* fill the DRM frame descriptor */
++    drm_desc->nb_objects = avbuf->num_planes;
++    drm_desc->nb_layers = 1;
++
++    layer = &drm_desc->layers[0];
++    layer->nb_planes = avbuf->num_planes;
++
++    for (int i = 0; i < avbuf->num_planes; i++) {
++        layer->planes[i].object_index = i;
++        layer->planes[i].offset = 0;
++        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
++    }
++
++    switch (layer->format) {
++    case DRM_FORMAT_YUYV:
++        layer->nb_planes = 1;
++        break;
++
++    case DRM_FORMAT_NV12:
++    case DRM_FORMAT_NV21:
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 2;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++        break;
++
++    case DRM_FORMAT_YUV420:
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 3;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++        layer->planes[2].object_index = 0;
++        layer->planes[2].offset = layer->planes[1].offset +
++            ((avbuf->plane_info[0].bytesperline *
++              height) >> 2);
++        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++        break;
++
++    default:
++        drm_desc->nb_layers = 0;
++        break;
++    }
++
++    return (uint8_t *) drm_desc;
++}
++
++// timeout in ms
++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
++{
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    V4L2Buffer* avbuf;
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
++    if (!avbuf) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
++        return AVERROR(EAGAIN);
++    }
++
++    // Fill in PTS and anciliary info from src frame
++    // we will want to overwrite some fields as only the pts/dts
++    // fields are updated with new timing in this fn
++    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
++
++    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
++                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
++                            avbuf, AV_BUFFER_FLAG_READONLY);
++    if (!frame->buf[0]) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
++        return AVERROR(ENOMEM);
++    }
++
++    atomic_fetch_add(&ctx->refcount, 1);
++
++    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
++    frame->format = AV_PIX_FMT_DRM_PRIME;
++    if (ctx->hw_frames_ctx)
++        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
++    frame->height = ctx->height;
++    frame->width = ctx->width;
++
++    // Not interlaced now
++    frame->interlaced_frame = 0;
++    frame->top_field_first = 0;
++    // Pkt duration halved
++    frame->pkt_duration /= 2;
++
++    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
++        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
++    }
++
++    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
++    return 0;
++}
++
++static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
++{
++    AVFilterLink *inlink           = outlink->src->inputs[0];
++    AVFilterContext *avctx         = outlink->src;
++    DeintV4L2M2MContext *priv      = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++    int ret;
++
++    ctx->height = avctx->inputs[0]->h;
++    ctx->width = avctx->inputs[0]->w;
++
++    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
++
++    outlink->time_base           = inlink->time_base;
++    outlink->w                   = inlink->w;
++    outlink->h                   = inlink->h;
++    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
++    outlink->format              = inlink->format;
++    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
++
++    ret = deint_v4l2m2m_find_device(ctx);
++    if (ret)
++        return ret;
++
++    if (inlink->hw_frames_ctx) {
++        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
++        if (!ctx->hw_frames_ctx)
++            return AVERROR(ENOMEM);
++    }
++    return 0;
++}
++
++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
++{
++    static const enum AVPixelFormat pixel_formats[] = {
++        AV_PIX_FMT_DRM_PRIME,
++        AV_PIX_FMT_YUV420P,
++        AV_PIX_FMT_NONE,
++    };
++
++    return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
++}
++
++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
++{
++    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
++            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
++
++    switch (drm_desc->layers[0].format) {
++    case DRM_FORMAT_YUV420:
++        if (is_linear)
++            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
++        break;
++    case DRM_FORMAT_NV12:
++        if (is_linear)
++            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
++        break;
++    default:
++        break;
++    }
++    return 0;
++}
++
++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterContext *avctx         = link->dst;
++    DeintV4L2M2MContext *priv      = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++    V4L2Queue *capture             = &ctx->capture;
++    V4L2Queue *output              = &ctx->output;
++    int ret;
++
++    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
++          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
++    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
++           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
++
++    if (ctx->field_order == V4L2_FIELD_ANY) {
++        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
++        const uint32_t pixelformat = desc_pixelformat(drm_desc);
++
++        if (pixelformat == 0) {
++            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
++                   av_fourcc2str(drm_desc->layers[0].format),
++                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
++            return AVERROR(EINVAL);
++        }
++
++        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
++        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
++
++        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
++           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
++
++        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
++        if (ret)
++            return ret;
++
++        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
++        if (ret)
++            return ret;
++
++        ret = deint_v4l2m2m_allocate_buffers(capture);
++        if (ret)
++            return ret;
++
++        ret = deint_v4l2m2m_streamon(capture);
++        if (ret)
++            return ret;
++
++        ret = deint_v4l2m2m_allocate_buffers(output);
++        if (ret)
++            return ret;
++
++        ret = deint_v4l2m2m_streamon(output);
++        if (ret)
++            return ret;
++
++        if (in->top_field_first)
++            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
++        else
++            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
++
++    }
++
++    ret = deint_v4l2m2m_enqueue_frame(output, in);
++
++    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
++    return ret;
++}
++
++static int deint_v4l2m2m_activate(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext * const priv = avctx->priv;
++    DeintV4L2M2MContextShared *const s = priv->shared;
++    AVFilterLink * const outlink = avctx->outputs[0];
++    AVFilterLink * const inlink = avctx->inputs[0];
++    int n = 0;
++    int cn = 99;
++    int instatus = 0;
++    int64_t inpts = 0;
++    int did_something = 0;
++
++    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
++
++    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
++
++    if (!ff_outlink_frame_wanted(outlink)) {
++        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
++    }
++    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
++    {
++        AVFrame * frame = av_frame_alloc();
++        int rv;
++
++again:
++        recycle_q(&s->output);
++        n = count_enqueued(&s->output);
++
++        if (frame == NULL) {
++            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
++            return AVERROR(ENOMEM);
++        }
++
++        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
++        if (rv != 0) {
++            av_frame_free(&frame);
++            if (rv != AVERROR(EAGAIN)) {
++                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
++                return rv;
++            }
++        }
++        else {
++            frame->interlaced_frame = 0;
++            // frame is always consumed by filter_frame - even on error despite
++            // a somewhat confusing comment in the header
++            rv = ff_filter_frame(outlink, frame);
++
++            if (instatus != 0) {
++                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
++                goto again;
++            }
++
++            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
++            did_something = 1;
++        }
++
++        cn = count_enqueued(&s->capture);
++    }
++
++    if (instatus != 0) {
++        ff_outlink_set_status(outlink, instatus, inpts);
++        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
++        return 0;
++    }
++
++    recycle_q(&s->output);
++    n = count_enqueued(&s->output);
++
++    while (n < 6) {
++        AVFrame * frame;
++        int rv;
++
++        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
++            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
++            return rv;
++        }
++
++        if (frame == NULL) {
++            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
++            break;
++        }
++
++        rv = deint_v4l2m2m_filter_frame(inlink, frame);
++        av_frame_free(&frame);
++
++        if (rv != 0)
++            return rv;
++
++        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
++        ++n;
++    }
++
++    if (n < 6) {
++        ff_inlink_request_frame(inlink);
++        did_something = 1;
++        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
++    }
++
++    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
++        ff_filter_set_ready(avctx, 1);
++        did_something = 1;
++        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
++    }
++
++    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
++    return did_something ? 0 : FFERROR_NOT_READY;
++}
++
++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext * const priv = avctx->priv;
++    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
++
++    if (!ctx) {
++        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
++        return AVERROR(ENOMEM);
++    }
++    priv->shared = ctx;
++    ctx->logctx = priv;
++    ctx->fd = -1;
++    ctx->output.ctx = ctx;
++    ctx->output.num_buffers = 8;
++    ctx->capture.ctx = ctx;
++    ctx->capture.num_buffers = 12;
++    ctx->done = 0;
++    ctx->field_order = V4L2_FIELD_ANY;
++
++    pts_track_init(&ctx->track, priv);
++
++    atomic_init(&ctx->refcount, 1);
++
++    return 0;
++}
++
++static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext *priv = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++
++    ctx->done = 1;
++    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
++    pts_track_uninit(&ctx->track);
++    deint_v4l2m2m_destroy_context(ctx);
++}
++
++static const AVOption deinterlace_v4l2m2m_options[] = {
++    { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
++
++static const AVFilterPad deint_v4l2m2m_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++    },
++    { NULL }
++};
++
++static const AVFilterPad deint_v4l2m2m_outputs[] = {
++    {
++        .name          = "default",
++        .type          = AVMEDIA_TYPE_VIDEO,
++        .config_props  = deint_v4l2m2m_config_props,
++    },
++    { NULL }
++};
++
++AVFilter ff_vf_deinterlace_v4l2m2m = {
++    .name           = "deinterlace_v4l2m2m",
++    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
++    .priv_size      = sizeof(DeintV4L2M2MContext),
++    .init           = &deint_v4l2m2m_init,
++    .uninit         = &deint_v4l2m2m_uninit,
++    .query_formats  = &deint_v4l2m2m_query_formats,
++    .inputs         = deint_v4l2m2m_inputs,
++    .outputs        = deint_v4l2m2m_outputs,
++    .priv_class     = &deinterlace_v4l2m2m_class,
++    .activate       = deint_v4l2m2m_activate,
++};
+--- /dev/null
 +++ b/libavfilter/vf_unsand.c
 @@ -0,0 +1,234 @@
 +/*
@@ -56063,7 +62864,7 @@ Upstream-status: Pending
 @@ -3051,6 +3051,40 @@ static int has_codec_parameters(AVStream
      return 1;
  }
-
+ 
 +#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
 +// This should be quite general purpose but avoid possible conflicts
 +// by limiting usage to cases wehere we know it works.
@@ -56165,10 +62966,10 @@ Upstream-status: Pending
            sha512.h                                                      \
 @@ -86,6 +87,7 @@ HEADERS = adler32.h
            tx.h                                                          \
-
+ 
  HEADERS-$(CONFIG_LZO)                   += lzo.h
 +HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h
-
+ 
  ARCH_HEADERS = bswap.h                                                  \
                 intmath.h                                                \
 @@ -180,6 +182,7 @@ OBJS-$(CONFIG_LZO)
@@ -56184,14 +62985,14 @@ Upstream-status: Pending
 @@ -1,4 +1,6 @@
  OBJS += aarch64/cpu.o                                                 \
          aarch64/float_dsp_init.o                                      \
-
+ 
 -NEON-OBJS += aarch64/float_dsp_neon.o
 +NEON-OBJS += aarch64/float_dsp_neon.o                                 \
 +             aarch64/rpi_sand_neon.o                                  \
 +
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,676 @@
+@@ -0,0 +1,781 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -56252,7 +63053,7 @@ Upstream-status: Pending
 +
 +    // this is the value we have to add to the src pointer after reading a complete block
 +    // it will move the address to the start of the next block
-+    // w10 = stride2 * stride1 - stride1
++    // w10 = stride2 * stride1 - stride1 
 +    mov w10, w4
 +    lsl w10, w10, #7
 +    sub w10, w10, #128
@@ -56279,7 +63080,7 @@ Upstream-status: Pending
 +    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
 +    // fortunately these aren't callee saved ones, meaning we don't need to backup them
 +    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
-+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64
++    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64 
 +
 +    // write these registers back to the destination vector and increase the dst address by 128
 +    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
@@ -56310,13 +63111,13 @@ Upstream-status: Pending
 +    add w5, w5, #1
 +    b incomplete_block_loop_y8
 +incomplete_block_loop_end_y8:
-+
-+
-+    // increase the row offset by 128 (stride1)
++    
++   
++    // increase the row offset by 128 (stride1) 
 +    add w11, w11, #128
 +    // increment the row counter
 +    add w12, w12, #1
-+
++    
 +    // process the next row if we haven't finished yet
 +    cmp w15, w12
 +    bgt row_loop
@@ -56380,7 +63181,7 @@ Upstream-status: Pending
 +    beq no_main_c8
 +
 +block_loop_c8:
-+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values
++    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values 
 +    ld2 { v0.16b,  v1.16b }, [x13], #32
 +    ld2 { v2.16b,  v3.16b }, [x13], #32
 +    ld2 { v4.16b,  v5.16b }, [x13], #32
@@ -56403,14 +63204,14 @@ Upstream-status: Pending
 +    // increment row counter and move src to the beginning of the next block
 +    add w14, w14, #1
 +    add x13, x13, x10
-+
++    
 +    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
 +    cmp w8, w14
 +    bgt block_loop_c8
 +
 +no_main_c8:
 +    // handle incomplete block at the end of every row
-+    eor w5, w5, w5 // point counter, this might be
++    eor w5, w5, w5 // point counter, this might be 
 +incomplete_block_loop_c8:
 +    cmp w5, w9
 +    bge incomplete_block_loop_end_c8
@@ -56442,228 +63243,6 @@ Upstream-status: Pending
 +    ret
 +endfunc
 +
-+//void ff_rpi_sand30_lines_to_planar_y16(
-+//  uint8_t * dest,             // [x0]
-+//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
-+//  const uint8_t * src,        // [x2]
-+//  unsigned int src_stride1,   // [w3] -> 128
-+//  unsigned int src_stride2,   // [w4]
-+//  unsigned int _x,            // [w5]
-+//  unsigned int y,             // [w6]
-+//  unsigned int _w,            // [w7]
-+//  unsigned int h);            // [sp, #0]
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
-+
-+    // w6 = argument h
-+    ldr w6, [sp, #48]
-+
-+    // slice_inc = ((stride2 - 1) * stride1)
-+    mov w5, w4
-+    sub w5, w5, #1
-+    lsl w5, w5, #7
-+
-+    // total number of bytes per row = (width / 3) * 4
-+    mov w8, w7
-+    mov w9, #3
-+    udiv w8, w8, w9
-+    lsl w8, w8, #2
-+
-+    // number of full 128 byte blocks to be processed
-+    mov w9, #96
-+    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
-+
-+    // w10 = number of full integers to process (4 bytes)
-+    // w11 = remaning zero to two 10bit values still to copy over
-+    mov w12, #96
-+    mul w12, w9, w12
-+    sub w12, w7, w12  // width - blocks*96 = remaining points per row
-+    mov w11, #3
-+    udiv w10, w12, w11 // full integers to process = w12 / 3
-+    mul w11, w10, w11  // #integers *3
-+    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
-+
-+    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
-+    // this is to efficiently copy incomplete blocks at the end of the rows
-+    // the last row is handled explicitly to avoid writing out of bounds
-+    add w22, w10, w11
-+    cmp w22, #0
-+    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
-+    add w9, w9, w22
-+    sub w6, w6, #1
-+
-+    // store the number of bytes in w20 which we copy too much for every row
-+    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
-+    mov w20, #96*2
-+    mul w20, w20, w9
-+    sub w20, w1, w20
-+
-+    mov w23, #0 // flag to check whether the last line had already been processed
-+
-+    // bitmask to clear the uppper 6bits of the result values
-+    mov x19, #0x03ff03ff03ff03ff
-+    dup v22.2d, x19
-+
-+    // row counter = 0
-+    eor w12, w12, w12
-+row_loop_y16:
-+    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
-+    bge row_loop_y16_fin
-+
-+    mov x13, x2               // row src
-+    eor w14, w14, w14         // full block counter
-+block_loop_y16:
-+    cmp w14, w9
-+    bge block_loop_y16_fin
-+
-+    // load 64 bytes
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-+
-+    // process v0 and v1
-+    xtn v16.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v17.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v18.4h, v0.4s
-+
-+    xtn2 v16.8h, v1.4s
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v17.8h, v1.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v18.8h, v1.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // process v2 and v3
-+    xtn v23.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v24.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v25.4h, v2.4s
-+
-+    xtn2 v23.8h, v3.4s
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v24.8h, v3.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v25.8h, v3.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+
-+    // load the second half of the block -> 64 bytes into registers v4-v7
-+    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
-+
-+    // process v4 and v5
-+    xtn v16.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v17.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v18.4h, v4.4s
-+
-+    xtn2 v16.8h, v5.4s
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v17.8h, v5.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v18.8h, v5.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // v6 and v7
-+    xtn v23.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v24.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v25.4h, v6.4s
-+
-+    xtn2 v23.8h, v7.4s
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v24.8h, v7.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v25.8h, v7.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+
-+    add x13, x13, x5          // row src += slice_inc
-+    add w14, w14, #1
-+    b block_loop_y16
-+block_loop_y16_fin:
-+
-+
-+
-+
-+    add x2, x2, #128          // src += stride1 (start of the next row)
-+    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
-+    add w12, w12, #1
-+    b row_loop_y16
-+row_loop_y16_fin:
-+
-+    // check whether we have incomplete blocks at the end of every row
-+    // in that case decrease row block count by one
-+    // change height back to it's original value (meaning increase it by 1)
-+    // and jump back to another iteration of row_loop_y16
-+
-+    cmp w23, #1
-+    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
-+    add w6, w6, #1    // increase height to the original value
-+    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
-+    mov w23, #1
-+    b row_loop_y16
-+row_loop_y16_fin2:
-+
-+    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
-+
-+    // now we've got to handle the last block in the last row
-+    eor w12, w12, w12 // w12 = 0 = counter
-+integer_loop_y16:
-+    cmp w12, w10
-+    bge integer_loop_y16_fin
-+    ldr w14, [x13], #4
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    add w12, w12, #1
-+    b integer_loop_y16
-+integer_loop_y16_fin:
-+
-+final_values_y16:
-+    // remaining point count = w11
-+    ldr w14, [x13], #4
-+    cmp w11, #0
-+    beq final_values_y16_fin
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    cmp w11, #1
-+    beq final_values_y16_fin
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+final_values_y16_fin:
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
-+endfunc
-+
 +//void ff_rpi_sand30_lines_to_planar_c16(
 +//  uint8_t * dst_u,            // [x0]
 +//  unsigned int dst_stride_u,  // [w1] == _w*2
@@ -56671,7 +63250,7 @@ Upstream-status: Pending
 +//  unsigned int dst_stride_v,  // [w3] == _w*2
 +//  const uint8_t * src,        // [x4]
 +//  unsigned int stride1,       // [w5] == 128
-+//  unsigned int stride2,       // [w6]
++//  unsigned int stride2,       // [w6] 
 +//  unsigned int _x,            // [w7] == 0
 +//  unsigned int y,             // [sp, #0] == 0
 +//  unsigned int _w,            // [sp, #8] -> w3
@@ -56694,7 +63273,7 @@ Upstream-status: Pending
 +    and v5.16b, v5.16b, v16.16b
 +    and v6.16b, v6.16b, v16.16b
 +    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
-+
++    
 +    xtn v4.4h, v2.4s
 +    ushr v2.4s, v2.4s, #10
 +    xtn v5.4h, v2.4s
@@ -56841,7 +63420,7 @@ Upstream-status: Pending
 +    ldr w22, [x4], #4
 +    str w22, [x0], #2
 +    lsr w22, w22, #16
-+    str w22, [x2], #2
++    str w22, [x2], #2 
 +
 +    add w20, w20, #1
 +    b rem_pix_c16_loop
@@ -56868,9 +63447,336 @@ Upstream-status: Pending
 +//  unsigned int _w,
 +//  unsigned int h);
 +
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7, lsl #1
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #14
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #10
++
++                shrn2           v18.8h,  v1.4s,   #14
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #10
++
++                ushr            v18.8h,  v18.8h,  #6
++                bic             v16.8h,  #0xfc,   lsl #8
++                bic             v17.8h,  #0xfc,   lsl #8
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #14
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #10
++
++                shrn2           v21.8h,  v3.4s,   #14
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #10
++
++                ushr            v21.8h,  v21.8h,  #6
++                bic             v19.8h,  #0xfc,   lsl #8
++                bic             v20.8h,  #0xfc,   lsl #8
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #14
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #10
++
++                shrn2           v24.8h,  v5.4s,   #14
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #10
++
++                ushr            v24.8h,  v24.8h,  #6
++                bic             v22.8h,  #0xfc,   lsl #8
++                bic             v23.8h,  #0xfc,   lsl #8
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #14
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #10
++
++                shrn2           v27.8h,  v7.4s,   #14
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #10
++
++                ushr            v27.8h,  v27.8h,  #6
++                bic             v25.8h,  #0xfc,   lsl #8
++                bic             v26.8h,  #0xfc,   lsl #8
++
++                blt             2f
++
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
++                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++                mov             v19.16b, v25.16b
++                mov             v20.16b, v26.16b
++                mov             v21.16b, v27.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v19.16b
++                mov             v17.16b, v20.16b
++                sub             w5,  w5,  #24
++                mov             v18.16b, v21.16b
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #12
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #6
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #3
++                mov             v17.4h[0], v17.4h[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.h, v17.h}[0], [x0], #4
++                b               11b
++1:
++                st1             {v16.h}[0], [x0], #2
++                b               11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #16
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #12
++
++                shrn2           v18.8h,  v1.4s,   #16
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #12
++
++                shrn            v18.8b,  v18.8h,  #6
++                shrn            v16.8b,  v16.8h,  #2
++                xtn             v17.8b,  v17.8h
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #16
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #12
++
++                shrn2           v21.8h,  v3.4s,   #16
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #12
++
++                shrn2           v18.16b, v21.8h,  #6
++                shrn2           v16.16b, v19.8h,  #2
++                xtn2            v17.16b, v20.8h
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #16
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #12
++
++                shrn2           v24.8h,  v5.4s,   #16
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #12
++
++                shrn            v21.8b,  v24.8h,  #6
++                shrn            v19.8b,  v22.8h,  #2
++                xtn             v20.8b,  v23.8h
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #16
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #12
++
++                shrn2           v27.8h,  v7.4s,   #16
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #12
++
++                shrn2           v21.16b, v27.8h,  #6
++                shrn2           v19.16b, v25.8h,  #2
++                xtn2            v20.16b, v26.8h
++
++                blt             2f
++
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #24
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #12
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #6
++                mov             v17.4h[0], v17.4h[1]
++                mov             v18.4h[0], v18.4h[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                beq             11b
++                mov             v16.8b[0], v16.8b[1]
++                sub             w5,  w5,  #3
++                mov             v17.8b[0], v17.8b[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.b, v17.b}[0], [x0], #2
++                b               11b
++1:
++                st1             {v16.b}[0], [x0], #1
++                b               11b
++
++endfunc
++
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,59 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -56922,6 +63828,10 @@ Upstream-status: Pending
 +  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
 +  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
 +
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
 +#ifdef __cplusplus
 +}
 +#endif
@@ -56929,13 +63839,13 @@ Upstream-status: Pending
 --- a/libavutil/arm/Makefile
 +++ b/libavutil/arm/Makefile
 @@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o
-
+ 
  NEON-OBJS += arm/float_dsp_init_neon.o                                  \
               arm/float_dsp_neon.o                                       \
 +             arm/rpi_sand_neon.o                                        \
 --- /dev/null
 +++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,768 @@
+@@ -0,0 +1,925 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -57298,7 +64208,6 @@ Upstream-status: Pending
 +                ldr             r6,  [sp, #36]
 +                ldr             r7,  [sp, #32]  @ y
 +                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
 +                sub             r3,  #1
 +                lsl             r3,  #7
 +                sub             r1,  r1,  r6,  lsl #1
@@ -57314,37 +64223,33 @@ Upstream-status: Pending
 +                vldm            r2!, {q10-q13}
 +                add             lr,  #64
 +
-+                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
++                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
 +                ands            lr,  #127
 +                vshrn.u32       d2,  q10, #10
 +                vmovn.u32       d0,  q10
-+                vmovn.u32       d4,  q14
 +
-+                vshr.u32        q14, q11, #20
++                vshrn.u32       d5,  q11, #14
 +                it              eq
 +                addeq           r2,  r3
 +                vshrn.u32       d3,  q11, #10
 +                vmovn.u32       d1,  q11
-+                vmovn.u32       d5,  q14
 +
 +                subs            r5,  #48
-+                vand            q0,  q15
-+                vand            q1,  q15
-+                vand            q2,  q15
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
 +
-+                vshr.u32        q14, q12, #20
++                vshrn.u32       d20, q12, #14
 +                vshrn.u32       d18, q12, #10
 +                vmovn.u32       d16, q12
-+                vmovn.u32       d20, q14
 +
-+                vshr.u32        q14, q13, #20
++                vshrn.u32       d21, q13, #14
 +                vshrn.u32       d19, q13, #10
 +                vmovn.u32       d17, q13
-+                vmovn.u32       d21, q14
 +
-+                vand            q8,  q15
-+                vand            q9,  q15
-+                vand            q10, q15
++                vshr.u16        q10, #6
++                vbic.u16        q8,  #0xfc00
++                vbic.u16        q9 , #0xfc00
 +                blt             2f
 +
 +                vst3.16         {d0,  d2,  d4},  [r0], r12
@@ -57437,7 +64342,6 @@ Upstream-status: Pending
 +                ldr             r7,  [sp, #48]
 +                ldr             r9,  [sp, #52]
 +                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
 +                sub             r8,  #1
 +                lsl             r8,  #7
 +                add             r5,  r5,  r7,  lsl #7
@@ -57453,48 +64357,44 @@ Upstream-status: Pending
 +                add             lr,  #64
 +
 +                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-+                vshr.u32        q14, q0,  #20
-+                vshrn.u32       d16, q0,  #10
++                vshrn.u32       d20, q0,  #14
 +                vmovn.u32       d18, q0
++                vshrn.u32       d0,  q0,  #10
 +                ands            lr,  #127
-+                vmovn.u32       d20, q14
 +
-+                vshr.u32        q14, q1,  #20
-+                vshrn.u32       d17, q1,  #10
++                vshrn.u32       d21, q1,  #14
 +                vmovn.u32       d19, q1
-+                vmovn.u32       d21, q14
++                vshrn.u32       d1,  q1,  #10
 +
-+                vshr.u32        q14, q2,  #20
 +                vshrn.u32       d22, q2,  #10
-+                vmovn.u32       d24, q2
-+                vmovn.u32       d26, q14
++                vmovn.u32       d2,  q2
++                vshrn.u32       d4,  q2,  #14
 +
-+                vshr.u32        q14, q3,  #20
-+                vshrn.u32       d23, q3,  #10
-+                vmovn.u32       d25, q3
 +                add             r10, r0,  #24
-+                vmovn.u32       d27, q14
++                vshrn.u32       d23, q3,  #10
++                vmovn.u32       d3,  q3
++                vshrn.u32       d5,  q3,  #14
 +
 +                it              eq
 +                addeq           r4,  r8
-+                vuzp.16         q8,  q11
-+                vuzp.16         q9,  q12
-+                vuzp.16         q10, q13
++                vuzp.16         q0,  q11
++                vuzp.16         q9,  q1
++                vuzp.16         q10, q2
 +
-+                @ q8   V0, V3,.. -> q0
++                @ q0   V0, V3,..
 +                @ q9   U0, U3...
 +                @ q10  U1, U4...
 +                @ q11  U2, U5,..
-+                @ q12  V1, V4,.. -> q1
-+                @ q13  V2, V5,.. -> q2
++                @ q1   V1, V4,
++                @ q2   V2, V5,..
 +
 +                subs            r6,  #24
-+                vand            q11, q15
-+                vand            q9,  q15
-+                vand            q10, q15
-+                vand            q0,  q8,  q15
-+                vand            q1,  q12, q15
-+                vand            q2,  q13, q15
++                vbic.u16        q11, #0xfc00
++                vbic.u16        q9,  #0xfc00
++                vshr.u16        q10, #6
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
 +
 +                blt             2f
 +
@@ -57703,10 +64603,177 @@ Upstream-status: Pending
 +endfunc
 +
 +
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                lsl             r3,  #7
++                sub             r1,  r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++1:
++                vldm            r2,  {q8-q15}
++
++                subs            r5,  #96
++
++                vmovn.u32       d0,  q8
++                vshrn.u32       d2,  q8,  #12
++                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
++
++                add             r2,  r3
++
++                vmovn.u32       d1,  q9
++                vshrn.u32       d3,  q9,  #12
++                vshrn.u32       d5,  q9,  #16
++
++                pld             [r2, #0]
++
++                vshrn.u16       d0,  q0,  #2
++                vmovn.u16       d1,  q1
++                vshrn.u16       d2,  q2,  #6
++
++                vmovn.u32       d16, q10
++                vshrn.u32       d18, q10, #12
++                vshrn.u32       d20, q10, #16
++
++                vmovn.u32       d17, q11
++                vshrn.u32       d19, q11, #12
++                vshrn.u32       d21, q11, #16
++
++                pld             [r2, #64]
++
++                vshrn.u16       d4,  q8,  #2
++                vmovn.u16       d5,  q9
++                vshrn.u16       d6,  q10, #6
++
++                vmovn.u32       d16, q12
++                vshrn.u32       d18, q12, #12
++                vshrn.u32       d20, q12, #16
++
++                vmovn.u32       d17, q13
++                vshrn.u32       d19, q13, #12
++                vshrn.u32       d21, q13, #16
++
++                vshrn.u16       d16, q8,  #2
++                vmovn.u16       d17, q9
++                vshrn.u16       d18, q10, #6
++
++                vmovn.u32       d20, q14
++                vshrn.u32       d22, q14, #12
++                vshrn.u32       d24, q14, #16
++
++                vmovn.u32       d21, q15
++                vshrn.u32       d23, q15, #12
++                vshrn.u32       d25, q15, #16
++
++                vshrn.u16       d20, q10, #2
++                vmovn.u16       d21, q11
++                vshrn.u16       d22, q12, #6
++
++                blt             2f
++
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                vst3.8          {d16, d17, d18}, [r0], r12
++                vst3.8          {d20, d21, d22}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #48-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                beq             11b
++                vmov            q0,  q8
++                vmov            q2,  q10
++                sub             r5,  #48
++                vmov            d2,  d18
++                vmov            d6,  d22
++1:
++                cmp             r5,  #24-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0]!
++                beq             11b
++                vmov            q0,  q2
++                sub             r5,  #24
++                vmov            d2,  d6
++1:
++                cmp             r5,  #12-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
++                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #12
++                vmov            s2,  s3
++                vmov            s4,  s5
++1:
++                cmp             r5,  #6-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                add             r0,  #12
++                beq             11b
++                vshr.u32        d0,  #16
++                sub             r5,  #6
++                vshr.u32        d1,  #16
++                vshr.u32        d2,  #16
++1:
++                cmp             r5, #3-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #8
++                vshr.u32        d1, #8
++1:
++                cmp             r5, #2-96
++                blt             1f
++                vst2.8          {d0[0], d1[0]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
 +
 --- /dev/null
 +++ b/libavutil/arm/rpi_sand_neon.h
-@@ -0,0 +1,99 @@
+@@ -0,0 +1,110 @@
 +/*
 +Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -57804,6 +64871,17 @@ Upstream-status: Pending
 +  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
 +  unsigned int h);            // [sp, #16] -> r7
 +
++void ff_rpi_sand30_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
 +#endif // AVUTIL_ARM_SAND_NEON_H
 +
 --- a/libavutil/frame.c
@@ -57811,7 +64889,7 @@ Upstream-status: Pending
 @@ -16,6 +16,8 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
-
+ 
 +#include "config.h"
 +
  #include "channel_layout.h"
@@ -57824,13 +64902,13 @@ Upstream-status: Pending
 +#if CONFIG_SAND
 +#include "rpi_sand_fns.h"
 +#endif
-
+ 
  #if FF_API_FRAME_GET_SET
  MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
 @@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *fra
          (frame->crop_top + frame->crop_bottom) >= frame->height)
          return AVERROR(ERANGE);
-
+ 
 +#if CONFIG_SAND
 +    // Sand cannot be cropped - do not try
 +    if (av_rpi_is_sand_format(frame->format))
@@ -57845,7 +64923,7 @@ Upstream-status: Pending
 @@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *fra
   */
  const char *av_frame_side_data_name(enum AVFrameSideDataType type);
-
+ 
 +
 +static inline int av_frame_cropped_width(const AVFrame * const frame)
 +{
@@ -57866,11 +64944,11 @@ Upstream-status: Pending
  #include <sys/mman.h>
  #include <unistd.h>
 +#include <sys/ioctl.h>
-
+ 
  #include <drm.h>
 +#include <libdrm/drm_fourcc.h>
  #include <xf86drm.h>
-
+ 
  #include "avassert.h"
 @@ -28,6 +30,11 @@
  #include "hwcontext_drm.h"
@@ -57881,13 +64959,13 @@ Upstream-status: Pending
 +#include <linux/mman.h>
 +#include <linux/dma-buf.h>
 +#include <linux/dma-heap.h>
-
-
+ 
+ 
  static void drm_device_free(AVHWDeviceContext *hwdev)
 @@ -43,6 +50,11 @@ static int drm_device_create(AVHWDeviceC
      AVDRMDeviceContext *hwctx = hwdev->hwctx;
      drmVersionPtr version;
-
+ 
 +    if (device == NULL) {
 +      hwctx->fd = -1;
 +      return 0;
@@ -57905,7 +64983,7 @@ Upstream-status: Pending
      size_t length[AV_DRM_MAX_PLANES];
 +    int fds[AV_DRM_MAX_PLANES];
  } DRMMapping;
-
+ 
 +static int dmasync(const int fd, const unsigned int flags)
 +{
 +    struct dma_buf_sync sync = {
@@ -57926,19 +65004,19 @@ Upstream-status: Pending
  {
      DRMMapping *map = hwmap->priv;
      int i;
-
+ 
 -    for (i = 0; i < map->nb_regions; i++)
 +    for (i = 0; i < map->nb_regions; i++) {
          munmap(map->address[i], map->length[i]);
 +        dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
 +    }
-
+ 
      av_free(map);
  }
 @@ -114,15 +145,28 @@ static int drm_map_frame(AVHWFramesConte
      if (!map)
          return AVERROR(ENOMEM);
-
+ 
 +    for (i = 0; i < AV_DRM_MAX_PLANES; i++)
 +        map->fds[i] = -1;
 +
@@ -57956,7 +65034,7 @@ Upstream-status: Pending
 +
 +    if (dst->format == AV_PIX_FMT_NONE)
 +        dst->format = hwfc->sw_format;
-
+ 
      av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
      for (i = 0; i < desc->nb_objects; i++) {
 -        addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
@@ -57968,7 +65046,7 @@ Upstream-status: Pending
          if (addr == MAP_FAILED) {
              err = AVERROR(errno);
 @@ -151,6 +195,23 @@ static int drm_map_frame(AVHWFramesConte
-
+ 
      dst->width  = src->width;
      dst->height = src->height;
 +    dst->crop_top    = src->crop_top;
@@ -57988,12 +65066,12 @@ Upstream-status: Pending
 +        // *** Are we sure src->height is actually what we want ???
 +    }
 +#endif
-
+ 
      err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
                                  &drm_unmap_frame, map);
 @@ -160,7 +221,9 @@ static int drm_map_frame(AVHWFramesConte
      return 0;
-
+ 
  fail:
 -    for (i = 0; i < desc->nb_objects; i++) {
 +    for (i = 0; i < AV_DRM_MAX_PLANES; i++) {
@@ -58002,13 +65080,23 @@ Upstream-status: Pending
          if (map->address[i])
              munmap(map->address[i], map->length[i]);
      }
-@@ -178,7 +241,15 @@ static int drm_transfer_get_formats(AVHW
-     if (!pix_fmts)
+@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW
+                                     enum AVHWFrameTransferDirection dir,
+                                     enum AVPixelFormat **formats)
+ {
+-    enum AVPixelFormat *pix_fmts;
++    enum AVPixelFormat *p;
+ 
+-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+-    if (!pix_fmts)
++    p = *formats = av_malloc_array(3, sizeof(*p));
++    if (!p)
          return AVERROR(ENOMEM);
-
+ 
 -    pix_fmts[0] = ctx->sw_format;
+-    pix_fmts[1] = AV_PIX_FMT_NONE;
 +    // **** Offer native sand too ????
-+    pix_fmts[0] =
++    *p++ =
 +#if CONFIG_SAND
 +        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
 +            AV_PIX_FMT_YUV420P :
@@ -58016,21 +65104,30 @@ Upstream-status: Pending
 +            AV_PIX_FMT_YUV420P10LE :
 +#endif
 +            ctx->sw_format;
-     pix_fmts[1] = AV_PIX_FMT_NONE;
-
-     *formats = pix_fmts;
-@@ -197,18 +268,80 @@ static int drm_transfer_data_from(AVHWFr
++
++#if CONFIG_SAND
++    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++        *p++ = AV_PIX_FMT_NV12;
++#endif
+ 
+-    *formats = pix_fmts;
++    *p = AV_PIX_FMT_NONE;
+     return 0;
+ }
+ 
+@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr
      map = av_frame_alloc();
      if (!map)
          return AVERROR(ENOMEM);
 -    map->format = dst->format;
-
+ 
 +    // Map to default
 +    map->format = AV_PIX_FMT_NONE;
      err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
      if (err)
          goto fail;
-
+ 
 -    map->width  = dst->width;
 -    map->height = dst->height;
 +#if 0
@@ -58054,29 +65151,12 @@ Upstream-status: Pending
 +        const unsigned int w = FFMIN(dst->width, map->width);
 +        const unsigned int h = FFMIN(dst->height, map->height);
 +
-+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else
++        map->crop_top = 0;
++        map->crop_bottom = 0;
++        map->crop_left = 0;
++        map->crop_right = 0;
++
++        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
 +        {
 +            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
 +            err = AVERROR(EINVAL);
@@ -58094,30 +65174,30 @@ Upstream-status: Pending
 +        map->height = dst->height;
 +        err = av_frame_copy(dst, map);
 +    }
-
+ 
 -    err = av_frame_copy(dst, map);
      if (err)
 +    {
 +        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
          goto fail;
 +    }
-
+ 
      err = 0;
  fail:
-@@ -223,7 +356,10 @@ static int drm_transfer_data_to(AVHWFram
+@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram
      int err;
-
+ 
      if (src->width > hwfc->width || src->height > hwfc->height)
 +    {
 +        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
          return AVERROR(EINVAL);
 +    }
-
+ 
      map = av_frame_alloc();
      if (!map)
 --- a/libavutil/pixdesc.c
 +++ b/libavutil/pixdesc.c
-@@ -2371,6 +2371,38 @@ static const AVPixFmtDescriptor av_pix_f
+@@ -2371,6 +2371,50 @@ static const AVPixFmtDescriptor av_pix_f
          .name = "vulkan",
          .flags = AV_PIX_FMT_FLAG_HWACCEL,
      },
@@ -58140,17 +65220,29 @@ Upstream-status: Pending
 +        .log2_chroma_h = 1,
 +        .comp = {
 +            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
-+            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
-+            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
++            { 1, 4, 0, 0, 10, 3, 9, 1 },        /* U */
++            { 1, 4, 2, 0, 10, 3, 9, 3 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_SAND64_16] = {
++        .name = "sand64_16",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 16, 0, 15, 1 },        /* Y */
++            { 1, 4, 0, 0, 16, 3, 15, 1 },        /* U */
++            { 1, 4, 2, 0, 16, 3, 15, 3 },        /* V */
 +        },
 +        .flags = 0,
 +    },
 +    [AV_PIX_FMT_RPI4_8] = {
-+        .name = "rpi",
++        .name = "rpi4_8",
 +        .flags = AV_PIX_FMT_FLAG_HWACCEL,
 +    },
 +    [AV_PIX_FMT_RPI4_10] = {
-+        .name = "rpi",
++        .name = "rpi4_10",
 +        .flags = AV_PIX_FMT_FLAG_HWACCEL,
 +    },
  };
@@ -58159,7 +65251,7 @@ Upstream-status: Pending
 --- a/libavutil/pixfmt.h
 +++ b/libavutil/pixfmt.h
 @@ -357,6 +357,12 @@ enum AVPixelFormat {
-
+ 
      AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
      AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
 +// RPI - not on ifdef so can be got at by calling progs
@@ -58168,7 +65260,7 @@ Upstream-status: Pending
 +    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
 +    AV_PIX_FMT_RPI4_8,
 +    AV_PIX_FMT_RPI4_10,
-
+ 
      AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
  };
 --- /dev/null
@@ -58403,7 +65495,7 @@ Upstream-status: Pending
 +
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,356 @@
+@@ -0,0 +1,445 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -58635,6 +65727,75 @@ Upstream-status: Pending
 +    }
 +}
 +
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint8_t * d = dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = (p3 >> 2) & 0xff;
++            *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = (p3 >> 2) & 0xff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 12) & 0xff;
++        }
++    }
++}
++
++
 +
 +// w/h in pixels
 +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -58716,6 +65877,16 @@ Upstream-status: Pending
 +                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
 +                                             x/2, y/2,  w/2, h/2);
 +                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
 +                default:
 +                    return -1;
 +            }
@@ -58750,6 +65921,16 @@ Upstream-status: Pending
 +                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
 +                                             x/2, y/2, w/2, h/2);
 +                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
 +                default:
 +                    return -1;
 +            }
@@ -58762,7 +65943,7 @@ Upstream-status: Pending
 +}
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,183 @@
+@@ -0,0 +1,188 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -58850,6 +66031,11 @@ Upstream-status: Pending
 +                             unsigned int _x, unsigned int y,
 +                             unsigned int _w, unsigned int h);
 +
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
 +
 +// w/h in pixels
 +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -58948,34 +66134,64 @@ Upstream-status: Pending
 +
 --- /dev/null
 +++ b/pi-util/BUILD.txt
-@@ -0,0 +1,29 @@
+@@ -0,0 +1,59 @@
 +Building Pi FFmpeg
 +==================
 +
-+Configuration:
-+=============
++Current only building on a Pi is supported.
++This builds ffmpeg the way I've tested it
 +
-+These instructions work for cross compiles from Ubuntu 16.04 & Ubuntu
-+18.04. I would expect most other linux environments to work but I haven't
-+tried them.
++Get all dependencies - the current package dependencies are good enough
 +
-+pi-util/conf_pi2.sh
++$ sudo apt-get build-dep ffmpeg
 +
-+contains suitable options to build the code for Pi2/3.  It expects to find
-+git clones of
++Configure using the pi-util/conf_native.sh script
++-------------------------------------------------
 +
-+https://github.com/raspberrypi/tools
-+https://github.com/raspberrypi/firmware
++This sets the normal release options and creates an ouutput dir to build into
++The directory name will depend on system and options but will be under out/
 +
-+in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
-+lot of history you don't want.
++There are a few choices here
++ --mmal  build including the legacy mmal-based decoders and zero-copy code
++         this requires appropriate libraries which currently will exist for
++         armv7 but not arm64
++ --noshared
++         Build a static image rather than a shared library one.  Static is
++         easier for testing as there is no need to worry about library
++         paths being confused and therefore running the wrong code,  Shared
++         is what is needed, in most cases, when building for use by other
++         programs.
 +
-+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
-+rebuilt.  Otherwise the prebuilt .c & .h files will be used.
-+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
++So for a static build
++---------------------
 +
-+pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
-+H265 QPU acceleration is broken on Pi1 and so it is disabled.
++$ pi-util/conf_native.sh --noshared
++
++$ make -j8 -C out/<wherever the script said it was building to>
++
++You can now run ffmpeg directly from where it was built
++
++For a shared build
++------------------
++
++$ pi-util/conf_native.sh
++
++You will normally want an install target if shared. Note that the script has
++set this up to be generated in out/<builddir>/install, you don't have to worry
++about overwriting your system libs.
++
++$ make -j8 -C out/<builddir> install
++
++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
++built or install the image on the system - you have to be careful to get rid
++of all other ffmpeg libs or confusion may result.  There is a little script
++that wipes all other versions - obviously use with care!
++
++$ sudo pi-util/clean_usr_libs.sh
++
++Then simply copying from the install to /usr works
++
++$ sudo cp -r out/<builddir>/install/* /usr
 +
 +
 --- /dev/null
@@ -59137,29 +66353,32 @@ Upstream-status: Pending
 +
 --- /dev/null
 +++ b/pi-util/clean_usr_libs.sh
-@@ -0,0 +1,23 @@
+@@ -0,0 +1,26 @@
 +set -e
 +U=/usr/lib/arm-linux-gnueabihf
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +U=/usr/lib/arm-linux-gnueabihf/neon/vfp
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +U=/usr/lib/aarch64-linux-gnu
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +
 --- /dev/null
 +++ b/pi-util/conf_arm64_native.sh
@@ -59706,57 +66925,90 @@ Upstream-status: Pending
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 --- /dev/null
 +++ b/pi-util/conf_native.sh
-@@ -0,0 +1,78 @@
+@@ -0,0 +1,106 @@
 +echo "Configure for native build"
 +
 +FFSRC=`pwd`
-+MC=`uname -m`
++MC=`dpkg --print-architecture`
++BUILDBASE=$FFSRC/out
 +
 +#RPI_KEEPS="-save-temps=obj"
 +RPI_KEEPS=""
 +
-+if [ "$MC" == "aarch64" ]; then
++NOSHARED=
++MMAL=
++
++while [ "$1" != "" ] ; do
++    case $1 in
++	--noshared)
++	    NOSHARED=1
++	    ;;
++	--mmal)
++	    MMAL=1
++	    ;;
++	*)
++	    echo "Usage $0: [--noshared] [--mmal]"
++	    exit 1
++	    ;;
++    esac
++    shift
++done
++
++
++MCOPTS=
++RPI_INCLUDES=
++RPI_LIBDIRS=
++RPI_DEFINES=
++RPI_EXTRALIBS=
++
++if [ "$MC" == "arm64" ]; then
 +  echo "M/C aarch64"
 +  A=aarch64-linux-gnu
 +  B=arm64
-+  MCOPTS=
-+  RPI_INCLUDES=
-+  RPI_LIBDIRS=
-+  RPI_DEFINES=
-+  RPI_EXTRALIBS=
-+  RPIOPTS="--disable-mmal --enable-sand"
-+else
++elif [ "$MC" == "armhf" ]; then
 +  echo "M/C armv7"
 +  A=arm-linux-gnueabihf
 +  B=armv7
 +  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
++  RPI_DEFINES=-mfpu=neon-vfpv4
++else
++  echo Unexpected architecture $MC
++  exit 1
++fi
++
++if [ $MMAL ]; then
 +  RPI_OPT_VC=/opt/vc
 +  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
 +  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
-+  RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
++  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
 +  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
 +  RPIOPTS="--enable-mmal --enable-rpi"
++else
++  RPIOPTS="--disable-mmal --enable-sand"
 +fi
++
 +C=`lsb_release -sc`
 +V=`cat RELEASE`
 +
 +SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
++if [ $NOSHARED ]; then
 +  SHARED_LIBS="--disable-shared"
-+  OUT=out/$B-$C-$V-static-rel
++  OUT=$BUILDBASE/$B-$C-$V-static-rel
 +  echo Static libs
 +else
 +  echo Shared libs
-+  OUT=out/$B-$C-$V-shared-rel
++  OUT=$BUILDBASE/$B-$C-$V-shared-rel
 +fi
 +
-+USR_PREFIX=$FFSRC/$OUT/install
++USR_PREFIX=$OUT/install
 +LIB_PREFIX=$USR_PREFIX/lib/$A
 +INC_PREFIX=$USR_PREFIX/include/$A
 +
 +echo Destination directory: $OUT
-+mkdir -p $FFSRC/$OUT
-+cd $FFSRC/$OUT
++mkdir -p $OUT
++# Nothing under here need worry git - including this .gitignore!
++echo "**" > $BUILDBASE/.gitignore
++cd $OUT
 +
 +$FFSRC/configure \
 + --prefix=$USR_PREFIX\
@@ -59767,10 +67019,8 @@ Upstream-status: Pending
 + --disable-thumb\
 + --enable-v4l2-request\
 + --enable-libdrm\
-+ --enable-epoxy\
-+ --enable-libudev\
-+ --enable-vout-drm\
 + --enable-vout-egl\
++ --enable-vout-drm\
 + $SHARED_LIBS\
 + $RPIOPTS\
 + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
@@ -59779,118 +67029,13 @@ Upstream-status: Pending
 + --extra-libs="$RPI_EXTRALIBS"\
 + --extra-version="rpi"
 +
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
---- /dev/null
-+++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,39 @@
-+echo "Configure for Pi1"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
-+  SHARED_LIBS="--disable-shared"
-+  echo Static libs
-+else
-+  echo Shared libs
-+fi
-+
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=arm\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ $SHARED_LIBS\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
---- /dev/null
-+++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,57 @@
-+echo "Configure for Pi2/3"
-+
-+FFSRC=`pwd`
-+
-+RPI_TOOLROOT=$FFSRC/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$FFSRC/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+SHARED_LIBS="--enable-shared"
-+if [ "$1" == "--noshared" ]; then
-+  SHARED_LIBS="--disable-shared"
-+  OUT=out/x-armv7-static-rel
-+  echo Static libs
-+else
-+  echo Shared libs
-+  OUT=out/x-armv7-shared-rel
-+fi
-+
-+USR_PREFIX=$FFSRC/$OUT/install
-+LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf
-+INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf
-+
-+mkdir -p $FFSRC/$OUT
-+cd $FFSRC/$OUT
-+
-+$FFSRC/configure --enable-cross-compile\
-+ --prefix=$USR_PREFIX\
-+ --libdir=$LIB_PREFIX\
-+ --incdir=$INC_PREFIX\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --enable-rpi\
-+ $SHARED_LIBS\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-shared\
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
 +
 +# gcc option for getting asm listing
 +# -Wa,-ahls
 --- /dev/null
 +++ b/pi-util/ffconf.py
 @@ -0,0 +1,215 @@
-+#!/usr/bin/env python
++#!/usr/bin/env python3
 +
 +import string
 +import os
@@ -59967,16 +67112,16 @@ Upstream-status: Pending
 +        pass
 +
 +    if  m1 and m2 and m1.group() == m2.group():
-+        print >> flog, "Match: " + m1.group()
++        print("Match: " + m1.group(), file=flog)
 +        rv = 0
 +    elif not m1:
-+        print >> flog, "****** Cannot find m1"
++        print("****** Cannot find m1", file=flog)
 +        rv = 3
 +    elif not m2:
-+        print >> flog, "****** Cannot find m2"
++        print("****** Cannot find m2", file=flog)
 +        rv = 2
 +    else:
-+        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++        print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
 +        rv = 1
 +    flog.close()
 +    return rv
@@ -60022,7 +67167,7 @@ Upstream-status: Pending
 +        exp_test = int(a[0])
 +        if (exp_test and runtest(a[1], tests)):
 +            name = a[1]
-+            print "==== ", name,
++            print ("==== ", name, end="")
 +            sys.stdout.flush()
 +
 +            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
@@ -60033,31 +67178,31 @@ Upstream-status: Pending
 +
 +            if (rv == 0):
 +                if exp_test == 2:
-+                    print ": * OK *"
++                    print(": * OK *")
 +                    unx_success.append(name)
 +                else:
-+                    print ": ok"
++                    print(": ok")
 +            elif exp_test == 2 and rv == 1:
-+                print ": fail"
++                print(": fail")
 +            elif exp_test == 3 and rv == 2:
 +                # Call an expected "crash" an abort
-+                print ": abort"
++                print(": abort")
 +            else:
 +                unx_failures.append(name)
 +                if rv == 1:
-+                    print ": * FAIL *"
++                    print(": * FAIL *")
 +                elif (rv == 2) :
-+                    print ": * CRASH *"
++                    print(": * CRASH *")
 +                elif (rv == 3) :
-+                    print ": * MD5 MISSING *"
++                    print(": * MD5 MISSING *")
 +                else :
-+                    print ": * BANG *"
++                    print(": * BANG *")
 +
 +    if unx_failures or unx_success:
-+        print "Unexpected Failures:", unx_failures
-+        print "Unexpected Success: ", unx_success
++        print("Unexpected Failures:", unx_failures)
++        print("Unexpected Success: ", unx_success)
 +    else:
-+        print "All tests normal:", successes, "ok,", failures, "failed"
++        print("All tests normal:", successes, "ok,", failures, "failed")
 +
 +
 +class ConfCSVDialect(csv.Dialect):
@@ -60567,3 +67712,630 @@ Upstream-status: Pending
 +
 +    do_logparse(args.logfile)
 +
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)
+ AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
+ AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
+ AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
++AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
++AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
+ AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
+ AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
+ 
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -121,6 +121,9 @@ static const struct {
+     #if CONFIG_HUFFYUV_DECODER
+         { "huffyuvdsp", checkasm_check_huffyuvdsp },
+     #endif
++    #if CONFIG_IDCTDSP
++        { "idctdsp", checkasm_check_idctdsp },
++    #endif
+     #if CONFIG_JPEG2000_DECODER
+         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
+     #endif
+@@ -145,6 +148,9 @@ static const struct {
+     #if CONFIG_V210_ENCODER
+         { "v210enc", checkasm_check_v210enc },
+     #endif
++    #if CONFIG_VC1DSP
++        { "vc1dsp", checkasm_check_vc1dsp },
++    #endif
+     #if CONFIG_VP8DSP
+         { "vp8dsp", checkasm_check_vp8dsp },
+     #endif
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void);
+ void checkasm_check_hevc_idct(void);
+ void checkasm_check_hevc_sao(void);
+ void checkasm_check_huffyuvdsp(void);
++void checkasm_check_idctdsp(void);
+ void checkasm_check_jpeg2000dsp(void);
+ void checkasm_check_llviddsp(void);
+ void checkasm_check_llviddspenc(void);
+@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
++void checkasm_check_vc1dsp(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+--- /dev/null
++++ b/tests/checkasm/idctdsp.c
+@@ -0,0 +1,98 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/idctdsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++} test;
++
++#define RANDOMIZE_BUFFER16(name, size)          \
++    do {                                        \
++        int i;                                  \
++        for (i = 0; i < size; ++i) {            \
++            uint16_t r = rnd() % 0x201 - 0x100; \
++            AV_WN16A(name##0 + i, r);           \
++            AV_WN16A(name##1 + i, r);           \
++        }                                       \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++static void check_add_put_clamped(void)
++{
++    /* Source buffers are only as big as needed, since any over-read won't affect results */
++    LOCAL_ALIGNED_16(int16_t, src0, [64]);
++    LOCAL_ALIGNED_16(int16_t, src1, [64]);
++    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
++    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
++
++    AVCodecContext avctx = { 0 };
++    IDCTDSPContext h;
++
++    const test tests[] = {
++        IDCTDSP_TEST(add_pixels_clamped)
++        IDCTDSP_TEST(put_pixels_clamped)
++        IDCTDSP_TEST(put_signed_pixels_clamped)
++    };
++
++    ff_idctdsp_init(&h, &avctx);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "idctdsp.%s", tests[t].name)) {
++            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
++            RANDOMIZE_BUFFER16(src, 64);
++            RANDOMIZE_BUFFER8(dst, 10 * 24);
++            call_ref(src0, dst0 + 24 + 8, 24);
++            call_new(src1, dst1 + 24 + 8, 24);
++            if (memcmp(dst0, dst1, 10 * 24))
++                fail();
++            bench_new(src1, dst1 + 24 + 8, 24);
++        }
++    }
++}
++
++void checkasm_check_idctdsp(void)
++{
++    check_add_put_clamped();
++    report("idctdsp");
++}
+--- /dev/null
++++ b/tests/checkasm/vc1dsp.c
+@@ -0,0 +1,452 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/vc1dsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++    int width;
++    int height;
++} test;
++
++typedef struct matrix {
++    size_t width;
++    size_t height;
++    float d[];
++} matrix;
++
++static const matrix T8 = { 8, 8, {
++        12,  12,  12,  12,  12,  12,  12,  12,
++        16,  15,   9,   4,  -4,  -9, -15, -16,
++        16,   6,  -6, -16, -16,  -6,   6,  16,
++        15,  -4, -16,  -9,   9,  16,   4, -15,
++        12, -12, -12,  12,  12, -12, -12,  12,
++         9, -16,   4,  15, -15,  -4,  16,  -9,
++         6, -16,  16,  -6,  -6,  16, -16,   6,
++         4,  -9,  15, -16,  16, -15,   9,  -4
++} };
++
++static const matrix T4 = { 4, 4, {
++        17,  17,  17,  17,
++        22,  10, -10, -22,
++        17, -17, -17,  17,
++        10, -22,  22, -10
++} };
++
++static const matrix T8t = { 8, 8, {
++        12,  16,  16,  15,  12,   9,   6,   4,
++        12,  15,   6,  -4, -12, -16, -16,  -9,
++        12,   9,  -6, -16, -12,   4,  16,  15,
++        12,   4, -16,  -9,  12,  15,  -6, -16,
++        12,  -4, -16,   9,  12, -15,  -6,  16,
++        12,  -9,  -6,  16, -12,  -4,  16, -15,
++        12, -15,   6,   4, -12,  16, -16,   9,
++        12, -16,  16, -15,  12,  -9,   6,  -4
++} };
++
++static const matrix T4t = { 4, 4, {
++        17,  22,  17,  10,
++        17,  10, -17, -22,
++        17, -10, -17,  22,
++        17, -22,  17, -10
++} };
++
++static matrix *new_matrix(size_t width, size_t height)
++{
++    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
++    if (out == NULL) {
++        fprintf(stderr, "Memory allocation failure\n");
++        exit(EXIT_FAILURE);
++    }
++    out->width = width;
++    out->height = height;
++    return out;
++}
++
++static matrix *multiply(const matrix *a, const matrix *b)
++{
++    matrix *out;
++    if (a->width != b->height) {
++        fprintf(stderr, "Incompatible multiplication\n");
++        exit(EXIT_FAILURE);
++    }
++    out = new_matrix(b->width, a->height);
++    for (int j = 0; j < out->height; ++j)
++        for (int i = 0; i < out->width; ++i) {
++            float sum = 0;
++            for (int k = 0; k < a->width; ++k)
++                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
++            out->d[j * out->width + i] = sum;
++        }
++    return out;
++}
++
++static void normalise(matrix *a)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p *= 64;
++            if (a->height == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
++            if (a->width == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
++        }
++}
++
++static void divide_and_round_nearest(matrix *a, float by)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p = rintf(*p / by);
++        }
++}
++
++static void tweak(matrix *a)
++{
++    for (int j = 4; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p += 1;
++        }
++}
++
++/* The VC-1 spec places restrictions on the values permitted at three
++ * different stages:
++ * - D: the input coefficients in frequency domain
++ * - E: the intermediate coefficients, inverse-transformed only horizontally
++ * - R: the fully inverse-transformed coefficients
++ *
++ * To fully cater for the ranges specified requires various intermediate
++ * values to be held to 17-bit precision; yet these conditions do not appear
++ * to be utilised in real-world streams. At least some assembly
++ * implementations have chosen to restrict these values to 16-bit precision,
++ * to accelerate the decoding of real-world streams at the cost of strict
++ * adherence to the spec. To avoid our test marking these as failures,
++ * reduce our random inputs.
++ */
++#define ATTENUATION 4
++
++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
++{
++    matrix *raw, *tmp, *D, *E, *R;
++    raw = new_matrix(width, height);
++    for (int i = 0; i < width * height; ++i)
++        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
++    tmp = multiply(height == 8 ? &T8 : &T4, raw);
++    D = multiply(tmp, width == 8 ? &T8t : &T4t);
++    normalise(D);
++    divide_and_round_nearest(D, 1);
++    for (int i = 0; i < width * height; ++i) {
++        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    }
++    E = multiply(D, width == 8 ? &T8 : &T4);
++    divide_and_round_nearest(E, 8);
++    for (int i = 0; i < width * height; ++i)
++        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    R = multiply(height == 8 ? &T8t : &T4t, E);
++    tweak(R);
++    divide_and_round_nearest(R, 128);
++    for (int i = 0; i < width * height; ++i)
++        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            av_free(R);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    av_free(raw);
++    av_free(tmp);
++    av_free(E);
++    av_free(R);
++    return D;
++}
++
++#define RANDOMIZE_BUFFER16(name, size)        \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint16_t r = rnd();               \
++            AV_WN16A(name##0 + i, r);         \
++            AV_WN16A(name##1 + i, r);         \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
++    do {                                            \
++        uint8_t *p##0 = name##0, *p##1 = name##1;   \
++        int i = (size);                             \
++        while (i-- > 0) {                           \
++            int x = 0x80 | (rnd() & 0x7F);          \
++            x >>= rnd() % 9;                        \
++            if (rnd() & 1)                          \
++                x = -x;                             \
++            *p##1++ = *p##0++ = 0x80 + x;           \
++        }                                           \
++    } while (0)
++
++static void check_inv_trans_inplace(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size.
++     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
++     * are stored in column-major order, and the outputs are written back
++     * to the input buffer, so we oversize it slightly to catch overruns. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
++        matrix *coeffs;
++        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
++        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
++        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
++        for (int j = 0; j < 8; ++j)
++            for (int i = 0; i < 8; ++i) {
++                int idx = 8 + i * 8 + j;
++                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
++            }
++        call_ref(inv_trans_in0 + 8);
++        call_new(inv_trans_in1 + 8);
++        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
++            fail();
++        bench_new(inv_trans_in1 + 8);
++        av_free(coeffs);
++    }
++}
++
++static void check_inv_trans_adding(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
++
++    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
++     * added with saturation to an array of unsigned 8-bit values. Oversize
++     * this by 8 samples left and right and one row above and below. */
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            matrix *coeffs;
++            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
++            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
++            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
++            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
++            for (int j = 0; j < tests[t].height; ++j)
++                for (int i = 0; i < tests[t].width; ++i) {
++                    int idx = j * 8 + i;
++                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
++                }
++            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
++            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
++            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
++                fail();
++            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
++            av_free(coeffs);
++        }
++    }
++}
++
++static void check_loop_filter(void)
++{
++    /* Deblocking filter buffers are big enough to hold a 16x16 block,
++     * plus 16 columns left and 4 rows above to hold filter inputs
++     * (depending on whether v or h neighbouring block edge, oversized
++     * horizontally to maintain 16-byte alignment) plus 16 columns and
++     * 4 rows below to catch write overflows */
++    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
++    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_TEST(vc1_v_loop_filter4)
++        VC1DSP_TEST(vc1_h_loop_filter4)
++        VC1DSP_TEST(vc1_v_loop_filter8)
++        VC1DSP_TEST(vc1_h_loop_filter8)
++        VC1DSP_TEST(vc1_v_loop_filter16)
++        VC1DSP_TEST(vc1_h_loop_filter16)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
++        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            for (int count = 1000; count > 0; --count) {
++                int pq = rnd() % 31 + 1;
++                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
++                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
++                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
++                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
++                    fail();
++            }
++        }
++        for (int j = 0; j < 24; ++j)
++            for (int i = 0; i < 48; ++i)
++                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
++        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
++        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
++    }
++}
++
++#define TEST_UNESCAPE                                                                               \
++    do {                                                                                            \
++        for (int count = 100; count > 0; --count) {                                                 \
++            escaped_offset = rnd() & 7;                                                             \
++            unescaped_offset = rnd() & 7;                                                           \
++            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
++            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
++            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
++            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
++            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
++                fail();                                                                             \
++        }                                                                                           \
++    } while (0)
++
++static void check_unescape(void)
++{
++    /* This appears to be a typical length of buffer in use */
++#define LOG2_UNESCAPE_BUF_SIZE 17
++#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
++    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
++        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
++        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
++
++        /* Test data which consists of escapes sequences packed as tightly as possible */
++        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
++            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
++        TEST_UNESCAPE;
++
++        /* Test random data */
++        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Test data with escape sequences at random intervals */
++        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
++            int gap, gap_msb;
++            escaped1[x+0] = escaped0[x+0] = 0;
++            escaped1[x+1] = escaped0[x+1] = 0;
++            escaped1[x+2] = escaped0[x+2] = 3;
++            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
++            gap_msb = 2u << (rnd() % 8);
++            gap = (rnd() &~ -gap_msb) | gap_msb;
++            x += gap;
++        }
++        TEST_UNESCAPE;
++
++        /* Test data which is known to contain no escape sequences */
++        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
++        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Benchmark the no-escape-sequences case */
++        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
++    }
++}
++
++void checkasm_check_vc1dsp(void)
++{
++    check_inv_trans_inplace();
++    check_inv_trans_adding();
++    report("inv_trans");
++
++    check_loop_filter();
++    report("loop_filter");
++
++    check_unescape();
++    report("unescape_buffer");
++}
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+                 fate-checkasm-hevc_add_res                              \
+                 fate-checkasm-hevc_idct                                 \
+                 fate-checkasm-hevc_sao                                  \
++                fate-checkasm-idctdsp                                   \
+                 fate-checkasm-jpeg2000dsp                               \
+                 fate-checkasm-llviddsp                                  \
+                 fate-checkasm-llviddspenc                               \
+@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+                 fate-checkasm-sw_scale                                  \
+                 fate-checkasm-v210dec                                   \
+                 fate-checkasm-v210enc                                   \
++                fate-checkasm-vc1dsp                                    \
+                 fate-checkasm-vf_blend                                  \
+                 fate-checkasm-vf_colorspace                             \
+                 fate-checkasm-vf_eq                                     \
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix_flags.diff b/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
similarity index 89%
rename from recipes-multimedia/rpidistro-ffmpeg/files/0005-fix_flags.diff
rename to recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
index 1eb1074..ab6f139 100644
--- a/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix_flags.diff
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
@@ -1,8 +1,11 @@
-Upstream-status: Pending
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
 
 --- a/configure
 +++ b/configure
-@@ -6467,11 +6467,9 @@ enabled mbedtls           && { check_pkg
+@@ -6471,11 +6471,9 @@ enabled mbedtls           && { check_pkg
                                 die "ERROR: mbedTLS not found"; }
  enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
  ( enabled rpi ||
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
new file mode 100644
index 0000000..f153827
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
@@ -0,0 +1,82 @@
+From 01e738a8f1414acd0102e432bbc15b4e603fd956 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:34:20 -0600
+Subject: [PATCH] configure: setup for OE-core usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Add global CFLAGS and LDFLAGS. So, that when
+./configure runs test it's able to locate proper
+headers and libs in a cross-compile environment.
+
+Add new check to opengl. None of the above headers
+exists and we also should be using GLESv2.
+
+Update where compiler finds OMX_Core.h
+
+Only check that sdl2 version greater than 2.0.1
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ configure | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/configure b/configure
+index 723b81f1..0c7f2654 100755
+--- a/configure
++++ b/configure
+@@ -5746,6 +5746,9 @@ enable_weak_pic() {
+ }
+ 
+ enabled pic && enable_weak_pic
++# Set CFLAGS and LDFLAGS globally
++add_cflags -I${sysroot}/usr/include/ -I${sysroot}/usr/include/IL -I${sysroot}/usr/include/drm
++add_ldflags -L${sysroot}/usr/lib/
+ 
+ test_cc <<EOF || die "Symbol mangling check failed."
+ int ff_extern;
+@@ -6471,8 +6474,7 @@ enabled mbedtls           && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
+                                die "ERROR: mbedTLS not found"; }
+ enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+ ( enabled rpi ||
+-  enabled mmal )          && { { add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+-                               add_ldflags -L/opt/vc/lib/ &&
++  enabled mmal )          && { { add_cflags -I${sysroot}/usr/include/interface/vmcs_host/linux -I${sysroot}/usr/include/interface/vcos/pthreads -fgnu89-inline &&
+                                check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcsm -lvchostif -lvchiq_arm -lvcos; } ||
+                                die "ERROR: mmal not found" &&
+                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+@@ -6492,15 +6494,15 @@ enabled opengl            && { check_lib opengl GL/glx.h glXGetProcAddress "-lGL
+                                check_lib opengl windows.h wglGetProcAddress "-lopengl32 -lgdi32" ||
+                                check_lib opengl OpenGL/gl3.h glGetError "-Wl,-framework,OpenGL" ||
+                                check_lib opengl ES2/gl.h glGetError "-isysroot=${sysroot} -Wl,-framework,OpenGLES" ||
++                               check_lib opengl GLES2/gl2.h glGetError "-lGLESv2" ||
+                                die "ERROR: opengl not found."
+                              }
+-enabled omx_rpi           && { test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame ||
++enabled omx_rpi           && { test_code cc IL/OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame ||
+                                { ! enabled cross_compile &&
+-                                 add_cflags -isystem/opt/vc/include/IL &&
+-                                 test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame; } ||
++                                 test_code cc IL/OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame; } ||
+                                die "ERROR: OpenMAX IL headers from raspberrypi/firmware not found"; } &&
+                              enable omx
+-enabled omx               && require_headers OMX_Core.h
++enabled omx               && require_headers IL/OMX_Core.h
+ enabled openssl           && { check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl ||
+                                check_pkg_config openssl openssl openssl/ssl.h SSL_library_init ||
+                                check_lib openssl openssl/ssl.h OPENSSL_init_ssl -lssl -lcrypto ||
+@@ -6540,7 +6542,7 @@ fi
+ 
+ if enabled sdl2; then
+     SDL2_CONFIG="${cross_prefix}sdl2-config"
+-    test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent
++    test_pkg_config sdl2 "sdl2 >= 2.0.1" SDL_events.h SDL_PollEvent
+     if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then
+         sdl2_cflags=$("${SDL2_CONFIG}" --cflags)
+         sdl2_extralibs=$("${SDL2_CONFIG}" --libs)
+-- 
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
new file mode 100644
index 0000000..43a9191
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
@@ -0,0 +1,111 @@
+From be426ad76c3e486f1364dd292cf8e1c633c80e91 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:39:47 -0600
+Subject: [PATCH] libavdevice: opengl_enc.c update dynamic function loader
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+For meta-raspberrypi ffmpeg builds, when opengl
+is enabled do_compile will fail. Reasion is that
+glGetProcAddress is undefined in either GLES2/gl2.h
+or GLES2/gl2ext.h.
+
+define SelectedGetProcAddress to SDL_GL_GetProcAddress
+if sdl2 is included. If not included, define function
+pointers at compile time versus runtime.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavdevice/opengl_enc.c | 44 ++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 40 insertions(+), 4 deletions(-)
+
+diff --git a/libavdevice/opengl_enc.c b/libavdevice/opengl_enc.c
+index 2bdb8da7..eabc1bf8 100644
+--- a/libavdevice/opengl_enc.c
++++ b/libavdevice/opengl_enc.c
+@@ -37,12 +37,13 @@
+ #include <OpenGL/gl3.h>
+ #elif HAVE_ES2_GL_H
+ #include <ES2/gl.h>
+-#else
+-#include <GL/gl.h>
+-#include <GL/glext.h>
+ #endif
+ #if HAVE_GLXGETPROCADDRESS
+ #include <GL/glx.h>
++#else
++#define GL_GLEXT_PROTOTYPES
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+ 
+ #if CONFIG_SDL2
+@@ -493,8 +494,14 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+ 
+ #if HAVE_GLXGETPROCADDRESS
+ #define SelectedGetProcAddress glXGetProcAddress
++#define CAN_DYNAMIC_LOAD 1
+ #elif HAVE_WGLGETPROCADDRESS
+ #define SelectedGetProcAddress wglGetProcAddress
++#elif CONFIG_SDL2
++#define SelectedGetProcAddress SDL_GL_GetProcAddress
++#define CAN_DYNAMIC_LOAD 1
++#else
++#define CAN_DYNAMIC_LOAD 0
+ #endif
+ 
+ #define LOAD_OPENGL_FUN(name, type) \
+@@ -504,7 +511,8 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+         return AVERROR(ENOSYS); \
+     }
+ 
+-#if CONFIG_SDL2
++#if CAN_DYNAMIC_LOAD
++#if CONFIG_SDL2 
+     if (!opengl->no_window)
+         return opengl_sdl_load_procedures(opengl);
+ #endif
+@@ -534,9 +542,37 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+     LOAD_OPENGL_FUN(glGetShaderInfoLog, FF_PFNGLGETSHADERINFOLOGPROC)
+     LOAD_OPENGL_FUN(glEnableVertexAttribArray, FF_PFNGLENABLEVERTEXATTRIBARRAYPROC)
+     LOAD_OPENGL_FUN(glVertexAttribPointer, FF_PFNGLVERTEXATTRIBPOINTERPROC)
++#else
++    procs->glActiveTexture = glActiveTexture;
++    procs->glGenBuffers = glGenBuffers;
++    procs->glDeleteBuffers = glDeleteBuffers;
++    procs->glBufferData = glBufferData;
++    procs->glBindBuffer = glBindBuffer;
++    procs->glGetAttribLocation = glGetAttribLocation;
++    procs->glGetUniformLocation = glGetUniformLocation;
++    procs->glUniform1f = glUniform1f;
++    procs->glUniform1i = glUniform1i;
++    procs->glUniformMatrix4fv = glUniformMatrix4fv;
++    procs->glCreateProgram = glCreateProgram;
++    procs->glDeleteProgram = glDeleteProgram;
++    procs->glUseProgram = glUseProgram;
++    procs->glLinkProgram = glLinkProgram;
++    procs->glGetProgramiv = glGetProgramiv;
++    procs->glGetProgramInfoLog = glGetProgramInfoLog;
++    procs->glAttachShader = glAttachShader;
++    procs->glCreateShader = glCreateShader;
++    procs->glDeleteShader = glDeleteShader;
++    procs->glCompileShader = glCompileShader;
++    procs->glShaderSource = glShaderSource;
++    procs->glGetShaderiv = glGetShaderiv;
++    procs->glGetShaderInfoLog = glGetShaderInfoLog;
++    procs->glEnableVertexAttribArray = glEnableVertexAttribArray;
++    procs->glVertexAttribPointer = (FF_PFNGLVERTEXATTRIBPOINTERPROC) glVertexAttribPointer;
++#endif
+ 
+     return 0;
+ 
++#undef CAN_DYNAMIC_LOAD
+ #undef SelectedGetProcAddress
+ #undef LOAD_OPENGL_FUN
+ }
+-- 
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
new file mode 100644
index 0000000..2232c48
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
@@ -0,0 +1,45 @@
+From 62c2f041890a6e20770350721a0a2138d0b38634 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Sat, 3 Dec 2022 23:35:51 -0600
+Subject: [PATCH] libavcodec: fix v4l2_req_devscan.h
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Fixes minor differences between v4l2_req_devscan.c
+and v4l2_req_devscan.h after all patches have been
+applied.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavcodec/v4l2_req_devscan.h | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
+index 0baef365..cd9c49ac 100644
+--- a/libavcodec/v4l2_req_devscan.h
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -1,6 +1,8 @@
+ #ifndef _DEVSCAN_H_
+ #define _DEVSCAN_H_
+ 
++#include <stdint.h>
++
+ struct devscan;
+ struct decdev;
+ enum v4l2_buf_type;
+@@ -13,7 +15,8 @@ const char *decdev_video_path(const struct decdev *const dev);
+ enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
+ uint32_t decdev_src_pixelformat(const struct decdev *const dev);
+ 
+-const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++const struct decdev *devscan_find(struct devscan *const scan,
++                                  const uint32_t src_fmt_v4l2);
+ 
+ int devscan_build(void * const dc, struct devscan **pscan);
+ void devscan_delete(struct devscan **const pScan);
+-- 
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
new file mode 100644
index 0000000..02c07de
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
@@ -0,0 +1,35 @@
+From 0dfb56e12fa709794525cda1471091f6699905d5 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:49:03 -0600
+Subject: [PATCH] libavcodec: omx replace /opt/vc path with /usr/lib
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Configures omx.c for OE usages as libbcm_host.so
+and libopenmaxil.so are located in a different
+location.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavcodec/omx.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/libavcodec/omx.c b/libavcodec/omx.c
+index 0a6a3083..8c6e9193 100644
+--- a/libavcodec/omx.c
++++ b/libavcodec/omx.c
+@@ -141,7 +141,7 @@ static av_cold OMXContext *omx_init(void *logctx, const char *libname, const cha
+ {
+     static const char * const libnames[] = {
+ #if CONFIG_OMX_RPI
+-        "/opt/vc/lib/libopenmaxil.so", "/opt/vc/lib/libbcm_host.so",
++        "/usr/lib/libopenmaxil.so", "/usr/lib/libbcm_host.so",
+ #else
+         "libOMX_Core.so", NULL,
+         "libOmxCore.so", NULL,
+-- 
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.2.bb b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
similarity index 89%
rename from recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.2.bb
rename to recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
index de0d445..1720d57 100644
--- a/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.2.bb
+++ b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
@@ -33,23 +33,27 @@ RPROVIDES:${PN} = "${PROVIDES}"
 DEPENDS = "nasm-native"
 
 inherit autotools pkgconfig
-PACKAGECONFIG ??= "avdevice avfilter avcodec avformat swresample swscale postproc avresample \
-                   opengl udev sdl2 ffplay alsa bzlib lzma pic pthreads shared theora zlib \
-                   libvorbis x264 gpl sand rpi vout-drm vout-egl \
-                   ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal', d)} \
+PACKAGECONFIG ??= "avdevice avfilter avcodec avformat swresample swscale postproc avresample ffplay \
+                   v4l2 drm udev alsa bzlib lzma pic pthreads shared theora zlib libvorbis x264 gpl \
+                   ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal rpi sand vout-drm', d)} \
                    ${@bb.utils.contains('AVAILTUNES', 'mips32r2', 'mips32r2', '', d)} \
-                   ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'xv xcb', '', d)}"
+                   ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'opengl', '', d)} \
+                   ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'xv xcb vout-egl epoxy', '', d)}"
 
 SRC_URI = "\
     git://git@github.com/RPi-Distro/ffmpeg;protocol=https;branch=pios/bullseye \
     file://0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch \
     file://0002-Fix-build-on-powerpc-and-ppc64.patch \
     file://0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch \
-    file://0004-ffmpeg-4.3.2-rpi_10.patch \
-    file://0005-fix_flags.diff \
-"
+    file://0004-ffmpeg-4.3.4-rpi_14.patch \
+    file://0005-fix-flags.diff \
+    file://2001-configure-setup-for-OE-core-usage.patch \
+    file://2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch \
+    file://2003-libavcodec-fix-v4l2_req_devscan.patch \
+    file://2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch \
+    "
 
-SRCREV = "ea72093f350f38edcd39c480b331c3219c377642"
+SRCREV = "246e1a55a0eca931537d8706acd8b133c07beb05"
 
 S = "${WORKDIR}/git"
 
@@ -70,7 +74,7 @@ PACKAGECONFIG[altivec] = "--enable-altivec,--disable-altivec,"
 PACKAGECONFIG[bzlib] = "--enable-bzlib,--disable-bzlib,bzip2"
 PACKAGECONFIG[fdk-aac] = "--enable-libfdk-aac --enable-nonfree,--disable-libfdk-aac,fdk-aac"
 PACKAGECONFIG[gpl] = "--enable-gpl,--disable-gpl"
-PACKAGECONFIG[opengl] = "--enable-opengl,--disable-opengl,virtual/libgl"
+PACKAGECONFIG[opengl] = "--enable-opengl,--disable-opengl,virtual/libgles2"
 PACKAGECONFIG[gsm] = "--enable-libgsm,--disable-libgsm,libgsm"
 PACKAGECONFIG[jack] = "--enable-indev=jack,--disable-indev=jack,jack"
 PACKAGECONFIG[libvorbis] = "--enable-libvorbis,--disable-libvorbis,libvorbis"
@@ -90,9 +94,11 @@ PACKAGECONFIG[x264] = "--enable-libx264,--disable-libx264,x264"
 PACKAGECONFIG[xcb] = "--enable-libxcb,--disable-libxcb,libxcb"
 PACKAGECONFIG[xv] = "--enable-outdev=xv,--disable-outdev=xv,libxv"
 PACKAGECONFIG[zlib] = "--enable-zlib,--disable-zlib,zlib"
-#PACKAGECONFIG[snappy] = "--enable-libsnappy,--enable-libsnappy,snappy"
+PACKAGECONFIG[snappy] = "--enable-libsnappy,--disable-libsnappy,snappy"
 PACKAGECONFIG[udev] = "--enable-libudev,--disable-libudev,udev"
-PACKAGECONFIG[v4l2] = "--enable-libv4l2 --enable-v4l2-request --enable-libdrm,,v4l-utils"
+PACKAGECONFIG[drm] = "--enable-libdrm,--disable-libdrm,libdrm"
+PACKAGECONFIG[epoxy] = "--enable-epoxy,--disable-epoxy,libepoxy"
+PACKAGECONFIG[v4l2] = "--enable-libv4l2 --enable-v4l2-m2m --enable-v4l2-request,,v4l-utils"
 PACKAGECONFIG[mmal] = "--enable-omx --enable-omx-rpi --enable-mmal,,userland"
 PACKAGECONFIG[sand] = "--enable-sand,,"
 PACKAGECONFIG[rpi] = "--enable-rpi,,"
@@ -138,11 +144,6 @@ EXTRA_OECONF = " \
 "
 EXTRA_OECONF:append:linux-gnux32 = " --disable-asm"
 
-# Directly specify the include directories the contain headers for
-#   libdrm
-#   openmaxil
-TARGET_CFLAGS:append = " -I${STAGING_INCDIR}/IL -I${STAGING_INCDIR}/drm"
-
 # gold crashes on x86, another solution is to --disable-asm but thats more hacky
 # ld.gold: internal error in relocate_section, at ../../gold/i386.cc:3684
 LDFLAGS:append:x86 = "${@bb.utils.contains('DISTRO_FEATURES', 'ld-is-gold', ' -fuse-ld=bfd ', '', d)}"