From 1f332405495f87951da89642d6ba845dea29268c Mon Sep 17 00:00:00 2001 From: Vincent Davis Jr Date: Thu, 8 Dec 2022 11:22:01 -0600 Subject: [PATCH] rpidistro-ffmpeg: upgrade 4.3.2 -> 4.3.4 Upgrades version of ffmpeg to 4.3.4 * Reason for not upgrading to 4.3.5 all ported raspberrypi team patches may not be included in that version/commit. * SRCREV set to 246e1a55a0eca931537d8706acd8b133c07beb05 Updates to PACKAGECONFIG * Only include --enable-opengl flag when opengl is set in DISTRO_FEATURES * Add new flag --enable-epoxy required by vout-egl * vout-egl requires both libepoxy and x11. Only enable vout-egl if x11 contained in DISTRO_FEATURES. * The remaining RPI-Distro related flags added through patches. Are only enabled if vc4graphics is disabled and userland graphics enabled. As an attempt to keep ffmpeg ./configure generic unless specified other wise. Removes TARGET_CFLAGS:append as include flags are set in ./configure via the 2001-configure-setup-for-OE-core-usage.patch patch. Replaces patches with updated patches used in actual commit. Adds four new patches to fix ./configure, compile, runtime bugs. PATCHES: - 2001-configure-setup-for-OE-core-usage.patch * The ./configure stage fails if neither x11 or wayland defined in DISTRO_FEATURES. When opengl enabled ./configure checks for relevant headers. The last header it checks for is ES2/gl.h which doesn't exists. Neither do the others if certain perameters are not meet. Patch addes check for GLES2/gl2.h which does exists. We use utilize GLESv2 to compile and link with. Patch also replaces where compiler find mmal and omx headers and libs. - 2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch * After configure stage succeeds the compile stage fails as SelectedGetProcAddress isn't defined. It can't be define as if x11 isn't enabled. Patch defines SelectedGetProcAddress if x11 not enabled, but sdl2 enabled to SDL_GL_GetProcAddress. If neither sdl2 or x11 is enabled patch loads GL functions pointers at compile time versus dynamically at runtime. - 2003-libavcodec-fix-v4l2_req_devscan.patch * v412_req_devscan.h function definitions where different from v412_req_devscan.c function implementations. - 2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch * Fixes where libbcm_host.so and libopenmaxil.so are loaded from. Signed-off-by: Vincent Davis Jr --- ...nc-avoid-callee-preserved-vfp-regist.patch | 40 +- .../0002-Fix-build-on-powerpc-and-ppc64.patch | 7 +- ...c-remove-monowhite-from-apng-formats.patch | 8 +- ...0.patch => 0004-ffmpeg-4.3.4-rpi_14.patch} | 11120 +++++++++++++--- ...005-fix_flags.diff => 0005-fix-flags.diff} | 7 +- ...01-configure-setup-for-OE-core-usage.patch | 82 + ...l_enc-update-dynamic-function-loader.patch | 111 + ...2003-libavcodec-fix-v4l2_req_devscan.patch | 45 + ...omx-replace-opt-vc-path-with-usr-lib.patch | 35 + ...peg_4.3.2.bb => rpidistro-ffmpeg_4.3.4.bb} | 35 +- 10 files changed, 9774 insertions(+), 1716 deletions(-) rename recipes-multimedia/rpidistro-ffmpeg/files/{0004-ffmpeg-4.3.2-rpi_10.patch => 0004-ffmpeg-4.3.4-rpi_14.patch} (84%) rename recipes-multimedia/rpidistro-ffmpeg/files/{0005-fix_flags.diff => 0005-fix-flags.diff} (89%) create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch create mode 100644 recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch rename recipes-multimedia/rpidistro-ffmpeg/{rpidistro-ffmpeg_4.3.2.bb => rpidistro-ffmpeg_4.3.4.bb} (89%) diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch index e9c9eb7..d9c07dd 100644 --- a/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch +++ b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch @@ -2,6 +2,11 @@ From: James Cowgill Date: Sun, 11 Aug 2019 16:50:56 +0100 Subject: avcodec/arm/sbcenc: avoid callee preserved vfp registers +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. + When compiling FFmpeg with GCC-9, some very random segfaults were observed in code which had previously called down into the SBC encoder NEON assembly routines. This was caused by these functions clobbering @@ -19,8 +24,6 @@ sets of registers consecutively numbered which hopefully makes the code more easy to follow. Since this commit only reallocates registers, it should have no performance impact. -Upstream-status: Pending - Signed-off-by: James Cowgill --- libavcodec/arm/sbcdsp_neon.S | 220 +++++++++++++++++++++---------------------- @@ -38,7 +41,7 @@ index d83d21d..914abfb 100644 - vld1.16 {d8, d9}, [r2, :128]! + vld1.16 {d16, d17}, [r0, :64]! + vld1.16 {d20, d21}, [r2, :128]! - + - vmull.s16 q0, d4, d8 - vld1.16 {d6, d7}, [r0, :64]! - vmull.s16 q1, d5, d9 @@ -47,7 +50,7 @@ index d83d21d..914abfb 100644 + vld1.16 {d18, d19}, [r0, :64]! + vmull.s16 q1, d17, d21 + vld1.16 {d22, d23}, [r2, :128]! - + - vmlal.s16 q0, d6, d10 - vld1.16 {d4, d5}, [r0, :64]! - vmlal.s16 q1, d7, d11 @@ -56,7 +59,7 @@ index d83d21d..914abfb 100644 + vld1.16 {d16, d17}, [r0, :64]! + vmlal.s16 q1, d19, d23 + vld1.16 {d20, d21}, [r2, :128]! - + - vmlal.s16 q0, d4, d8 - vld1.16 {d6, d7}, [r0, :64]! - vmlal.s16 q1, d5, d9 @@ -65,7 +68,7 @@ index d83d21d..914abfb 100644 + vld1.16 {d18, d19}, [r0, :64]! + vmlal.s16 q1, d17, d21 + vld1.16 {d22, d23}, [r2, :128]! - + - vmlal.s16 q0, d6, d10 - vld1.16 {d4, d5}, [r0, :64]! - vmlal.s16 q1, d7, d11 @@ -74,23 +77,23 @@ index d83d21d..914abfb 100644 + vld1.16 {d16, d17}, [r0, :64]! + vmlal.s16 q1, d19, d23 + vld1.16 {d20, d21}, [r2, :128]! - + - vmlal.s16 q0, d4, d8 - vmlal.s16 q1, d5, d9 + vmlal.s16 q0, d16, d20 + vmlal.s16 q1, d17, d21 - + vpadd.s32 d0, d0, d1 vpadd.s32 d1, d2, d3 - + vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE - + - vld1.16 {d2, d3, d4, d5}, [r2, :128]! + vld1.16 {d16, d17, d18, d19}, [r2, :128]! - + vdup.i32 d1, d0[1] /* TODO: can be eliminated */ vdup.i32 d0, d0[0] /* TODO: can be eliminated */ - + - vmull.s16 q3, d2, d0 - vmull.s16 q4, d3, d0 - vmlal.s16 q3, d4, d1 @@ -99,14 +102,14 @@ index d83d21d..914abfb 100644 + vmull.s16 q11, d17, d0 + vmlal.s16 q10, d18, d1 + vmlal.s16 q11, d19, d1 - + - vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */ - vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */ + vpadd.s32 d0, d20, d21 /* TODO: can be eliminated */ + vpadd.s32 d1, d22, d23 /* TODO: can be eliminated */ - + vst1.32 {d0, d1}, [r1, :128] - + @@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1 /* TODO: merge even and odd cases (or even merge all four calls to this * function) in order to have only aligned reads from 'in' array @@ -213,13 +216,13 @@ index d83d21d..914abfb 100644 + vpadd.s32 d1, d26, d27 + vpadd.s32 d2, d28, d29 + vpadd.s32 d3, d30, d31 - + vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE @@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1 vdup.i32 d1, d0[1] /* TODO: can be eliminated */ vdup.i32 d0, d0[0] /* TODO: can be eliminated */ - + - vld1.16 {d4, d5}, [r2, :128]! - vmull.s16 q6, d4, d0 - vld1.16 {d6, d7}, [r2, :128]! @@ -284,5 +287,6 @@ index d83d21d..914abfb 100644 + vpadd.s32 d1, d26, d27 /* TODO: can be eliminated */ + vpadd.s32 d2, d28, d29 /* TODO: can be eliminated */ + vpadd.s32 d3, d30, d31 /* TODO: can be eliminated */ - + vst1.32 {d0, d1, d2, d3}, [r1, :128] + diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch index 4d9c1b9..f398791 100644 --- a/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch +++ b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch @@ -2,7 +2,10 @@ From: John Paul Adrian Glaubitz Date: Tue, 19 Jan 2021 20:35:29 +0100 Subject: Fix build on powerpc and ppc64 -Upstream-status: Pending +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. --- libswscale/ppc/yuv2rgb_altivec.c | 10 ++++++++++ @@ -15,7 +18,7 @@ index 5365452..930ef6b 100644 @@ -283,6 +283,16 @@ static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y, * ------------------------------------------------------------------------------ */ - + +#if !HAVE_VSX +static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr) +{ diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch index 38f3fd4..11e3383 100644 --- a/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch +++ b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch @@ -2,13 +2,15 @@ From: Paul B Mahol Date: Sun, 14 Feb 2021 17:20:03 +0100 Subject: avcodec/pngenc: remove monowhite from apng formats +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. + Monowhite pixel format is not supported, and it does not make sense to add support for it. Fixes #7989 - -Upstream-status: Pending - --- libavcodec/pngenc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.2-rpi_10.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch similarity index 84% rename from recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.2-rpi_10.patch rename to recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch index 6bab0d0..740ac0e 100644 --- a/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.2-rpi_10.patch +++ b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch @@ -1,16 +1,27 @@ -Upstream-status: Pending +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. --- a/configure +++ b/configure -@@ -274,6 +274,7 @@ External library support: +@@ -207,6 +207,7 @@ External library support: + --disable-bzlib disable bzlib [autodetect] + --disable-coreimage disable Apple CoreImage framework [autodetect] + --enable-chromaprint enable audio fingerprinting with chromaprint [no] ++ --disable-epoxy disable epoxy [autodetect] + --enable-frei0r enable frei0r video filtering [no] + --enable-gcrypt enable gcrypt, needed for rtmp(t)e support + if openssl, librtmp or gmp is not used [no] +@@ -274,6 +275,7 @@ External library support: --enable-libtls enable LibreSSL (via libtls), needed for https support if openssl, gnutls or mbedtls is not used [no] --enable-libtwolame enable MP2 encoding via libtwolame [no] -+ --enable-libudev enable libudev [no] ++ --disable-libudev disable libudev [autodetect] --enable-libv4l2 enable libv4l2/v4l-utils [no] --enable-libvidstab enable video stabilization using vid.stab [no] --enable-libvmaf enable vmaf filter via libvmaf [no] -@@ -336,12 +337,17 @@ External library support: +@@ -336,12 +338,17 @@ External library support: --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] --enable-libnpp enable Nvidia Performance Primitives-based code [no] --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] @@ -28,23 +39,17 @@ Upstream-status: Pending --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] --disable-videotoolbox disable VideoToolbox code [autodetect] -@@ -1771,6 +1777,7 @@ EXTERNAL_LIBRARY_LIST=" - libdav1d - libdc1394 - libdrm +@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST=" + avfoundation + bzlib + coreimage + epoxy - libflite - libfontconfig - libfreetype -@@ -1807,6 +1814,7 @@ EXTERNAL_LIBRARY_LIST=" - libtesseract - libtheora - libtwolame + iconv + libudev - libv4l2 - libvorbis - libvpx -@@ -1861,7 +1869,10 @@ HWACCEL_LIBRARY_LIST=" + libxcb + libxcb_shm + libxcb_shape +@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST=" mmal omx opencl @@ -53,9 +58,9 @@ Upstream-status: Pending + rpi4_8 + rpi4_10 " - + DOCUMENT_LIST=" -@@ -1877,12 +1888,16 @@ FEATURE_LIST=" +@@ -1877,12 +1889,16 @@ FEATURE_LIST=" gray hardcoded_tables omx_rpi @@ -70,17 +75,17 @@ Upstream-status: Pending + vout_drm + vout_egl " - + # this list should be kept in linking order -@@ -1923,6 +1938,7 @@ SUBSYSTEM_LIST=" +@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST=" pixelutils network rdft + rpi " - + # COMPONENT_LIST needs to come last to ensure correct dependency checking -@@ -2405,9 +2421,11 @@ CONFIG_EXTRA=" +@@ -2405,9 +2422,11 @@ CONFIG_EXTRA=" rangecoder riffdec riffenc @@ -92,7 +97,7 @@ Upstream-status: Pending scene_sad sinewin snappy -@@ -2737,6 +2755,8 @@ hap_decoder_select="snappy texturedsp" +@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp" hap_encoder_deps="libsnappy" hap_encoder_select="texturedspenc" hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp" @@ -101,7 +106,7 @@ Upstream-status: Pending huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" hymt_decoder_select="huffyuv_decoder" -@@ -2903,6 +2923,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder +@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" ffnvcodec_deps_any="libdl LoadLibrary" nvdec_deps="ffnvcodec" @@ -109,7 +114,7 @@ Upstream-status: Pending vaapi_x11_deps="xlib" videotoolbox_hwaccel_deps="videotoolbox pthreads" videotoolbox_hwaccel_extralibs="-framework QuartzCore" -@@ -2934,6 +2955,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP +@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP hevc_dxva2_hwaccel_select="hevc_decoder" hevc_nvdec_hwaccel_deps="nvdec" hevc_nvdec_hwaccel_select="hevc_decoder" @@ -122,16 +127,15 @@ Upstream-status: Pending hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" hevc_vaapi_hwaccel_select="hevc_decoder" hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" -@@ -3401,8 +3428,14 @@ sndio_indev_deps="sndio" +@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio" sndio_outdev_deps="sndio" v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" v4l2_indev_suggest="libv4l2" +v4l2_outdev_deps="libdrm" v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" v4l2_outdev_suggest="libv4l2" -+vout_drm_outdev_deps="libdrm vout_drm" -+vout_egl_outdev_deps="xlib" -+vout_egl_outdev_select="epoxy" ++vout_drm_outdev_deps="libdrm" ++vout_egl_outdev_deps="xlib epoxy" +vout_rpi_outdev_deps="rpi" +vout_rpi_outdev_select="sand" vfwcap_indev_deps="vfw32 vfwcap_defines" @@ -145,23 +149,20 @@ Upstream-status: Pending unsharp_opencl_filter_deps="opencl" uspp_filter_deps="gpl avcodec" vaguedenoiser_filter_deps="gpl" -@@ -6299,6 +6333,7 @@ enabled libdav1d && require_pkg - enabled libdavs2 && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open - enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new - enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion -+enabled epoxy && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version - enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen || - { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac && - warn "using libfdk without pkg-config"; } } -@@ -6376,6 +6411,7 @@ enabled libtls && require_pkg - enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame && - { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || - die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } -+enabled libudev && require_pkg_config libudev libudev libudev.h udev_new - enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl - enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit - enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf -@@ -6430,11 +6466,12 @@ enabled mbedtls && { check_pkg +@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob + enabled xlib && + check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext + ++enabled libudev && ++ check_pkg_config libudev libudev libudev.h udev_new ++ ++enabled epoxy && ++ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version ++ + check_headers direct.h + check_headers dirent.h + check_headers dxgidebug.h +@@ -6430,11 +6470,12 @@ enabled mbedtls && { check_pkg check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto || die "ERROR: mbedTLS not found"; } enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; } @@ -176,26 +177,32 @@ Upstream-status: Pending die "ERROR: mmal not found" && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; } enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do -@@ -6475,6 +6512,10 @@ enabled rkmpp && { require_p +@@ -6475,8 +6516,16 @@ enabled rkmpp && { require_p { enabled libdrm || die "ERROR: rkmpp requires --enable-libdrm"; } } +enabled v4l2_request && { enabled libdrm || + die "ERROR: v4l2-request requires --enable-libdrm"; } && + { enabled libudev || -+ die "ERROR: v4l2-request requires --enable-libudev"; } ++ die "ERROR: v4l2-request requires libudev"; } enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init - - -@@ -6556,6 +6597,8 @@ if enabled v4l2_m2m; then + ++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; } ++ ++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } && ++ { enabled xlib || die "ERROR: vout_egl requires xlib"; } + + if enabled gcrypt; then + GCRYPT_CONFIG="${cross_prefix}libgcrypt-config" +@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" fi - + +check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns +check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" check_headers sys/videoio.h test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete - + --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c @@ -2119,8 +2119,8 @@ static int ifilter_send_frame(InputFilte @@ -208,11 +215,11 @@ Upstream-status: Pending + ifilter->height != av_frame_cropped_height(frame); break; } - + @@ -2131,6 +2131,9 @@ static int ifilter_send_frame(InputFilte (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data)) need_reinit = 1; - + + if (no_cvt_hw && fg->graph) + need_reinit = 0; + @@ -221,7 +228,7 @@ Upstream-status: Pending if (ret < 0) @@ -2401,8 +2404,7 @@ static int decode_video(InputStream *ist decoded_frame->top_field_first = ist->top_field_first; - + ist->frames_decoded++; - - if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { @@ -229,7 +236,21 @@ Upstream-status: Pending err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); if (err < 0) goto fail; -@@ -2820,6 +2822,16 @@ static enum AVPixelFormat get_format(AVC +@@ -2600,7 +2602,12 @@ static int process_input_packet(InputStr + case AVMEDIA_TYPE_VIDEO: + ret = decode_video (ist, repeating ? NULL : &avpkt, &got_output, &duration_pts, !pkt, + &decode_failed); +- if (!repeating || !pkt || got_output) { ++ // Pi: Do not inc dts if no_cvt_hw set ++ // V4L2 H264 decode has long latency and sometimes spits out a long ++ // stream of output without input. In this case incrementing DTS is wrong. ++ // There may be cases where the condition as written is correct so only ++ // "fix" in the cases which cause problems ++ if (!repeating || !pkt || (got_output && !no_cvt_hw)) { + if (pkt && pkt->duration) { + duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q); + } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) { +@@ -2820,6 +2827,16 @@ static enum AVPixelFormat get_format(AVC } else { const HWAccel *hwaccel = NULL; int i; @@ -246,10 +267,10 @@ Upstream-status: Pending for (i = 0; hwaccels[i].name; i++) { if (hwaccels[i].pix_fmt == *p) { hwaccel = &hwaccels[i]; -@@ -2914,6 +2926,15 @@ static int init_input_stream(int ist_ind +@@ -2914,6 +2931,15 @@ static int init_input_stream(int ist_ind return ret; } - + +#if CONFIG_HEVC_RPI_DECODER + ret = -1; + if (strcmp(codec->name, "hevc_rpi") == 0 && @@ -270,7 +291,7 @@ Upstream-status: Pending HWACCEL_QSV, + HWACCEL_RPI, }; - + typedef struct HWAccel { @@ -590,6 +591,7 @@ extern int video_sync_method; extern float frame_drop_threshold; @@ -283,15 +304,15 @@ Upstream-status: Pending --- a/fftools/ffmpeg_filter.c +++ b/fftools/ffmpeg_filter.c @@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputF - + ifilter->format = frame->format; - + - ifilter->width = frame->width; - ifilter->height = frame->height; + ifilter->width = av_frame_cropped_width(frame); + ifilter->height = av_frame_cropped_height(frame); ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; - + ifilter->sample_rate = frame->sample_rate; --- a/fftools/ffmpeg_hw.c +++ b/fftools/ffmpeg_hw.c @@ -309,7 +330,7 @@ Upstream-status: Pending @@ -130,6 +130,12 @@ static const char *opt_name_enc_time_bas }\ } - + +#if CONFIG_RPI +static int rpi_init(AVCodecContext *avctx) { + return 0; @@ -376,7 +397,7 @@ Upstream-status: Pending + v4l2_req_devscan.o weak_link.o OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o - + @@ -391,6 +396,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER) + OBJS-$(CONFIG_HEVC_QSV_ENCODER) += qsvenc_hevc.o hevc_ps_enc.o \ hevc_data.o @@ -399,7 +420,7 @@ Upstream-status: Pending +OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o +OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o +OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ -+ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o ++ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o @@ -435,6 +456,1866 @@ Upstream-status: Pending +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h +endif +--- a/libavcodec/aarch64/Makefile ++++ b/libavcodec/aarch64/Makefile +@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED) + NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ + aarch64/hpeldsp_neon.o + NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o +-NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o ++NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ ++ aarch64/simple_idct_neon.o + NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o + NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o + NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o ++NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o + NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o + + # decoders/encoders +--- a/libavcodec/aarch64/idctdsp_init_aarch64.c ++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c +@@ -27,19 +27,29 @@ + #include "libavcodec/idctdsp.h" + #include "idct.h" + ++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++ + av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) + { + int cpu_flags = av_get_cpu_flags(); + +- if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) { +- if (avctx->idct_algo == FF_IDCT_AUTO || +- avctx->idct_algo == FF_IDCT_SIMPLEAUTO || +- avctx->idct_algo == FF_IDCT_SIMPLENEON) { +- c->idct_put = ff_simple_idct_put_neon; +- c->idct_add = ff_simple_idct_add_neon; +- c->idct = ff_simple_idct_neon; +- c->perm_type = FF_IDCT_PERM_PARTTRANS; ++ if (have_neon(cpu_flags)) { ++ if (!avctx->lowres && !high_bit_depth) { ++ if (avctx->idct_algo == FF_IDCT_AUTO || ++ avctx->idct_algo == FF_IDCT_SIMPLEAUTO || ++ avctx->idct_algo == FF_IDCT_SIMPLENEON) { ++ c->idct_put = ff_simple_idct_put_neon; ++ c->idct_add = ff_simple_idct_add_neon; ++ c->idct = ff_simple_idct_neon; ++ c->perm_type = FF_IDCT_PERM_PARTTRANS; ++ } + } ++ ++ c->add_pixels_clamped = ff_add_pixels_clamped_neon; ++ c->put_pixels_clamped = ff_put_pixels_clamped_neon; ++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + } + } +--- /dev/null ++++ b/libavcodec/aarch64/idctdsp_neon.S +@@ -0,0 +1,130 @@ ++/* ++ * IDCT AArch64 NEON optimisations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// Clamp 16-bit signed block coefficients to unsigned 8-bit ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit results ++// x2 = row stride for results, bytes ++function ff_put_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v4.8b, v4.8h ++ st1 {v0.8b}, [x1], x2 ++ sqxtun v0.8b, v5.8h ++ st1 {v1.8b}, [x1], x2 ++ sqxtun v1.8b, v6.8h ++ st1 {v2.8b}, [x1], x2 ++ sqxtun v2.8b, v7.8h ++ st1 {v3.8b}, [x1], x2 ++ st1 {v4.8b}, [x1], x2 ++ st1 {v0.8b}, [x1], x2 ++ st1 {v1.8b}, [x1], x2 ++ st1 {v2.8b}, [x1] ++ ret ++endfunc ++ ++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit results ++// x2 = row stride for results, bytes ++function ff_put_signed_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ movi v4.8b, #128 ++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] ++ sqxtn v0.8b, v0.8h ++ sqxtn v1.8b, v1.8h ++ sqxtn v2.8b, v2.8h ++ sqxtn v3.8b, v3.8h ++ sqxtn v5.8b, v16.8h ++ add v0.8b, v0.8b, v4.8b ++ sqxtn v6.8b, v17.8h ++ add v1.8b, v1.8b, v4.8b ++ sqxtn v7.8b, v18.8h ++ add v2.8b, v2.8b, v4.8b ++ sqxtn v16.8b, v19.8h ++ add v3.8b, v3.8b, v4.8b ++ st1 {v0.8b}, [x1], x2 ++ add v0.8b, v5.8b, v4.8b ++ st1 {v1.8b}, [x1], x2 ++ add v1.8b, v6.8b, v4.8b ++ st1 {v2.8b}, [x1], x2 ++ add v2.8b, v7.8b, v4.8b ++ st1 {v3.8b}, [x1], x2 ++ add v3.8b, v16.8b, v4.8b ++ st1 {v0.8b}, [x1], x2 ++ st1 {v1.8b}, [x1], x2 ++ st1 {v2.8b}, [x1], x2 ++ st1 {v3.8b}, [x1] ++ ret ++endfunc ++ ++// Add 16-bit signed block coefficients to unsigned 8-bit ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit input and results ++// x2 = row stride for 8-bit input and results, bytes ++function ff_add_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ mov x3, x1 ++ ld1 {v4.8b}, [x1], x2 ++ ld1 {v5.8b}, [x1], x2 ++ ld1 {v6.8b}, [x1], x2 ++ ld1 {v7.8b}, [x1], x2 ++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] ++ uaddw v0.8h, v0.8h, v4.8b ++ uaddw v1.8h, v1.8h, v5.8b ++ uaddw v2.8h, v2.8h, v6.8b ++ ld1 {v4.8b}, [x1], x2 ++ uaddw v3.8h, v3.8h, v7.8b ++ ld1 {v5.8b}, [x1], x2 ++ sqxtun v0.8b, v0.8h ++ ld1 {v6.8b}, [x1], x2 ++ sqxtun v1.8b, v1.8h ++ ld1 {v7.8b}, [x1] ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ uaddw v4.8h, v16.8h, v4.8b ++ st1 {v0.8b}, [x3], x2 ++ uaddw v0.8h, v17.8h, v5.8b ++ st1 {v1.8b}, [x3], x2 ++ uaddw v1.8h, v18.8h, v6.8b ++ st1 {v2.8b}, [x3], x2 ++ uaddw v2.8h, v19.8h, v7.8b ++ sqxtun v4.8b, v4.8h ++ sqxtun v0.8b, v0.8h ++ st1 {v3.8b}, [x3], x2 ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ st1 {v4.8b}, [x3], x2 ++ st1 {v0.8b}, [x3], x2 ++ st1 {v1.8b}, [x3], x2 ++ st1 {v2.8b}, [x3] ++ ret ++endfunc +--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c ++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c +@@ -21,10 +21,28 @@ + #include "libavutil/attributes.h" + #include "libavutil/cpu.h" + #include "libavutil/aarch64/cpu.h" ++#include "libavutil/intreadwrite.h" + #include "libavcodec/vc1dsp.h" + + #include "config.h" + ++void ff_vc1_inv_trans_8x8_neon(int16_t *block); ++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++ ++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++ ++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); ++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); ++ + void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, +@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t + void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + ++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); ++ ++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) ++{ ++ /* Dealing with starting and stopping, and removing escape bytes, are ++ * comparatively less time-sensitive, so are more clearly expressed using ++ * a C wrapper around the assembly inner loop. Note that we assume a ++ * little-endian machine that supports unaligned loads. */ ++ int dsize = 0; ++ while (size >= 4) ++ { ++ int found = 0; ++ while (!found && (((uintptr_t) dst) & 7) && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ if (!found) ++ { ++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); ++ dst += skip; ++ src += skip; ++ size -= skip; ++ dsize += skip; ++ while (!found && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ } ++ if (found) ++ { ++ *dst++ = *src++; ++ *dst++ = *src++; ++ ++src; ++ size -= 3; ++ dsize += 2; ++ } ++ } ++ while (size > 0) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ return dsize; ++} ++ + av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) + { + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { ++ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; ++ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; ++ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; ++ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; ++ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; ++ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; ++ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; ++ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; ++ ++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; ++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; ++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; ++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; ++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; ++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; ++ + dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; ++ ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } + } +--- /dev/null ++++ b/libavcodec/aarch64/vc1dsp_neon.S +@@ -0,0 +1,1546 @@ ++/* ++ * VC1 AArch64 NEON optimisations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// VC-1 8x8 inverse transform ++// On entry: ++// x0 -> array of 16-bit inverse transform coefficients, in column-major order ++// On exit: ++// array at x0 updated to hold transformed block; also now held in row-major order ++function ff_vc1_inv_trans_8x8_neon, export=1 ++ ld1 {v1.16b, v2.16b}, [x0], #32 ++ ld1 {v3.16b, v4.16b}, [x0], #32 ++ ld1 {v5.16b, v6.16b}, [x0], #32 ++ shl v1.8h, v1.8h, #2 // 8/2 * src[0] ++ sub x1, x0, #3*32 ++ ld1 {v16.16b, v17.16b}, [x0] ++ shl v7.8h, v2.8h, #4 // 16 * src[8] ++ shl v18.8h, v2.8h, #2 // 4 * src[8] ++ shl v19.8h, v4.8h, #4 // 16 * src[24] ++ ldr d0, .Lcoeffs_it8 ++ shl v5.8h, v5.8h, #2 // 8/2 * src[32] ++ shl v20.8h, v6.8h, #4 // 16 * src[40] ++ shl v21.8h, v6.8h, #2 // 4 * src[40] ++ shl v22.8h, v17.8h, #4 // 16 * src[56] ++ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] ++ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] ++ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] ++ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] ++ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] ++ shl v3.8h, v3.8h, #3 // 16/2 * src[16] ++ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ ssra v1.8h, v1.8h, #1 // 12/2 * src[0] ++ ssra v5.8h, v5.8h, #1 // 12/2 * src[32] ++ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ shl v21.8h, v16.8h, #3 // 16/2 * src[48] ++ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 ++ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 ++ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 ++ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 ++ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 ++ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 ++ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 ++ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 ++ neg v3.8h, v7.8h // -t1 ++ neg v4.8h, v20.8h // +t2 ++ neg v6.8h, v19.8h // +t3 ++ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 ++ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 ++ neg v7.8h, v18.8h // +t4 ++ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 ++ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 ++ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 ++ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 ++ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 ++ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 ++ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 ++ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 ++ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 ++ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 ++ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 ++ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 ++ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 ++ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 ++ trn2 v17.8h, v3.8h, v4.8h ++ trn2 v18.8h, v5.8h, v6.8h ++ trn2 v19.8h, v2.8h, v1.8h ++ trn2 v20.8h, v7.8h, v16.8h ++ trn1 v21.4s, v17.4s, v18.4s ++ trn2 v17.4s, v17.4s, v18.4s ++ trn1 v18.4s, v19.4s, v20.4s ++ trn2 v19.4s, v19.4s, v20.4s ++ trn1 v3.8h, v3.8h, v4.8h ++ trn2 v4.2d, v21.2d, v18.2d ++ trn1 v20.2d, v17.2d, v19.2d ++ trn1 v5.8h, v5.8h, v6.8h ++ trn1 v1.8h, v2.8h, v1.8h ++ trn1 v2.8h, v7.8h, v16.8h ++ trn1 v6.2d, v21.2d, v18.2d ++ trn2 v7.2d, v17.2d, v19.2d ++ shl v16.8h, v20.8h, #4 // 16 * src[24] ++ shl v17.8h, v4.8h, #4 // 16 * src[40] ++ trn1 v18.4s, v3.4s, v5.4s ++ trn1 v19.4s, v1.4s, v2.4s ++ shl v21.8h, v7.8h, #4 // 16 * src[56] ++ shl v22.8h, v6.8h, #2 // 4 * src[8] ++ shl v23.8h, v4.8h, #2 // 4 * src[40] ++ trn2 v3.4s, v3.4s, v5.4s ++ trn2 v1.4s, v1.4s, v2.4s ++ shl v2.8h, v6.8h, #4 // 16 * src[8] ++ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] ++ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] ++ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] ++ trn1 v22.2d, v18.2d, v19.2d ++ trn2 v18.2d, v18.2d, v19.2d ++ trn1 v19.2d, v3.2d, v1.2d ++ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] ++ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ shl v21.8h, v22.8h, #2 // 8/2 * src[0] ++ shl v18.8h, v18.8h, #2 // 8/2 * src[32] ++ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ shl v6.8h, v19.8h, #3 // 16/2 * src[16] ++ trn2 v1.2d, v3.2d, v1.2d ++ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ ssra v21.8h, v21.8h, #1 // 12/2 * src[0] ++ ssra v18.8h, v18.8h, #1 // 12/2 * src[32] ++ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] ++ shl v19.8h, v1.8h, #3 // 16/2 * src[48] ++ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 ++ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 ++ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 ++ neg v21.8h, v17.8h // +t2 ++ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 ++ neg v4.8h, v5.8h // +t3 ++ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 ++ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 ++ neg v24.8h, v16.8h // +t4 ++ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 ++ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 ++ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 ++ neg v3.8h, v2.8h // -t1 ++ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 ++ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 ++ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 ++ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 ++ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 ++ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 ++ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 ++ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 ++ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 ++ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 ++ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 ++ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 ++ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 ++ st1 {v2.16b, v3.16b}, [x1], #32 ++ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 ++ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 ++ st1 {v4.16b, v5.16b}, [x1], #32 ++ st1 {v16.16b, v17.16b}, [x1], #32 ++ st1 {v0.16b, v1.16b}, [x1] ++ ret ++endfunc ++ ++// VC-1 8x4 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x4_neon, export=1 ++ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 ++ mov x3, x0 ++ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] ++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector ++ ld1 {v5.8b}, [x0], x1 ++ trn2 v6.4h, v1.4h, v3.4h ++ trn2 v7.4h, v2.4h, v4.4h ++ trn1 v1.4h, v1.4h, v3.4h ++ trn1 v2.4h, v2.4h, v4.4h ++ trn2 v3.4h, v16.4h, v18.4h ++ trn2 v4.4h, v17.4h, v19.4h ++ trn1 v16.4h, v16.4h, v18.4h ++ trn1 v17.4h, v17.4h, v19.4h ++ ld1 {v18.8b}, [x0], x1 ++ trn1 v19.2s, v6.2s, v3.2s ++ trn2 v3.2s, v6.2s, v3.2s ++ trn1 v6.2s, v7.2s, v4.2s ++ trn2 v4.2s, v7.2s, v4.2s ++ trn1 v7.2s, v1.2s, v16.2s ++ trn1 v20.2s, v2.2s, v17.2s ++ shl v21.4h, v19.4h, #4 // 16 * src[1] ++ trn2 v1.2s, v1.2s, v16.2s ++ shl v16.4h, v3.4h, #4 // 16 * src[3] ++ trn2 v2.2s, v2.2s, v17.2s ++ shl v17.4h, v6.4h, #4 // 16 * src[5] ++ ld1 {v22.8b}, [x0], x1 ++ shl v23.4h, v4.4h, #4 // 16 * src[7] ++ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] ++ ld1 {v25.8b}, [x0] ++ shl v26.4h, v19.4h, #2 // 4 * src[1] ++ shl v27.4h, v6.4h, #2 // 4 * src[5] ++ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] ++ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] ++ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] ++ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] ++ shl v7.4h, v7.4h, #2 // 8/2 * src[0] ++ shl v20.4h, v20.4h, #2 // 8/2 * src[4] ++ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] ++ shl v1.4h, v1.4h, #3 // 16/2 * src[2] ++ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] ++ ssra v7.4h, v7.4h, #1 // 12/2 * src[0] ++ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] ++ ssra v20.4h, v20.4h, #1 // 12/2 * src[4] ++ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] ++ shl v3.4h, v2.4h, #3 // 16/2 * src[6] ++ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] ++ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] ++ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] ++ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] ++ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] ++ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] ++ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] ++ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] ++ neg v6.4h, v21.4h // -t1 ++ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 ++ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 ++ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 ++ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 ++ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ neg v3.4h, v17.4h // +t2 ++ neg v4.4h, v16.4h // +t3 ++ neg v28.4h, v23.4h // +t4 ++ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 ++ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 ++ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 ++ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 ++ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 ++ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 ++ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 ++ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 ++ trn1 v1.2d, v7.2d, v1.2d ++ trn1 v2.2d, v20.2d, v2.2d ++ trn1 v3.2d, v24.2d, v27.2d ++ trn1 v4.2d, v19.2d, v26.2d ++ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 ++ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 ++ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 ++ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 ++ trn2 v6.8h, v1.8h, v2.8h ++ trn1 v1.8h, v1.8h, v2.8h ++ trn2 v2.8h, v3.8h, v4.8h ++ trn1 v3.8h, v3.8h, v4.8h ++ trn2 v4.4s, v6.4s, v2.4s ++ trn1 v7.4s, v1.4s, v3.4s ++ trn2 v1.4s, v1.4s, v3.4s ++ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] ++ trn1 v2.4s, v6.4s, v2.4s ++ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] ++ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] ++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] ++ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] ++ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] ++ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] ++ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] ++ neg v2.8h, v3.8h // -t4/2 ++ neg v6.8h, v4.8h // -t3/2 ++ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 ++ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 ++ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 ++ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 ++ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 ++ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 ++ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 ++ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v1.8h, v1.8h, v18.8b ++ uaddw v2.8h, v2.8h, v22.8b ++ uaddw v3.8h, v3.8h, v25.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3], x1 ++ st1 {v3.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 4x8 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x8_neon, export=1 ++ mov x3, #16 ++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector ++ mov x4, x0 ++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 ++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 ++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 ++ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 ++ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 ++ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 ++ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 ++ ld1 {v4.d}[1], [x2] // 70 71 72 73 ++ ld1 {v5.s}[0], [x0], x1 ++ ld1 {v6.s}[0], [x0], x1 ++ ld1 {v7.s}[0], [x0], x1 ++ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 ++ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 ++ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 ++ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 ++ ld1 {v4.s}[0], [x0], x1 ++ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 ++ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 ++ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 ++ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] ++ ld1 {v5.s}[1], [x0], x1 ++ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] ++ ld1 {v6.s}[1], [x0], x1 ++ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 ++ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] ++ ld1 {v7.s}[1], [x0], x1 ++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] ++ ld1 {v4.s}[1], [x0] ++ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] ++ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] ++ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] ++ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] ++ neg v3.8h, v16.8h // -t3/2 ++ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 ++ neg v18.8h, v17.8h // -t4/2 ++ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 ++ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 ++ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 ++ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 ++ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 ++ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 ++ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 ++ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 ++ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 ++ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 ++ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 ++ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 ++ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 ++ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 ++ mov d18, v3.d[1] // 50 51 52 53 ++ shl v19.4h, v3.4h, #4 // 16 * src[8] ++ mov d20, v16.d[1] // 70 71 72 73 ++ shl v21.4h, v16.4h, #4 // 16 * src[24] ++ mov d22, v17.d[1] // 40 41 42 43 ++ shl v23.4h, v3.4h, #2 // 4 * src[8] ++ shl v24.4h, v18.4h, #4 // 16 * src[40] ++ shl v25.4h, v20.4h, #4 // 16 * src[56] ++ shl v26.4h, v18.4h, #2 // 4 * src[40] ++ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 ++ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] ++ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] ++ shl v17.4h, v17.4h, #2 // 8/2 * src[0] ++ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] ++ shl v22.4h, v22.4h, #2 // 8/2 * src[32] ++ mov d23, v1.d[1] // 60 61 62 63 ++ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] ++ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] ++ shl v1.4h, v1.4h, #3 // 16/2 * src[16] ++ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ ssra v17.4h, v17.4h, #1 // 12/2 * src[0] ++ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ ssra v22.4h, v22.4h, #1 // 12/2 * src[32] ++ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ shl v3.4h, v23.4h, #3 // 16/2 * src[48] ++ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 ++ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 ++ neg v23.4h, v24.4h // +t2 ++ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 ++ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 ++ neg v17.4h, v21.4h // +t3 ++ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ neg v16.4h, v19.4h // -t1 ++ neg v27.4h, v2.4h // +t4 ++ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 ++ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 ++ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 ++ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 ++ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 ++ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 ++ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 ++ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 ++ trn1 v0.2d, v20.2d, v0.2d ++ trn1 v2.2d, v18.2d, v22.2d ++ trn1 v3.2d, v25.2d, v3.2d ++ trn1 v1.2d, v26.2d, v1.2d ++ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 ++ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 ++ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 ++ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v2.8h, v2.8h, v6.8b ++ uaddw v3.8h, v3.8h, v7.8b ++ uaddw v1.8h, v1.8h, v4.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x4], x1 ++ st1 {v2.s}[0], [x4], x1 ++ st1 {v3.s}[0], [x4], x1 ++ st1 {v1.s}[0], [x4], x1 ++ st1 {v0.s}[1], [x4], x1 ++ st1 {v2.s}[1], [x4], x1 ++ st1 {v3.s}[1], [x4], x1 ++ st1 {v1.s}[1], [x4] ++ ret ++endfunc ++ ++// VC-1 4x4 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x4_neon, export=1 ++ mov x3, #16 ++ ldr d0, .Lcoeffs_it4 ++ mov x4, x0 ++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 ++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 ++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 ++ ld1 {v4.d}[0], [x2] // 30 31 32 33 ++ ld1 {v5.s}[0], [x0], x1 ++ ld1 {v5.s}[1], [x0], x1 ++ ld1 {v6.s}[0], [x0], x1 ++ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 ++ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 ++ ld1 {v6.s}[1], [x0] ++ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 ++ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 ++ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 ++ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 ++ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 ++ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 ++ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] ++ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] ++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] ++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] ++ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] ++ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] ++ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] ++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] ++ neg v7.4h, v3.4h // -t3/2 ++ neg v16.4h, v4.4h // -t4/2 ++ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 ++ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 ++ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 ++ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 ++ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 ++ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 ++ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 ++ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 ++ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 ++ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 ++ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 ++ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 ++ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 ++ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 ++ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 ++ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 ++ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] ++ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] ++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] ++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] ++ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] ++ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] ++ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] ++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] ++ neg v3.4h, v2.4h // -t4/2 ++ neg v7.4h, v4.4h // -t3/2 ++ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 ++ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 ++ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 ++ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 ++ trn1 v0.2d, v4.2d, v3.2d ++ trn1 v1.2d, v2.2d, v7.2d ++ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 ++ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v1.8h, v1.8h, v6.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x4], x1 ++ st1 {v0.s}[1], [x4], x1 ++ st1 {v1.s}[0], [x4], x1 ++ st1 {v1.s}[1], [x4] ++ ret ++endfunc ++ ++// VC-1 8x8 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x8_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.8b}, [x0], x1 ++ ld1 {v1.8b}, [x0], x1 ++ ld1 {v2.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v3.8b}, [x0], x1 ++ ld1 {v4.8b}, [x0], x1 ++ add w2, w2, #1 ++ ld1 {v5.8b}, [x0], x1 ++ asr w2, w2, #1 ++ ld1 {v6.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v7.8b}, [x0] ++ add w0, w2, #16 ++ asr w0, w0, #5 ++ dup v16.8h, w0 ++ uaddw v0.8h, v16.8h, v0.8b ++ uaddw v1.8h, v16.8h, v1.8b ++ uaddw v2.8h, v16.8h, v2.8b ++ uaddw v3.8h, v16.8h, v3.8b ++ uaddw v4.8h, v16.8h, v4.8b ++ uaddw v5.8h, v16.8h, v5.8b ++ sqxtun v0.8b, v0.8h ++ uaddw v6.8h, v16.8h, v6.8b ++ sqxtun v1.8b, v1.8h ++ uaddw v7.8h, v16.8h, v7.8b ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v4.8b, v4.8h ++ st1 {v0.8b}, [x3], x1 ++ sqxtun v0.8b, v5.8h ++ st1 {v1.8b}, [x3], x1 ++ sqxtun v1.8b, v6.8h ++ st1 {v2.8b}, [x3], x1 ++ sqxtun v2.8b, v7.8h ++ st1 {v3.8b}, [x3], x1 ++ st1 {v4.8b}, [x3], x1 ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 8x4 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x4_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.8b}, [x0], x1 ++ ld1 {v1.8b}, [x0], x1 ++ ld1 {v2.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v3.8b}, [x0] ++ add w0, w2, #1 ++ asr w0, w0, #1 ++ add w0, w0, w0, lsl #4 ++ add w0, w0, #64 ++ asr w0, w0, #7 ++ dup v4.8h, w0 ++ uaddw v0.8h, v4.8h, v0.8b ++ uaddw v1.8h, v4.8h, v1.8b ++ uaddw v2.8h, v4.8h, v2.8b ++ uaddw v3.8h, v4.8h, v3.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3], x1 ++ st1 {v3.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 4x8 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x8_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.s}[0], [x0], x1 ++ ld1 {v1.s}[0], [x0], x1 ++ ld1 {v2.s}[0], [x0], x1 ++ add w2, w2, w2, lsl #4 ++ ld1 {v3.s}[0], [x0], x1 ++ add w2, w2, #4 ++ asr w2, w2, #3 ++ add w2, w2, w2, lsl #1 ++ ld1 {v0.s}[1], [x0], x1 ++ add w2, w2, #16 ++ asr w2, w2, #5 ++ dup v4.8h, w2 ++ ld1 {v1.s}[1], [x0], x1 ++ ld1 {v2.s}[1], [x0], x1 ++ ld1 {v3.s}[1], [x0] ++ uaddw v0.8h, v4.8h, v0.8b ++ uaddw v1.8h, v4.8h, v1.8b ++ uaddw v2.8h, v4.8h, v2.8b ++ uaddw v3.8h, v4.8h, v3.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3], x1 ++ st1 {v2.s}[0], [x3], x1 ++ st1 {v3.s}[0], [x3], x1 ++ st1 {v0.s}[1], [x3], x1 ++ st1 {v1.s}[1], [x3], x1 ++ st1 {v2.s}[1], [x3], x1 ++ st1 {v3.s}[1], [x3] ++ ret ++endfunc ++ ++// VC-1 4x4 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x4_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.s}[0], [x0], x1 ++ ld1 {v1.s}[0], [x0], x1 ++ ld1 {v0.s}[1], [x0], x1 ++ add w2, w2, w2, lsl #4 ++ ld1 {v1.s}[1], [x0] ++ add w0, w2, #4 ++ asr w0, w0, #3 ++ add w0, w0, w0, lsl #4 ++ add w0, w0, #64 ++ asr w0, w0, #7 ++ dup v2.8h, w0 ++ uaddw v0.8h, v2.8h, v0.8b ++ uaddw v1.8h, v2.8h, v1.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3], x1 ++ st1 {v0.s}[1], [x3], x1 ++ st1 {v1.s}[1], [x3] ++ ret ++endfunc ++ ++.align 5 ++.Lcoeffs_it8: ++.quad 0x000F00090003 ++.Lcoeffs_it4: ++.quad 0x0011000B0005 ++.Lcoeffs: ++.quad 0x00050002 ++ ++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter4_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ ldr d0, .Lcoeffs ++ ld1 {v1.s}[0], [x0], x1 // P5 ++ ld1 {v2.s}[0], [x3], x1 // P1 ++ ld1 {v3.s}[0], [x3], x1 // P2 ++ ld1 {v4.s}[0], [x0], x1 // P6 ++ ld1 {v5.s}[0], [x3], x1 // P3 ++ ld1 {v6.s}[0], [x0], x1 // P7 ++ ld1 {v7.s}[0], [x3] // P4 ++ ld1 {v16.s}[0], [x0] // P8 ++ ushll v17.8h, v1.8b, #1 // 2*P5 ++ dup v18.8h, w2 // pq ++ ushll v2.8h, v2.8b, #1 // 2*P1 ++ uxtl v3.8h, v3.8b // P2 ++ uxtl v4.8h, v4.8b // P6 ++ uxtl v19.8h, v5.8b // P3 ++ mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 ++ uxtl v3.8h, v6.8b // P7 ++ mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 ++ ushll v5.8h, v5.8b, #1 // 2*P3 ++ uxtl v6.8h, v7.8b // P4 ++ mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v3.8h, v16.8b // P8 ++ mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 ++ uxtl v1.8h, v1.8b // P5 ++ mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 ++ mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ sub v3.4h, v6.4h, v1.4h // P4-P5 ++ mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 ++ mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ abs v4.4h, v3.4h ++ srshr v7.4h, v17.4h, #3 ++ srshr v2.4h, v2.4h, #3 ++ sshr v4.4h, v4.4h, #1 // clip ++ srshr v5.4h, v5.4h, #3 ++ abs v7.4h, v7.4h // a2 ++ sshr v3.4h, v3.4h, #8 // clip_sign ++ abs v2.4h, v2.4h // a1 ++ cmeq v16.4h, v4.4h, #0 // test clip == 0 ++ abs v17.4h, v5.4h // a0 ++ sshr v5.4h, v5.4h, #8 // a0_sign ++ cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 ++ cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq ++ sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign ++ bsl v19.8b, v7.8b, v2.8b // a3 ++ orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq ++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 ++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w0, v5.s[1] // move to gp reg ++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ cmhs v5.4h, v0.4h, v4.4h ++ tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered ++ bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) ++ bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v6.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter4_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ ld1 {v3.8b}, [x3], x1 ++ ld1 {v4.8b}, [x3] ++ dup v5.8h, w2 // pq ++ trn1 v6.8b, v1.8b, v2.8b ++ trn2 v1.8b, v1.8b, v2.8b ++ trn1 v2.8b, v3.8b, v4.8b ++ trn2 v3.8b, v3.8b, v4.8b ++ trn1 v4.4h, v6.4h, v2.4h // P1, P5 ++ trn1 v7.4h, v1.4h, v3.4h // P2, P6 ++ trn2 v2.4h, v6.4h, v2.4h // P3, P7 ++ trn2 v1.4h, v1.4h, v3.4h // P4, P8 ++ ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 ++ uxtl v6.8h, v7.8b // P2, P6 ++ uxtl v7.8h, v2.8b // P3, P7 ++ uxtl v1.8h, v1.8b // P4, P8 ++ mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 ++ ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 ++ uxtl v4.8h, v4.8b // P1, P5 ++ mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 ++ mov d6, v6.d[1] // P6 ++ mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 ++ mov d4, v4.d[1] // P5 ++ mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 ++ mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 ++ sub v7.4h, v1.4h, v4.4h // P4-P5 ++ mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ srshr v3.8h, v3.8h, #3 ++ abs v6.4h, v7.4h ++ sshr v7.4h, v7.4h, #8 // clip_sign ++ srshr v2.4h, v2.4h, #3 ++ abs v3.8h, v3.8h // a1, a2 ++ sshr v6.4h, v6.4h, #1 // clip ++ mov d16, v3.d[1] // a2 ++ abs v17.4h, v2.4h // a0 ++ cmeq v18.4h, v6.4h, #0 // test clip == 0 ++ sshr v2.4h, v2.4h, #8 // a0_sign ++ cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 ++ cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq ++ sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign ++ bsl v19.8b, v16.8b, v3.8b // a3 ++ orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq ++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 ++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w2, v5.s[1] // move to gp reg ++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ cmhs v5.4h, v0.4h, v6.4h ++ tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered ++ bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) ++ bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ sqxtun v3.8b, v4.8h ++ sqxtun v2.8b, v1.8h ++ st2 {v2.b, v3.b}[0], [x0], x1 ++ st2 {v2.b, v3.b}[1], [x0], x1 ++ st2 {v2.b, v3.b}[2], [x0], x1 ++ st2 {v2.b, v3.b}[3], [x0] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter8_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x0], x1 // P5 ++ movi v2.2d, #0x0000ffff00000000 ++ ld1 {v3.8b}, [x3], x1 // P1 ++ ld1 {v4.8b}, [x3], x1 // P2 ++ ld1 {v5.8b}, [x0], x1 // P6 ++ ld1 {v6.8b}, [x3], x1 // P3 ++ ld1 {v7.8b}, [x0], x1 // P7 ++ ushll v16.8h, v1.8b, #1 // 2*P5 ++ ushll v3.8h, v3.8b, #1 // 2*P1 ++ ld1 {v17.8b}, [x3] // P4 ++ uxtl v4.8h, v4.8b // P2 ++ ld1 {v18.8b}, [x0] // P8 ++ uxtl v5.8h, v5.8b // P6 ++ dup v19.8h, w2 // pq ++ uxtl v20.8h, v6.8b // P3 ++ mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 ++ uxtl v4.8h, v7.8b // P7 ++ ushll v6.8h, v6.8b, #1 // 2*P3 ++ mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 ++ uxtl v7.8h, v17.8b // P4 ++ uxtl v17.8h, v18.8b // P8 ++ mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v1.8h, v1.8b // P5 ++ mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 ++ sub v4.8h, v7.8h, v1.8h // P4-P5 ++ mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 ++ mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ abs v17.8h, v4.8h ++ sshr v4.8h, v4.8h, #8 // clip_sign ++ mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ sshr v17.8h, v17.8h, #1 // clip ++ mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 ++ srshr v16.8h, v16.8h, #3 ++ mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ cmeq v5.8h, v17.8h, #0 // test clip == 0 ++ srshr v3.8h, v3.8h, #3 ++ abs v16.8h, v16.8h // a2 ++ abs v3.8h, v3.8h // a1 ++ srshr v6.8h, v6.8h, #3 ++ cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 ++ abs v20.8h, v6.8h // a0 ++ sshr v6.8h, v6.8h, #8 // a0_sign ++ bsl v18.16b, v16.16b, v3.16b // a3 ++ cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq ++ sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign ++ uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 ++ orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq ++ mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 ++ cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either ++ mov w0, v5.s[1] // move to gp reg ++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ mov w2, v5.s[3] ++ orr v2.16b, v3.16b, v2.16b ++ cmhs v3.8h, v0.8h, v17.8h ++ and w0, w0, w2 ++ bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) ++ tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case ++ bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered ++ mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v7.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter8_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ add x4, x0, x1, lsl #2 ++ ld1 {v3.8b}, [x3], x1 ++ ld1 {v4.8b}, [x3], x1 ++ ld1 {v5.8b}, [x3], x1 ++ ld1 {v6.8b}, [x3], x1 ++ ld1 {v7.8b}, [x3], x1 ++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... ++ ld1 {v17.8b}, [x3] ++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... ++ trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... ++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... ++ dup v4.8h, w2 // pq ++ trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... ++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... ++ trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... ++ trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... ++ trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... ++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... ++ trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... ++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... ++ trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... ++ trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... ++ trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... ++ trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... ++ trn1 v7.2s, v6.2s, v3.2s // P1 ++ trn1 v18.2s, v19.2s, v16.2s // P2 ++ trn2 v3.2s, v6.2s, v3.2s // P5 ++ trn2 v6.2s, v19.2s, v16.2s // P6 ++ trn1 v16.2s, v2.2s, v17.2s // P3 ++ trn2 v2.2s, v2.2s, v17.2s // P7 ++ ushll v7.8h, v7.8b, #1 // 2*P1 ++ trn1 v17.2s, v1.2s, v5.2s // P4 ++ ushll v19.8h, v3.8b, #1 // 2*P5 ++ trn2 v1.2s, v1.2s, v5.2s // P8 ++ uxtl v5.8h, v18.8b // P2 ++ uxtl v6.8h, v6.8b // P6 ++ uxtl v18.8h, v16.8b // P3 ++ mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 ++ uxtl v2.8h, v2.8b // P7 ++ ushll v5.8h, v16.8b, #1 // 2*P3 ++ mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 ++ uxtl v16.8h, v17.8b // P4 ++ uxtl v1.8h, v1.8b // P8 ++ mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v2.8h, v3.8b // P5 ++ mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 ++ sub v3.8h, v16.8h, v2.8h // P4-P5 ++ mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 ++ mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ abs v1.8h, v3.8h ++ sshr v3.8h, v3.8h, #8 // clip_sign ++ mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ sshr v1.8h, v1.8h, #1 // clip ++ mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 ++ srshr v17.8h, v19.8h, #3 ++ mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ cmeq v6.8h, v1.8h, #0 // test clip == 0 ++ srshr v7.8h, v7.8h, #3 ++ abs v17.8h, v17.8h // a2 ++ abs v7.8h, v7.8h // a1 ++ srshr v5.8h, v5.8h, #3 ++ cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 ++ abs v19.8h, v5.8h // a0 ++ sshr v5.8h, v5.8h, #8 // a0_sign ++ bsl v18.16b, v17.16b, v7.16b // a3 ++ cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq ++ sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign ++ uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 ++ orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq ++ mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w2, v5.s[1] // move to gp reg ++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ mov w3, v5.s[3] ++ cmhs v5.8h, v0.8h, v1.8h ++ and w5, w2, w3 ++ bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) ++ tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case ++ bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ sqxtun v1.8b, v2.8h ++ sqxtun v0.8b, v16.8h ++ tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so ++ st2 {v0.b, v1.b}[0], [x0], x1 ++ st2 {v0.b, v1.b}[1], [x0], x1 ++ st2 {v0.b, v1.b}[2], [x0], x1 ++ st2 {v0.b, v1.b}[3], [x0] ++1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so ++ st2 {v0.b, v1.b}[4], [x4], x1 ++ st2 {v0.b, v1.b}[5], [x4], x1 ++ st2 {v0.b, v1.b}[6], [x4], x1 ++ st2 {v0.b, v1.b}[7], [x4] ++2: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter16_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ ldr d0, .Lcoeffs ++ ld1 {v1.16b}, [x0], x1 // P5 ++ movi v2.2d, #0x0000ffff00000000 ++ ld1 {v3.16b}, [x3], x1 // P1 ++ ld1 {v4.16b}, [x3], x1 // P2 ++ ld1 {v5.16b}, [x0], x1 // P6 ++ ld1 {v6.16b}, [x3], x1 // P3 ++ ld1 {v7.16b}, [x0], x1 // P7 ++ ushll v16.8h, v1.8b, #1 // 2*P5[0..7] ++ ushll v17.8h, v3.8b, #1 // 2*P1[0..7] ++ ld1 {v18.16b}, [x3] // P4 ++ uxtl v19.8h, v4.8b // P2[0..7] ++ ld1 {v20.16b}, [x0] // P8 ++ uxtl v21.8h, v5.8b // P6[0..7] ++ dup v22.8h, w2 // pq ++ ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] ++ mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] ++ ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] ++ uxtl2 v4.8h, v4.16b // P2[8..15] ++ mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] ++ uxtl2 v5.8h, v5.16b // P6[8..15] ++ uxtl v23.8h, v6.8b // P3[0..7] ++ uxtl v24.8h, v7.8b // P7[0..7] ++ mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] ++ ushll v4.8h, v6.8b, #1 // 2*P3[0..7] ++ uxtl v25.8h, v18.8b // P4[0..7] ++ mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] ++ uxtl2 v26.8h, v6.16b // P3[8..15] ++ mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ uxtl2 v7.8h, v7.16b // P7[8..15] ++ ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] ++ mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ uxtl2 v18.8h, v18.16b // P4[8..15] ++ uxtl v23.8h, v20.8b // P8[0..7] ++ mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] ++ uxtl v24.8h, v1.8b // P5[0..7] ++ uxtl2 v20.8h, v20.16b // P8[8..15] ++ mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ uxtl2 v1.8h, v1.16b // P5[8..15] ++ sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] ++ mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] ++ mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] ++ abs v27.8h, v26.8h ++ sshr v26.8h, v26.8h, #8 // clip_sign[0..7] ++ mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ abs v28.8h, v7.8h ++ sshr v27.8h, v27.8h, #1 // clip[0..7] ++ mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ sshr v7.8h, v7.8h, #8 // clip_sign[8..15] ++ sshr v23.8h, v28.8h, #1 // clip[8..15] ++ mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 ++ srshr v17.8h, v17.8h, #3 ++ mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 ++ srshr v16.8h, v16.8h, #3 ++ mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ abs v17.8h, v17.8h // a1[0..7] ++ mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ srshr v3.8h, v3.8h, #3 ++ mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ abs v16.8h, v16.8h // a2[0..7] ++ srshr v19.8h, v19.8h, #3 ++ mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] ++ abs v3.8h, v3.8h // a1[8..15] ++ srshr v4.8h, v4.8h, #3 ++ abs v19.8h, v19.8h // a2[8..15] ++ bsl v5.16b, v16.16b, v17.16b // a3[0..7] ++ srshr v6.8h, v6.8h, #3 ++ cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] ++ abs v17.8h, v4.8h // a0[0..7] ++ sshr v4.8h, v4.8h, #8 // a0_sign[0..7] ++ bsl v16.16b, v19.16b, v3.16b // a3[8..15] ++ uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ abs v19.8h, v6.8h // a0[8..15] ++ cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq ++ cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] ++ sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] ++ sshr v6.8h, v6.8h, #8 // a0_sign[8..15] ++ mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq ++ cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq ++ cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] ++ mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] ++ orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq ++ cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either ++ mov w0, v5.s[1] // move to gp reg ++ cmhs v19.8h, v3.8h, v27.8h ++ ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ mov w2, v5.s[3] ++ orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ orr v16.16b, v20.16b, v17.16b ++ bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) ++ cmtst v2.2d, v5.2d, v2.2d ++ cmhs v3.8h, v0.8h, v23.8h ++ mov w4, v5.s[1] ++ mov w5, v5.s[3] ++ and w0, w0, w2 ++ bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ orr v2.16b, v7.16b, v2.16b ++ bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) ++ mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] ++ and w2, w4, w5 ++ bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] ++ and w0, w0, w2 ++ mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] ++ sqxtun v2.8b, v25.8h ++ tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case ++ mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] ++ sqxtun v0.8b, v24.8h ++ sqxtun2 v2.16b, v18.8h ++ sqxtun2 v0.16b, v1.8h ++ st1 {v2.16b}, [x3], x1 ++ st1 {v0.16b}, [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// x1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter16_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ add x4, x0, x1, lsl #3 ++ ld1 {v3.8b}, [x3], x1 ++ add x5, x0, x1, lsl #2 ++ ld1 {v4.8b}, [x3], x1 ++ add x6, x4, x1, lsl #2 ++ ld1 {v5.8b}, [x3], x1 ++ ld1 {v6.8b}, [x3], x1 ++ ld1 {v7.8b}, [x3], x1 ++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... ++ ld1 {v17.8b}, [x3], x1 ++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... ++ ld1 {v2.8b}, [x3], x1 ++ trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... ++ ld1 {v19.8b}, [x3], x1 ++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... ++ ld1 {v4.8b}, [x3], x1 ++ trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... ++ ld1 {v21.8b}, [x3], x1 ++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... ++ ld1 {v6.8b}, [x3], x1 ++ trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... ++ ld1 {v23.8b}, [x3], x1 ++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... ++ ld1 {v17.8b}, [x3], x1 ++ trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... ++ ld1 {v25.8b}, [x3] ++ trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... ++ trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... ++ trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... ++ trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... ++ trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... ++ trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... ++ trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... ++ trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... ++ trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... ++ trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... ++ trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... ++ trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... ++ trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... ++ trn1 v31.2s, v19.2s, v27.2s // P1[0..7] ++ trn2 v19.2s, v19.2s, v27.2s // P5[0..7] ++ trn1 v27.2s, v21.2s, v23.2s // P2[0..7] ++ trn2 v21.2s, v21.2s, v23.2s // P6[0..7] ++ trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... ++ trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... ++ trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... ++ trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... ++ trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... ++ trn1 v24.2s, v29.2s, v23.2s // P1[8..15] ++ trn2 v23.2s, v29.2s, v23.2s // P5[8..15] ++ trn1 v26.2s, v25.2s, v18.2s // P2[8..15] ++ trn2 v18.2s, v25.2s, v18.2s // P6[8..15] ++ trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... ++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... ++ trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... ++ trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... ++ trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... ++ ushll v5.8h, v31.8b, #1 // 2*P1[0..7] ++ ushll v6.8h, v19.8b, #1 // 2*P5[0..7] ++ trn1 v7.2s, v16.2s, v20.2s // P3[0..7] ++ uxtl v17.8h, v27.8b // P2[0..7] ++ trn2 v16.2s, v16.2s, v20.2s // P7[0..7] ++ uxtl v20.8h, v21.8b // P6[0..7] ++ trn1 v21.2s, v22.2s, v25.2s // P3[8..15] ++ ushll v24.8h, v24.8b, #1 // 2*P1[8..15] ++ trn2 v22.2s, v22.2s, v25.2s // P7[8..15] ++ ushll v25.8h, v23.8b, #1 // 2*P5[8..15] ++ trn1 v27.2s, v1.2s, v3.2s // P4[0..7] ++ uxtl v26.8h, v26.8b // P2[8..15] ++ mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] ++ uxtl v17.8h, v18.8b // P6[8..15] ++ mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] ++ trn1 v18.2s, v2.2s, v4.2s // P4[8..15] ++ uxtl v28.8h, v7.8b // P3[0..7] ++ mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] ++ uxtl v16.8h, v16.8b // P7[0..7] ++ uxtl v26.8h, v21.8b // P3[8..15] ++ mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] ++ uxtl v22.8h, v22.8b // P7[8..15] ++ ushll v7.8h, v7.8b, #1 // 2*P3[0..7] ++ uxtl v27.8h, v27.8b // P4[0..7] ++ trn2 v1.2s, v1.2s, v3.2s // P8[0..7] ++ ushll v3.8h, v21.8b, #1 // 2*P3[8..15] ++ trn2 v2.2s, v2.2s, v4.2s // P8[8..15] ++ uxtl v4.8h, v18.8b // P4[8..15] ++ mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ uxtl v1.8h, v1.8b // P8[0..7] ++ mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ uxtl v2.8h, v2.8b // P8[8..15] ++ uxtl v16.8h, v19.8b // P5[0..7] ++ mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ uxtl v18.8h, v23.8b // P5[8..15] ++ dup v19.8h, w2 // pq ++ mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] ++ sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] ++ mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] ++ abs v23.8h, v21.8h ++ mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] ++ abs v26.8h, v22.8h ++ sshr v21.8h, v21.8h, #8 // clip_sign[0..7] ++ mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ sshr v23.8h, v23.8h, #1 // clip[0..7] ++ sshr v26.8h, v26.8h, #1 // clip[8..15] ++ mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ sshr v1.8h, v22.8h, #8 // clip_sign[8..15] ++ cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 ++ mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 ++ srshr v5.8h, v5.8h, #3 ++ mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ srshr v2.8h, v6.8h, #3 ++ mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ srshr v6.8h, v24.8h, #3 ++ mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ abs v5.8h, v5.8h // a1[0..7] ++ srshr v24.8h, v25.8h, #3 ++ mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ abs v2.8h, v2.8h // a2[0..7] ++ abs v6.8h, v6.8h // a1[8..15] ++ mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ abs v17.8h, v24.8h // a2[8..15] ++ cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] ++ srshr v3.8h, v3.8h, #3 ++ cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] ++ srshr v7.8h, v7.8h, #3 ++ bsl v20.16b, v2.16b, v5.16b // a3[0..7] ++ abs v2.8h, v3.8h // a0[8..15] ++ sshr v3.8h, v3.8h, #8 // a0_sign[8..15] ++ bsl v24.16b, v17.16b, v6.16b // a3[8..15] ++ abs v5.8h, v7.8h // a0[0..7] ++ sshr v6.8h, v7.8h, #8 // a0_sign[0..7] ++ cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq ++ sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] ++ uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] ++ uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq ++ orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq ++ sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] ++ mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] ++ orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq ++ mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ mov w7, v2.s[1] ++ mov w8, v2.s[3] ++ ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ mov w2, v5.s[1] // move to gp reg ++ cmhs v2.8h, v3.8h, v26.8h ++ mov w3, v5.s[3] ++ cmhs v5.8h, v0.8h, v23.8h ++ bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) ++ and w9, w7, w8 ++ bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) ++ and w10, w2, w3 ++ bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ and w9, w10, w9 ++ bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 ++ tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case ++ mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 ++ mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 ++ sqxtun v2.8b, v4.8h ++ mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v27.8h ++ sqxtun v1.8b, v16.8h ++ sqxtun v3.8b, v18.8h ++ tbnz w2, #0, 1f ++ st2 {v0.b, v1.b}[0], [x0], x1 ++ st2 {v0.b, v1.b}[1], [x0], x1 ++ st2 {v0.b, v1.b}[2], [x0], x1 ++ st2 {v0.b, v1.b}[3], [x0] ++1: tbnz w3, #0, 2f ++ st2 {v0.b, v1.b}[4], [x5], x1 ++ st2 {v0.b, v1.b}[5], [x5], x1 ++ st2 {v0.b, v1.b}[6], [x5], x1 ++ st2 {v0.b, v1.b}[7], [x5] ++2: tbnz w7, #0, 3f ++ st2 {v2.b, v3.b}[0], [x4], x1 ++ st2 {v2.b, v3.b}[1], [x4], x1 ++ st2 {v2.b, v3.b}[2], [x4], x1 ++ st2 {v2.b, v3.b}[3], [x4] ++3: tbnz w8, #0, 4f ++ st2 {v2.b, v3.b}[4], [x6], x1 ++ st2 {v2.b, v3.b}[5], [x6], x1 ++ st2 {v2.b, v3.b}[6], [x6], x1 ++ st2 {v2.b, v3.b}[7], [x6] ++4: ret ++endfunc ++ ++// Copy at most the specified number of bytes from source to destination buffer, ++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence ++// On entry: ++// x0 -> source buffer ++// w1 = max number of bytes to copy ++// x2 -> destination buffer, optimally 8-byte aligned ++// On exit: ++// w0 = number of bytes not copied ++function ff_vc1_unescape_buffer_helper_neon, export=1 ++ // Offset by 80 to screen out cases that are too short for us to handle, ++ // and also make it easy to test for loop termination, or to determine ++ // whether we need an odd number of half-iterations of the loop. ++ subs w1, w1, #80 ++ b.mi 90f ++ ++ // Set up useful constants ++ movi v20.4s, #3, lsl #24 ++ movi v21.4s, #3, lsl #16 ++ ++ tst w1, #32 ++ b.ne 1f ++ ++ ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 ++ ext v25.16b, v0.16b, v1.16b, #1 ++ ext v26.16b, v0.16b, v1.16b, #2 ++ ext v27.16b, v0.16b, v1.16b, #3 ++ ext v29.16b, v1.16b, v2.16b, #1 ++ ext v30.16b, v1.16b, v2.16b, #2 ++ ext v31.16b, v1.16b, v2.16b, #3 ++ bic v24.16b, v0.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v1.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ add w1, w1, #32 ++ b 3f ++ ++1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 ++ ext v25.16b, v3.16b, v4.16b, #1 ++ ext v26.16b, v3.16b, v4.16b, #2 ++ ext v27.16b, v3.16b, v4.16b, #3 ++ ext v29.16b, v4.16b, v5.16b, #1 ++ ext v30.16b, v4.16b, v5.16b, #2 ++ ext v31.16b, v4.16b, v5.16b, #3 ++ bic v24.16b, v3.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v4.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ // Drop through... ++2: mov v0.16b, v5.16b ++ ld1 {v1.16b, v2.16b}, [x0], #32 ++ cmeq v28.4s, v28.4s, #0 ++ cmeq v29.4s, v29.4s, #0 ++ cmeq v30.4s, v30.4s, #0 ++ cmeq v31.4s, v31.4s, #0 ++ orr v24.16b, v24.16b, v25.16b ++ orr v26.16b, v26.16b, v27.16b ++ orr v28.16b, v28.16b, v29.16b ++ orr v30.16b, v30.16b, v31.16b ++ ext v25.16b, v0.16b, v1.16b, #1 ++ orr v22.16b, v24.16b, v26.16b ++ ext v26.16b, v0.16b, v1.16b, #2 ++ ext v27.16b, v0.16b, v1.16b, #3 ++ ext v29.16b, v1.16b, v2.16b, #1 ++ orr v23.16b, v28.16b, v30.16b ++ ext v30.16b, v1.16b, v2.16b, #2 ++ ext v31.16b, v1.16b, v2.16b, #3 ++ bic v24.16b, v0.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ orr v22.16b, v22.16b, v23.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v1.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ addv s22, v22.4s ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ mov w3, v22.s[0] ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ cbnz w3, 90f ++ st1 {v3.16b, v4.16b}, [x2], #32 ++3: mov v3.16b, v2.16b ++ ld1 {v4.16b, v5.16b}, [x0], #32 ++ cmeq v28.4s, v28.4s, #0 ++ cmeq v29.4s, v29.4s, #0 ++ cmeq v30.4s, v30.4s, #0 ++ cmeq v31.4s, v31.4s, #0 ++ orr v24.16b, v24.16b, v25.16b ++ orr v26.16b, v26.16b, v27.16b ++ orr v28.16b, v28.16b, v29.16b ++ orr v30.16b, v30.16b, v31.16b ++ ext v25.16b, v3.16b, v4.16b, #1 ++ orr v22.16b, v24.16b, v26.16b ++ ext v26.16b, v3.16b, v4.16b, #2 ++ ext v27.16b, v3.16b, v4.16b, #3 ++ ext v29.16b, v4.16b, v5.16b, #1 ++ orr v23.16b, v28.16b, v30.16b ++ ext v30.16b, v4.16b, v5.16b, #2 ++ ext v31.16b, v4.16b, v5.16b, #3 ++ bic v24.16b, v3.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ orr v22.16b, v22.16b, v23.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v4.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ addv s22, v22.4s ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ mov w3, v22.s[0] ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ cbnz w3, 91f ++ st1 {v0.16b, v1.16b}, [x2], #32 ++ subs w1, w1, #64 ++ b.pl 2b ++ ++90: add w0, w1, #80 ++ ret ++ ++91: sub w1, w1, #32 ++ b 90b ++endfunc --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder; @@ -448,7 +2329,7 @@ Upstream-status: Pending @@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_c } } - + +static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt) +{ + const enum AVPixelFormat *pf = p->pix_fmts; @@ -528,7 +2409,7 @@ Upstream-status: Pending @@ -26,83 +26,209 @@ #include "libavutil/internal.h" #include "libavcodec/cabac.h" - + + #define get_cabac_inline get_cabac_inline_arm static av_always_inline int get_cabac_inline_arm(CABACContext *c, @@ -622,7 +2503,7 @@ Upstream-status: Pending + ); + return bit; +} - + - __asm__ volatile( - "ldrb %[bit] , [%[state]] \n\t" - "add %[r_b] , %[tables] , %[lps_off] \n\t" @@ -719,7 +2600,7 @@ Upstream-status: Pending +#endif + "lsls %[range] , %[low], #16 \n\t" + "bne 1f \n\t" - + - return bit & 1; + "str %[ptr] , [%[c], %[ptr_off]] \n\t" + "rev %[tmp] , %[tmp] \n\t" @@ -803,7 +2684,7 @@ Upstream-status: Pending +} + #endif /* HAVE_ARMV6T2_INLINE */ - + #endif /* AVCODEC_ARM_CABAC_H */ --- /dev/null +++ b/libavcodec/arm/rpi_hevc_cabac.h @@ -15211,6 +17092,883 @@ Upstream-status: Pending + bx lr + +endfunc +--- a/libavcodec/arm/vc1dsp_init_neon.c ++++ b/libavcodec/arm/vc1dsp_init_neon.c +@@ -19,6 +19,7 @@ + #include + + #include "libavutil/attributes.h" ++#include "libavutil/intreadwrite.h" + #include "libavcodec/vc1dsp.h" + #include "vc1dsp.h" + +@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_ + void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + ++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); ++ + void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int rnd); + +@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t + void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + ++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); ++ ++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) ++{ ++ /* Dealing with starting and stopping, and removing escape bytes, are ++ * comparatively less time-sensitive, so are more clearly expressed using ++ * a C wrapper around the assembly inner loop. Note that we assume a ++ * little-endian machine that supports unaligned loads. */ ++ int dsize = 0; ++ while (size >= 4) ++ { ++ int found = 0; ++ while (!found && (((uintptr_t) dst) & 7) && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ if (!found) ++ { ++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); ++ dst += skip; ++ src += skip; ++ size -= skip; ++ dsize += skip; ++ while (!found && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ } ++ if (found) ++ { ++ *dst++ = *src++; ++ *dst++ = *src++; ++ ++src; ++ size -= 3; ++ dsize += 2; ++ } ++ } ++ while (size > 0) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ return dsize; ++} ++ + #define FN_ASSIGN(X, Y) \ + dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ + dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon +@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC + dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; + dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + ++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; ++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; ++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; ++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; ++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; ++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; ++ + dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; + FN_ASSIGN(1, 0); + FN_ASSIGN(2, 0); +@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; ++ ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } +--- a/libavcodec/arm/vc1dsp_neon.S ++++ b/libavcodec/arm/vc1dsp_neon.S +@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e + vst1.32 {d1[1]}, [r0,:32] + bx lr + endfunc ++ ++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter4_neon, export=1 ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.32 {d1[0]}, [r0], r1 @ P5 ++ vld1.32 {d2[0]}, [r3], r1 @ P1 ++ vld1.32 {d3[0]}, [r3], r1 @ P2 ++ vld1.32 {d4[0]}, [r0], r1 @ P6 ++ vld1.32 {d5[0]}, [r3], r1 @ P3 ++ vld1.32 {d6[0]}, [r0], r1 @ P7 ++ vld1.32 {d7[0]}, [r3] @ P4 ++ vld1.32 {d16[0]}, [r0] @ P8 ++ vshll.u8 q9, d1, #1 @ 2*P5 ++ vdup.16 d17, r2 @ pq ++ vshll.u8 q10, d2, #1 @ 2*P1 ++ vmovl.u8 q11, d3 @ P2 ++ vmovl.u8 q1, d4 @ P6 ++ vmovl.u8 q12, d5 @ P3 ++ vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q11, d6 @ P7 ++ vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6 ++ vshll.u8 q2, d5, #1 @ 2*P3 ++ vmovl.u8 q3, d7 @ P4 ++ vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q11, d16 @ P8 ++ vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3 ++ vmovl.u8 q12, d1 @ P5 ++ vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4 ++ vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vsub.i16 d1, d6, d24 @ P4-P5 ++ vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5 ++ vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vabs.s16 d2, d1 ++ vrshr.s16 d3, d18, #3 ++ vrshr.s16 d5, d20, #3 ++ vshr.s16 d2, d2, #1 @ clip ++ vrshr.s16 d4, d4, #3 ++ vabs.s16 d3, d3 @ a2 ++ vshr.s16 d1, d1, #8 @ clip_sign ++ vabs.s16 d5, d5 @ a1 ++ vceq.i16 d7, d2, #0 @ test clip == 0 ++ vabs.s16 d16, d4 @ a0 ++ vshr.s16 d4, d4, #8 @ a0_sign ++ vcge.s16 d18, d5, d3 @ test a1 >= a2 ++ vcge.s16 d17, d16, d17 @ test a0 >= pq ++ vbsl d18, d3, d5 @ a3 ++ vsub.i16 d1, d1, d4 @ clip_sign - a0_sign ++ vorr d3, d7, d17 @ test clip == 0 || a0 >= pq ++ vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 d5, d18, d16 @ test a3 >= a0 ++ vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r0, d4[1] @ move to gp reg ++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vcge.s16 d4, d0, d2 ++ tst r0, #1 ++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered ++ vbsl d4, d2, d0 @ FFMIN(d, clip) ++ vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vqmovun.s16 d0, q3 ++ vqmovun.s16 d1, q12 ++ vst1.32 {d0[0]}, [r3], r1 ++ vst1.32 {d1[0]}, [r3] ++1: bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter4_neon, export=1 ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d4}, [r3], r1 ++ vld1.32 {d3}, [r3], r1 ++ vld1.32 {d5}, [r3] ++ vdup.16 d1, r2 @ pq ++ vtrn.8 q1, q2 ++ vtrn.16 d2, d3 @ P1, P5, P3, P7 ++ vtrn.16 d4, d5 @ P2, P6, P4, P8 ++ vshll.u8 q3, d2, #1 @ 2*P1, 2*P5 ++ vmovl.u8 q8, d4 @ P2, P6 ++ vmovl.u8 q9, d3 @ P3, P7 ++ vmovl.u8 q2, d5 @ P4, P8 ++ vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6 ++ vshll.u8 q10, d3, #1 @ 2*P3, 2*P7 ++ vmovl.u8 q1, d2 @ P1, P5 ++ vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 ++ vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 ++ vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later ++ vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4 ++ vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5 ++ vsub.i16 d3, d4, d2 @ P4-P5 ++ vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vrshr.s16 q3, q3, #3 ++ vabs.s16 d5, d3 ++ vshr.s16 d3, d3, #8 @ clip_sign ++ vrshr.s16 d16, d20, #3 ++ vabs.s16 q3, q3 @ a1, a2 ++ vshr.s16 d5, d5, #1 @ clip ++ vabs.s16 d17, d16 @ a0 ++ vceq.i16 d18, d5, #0 @ test clip == 0 ++ vshr.s16 d16, d16, #8 @ a0_sign ++ vcge.s16 d19, d6, d7 @ test a1 >= a2 ++ vcge.s16 d1, d17, d1 @ test a0 >= pq ++ vsub.i16 d16, d3, d16 @ clip_sign - a0_sign ++ vbsl d19, d7, d6 @ a3 ++ vorr d1, d18, d1 @ test clip == 0 || a0 >= pq ++ vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 d6, d19, d17 @ test a3 >= a0 @ ++ vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r2, d3[1] @ move to gp reg ++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vcge.s16 d3, d0, d5 ++ tst r2, #1 ++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered ++ vbsl d3, d5, d0 @ FFMIN(d, clip) ++ vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d0, q2 ++ vst2.8 {d0[0], d1[0]}, [r0], r1 ++ vst2.8 {d0[1], d1[1]}, [r0], r1 ++ vst2.8 {d0[2], d1[2]}, [r0], r1 ++ vst2.8 {d0[3], d1[3]}, [r0] ++1: bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter8_neon, export=1 ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.32 {d1}, [r0 :64], r1 @ P5 ++ vld1.32 {d2}, [r3 :64], r1 @ P1 ++ vld1.32 {d3}, [r3 :64], r1 @ P2 ++ vld1.32 {d4}, [r0 :64], r1 @ P6 ++ vld1.32 {d5}, [r3 :64], r1 @ P3 ++ vld1.32 {d6}, [r0 :64], r1 @ P7 ++ vshll.u8 q8, d1, #1 @ 2*P5 ++ vshll.u8 q9, d2, #1 @ 2*P1 ++ vld1.32 {d7}, [r3 :64] @ P4 ++ vmovl.u8 q1, d3 @ P2 ++ vld1.32 {d20}, [r0 :64] @ P8 ++ vmovl.u8 q11, d4 @ P6 ++ vdup.16 q12, r2 @ pq ++ vmovl.u8 q13, d5 @ P3 ++ vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q1, d6 @ P7 ++ vshll.u8 q2, d5, #1 @ 2*P3 ++ vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6 ++ vmovl.u8 q3, d7 @ P4 ++ vmovl.u8 q10, d20 @ P8 ++ vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q1, d1 @ P5 ++ vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3 ++ vsub.i16 q13, q3, q1 @ P4-P5 ++ vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4 ++ vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vabs.s16 q10, q13 ++ vshr.s16 q13, q13, #8 @ clip_sign ++ vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vshr.s16 q10, q10, #1 @ clip ++ vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5 ++ vrshr.s16 q8, q8, #3 ++ vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vceq.i16 q11, q10, #0 @ test clip == 0 ++ vrshr.s16 q9, q9, #3 ++ vabs.s16 q8, q8 @ a2 ++ vabs.s16 q9, q9 @ a1 ++ vrshr.s16 q2, q2, #3 ++ vcge.s16 q14, q9, q8 @ test a1 >= a2 ++ vabs.s16 q15, q2 @ a0 ++ vshr.s16 q2, q2, #8 @ a0_sign ++ vbsl q14, q8, q9 @ a3 ++ vcge.s16 q8, q15, q12 @ test a0 >= pq ++ vsub.i16 q2, q13, q2 @ clip_sign - a0_sign ++ vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q12, q14, q15 @ test a3 >= a0 ++ vorr q8, q11, q8 @ test clip == 0 || a0 >= pq ++ vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vshl.i64 q11, q9, #16 ++ vmov.32 r0, d18[1] @ move to gp reg ++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vmov.32 r2, d19[1] ++ vshr.s64 q9, q11, #48 ++ vcge.s16 q11, q0, q10 ++ vorr q8, q8, q9 ++ and r0, r0, r2 ++ vbsl q11, q10, q0 @ FFMIN(d, clip) ++ tst r0, #1 ++ bne 1f @ none of the 8 pixel pairs should be updated in this case ++ vbic q0, q11, q8 @ set each d to zero if it should not be filtered ++ vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vqmovun.s16 d0, q3 ++ vqmovun.s16 d1, q1 ++ vst1.32 {d0}, [r3 :64], r1 ++ vst1.32 {d1}, [r3 :64] ++1: bx lr ++endfunc ++ ++.align 5 ++.Lcoeffs: ++.quad 0x00050002 ++ ++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter8_neon, export=1 ++ push {lr} ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d4}, [r3], r1 ++ add r12, r0, r1, lsl #2 ++ vld1.32 {d3}, [r3], r1 ++ vld1.32 {d5}, [r3], r1 ++ vld1.32 {d6}, [r3], r1 ++ vld1.32 {d16}, [r3], r1 ++ vld1.32 {d7}, [r3], r1 ++ vld1.32 {d17}, [r3] ++ vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]... ++ vdup.16 q9, r2 @ pq ++ vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... ++ vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... ++ vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]... ++ vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]... ++ vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... ++ vtrn.32 d2, d6 @ P1, P5 ++ vtrn.32 d4, d16 @ P2, P6 ++ vtrn.32 d3, d7 @ P3, P7 ++ vtrn.32 d5, d17 @ P4, P8 ++ vshll.u8 q10, d2, #1 @ 2*P1 ++ vshll.u8 q11, d6, #1 @ 2*P5 ++ vmovl.u8 q12, d4 @ P2 ++ vmovl.u8 q13, d16 @ P6 ++ vmovl.u8 q14, d3 @ P3 ++ vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q12, d7 @ P7 ++ vshll.u8 q1, d3, #1 @ 2*P3 ++ vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6 ++ vmovl.u8 q2, d5 @ P4 ++ vmovl.u8 q8, d17 @ P8 ++ vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q3, d6 @ P5 ++ vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3 ++ vsub.i16 q12, q2, q3 @ P4-P5 ++ vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4 ++ vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vabs.s16 q8, q12 ++ vshr.s16 q12, q12, #8 @ clip_sign ++ vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vshr.s16 q8, q8, #1 @ clip ++ vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5 ++ vrshr.s16 q11, q11, #3 ++ vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vceq.i16 q13, q8, #0 @ test clip == 0 ++ vrshr.s16 q10, q10, #3 ++ vabs.s16 q11, q11 @ a2 ++ vabs.s16 q10, q10 @ a1 ++ vrshr.s16 q1, q1, #3 ++ vcge.s16 q14, q10, q11 @ test a1 >= a2 ++ vabs.s16 q15, q1 @ a0 ++ vshr.s16 q1, q1, #8 @ a0_sign ++ vbsl q14, q11, q10 @ a3 ++ vcge.s16 q9, q15, q9 @ test a0 >= pq ++ vsub.i16 q1, q12, q1 @ clip_sign - a0_sign ++ vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q11, q14, q15 @ test a3 >= a0 ++ vorr q9, q13, q9 @ test clip == 0 || a0 >= pq ++ vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r2, d20[1] @ move to gp reg ++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vmov.32 r3, d21[1] ++ vcge.s16 q10, q0, q8 ++ and r14, r2, r3 ++ vbsl q10, q8, q0 @ FFMIN(d, clip) ++ tst r14, #1 ++ bne 2f @ none of the 8 pixel pairs should be updated in this case ++ vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vqmovun.s16 d1, q3 ++ vqmovun.s16 d0, q2 ++ tst r2, #1 ++ bne 1f @ none of the first 4 pixel pairs should be updated if so ++ vst2.8 {d0[0], d1[0]}, [r0], r1 ++ vst2.8 {d0[1], d1[1]}, [r0], r1 ++ vst2.8 {d0[2], d1[2]}, [r0], r1 ++ vst2.8 {d0[3], d1[3]}, [r0] ++1: tst r3, #1 ++ bne 2f @ none of the second 4 pixel pairs should be updated if so ++ vst2.8 {d0[4], d1[4]}, [r12], r1 ++ vst2.8 {d0[5], d1[5]}, [r12], r1 ++ vst2.8 {d0[6], d1[6]}, [r12], r1 ++ vst2.8 {d0[7], d1[7]}, [r12] ++2: pop {pc} ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter16_neon, export=1 ++ vpush {d8-d15} ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.64 {q1}, [r0 :128], r1 @ P5 ++ vld1.64 {q2}, [r3 :128], r1 @ P1 ++ vld1.64 {q3}, [r3 :128], r1 @ P2 ++ vld1.64 {q4}, [r0 :128], r1 @ P6 ++ vld1.64 {q5}, [r3 :128], r1 @ P3 ++ vld1.64 {q6}, [r0 :128], r1 @ P7 ++ vshll.u8 q7, d2, #1 @ 2*P5[0..7] ++ vshll.u8 q8, d4, #1 @ 2*P1[0..7] ++ vld1.64 {q9}, [r3 :128] @ P4 ++ vmovl.u8 q10, d6 @ P2[0..7] ++ vld1.64 {q11}, [r0 :128] @ P8 ++ vmovl.u8 q12, d8 @ P6[0..7] ++ vdup.16 q13, r2 @ pq ++ vshll.u8 q2, d5, #1 @ 2*P1[8..15] ++ vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7] ++ vshll.u8 q10, d3, #1 @ 2*P5[8..15] ++ vmovl.u8 q3, d7 @ P2[8..15] ++ vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] ++ vmovl.u8 q4, d9 @ P6[8..15] ++ vmovl.u8 q14, d10 @ P3[0..7] ++ vmovl.u8 q15, d12 @ P7[0..7] ++ vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15] ++ vshll.u8 q3, d10, #1 @ 2*P3[0..7] ++ vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15] ++ vmovl.u8 q6, d13 @ P7[8..15] ++ vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ vmovl.u8 q14, d18 @ P4[0..7] ++ vmovl.u8 q9, d19 @ P4[8..15] ++ vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ vmovl.u8 q15, d11 @ P3[8..15] ++ vshll.u8 q5, d11, #1 @ 2*P3[8..15] ++ vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7] ++ vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ vmovl.u8 q15, d22 @ P8[0..7] ++ vmovl.u8 q11, d23 @ P8[8..15] ++ vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ vmovl.u8 q6, d2 @ P5[0..7] ++ vmovl.u8 q1, d3 @ P5[8..15] ++ vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15] ++ vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7] ++ vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ vrshr.s16 q8, q8, #3 ++ vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ vrshr.s16 q7, q7, #3 ++ vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ vabs.s16 q11, q15 ++ vabs.s16 q8, q8 @ a1[0..7] ++ vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ vshr.s16 q15, q15, #8 @ clip_sign[0..7] ++ vrshr.s16 q2, q2, #3 ++ vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ vabs.s16 q7, q7 @ a2[0..7] ++ vrshr.s16 q10, q10, #3 ++ vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15] ++ vshr.s16 q11, q11, #1 @ clip[0..7] ++ vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7] ++ vabs.s16 q2, q2 @ a1[8..15] ++ vrshr.s16 q3, q3, #3 ++ vabs.s16 q10, q10 @ a2[8..15] ++ vbsl q4, q7, q8 @ a3[0..7] ++ vabs.s16 q7, q12 ++ vshr.s16 q8, q12, #8 @ clip_sign[8..15] ++ vrshr.s16 q5, q5, #3 ++ vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15] ++ vshr.s16 q7, q7, #1 @ clip[8..15] ++ vbsl q12, q10, q2 @ a3[8..15] ++ vabs.s16 q2, q3 @ a0[0..7] ++ vceq.i16 q10, q11, #0 @ test clip[0..7] == 0 ++ vshr.s16 q3, q3, #8 @ a0_sign[0..7] ++ vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7] ++ vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq ++ vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq ++ vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7] ++ vabs.s16 q4, q5 @ a0[8..15] ++ vshr.s16 q5, q5, #8 @ a0_sign[8..15] ++ vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq ++ vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15] ++ vceq.i16 q8, q7, #0 @ test clip[8..15] == 0 ++ vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ vmov.32 r0, d4[1] @ move to gp reg ++ vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq ++ vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vmov.32 r2, d5[1] ++ vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15] ++ vshl.i64 q2, q2, #16 ++ vcge.s16 q12, q15, q11 ++ vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ vshr.s64 q2, q2, #48 ++ and r0, r0, r2 ++ vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7]) ++ vshl.i64 q11, q4, #16 ++ vmov.32 r2, d8[1] ++ vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ vorr q2, q10, q2 ++ vmov.32 r12, d9[1] ++ vshr.s64 q4, q11, #48 ++ vcge.s16 q10, q0, q7 ++ vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vorr q4, q8, q4 ++ and r2, r2, r12 ++ vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15]) ++ vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] ++ and r0, r0, r2 ++ vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ tst r0, #1 ++ bne 1f @ none of the 16 pixel pairs should be updated in this case ++ vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] ++ vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] ++ vqmovun.s16 d4, q14 ++ vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] ++ vqmovun.s16 d0, q6 ++ vqmovun.s16 d5, q9 ++ vqmovun.s16 d1, q1 ++ vst1.64 {q2}, [r3 :128], r1 ++ vst1.64 {q0}, [r3 :128] ++1: vpop {d8-d15} ++ bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter16_neon, export=1 ++ push {r4-r6,lr} ++ vpush {d8-d15} ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d3}, [r3], r1 ++ add r4, r0, r1, lsl #2 ++ vld1.32 {d10}, [r3], r1 ++ vld1.32 {d11}, [r3], r1 ++ vld1.32 {d16}, [r3], r1 ++ vld1.32 {d4}, [r3], r1 ++ vld1.32 {d8}, [r3], r1 ++ vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]... ++ vld1.32 {d14}, [r3], r1 ++ vld1.32 {d5}, [r3], r1 ++ vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]... ++ vld1.32 {d6}, [r3], r1 ++ vld1.32 {d12}, [r3], r1 ++ vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]... ++ vld1.32 {d13}, [r3], r1 ++ vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... ++ vld1.32 {d1}, [r3], r1 ++ vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]... ++ vld1.32 {d7}, [r3], r1 ++ vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... ++ vld1.32 {d9}, [r3], r1 ++ vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]... ++ vld1.32 {d15}, [r3] ++ vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]... ++ vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... ++ vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]... ++ vdup.16 q9, r2 @ pq ++ vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]... ++ vtrn.32 d2, d16 @ P1[0..7], P5[0..7] ++ vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]... ++ vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]... ++ vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]... ++ vtrn.32 d3, d4 @ P2[0..7], P6[0..7] ++ vshll.u8 q10, d2, #1 @ 2*P1[0..7] ++ vtrn.32 d10, d8 @ P3[0..7], P7[0..7] ++ vshll.u8 q11, d16, #1 @ 2*P5[0..7] ++ vtrn.32 d11, d14 @ P4[0..7], P8[0..7] ++ vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]... ++ vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]... ++ vmovl.u8 q1, d3 @ P2[0..7] ++ vmovl.u8 q12, d4 @ P6[0..7] ++ vtrn.32 d5, d1 @ P1[8..15], P5[8..15] ++ vtrn.32 d6, d7 @ P2[8..15], P6[8..15] ++ vtrn.32 d12, d9 @ P3[8..15], P7[8..15] ++ vtrn.32 d13, d15 @ P4[8..15], P8[8..15] ++ vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7] ++ vmovl.u8 q1, d10 @ P3[0..7] ++ vshll.u8 q2, d5, #1 @ 2*P1[8..15] ++ vshll.u8 q13, d1, #1 @ 2*P5[8..15] ++ vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] ++ vmovl.u8 q14, d6 @ P2[8..15] ++ vmovl.u8 q3, d7 @ P6[8..15] ++ vmovl.u8 q15, d8 @ P7[0..7] ++ vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ vmovl.u8 q1, d12 @ P3[8..15] ++ vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15] ++ vmovl.u8 q4, d9 @ P7[8..15] ++ vshll.u8 q14, d10, #1 @ 2*P3[0..7] ++ vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15] ++ vmovl.u8 q5, d11 @ P4[0..7] ++ vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ vshll.u8 q15, d12, #1 @ 2*P3[8..15] ++ vmovl.u8 q6, d13 @ P4[8..15] ++ vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ vmovl.u8 q1, d14 @ P8[0..7] ++ vmovl.u8 q7, d15 @ P8[8..15] ++ vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ vmovl.u8 q4, d16 @ P5[0..7] ++ vmovl.u8 q8, d1 @ P5[8..15] ++ vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7] ++ vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15] ++ vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7] ++ vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ vrshr.s16 q10, q10, #3 ++ vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15] ++ vrshr.s16 q11, q11, #3 ++ vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ vrshr.s16 q2, q2, #3 ++ vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ vabs.s16 q10, q10 @ a1[0..7] ++ vrshr.s16 q13, q13, #3 ++ vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ vabs.s16 q3, q11 @ a2[0..7] ++ vabs.s16 q2, q2 @ a1[8..15] ++ vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ vabs.s16 q11, q1 ++ vabs.s16 q12, q13 @ a2[8..15] ++ vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7] ++ vshr.s16 q1, q1, #8 @ clip_sign[0..7] ++ vrshr.s16 q15, q15, #3 ++ vshr.s16 q11, q11, #1 @ clip[0..7] ++ vrshr.s16 q14, q14, #3 ++ vbsl q13, q3, q10 @ a3[0..7] ++ vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15] ++ vabs.s16 q10, q15 @ a0[8..15] ++ vshr.s16 q15, q15, #8 @ a0_sign[8..15] ++ vbsl q3, q12, q2 @ a3[8..15] ++ vabs.s16 q2, q14 @ a0[0..7] ++ vabs.s16 q12, q7 ++ vshr.s16 q7, q7, #8 @ clip_sign[8..15] ++ vshr.s16 q14, q14, #8 @ a0_sign[0..7] ++ vshr.s16 q12, q12, #1 @ clip[8..15] ++ vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15] ++ vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15] ++ vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq ++ vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq ++ vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7] ++ vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7] ++ vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ vceq.i16 q15, q11, #0 @ test clip[0..7] == 0 ++ vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq ++ vceq.i16 q14, q12, #0 @ test clip[8..15] == 0 ++ vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq ++ vcge.s16 q14, q13, q12 ++ vmov.32 r2, d4[1] @ move to gp reg ++ vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ vmov.32 r3, d5[1] ++ vcge.s16 q2, q0, q11 ++ vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15]) ++ vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7]) ++ vmov.32 r5, d6[1] ++ vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmov.32 r6, d7[1] ++ and r12, r2, r3 ++ vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 ++ vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 ++ and r14, r5, r6 ++ vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 ++ and r12, r12, r14 ++ vqmovun.s16 d4, q6 ++ vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 ++ tst r12, #1 ++ bne 4f @ none of the 16 pixel pairs should be updated in this case ++ vqmovun.s16 d2, q5 ++ vqmovun.s16 d3, q4 ++ vqmovun.s16 d5, q8 ++ tst r2, #1 ++ bne 1f ++ vst2.8 {d2[0], d3[0]}, [r0], r1 ++ vst2.8 {d2[1], d3[1]}, [r0], r1 ++ vst2.8 {d2[2], d3[2]}, [r0], r1 ++ vst2.8 {d2[3], d3[3]}, [r0] ++1: add r0, r4, r1, lsl #2 ++ tst r3, #1 ++ bne 2f ++ vst2.8 {d2[4], d3[4]}, [r4], r1 ++ vst2.8 {d2[5], d3[5]}, [r4], r1 ++ vst2.8 {d2[6], d3[6]}, [r4], r1 ++ vst2.8 {d2[7], d3[7]}, [r4] ++2: add r4, r0, r1, lsl #2 ++ tst r5, #1 ++ bne 3f ++ vst2.8 {d4[0], d5[0]}, [r0], r1 ++ vst2.8 {d4[1], d5[1]}, [r0], r1 ++ vst2.8 {d4[2], d5[2]}, [r0], r1 ++ vst2.8 {d4[3], d5[3]}, [r0] ++3: tst r6, #1 ++ bne 4f ++ vst2.8 {d4[4], d5[4]}, [r4], r1 ++ vst2.8 {d4[5], d5[5]}, [r4], r1 ++ vst2.8 {d4[6], d5[6]}, [r4], r1 ++ vst2.8 {d4[7], d5[7]}, [r4] ++4: vpop {d8-d15} ++ pop {r4-r6,pc} ++endfunc ++ ++@ Copy at most the specified number of bytes from source to destination buffer, ++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence ++@ On entry: ++@ r0 -> source buffer ++@ r1 = max number of bytes to copy ++@ r2 -> destination buffer, optimally 8-byte aligned ++@ On exit: ++@ r0 = number of bytes not copied ++function ff_vc1_unescape_buffer_helper_neon, export=1 ++ @ Offset by 48 to screen out cases that are too short for us to handle, ++ @ and also make it easy to test for loop termination, or to determine ++ @ whether we need an odd number of half-iterations of the loop. ++ subs r1, r1, #48 ++ bmi 90f ++ ++ @ Set up useful constants ++ vmov.i32 q0, #0x3000000 ++ vmov.i32 q1, #0x30000 ++ ++ tst r1, #16 ++ bne 1f ++ ++ vld1.8 {q8, q9}, [r0]! ++ vbic q12, q8, q0 ++ vext.8 q13, q8, q9, #1 ++ vext.8 q14, q8, q9, #2 ++ vext.8 q15, q8, q9, #3 ++ veor q12, q12, q1 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ add r1, r1, #16 ++ b 3f ++ ++1: vld1.8 {q10, q11}, [r0]! ++ vbic q12, q10, q0 ++ vext.8 q13, q10, q11, #1 ++ vext.8 q14, q10, q11, #2 ++ vext.8 q15, q10, q11, #3 ++ veor q12, q12, q1 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ @ Drop through... ++2: vmov q8, q11 ++ vld1.8 {q9}, [r0]! ++ vorr q13, q12, q13 ++ vorr q15, q14, q15 ++ vbic q12, q8, q0 ++ vorr q3, q13, q15 ++ vext.8 q13, q8, q9, #1 ++ vext.8 q14, q8, q9, #2 ++ vext.8 q15, q8, q9, #3 ++ veor q12, q12, q1 ++ vorr d6, d6, d7 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ vmov r3, r12, d6 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ orrs r3, r3, r12 ++ bne 90f ++ vst1.64 {q10}, [r2]! ++3: vmov q10, q9 ++ vld1.8 {q11}, [r0]! ++ vorr q13, q12, q13 ++ vorr q15, q14, q15 ++ vbic q12, q10, q0 ++ vorr q3, q13, q15 ++ vext.8 q13, q10, q11, #1 ++ vext.8 q14, q10, q11, #2 ++ vext.8 q15, q10, q11, #3 ++ veor q12, q12, q1 ++ vorr d6, d6, d7 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ vmov r3, r12, d6 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ orrs r3, r3, r12 ++ bne 91f ++ vst1.64 {q8}, [r2]! ++ subs r1, r1, #32 ++ bpl 2b ++ ++90: add r0, r1, #48 ++ bx lr ++ ++91: sub r1, r1, #16 ++ b 90b ++endfunc --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -2567,6 +2567,17 @@ typedef struct AVHWAccel { @@ -15229,7 +17987,7 @@ Upstream-status: Pending + */ + void (*abort_frame)(AVCodecContext *avctx); } AVHWAccel; - + /** --- a/libavcodec/cabac.h +++ b/libavcodec/cabac.h @@ -15253,7 +18011,7 @@ Upstream-status: Pending +++ b/libavcodec/codec.h @@ -350,6 +350,17 @@ const AVCodec *av_codec_iterate(void **o AVCodec *avcodec_find_decoder(enum AVCodecID id); - + /** + * Find a registered decoder with a matching codec ID and pix_fmt. + * A decoder will pix_fmt set to NULL will match any fmt. @@ -15761,12 +18519,788 @@ Upstream-status: Pending +}; + +#endif +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v3.h +@@ -0,0 +1,255 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 flags; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 padding[5]; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u8 num_active_dpb_entries; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ ++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) ++/* ++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - ++ * the number of data (in bits) to skip in the ++ * slice segment header. ++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" ++ * to before syntax element "slice_temporal_mvp_enabled_flag". ++ * If IDR, the skipped bits are just "pic_output_flag" ++ * (separate_colour_plane_flag is not supported). ++ */ ++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) ++ ++#endif +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v4.h +@@ -0,0 +1,515 @@ ++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* ++ * Video for Linux Two controls header file ++ * ++ * Copyright (C) 1999-2012 the contributors ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * Alternatively you can redistribute this file under the terms of the ++ * BSD license as stated below: ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * 3. The names of its contributors may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * The contents of this header was split off from videodev2.h. All control ++ * definitions should be added to this header, which is included by ++ * videodev2.h. ++ */ ++ ++#ifndef AVCODEC_HEVC_CTRLS_V4_H ++#define AVCODEC_HEVC_CTRLS_V4_H ++ ++#include ++#include ++ ++#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) ++#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) ++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) ++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403) ++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404) ++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405) ++#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406) ++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407) ++ ++enum v4l2_stateless_hevc_decode_mode { ++ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_stateless_hevc_start_code { ++ V4L2_STATELESS_HEVC_START_CODE_NONE, ++ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/** ++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set ++ * ++ * @video_parameter_set_id: specifies the value of the ++ * vps_video_parameter_set_id of the active VPS ++ * @seq_parameter_set_id: provides an identifier for the SPS for ++ * reference by other syntax elements ++ * @pic_width_in_luma_samples: specifies the width of each decoded picture ++ * in units of luma samples ++ * @pic_height_in_luma_samples: specifies the height of each decoded picture ++ * in units of luma samples ++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the ++ * samples of the luma array ++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the ++ * samples of the chroma arrays ++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of ++ * the variable MaxPicOrderCntLsb ++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum ++ * required size of the decoded picture ++ * buffer for the codec video sequence ++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures ++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the ++ * value of SpsMaxLatencyPictures array ++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum ++ * luma coding block size ++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between ++ * the maximum and minimum luma ++ * coding block size ++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma ++ * transform block size ++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between ++ * the maximum and minimum luma ++ * transform block size ++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy ++ * depth for transform units of ++ * coding units coded in inter ++ * prediction mode ++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy ++ * depth for transform units of ++ * coding units coded in intra ++ * prediction mode ++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of ++ * bits used to represent each of PCM sample ++ * values of the luma component ++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number ++ * of bits used to represent each of PCM ++ * sample values of the chroma components ++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the ++ * minimum size of coding blocks ++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between ++ * the maximum and minimum size of ++ * coding blocks ++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set() ++ * syntax structures included in the SPS ++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term ++ * reference pictures that are specified in the SPS ++ * @chroma_format_idc: specifies the chroma sampling ++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number ++ * of temporal sub-layers ++ * @reserved: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_SPS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_sps { ++ __u8 video_parameter_set_id; ++ __u8 seq_parameter_set_id; ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u8 reserved[6]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++/** ++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set ++ * ++ * @pic_parameter_set_id: identifies the PPS for reference by other ++ * syntax elements ++ * @num_extra_slice_header_bits: specifies the number of extra slice header ++ * bits that are present in the slice header RBSP ++ * for coded pictures referring to the PPS. ++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the ++ * inferred value of num_ref_idx_l0_active_minus1 ++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the ++ * inferred value of num_ref_idx_l1_active_minus1 ++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for ++ * each slice referring to the PPS ++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding ++ * tree block size and the minimum luma coding block ++ * size of coding units that convey cu_qp_delta_abs ++ * and cu_qp_delta_sign_flag ++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb ++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr ++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns ++ * partitioning the picture ++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning ++ * the picture ++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in ++ * units of coding tree blocks ++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in ++ * units of coding tree blocks ++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for ++ * beta divided by 2 ++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC ++ * divided by 2 ++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of ++ * the variable Log2ParMrgLevel ++ * @reserved: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_PPS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_pps { ++ __u8 pic_parameter_set_id; ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ __u8 reserved; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 ++ ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6 ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7 ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++/** ++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry ++ * ++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference. ++ * @flags: long term flag for the reference frame ++ * @field_pic: whether the reference is a field picture or a frame. ++ * @reserved: padding field. Should be zeroed by applications. ++ * @pic_order_cnt_val: the picture order count of the current picture. ++ */ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 flags; ++ __u8 field_pic; ++ __u16 reserved; ++ __s32 pic_order_cnt_val; ++}; ++ ++/** ++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters ++ * ++ * @delta_luma_weight_l0: the difference of the weighting factor applied ++ * to the luma prediction value for list 0 ++ * @luma_offset_l0: the additive offset applied to the luma prediction value ++ * for list 0 ++ * @delta_chroma_weight_l0: the difference of the weighting factor applied ++ * to the chroma prediction values for list 0 ++ * @chroma_offset_l0: the difference of the additive offset applied to ++ * the chroma prediction values for list 0 ++ * @delta_luma_weight_l1: the difference of the weighting factor applied ++ * to the luma prediction value for list 1 ++ * @luma_offset_l1: the additive offset applied to the luma prediction value ++ * for list 1 ++ * @delta_chroma_weight_l1: the difference of the weighting factor applied ++ * to the chroma prediction values for list 1 ++ * @chroma_offset_l1: the difference of the additive offset applied to ++ * the chroma prediction values for list 1 ++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for ++ * all luma weighting factors ++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm ++ * of the denominator for all chroma ++ * weighting factors ++ */ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++/** ++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters ++ * ++ * This control is a dynamically sized 1-dimensional array, ++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it. ++ * ++ * @bit_size: size (in bits) of the current slice data ++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data ++ * @num_entry_point_offsets: specifies the number of entry point offset syntax ++ * elements in the slice header. ++ * @nal_unit_type: specifies the coding type of the slice (B, P or I) ++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit ++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{} ++ * @colour_plane_id: specifies the colour plane associated with the current slice ++ * @slice_pic_order_cnt: specifies the picture order count ++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum ++ * reference index for reference picture list 0 ++ * that may be used to decode the slice ++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum ++ * reference index for reference picture list 1 ++ * that may be used to decode the slice ++ * @collocated_ref_idx: specifies the reference index of the collocated picture used ++ * for temporal motion vector prediction ++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging ++ * motion vector prediction candidates supported in ++ * the slice subtracted from 5 ++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding ++ * blocks in the slice ++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset ++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset ++ * @slice_act_y_qp_offset: screen content extension parameters ++ * @slice_act_cb_qp_offset: screen content extension parameters ++ * @slice_act_cr_qp_offset: screen content extension parameters ++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2 ++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2 ++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or ++ * more fields ++ * @reserved0: padding field. Should be zeroed by applications. ++ * @slice_segment_addr: specifies the address of the first coding tree block in ++ * the slice segment ++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB ++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB ++ * @short_term_ref_pic_set_size: specifies the size of short-term reference ++ * pictures set included in the SPS ++ * @long_term_ref_pic_set_size: specifies the size of long-term reference ++ * pictures set include in the SPS ++ * @pred_weight_table: the prediction weight coefficients for inter-picture ++ * prediction ++ * @reserved1: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_byte_offset; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __s32 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ __u8 reserved0[3]; ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u16 short_term_ref_pic_set_size; ++ __u16 long_term_ref_pic_set_size; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u8 reserved1[2]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++/** ++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters ++ * ++ * @pic_order_cnt_val: picture order count ++ * @short_term_ref_pic_set_size: specifies the size of short-term reference ++ * pictures set included in the SPS of the first slice ++ * @long_term_ref_pic_set_size: specifies the size of long-term reference ++ * pictures set include in the SPS of the first slice ++ * @num_active_dpb_entries: the number of entries in dpb ++ * @num_poc_st_curr_before: the number of reference pictures in the short-term ++ * set that come before the current frame ++ * @num_poc_st_curr_after: the number of reference pictures in the short-term ++ * set that come after the current frame ++ * @num_poc_lt_curr: the number of reference pictures in the long-term set ++ * @poc_st_curr_before: provides the index of the short term before references ++ * in DPB array ++ * @poc_st_curr_after: provides the index of the short term after references ++ * in DPB array ++ * @poc_lt_curr: provides the index of the long term references in DPB array ++ * @reserved: padding field. Should be zeroed by applications. ++ * @dpb: the decoded picture buffer, for meta-data about reference frames ++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u16 short_term_ref_pic_set_size; ++ __u16 long_term_ref_pic_set_size; ++ __u8 num_active_dpb_entries; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 reserved[4]; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++/** ++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters ++ * ++ * @scaling_list_4x4: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_8x8: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_16x16: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_32x32: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process ++ * for transform coefficients. The values on each ++ * scaling list are expected in raster scan order. ++ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process ++ * for transform coefficients. The values on each ++ * scaling list are expected in raster scan order. ++ */ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif --- a/libavcodec/hevc_parser.c +++ b/libavcodec/hevc_parser.c @@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod avctx->profile = ps->sps->ptl.general_ptl.profile_idc; avctx->level = ps->sps->ptl.general_ptl.level_idc; - + + if (ps->sps->chroma_format_idc == 1) { + avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ? + ps->sps->vui.chroma_sample_loc_type_top_field + 1 : @@ -15783,12 +19317,69 @@ Upstream-status: Pending if (ps->vps->vps_timing_info_present_flag) { num = ps->vps->vps_num_units_in_tick; den = ps->vps->vps_time_scale; +--- a/libavcodec/hevc_refs.c ++++ b/libavcodec/hevc_refs.c +@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex + if (!frame->rpl_buf) + goto fail; + +- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); +- if (!frame->tab_mvf_buf) +- goto fail; +- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ if (s->tab_mvf_pool) { ++ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); ++ if (!frame->tab_mvf_buf) ++ goto fail; ++ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ } + +- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); +- if (!frame->rpl_tab_buf) +- goto fail; +- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; +- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; +- for (j = 0; j < frame->ctb_count; j++) +- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ if (s->rpl_tab_pool) { ++ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); ++ if (!frame->rpl_tab_buf) ++ goto fail; ++ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; ++ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; ++ for (j = 0; j < frame->ctb_count; j++) ++ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ } + + frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; + frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); +@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s + int ctb_count = frame->ctb_count; + int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + int i; ++ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; + + if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) + return AVERROR_INVALIDDATA; + +- for (i = ctb_addr_ts; i < ctb_count; i++) +- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; ++ if (frame->rpl_tab) { ++ for (i = ctb_addr_ts; i < ctb_count; i++) ++ frame->rpl_tab[i] = tab; ++ } + +- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; ++ frame->refPicList = tab->refPicList; + + return 0; + } --- a/libavcodec/hevcdec.c +++ b/libavcodec/hevcdec.c @@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon - + ff_set_sar(avctx, sps->vui.sar); - + + if (sps->chroma_format_idc == 1) { + avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ? + sps->vui.chroma_sample_loc_type_top_field + 1 : @@ -15816,7 +19407,7 @@ Upstream-status: Pending + CONFIG_HEVC_RPI4_10_HWACCEL + \ CONFIG_HEVC_VDPAU_HWACCEL) enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; - + switch (sps->pix_fmt) { case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUVJ420P: @@ -15851,7 +19442,43 @@ Upstream-status: Pending break; case AV_PIX_FMT_YUV444P: #if CONFIG_HEVC_VDPAU_HWACCEL -@@ -3230,7 +3258,14 @@ static int hevc_decode_frame(AVCodecCont +@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const + if (!sps) + return 0; + ++ // If hwaccel then we don't need all the s/w decode helper arrays ++ if (s->avctx->hwaccel) { ++ export_stream_params(s, sps); ++ ++ s->avctx->pix_fmt = pix_fmt; ++ s->ps.sps = sps; ++ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; ++ return 0; ++ } ++ + ret = pic_arrays_init(s, sps); + if (ret < 0) + goto fail; +@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext + ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); + int ret; + +- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); +- memset(s->vertical_bs, 0, s->bs_width * s->bs_height); +- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); +- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); +- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ if (s->horizontal_bs) { ++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); ++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height); ++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); ++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); ++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ } + + s->is_decoded = 0; + s->first_nal_type = s->nal_unit_type; +@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont s->ref = NULL; ret = decode_nal_units(s, avpkt->data, avpkt->size); if (ret < 0) @@ -15863,10 +19490,38 @@ Upstream-status: Pending + return ret; + } - + if (avctx->hwaccel) { if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { -@@ -3585,6 +3620,15 @@ AVCodec ff_hevc_decoder = { +@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s + if (ret < 0) + return ret; + +- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); +- if (!dst->tab_mvf_buf) +- goto fail; +- dst->tab_mvf = src->tab_mvf; ++ if (src->tab_mvf_buf) { ++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); ++ if (!dst->tab_mvf_buf) ++ goto fail; ++ dst->tab_mvf = src->tab_mvf; ++ } + +- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); +- if (!dst->rpl_tab_buf) +- goto fail; +- dst->rpl_tab = src->rpl_tab; ++ if (src->rpl_tab_buf) { ++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); ++ if (!dst->rpl_tab_buf) ++ goto fail; ++ dst->rpl_tab = src->rpl_tab; ++ } + + dst->rpl_buf = av_buffer_ref(src->rpl_buf); + if (!dst->rpl_buf) +@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = { #if CONFIG_HEVC_NVDEC_HWACCEL HWACCEL_NVDEC(hevc), #endif @@ -15897,12 +19552,12 @@ Upstream-status: Pending --- a/libavcodec/hwconfig.h +++ b/libavcodec/hwconfig.h @@ -24,6 +24,7 @@ - - + + #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) +#define HWACCEL_CAP_MT_SAFE (1 << 1) - - + + typedef struct AVCodecHWConfigInternal { @@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal { HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) @@ -15922,7 +19577,7 @@ Upstream-status: Pending @@ -24,6 +24,9 @@ * MMAL Video Decoder */ - + +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" @@ -15935,12 +19590,12 @@ Upstream-status: Pending #include +#pragma GCC diagnostic pop #include - + #include "avcodec.h" --- a/libavcodec/pthread_frame.c +++ b/libavcodec/pthread_frame.c @@ -191,7 +191,8 @@ static attribute_align_arg void *frame_w - + /* if the previous thread uses hwaccel then we take the lock to ensure * the threads don't run concurrently */ - if (avctx->hwaccel) { @@ -15950,9 +19605,9 @@ Upstream-status: Pending p->hwaccel_serializing = 1; } @@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecConte - + if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; - + - if (avctx->hwaccel && !p->hwaccel_serializing) { + if (avctx->hwaccel && + !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && @@ -15965,7 +19620,7 @@ Upstream-status: Pending @@ -293,6 +293,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ - + + /* RPI (Might as well define for everything) */ + { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, + { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, @@ -15974,13 +19629,13 @@ Upstream-status: Pending + { AV_PIX_FMT_NONE, 0 }, }; - + --- a/libavcodec/rawenc.c +++ b/libavcodec/rawenc.c @@ -24,6 +24,7 @@ * Raw Video Encoder */ - + +#include "config.h" #include "avcodec.h" #include "raw.h" @@ -15993,13 +19648,13 @@ Upstream-status: Pending +#if CONFIG_SAND +#include "libavutil/rpi_sand_fns.h" +#endif - + static av_cold int raw_encode_init(AVCodecContext *avctx) { @@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS return 0; } - + +#if CONFIG_SAND +static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) @@ -16080,7 +19735,7 @@ Upstream-status: Pending - frame->width, frame->height, 1); + int ret; + AVFrame * frame = NULL; - + - if (ret < 0) +#if CONFIG_SAND + if (av_rpi_is_sand_frame(src_frame)) { @@ -16104,7 +19759,7 @@ Upstream-status: Pending + frame->width, frame->height, 1); + if (ret < 0) + goto fail; - + if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) - return ret; + goto fail; @@ -16114,7 +19769,7 @@ Upstream-status: Pending frame->width, frame->height, 1)) < 0) - return ret; + goto fail; - + if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 && frame->format == AV_PIX_FMT_YUYV422) { @@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *av @@ -16130,7 +19785,7 @@ Upstream-status: Pending + *got_packet = 0; + return ret; } - + AVCodec ff_rawvideo_encoder = { --- /dev/null +++ b/libavcodec/rpi_hevc_cabac.c @@ -17637,7 +21292,7 @@ Upstream-status: Pending + const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; + + int use_vpu; -+#if RPI_COMPRESS_COEFFS ++#if RPI_COMPRESS_COEFFS + int num_nonzero = 0; + int use_compress = 0; + int *coeffs32; @@ -17979,7 +21634,7 @@ Upstream-status: Pending + } + use_compress = 0; + } -+#endif ++#endif + + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | @@ -18055,7 +21710,7 @@ Upstream-status: Pending + scale, + i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m, + shift); -+#if RPI_COMPRESS_COEFFS ++#if RPI_COMPRESS_COEFFS + if (use_compress) + coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); + else @@ -18247,11 +21902,11 @@ Upstream-status: Pending +#endif + + if (!use_dc) { -+#if RPI_COMPRESS_COEFFS ++#if RPI_COMPRESS_COEFFS + if (use_compress) { + coeffs32[num_nonzero] = 0; + } -+#endif ++#endif + rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); + } +} @@ -29840,7 +33495,7 @@ Upstream-status: Pending + unsigned int i; + for (i = 0; i != 4; ++i) { + cf->s[i].n = 0; -+#if RPI_COMPRESS_COEFFS ++#if RPI_COMPRESS_COEFFS + cf->s[i].packed = 1; + cf->s[i].packed_n = 0; +#endif @@ -46104,76 +49759,104 @@ Upstream-status: Pending @@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ - + +#include #include #include #include -@@ -30,12 +31,14 @@ +@@ -29,57 +30,82 @@ + #include #include "libavcodec/avcodec.h" #include "libavcodec/internal.h" ++#include "libavutil/avassert.h" #include "libavutil/pixdesc.h" +#include "libavutil/hwcontext.h" #include "v4l2_context.h" #include "v4l2_buffers.h" #include "v4l2_m2m.h" +#include "weak_link.h" - + #define USEC_PER_SEC 1000000 -static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; +static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; - - static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) + +-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) ++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) { -@@ -52,34 +55,44 @@ static inline AVCodecContext *logger(V4L - static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) + return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? + container_of(buf->context, V4L2m2mContext, output) : + container_of(buf->context, V4L2m2mContext, capture); + } + +-static inline AVCodecContext *logger(V4L2Buffer *buf) ++static inline AVCodecContext *logger(const V4L2Buffer * const buf) { - V4L2m2mContext *s = buf_to_m2mctx(avbuf); + return buf_to_m2mctx(buf)->avctx; + } + +-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) ++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) + { +- V4L2m2mContext *s = buf_to_m2mctx(avbuf); - - if (s->avctx->pkt_timebase.num) - return s->avctx->pkt_timebase; - return s->avctx->time_base; ++ const V4L2m2mContext *s = buf_to_m2mctx(avbuf); + const AVRational tb = s->avctx->pkt_timebase.num ? + s->avctx->pkt_timebase : + s->avctx->time_base; + return tb.num && tb.den ? tb : v4l2_timebase; } - + -static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) -+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale) ++static inline struct timeval tv_from_int(const int64_t t) { - int64_t v4l2_pts; -- ++ return (struct timeval){ ++ .tv_usec = t % USEC_PER_SEC, ++ .tv_sec = t / USEC_PER_SEC ++ }; ++} + - if (pts == AV_NOPTS_VALUE) - pts = 0; -- ++static inline int64_t int_from_tv(const struct timeval t) ++{ ++ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec; ++} + ++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) ++{ /* convert pts to v4l2 timebase */ - v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); +- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; +- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; + const int64_t v4l2_pts = -+ no_rescale ? pts : + pts == AV_NOPTS_VALUE ? 0 : + av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); - out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; - out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; ++ out->buf.timestamp = tv_from_int(v4l2_pts); } - + -static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf) -+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale) ++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) { - int64_t v4l2_pts; - ++ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp); ++ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE; ++#if 0 /* convert pts back to encoder timebase */ - v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + -+ const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + - avbuf->buf.timestamp.tv_usec; - -- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); +- avbuf->buf.timestamp.tv_usec; + return -+ no_rescale ? v4l2_pts : ++ avbuf->context->no_pts_rescale ? v4l2_pts : + v4l2_pts == 0 ? AV_NOPTS_VALUE : + av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); ++#endif +} -+ + +- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); +static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) +{ + if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { @@ -46184,12 +49867,15 @@ Upstream-status: Pending + out->buf.length = length; + } } - + static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) -@@ -116,6 +129,105 @@ static enum AVColorPrimaries v4l2_get_co +@@ -116,49 +142,176 @@ static enum AVColorPrimaries v4l2_get_co return AVCOL_PRI_UNSPECIFIED; } - + +-static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) +-{ +- enum v4l2_quantization qt; +static void v4l2_set_color(V4L2Buffer *buf, + const enum AVColorPrimaries avcp, + const enum AVColorSpace avcs, @@ -46230,7 +49916,10 @@ Upstream-status: Pending + default: + break; + } -+ + +- qt = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? +- buf->context->format.fmt.pix_mp.quantization : +- buf->context->format.fmt.pix.quantization; + switch (avcs) { + case AVCOL_SPC_RGB: + cs = V4L2_COLORSPACE_SRGB; @@ -46260,7 +49949,10 @@ Upstream-status: Pending + default: + break; + } -+ + +- switch (qt) { +- case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG; +- case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG; + switch (xfer) { + case AVCOL_TRC_BT709: + xfer = V4L2_XFER_FUNC_709; @@ -46274,10 +49966,11 @@ Upstream-status: Pending + case AVCOL_TRC_SMPTE2084: + xfer = V4L2_XFER_FUNC_SMPTE2084; + break; -+ default: -+ break; -+ } -+ + default: + break; + } + +- return AVCOL_RANGE_UNSPECIFIED; + if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { + buf->context->format.fmt.pix_mp.colorspace = cs; + buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr; @@ -46287,15 +49980,58 @@ Upstream-status: Pending + buf->context->format.fmt.pix.ycbcr_enc = ycbcr; + buf->context->format.fmt.pix.xfer_func = xfer; + } + } + +-static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) ++static inline enum v4l2_quantization ++buf_quantization(const V4L2Buffer * const buf) + { +- enum v4l2_ycbcr_encoding ycbcr; +- enum v4l2_colorspace cs; ++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? ++ buf->context->format.fmt.pix_mp.quantization : ++ buf->context->format.fmt.pix.quantization; ++} + +- cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? ++static inline enum v4l2_colorspace ++buf_colorspace(const V4L2Buffer * const buf) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? + buf->context->format.fmt.pix_mp.colorspace : + buf->context->format.fmt.pix.colorspace; ++} + +- ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? ++static inline enum v4l2_ycbcr_encoding ++buf_ycbcr_enc(const V4L2Buffer * const buf) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? + buf->context->format.fmt.pix_mp.ycbcr_enc: + buf->context->format.fmt.pix.ycbcr_enc; ++} + +- switch(cs) { +- case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB; ++static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) ++{ ++ switch (buf_quantization(buf)) { ++ case V4L2_QUANTIZATION_LIM_RANGE: ++ return AVCOL_RANGE_MPEG; ++ case V4L2_QUANTIZATION_FULL_RANGE: ++ return AVCOL_RANGE_JPEG; ++ case V4L2_QUANTIZATION_DEFAULT: ++ // If YUV (which we assume for all video decode) then, from the header ++ // comments, range is limited unless CS is JPEG ++ return buf_colorspace(buf) == V4L2_COLORSPACE_JPEG ? ++ AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; ++ default: ++ break; ++ } ++ ++ return AVCOL_RANGE_UNSPECIFIED; +} + - static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) - { - enum v4l2_quantization qt; -@@ -134,6 +246,20 @@ static enum AVColorRange v4l2_get_color_ - return AVCOL_RANGE_UNSPECIFIED; - } - +static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr) +{ + const enum v4l2_quantization q = @@ -46310,13 +50046,51 @@ Upstream-status: Pending + } +} + - static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) ++static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) ++{ ++ switch (buf_colorspace(buf)) { ++ case V4L2_COLORSPACE_JPEG: // JPEG -> SRGB ++ case V4L2_COLORSPACE_SRGB: ++ return AVCOL_SPC_RGB; + case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709; + case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC; + case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG; + case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M; + case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M; + case V4L2_COLORSPACE_BT2020: +- if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM) +- return AVCOL_SPC_BT2020_CL; +- else +- return AVCOL_SPC_BT2020_NCL; ++ return buf_ycbcr_enc(buf) == V4L2_YCBCR_ENC_BT2020_CONST_LUM ? ++ AVCOL_SPC_BT2020_CL : AVCOL_SPC_BT2020_NCL; + default: + break; + } +@@ -168,17 +321,9 @@ static enum AVColorSpace v4l2_get_color_ + + static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) { - enum v4l2_ycbcr_encoding ycbcr; -@@ -210,73 +336,165 @@ static enum AVColorTransferCharacteristi +- enum v4l2_ycbcr_encoding ycbcr; ++ const enum v4l2_ycbcr_encoding ycbcr = buf_ycbcr_enc(buf); + enum v4l2_xfer_func xfer; +- enum v4l2_colorspace cs; +- +- cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? +- buf->context->format.fmt.pix_mp.colorspace : +- buf->context->format.fmt.pix.colorspace; +- +- ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? +- buf->context->format.fmt.pix_mp.ycbcr_enc: +- buf->context->format.fmt.pix.ycbcr_enc; ++ const enum v4l2_colorspace cs = buf_colorspace(buf); + + xfer = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ? + buf->context->format.fmt.pix_mp.xfer_func: +@@ -210,73 +355,165 @@ static enum AVColorTransferCharacteristi return AVCOL_TRC_UNSPECIFIED; } - + -static void v4l2_free_buffer(void *opaque, uint8_t *unused) +static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf) { @@ -46327,7 +50101,7 @@ Upstream-status: Pending - atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel); + return V4L2_FIELD_IS_INTERLACED(buf->buf.field); +} - + - if (s->reinit) { - if (!atomic_load(&s->refcount)) - sem_post(&s->refsync); @@ -46343,7 +50117,7 @@ Upstream-status: Pending +{ + return buf->buf.field == V4L2_FIELD_INTERLACED_TB; +} - + - av_buffer_unref(&avbuf->context_ref); - } +static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff) @@ -46351,14 +50125,14 @@ Upstream-status: Pending + buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE : + is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT; } - + -static int v4l2_buf_increase_ref(V4L2Buffer *in) +static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) { - V4L2m2mContext *s = buf_to_m2mctx(in); + AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; + AVDRMLayerDescriptor *layer; - + - if (in->context_ref) - atomic_fetch_add(&in->context_refcount, 1); - else { @@ -46368,7 +50142,7 @@ Upstream-status: Pending + /* fill the DRM frame descriptor */ + drm_desc->nb_objects = avbuf->num_planes; + drm_desc->nb_layers = 1; - + - in->context_refcount = 1; + layer = &drm_desc->layers[0]; + layer->nb_planes = avbuf->num_planes; @@ -46378,7 +50152,7 @@ Upstream-status: Pending + layer->planes[i].offset = 0; + layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; } - + - in->status = V4L2BUF_RET_USER; - atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed); + switch (avbuf->context->av_pix_fmt) { @@ -46386,7 +50160,7 @@ Upstream-status: Pending + + layer->format = DRM_FORMAT_YUYV; + layer->nb_planes = 1; - + - return 0; + break; + @@ -46435,7 +50209,7 @@ Upstream-status: Pending + + return (uint8_t *) drm_desc; } - + -static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) +static void v4l2_free_bufref(void *opaque, uint8_t *data) { @@ -46443,25 +50217,25 @@ Upstream-status: Pending + AVBufferRef * bufref = (AVBufferRef *)data; + V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data; + struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl); - + - if (plane >= in->num_planes) - return AVERROR(EINVAL); + if (ctx != NULL) { + // Buffer still attached to context + V4L2m2mContext *s = buf_to_m2mctx(avbuf); - + - /* even though most encoders return 0 in data_offset encoding vp8 does require this value */ - *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset, - in->plane_info[plane].length, v4l2_free_buffer, in, 0); - if (!*buf) - return AVERROR(ENOMEM); + ff_mutex_lock(&ctx->lock); - + - ret = v4l2_buf_increase_ref(in); - if (ret) - av_buffer_unref(buf); -+ avbuf->status = V4L2BUF_AVAILABLE; - ++ ff_v4l2_buffer_set_avail(avbuf); + - return ret; + if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { + av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); @@ -46482,8 +50256,9 @@ Upstream-status: Pending + + ff_weak_link_unlock(avbuf->context_wl); + av_buffer_unref(&bufref); -+} -+ + } + +-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref) +static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) +{ + struct v4l2_exportbuffer expbuf; @@ -46514,20 +50289,19 @@ Upstream-status: Pending + } + + return 0; - } - --static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref) ++} ++ +static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) { unsigned int bytesused, length; + int rv = 0; - + if (plane >= out->num_planes) return AVERROR(EINVAL); -@@ -284,32 +502,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer +@@ -284,32 +521,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer length = out->plane_info[plane].length; bytesused = FFMIN(size+offset, length); - + - memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); - - if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { @@ -46540,7 +50314,7 @@ Upstream-status: Pending + size = length - offset; + rv = AVERROR(ENOMEM); } - + - return 0; + memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size); + @@ -46564,14 +50338,14 @@ Upstream-status: Pending + avbuf->status = V4L2BUF_RET_USER; + return newbuf; } - + static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) { - int i, ret; + int i; - + frame->format = avbuf->context->av_pix_fmt; - + - for (i = 0; i < avbuf->num_planes; i++) { - ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); - if (ret) @@ -46579,7 +50353,7 @@ Upstream-status: Pending + frame->buf[0] = wrap_avbuf(avbuf); + if (frame->buf[0] == NULL) + return AVERROR(ENOMEM); - ++ + if (buf_to_m2mctx(avbuf)->output_drm) { + /* 1. get references to the actual data */ + frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); @@ -46587,7 +50361,7 @@ Upstream-status: Pending + frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); + return 0; + } -+ + + + /* 1. get references to the actual data */ + for (i = 0; i < avbuf->num_planes; i++) { @@ -46595,9 +50369,9 @@ Upstream-status: Pending frame->linesize[i] = avbuf->plane_info[i].bytesperline; - frame->data[i] = frame->buf[i]->data; } - + /* fixup special cases */ -@@ -318,17 +561,17 @@ static int v4l2_buffer_buf_to_swframe(AV +@@ -318,17 +580,17 @@ static int v4l2_buffer_buf_to_swframe(AV case AV_PIX_FMT_NV21: if (avbuf->num_planes > 1) break; @@ -46606,7 +50380,7 @@ Upstream-status: Pending + frame->linesize[1] = frame->linesize[0]; + frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); break; - + case AV_PIX_FMT_YUV420P: if (avbuf->num_planes > 1) break; @@ -46619,12 +50393,12 @@ Upstream-status: Pending + frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); + frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2; break; - + default: -@@ -338,68 +581,95 @@ static int v4l2_buffer_buf_to_swframe(AV +@@ -338,68 +600,127 @@ static int v4l2_buffer_buf_to_swframe(AV return 0; } - + +static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) +{ + if (dst_stride == src_stride && w + 32 >= dst_stride) { @@ -46643,6 +50417,38 @@ Upstream-status: Pending +{ + return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); +} ++ ++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++{ ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src) ++ return AVERROR(EINVAL); ++ ++ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { ++ // Only currently cope with single buffer types ++ if (out->buf.length != 1) ++ return AVERROR_PATCHWELCOME; ++ if (src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ out->planes[0].m.fd = src->objects[0].fd; ++ } ++ else { ++ if (src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ out->buf.m.fd = src->objects[0].fd; ++ } ++ ++ // No need to copy src AVDescriptor and if we did then we may confuse ++ // fd close on free ++ out->ref_buf = av_buffer_ref(frame->buf[0]); ++ ++ return 0; ++} + static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) { @@ -46756,7 +50562,7 @@ Upstream-status: Pending + av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length); + return -1; + } - + - for (i = 0; i < out->num_planes; i++) { - ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]); - if (ret) @@ -46770,42 +50576,60 @@ Upstream-status: Pending - return 0; } - -@@ -411,14 +681,22 @@ static int v4l2_buffer_swframe_to_buf(co - - int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + +@@ -409,16 +730,31 @@ static int v4l2_buffer_swframe_to_buf(co + * + ******************************************************************************/ + +-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts) { - v4l2_set_pts(out, frame->pts); -+ out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME); +- +- return v4l2_buffer_swframe_to_buf(frame, out); ++ out->buf.flags = frame->key_frame ? ++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : ++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); + // Beware that colour info is held in format rather than the actual + // v4l2 buffer struct so this may not be as useful as you might hope + v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); + v4l2_set_color_range(out, frame->color_range); + // PTS & interlace are buffer vars -+ v4l2_set_pts(out, frame->pts, 0); ++ if (track_ts) ++ out->buf.timestamp = tv_from_int(track_ts); ++ else ++ v4l2_set_pts(out, frame->pts); + v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); - - return v4l2_buffer_swframe_to_buf(frame, out); ++ ++ return frame->format == AV_PIX_FMT_DRM_PRIME ? ++ v4l2_buffer_primeframe_to_buf(frame, out) : ++ v4l2_buffer_swframe_to_buf(frame, out); } - --int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) -+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts) + + int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) { int ret; + V4L2Context * const ctx = avbuf->context; - + av_frame_unref(frame); - -@@ -433,13 +711,24 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram + +@@ -429,17 +765,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram + + /* 2. get frame information */ + frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME); ++ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : ++ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P : ++ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B : ++ AV_PICTURE_TYPE_NONE; + frame->color_primaries = v4l2_get_color_primaries(avbuf); frame->colorspace = v4l2_get_color_space(avbuf); frame->color_range = v4l2_get_color_range(avbuf); frame->color_trc = v4l2_get_color_trc(avbuf); -- frame->pts = v4l2_get_pts(avbuf); -+ frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); + frame->pts = v4l2_get_pts(avbuf); frame->pkt_dts = AV_NOPTS_VALUE; + frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); + frame->top_field_first = v4l2_buf_is_top_first(avbuf); - + /* these values are updated also during re-init in v4l2_process_driver_event */ - frame->height = avbuf->context->height; - frame->width = avbuf->context->width; @@ -46820,18 +50644,17 @@ Upstream-status: Pending + frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ? + frame->width - (ctx->selection.left + ctx->selection.width) : 0; + frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ? -+ frame->width - (ctx->selection.top + ctx->selection.height) : 0; ++ frame->height - (ctx->selection.top + ctx->selection.height) : 0; + } - + /* 3. report errors upstream */ if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) { -@@ -452,15 +741,16 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram - +@@ -452,15 +803,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram + int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) { - int ret; -+ av_log(logger(avbuf), AV_LOG_INFO, "%s\n", __func__); - +- av_packet_unref(pkt); - ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf); - if (ret) @@ -46840,29 +50663,25 @@ Upstream-status: Pending + pkt->buf = wrap_avbuf(avbuf); + if (pkt->buf == NULL) + return AVERROR(ENOMEM); - + pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; - pkt->data = pkt->buf->data; + pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; - ++ pkt->flags = 0; + if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) pkt->flags |= AV_PKT_FLAG_KEY; -@@ -470,36 +760,89 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket - pkt->flags |= AV_PKT_FLAG_CORRUPT; - } - -- pkt->dts = pkt->pts = v4l2_get_pts(avbuf); -+ pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0); - +@@ -475,31 +826,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket return 0; } - + -int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -+ const void *extdata, size_t extlen, int no_rescale_pts) ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, ++ const void *extdata, size_t extlen, ++ const int64_t timestamp) { int ret; - + - ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf); - if (ret) + if (extlen) { @@ -46874,18 +50693,23 @@ Upstream-status: Pending + ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); + if (ret && ret != AVERROR(ENOMEM)) return ret; - + - v4l2_set_pts(out, pkt->pts); -+ v4l2_set_pts(out, pkt->pts, no_rescale_pts); - - if (pkt->flags & AV_PKT_FLAG_KEY) - out->flags = V4L2_BUF_FLAG_KEYFRAME; - -- return 0; ++ if (timestamp) ++ out->buf.timestamp = tv_from_int(timestamp); ++ else ++ v4l2_set_pts(out, pkt->pts); ++ ++ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? ++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : ++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); + +- if (pkt->flags & AV_PKT_FLAG_KEY) +- out->flags = V4L2_BUF_FLAG_KEYFRAME; + return ret; - } - --int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) ++} + +- return 0; +int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) +{ + return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); @@ -46908,23 +50732,27 @@ Upstream-status: Pending + close(avbuf->drm_frame.objects[i].fd); + } + ++ av_buffer_unref(&avbuf->ref_buf); ++ + ff_weak_link_unref(&avbuf->context_wl); + + av_free(avbuf); -+} + } + +-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + -+ -+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx) ++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem) { - V4L2Context *ctx = avbuf->context; int ret, i; + V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); + AVBufferRef * bufref; -+ + +- avbuf->buf.memory = V4L2_MEMORY_MMAP; + *pbufref = NULL; + if (avbuf == NULL) + return AVERROR(ENOMEM); - ++ + bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0); + if (bufref == NULL) { + av_free(avbuf); @@ -46932,10 +50760,10 @@ Upstream-status: Pending + } + + avbuf->context = ctx; - avbuf->buf.memory = V4L2_MEMORY_MMAP; ++ avbuf->buf.memory = mem; avbuf->buf.type = ctx->type; avbuf->buf.index = index; - + + for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { + avbuf->drm_frame.objects[i].fd = -1; + } @@ -46945,43 +50773,48 @@ Upstream-status: Pending if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->buf.length = VIDEO_MAX_PLANES; avbuf->buf.m.planes = avbuf->planes; -@@ -507,7 +850,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer - +@@ -507,7 +918,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer + ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); if (ret < 0) - return AVERROR(errno); + goto fail; - + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->num_planes = 0; -@@ -527,25 +870,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer - +@@ -520,6 +931,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer + avbuf->num_planes = 1; + + for (i = 0; i < avbuf->num_planes; i++) { ++ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP && ++ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm); + + avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? + ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline : +@@ -527,25 +940,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, - PROT_READ | PROT_WRITE, MAP_SHARED, - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); + -+ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || -+ !buf_to_m2mctx(avbuf)->output_drm) { ++ if (want_mmap) + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, + PROT_READ | PROT_WRITE, MAP_SHARED, + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); -+ } } else { avbuf->plane_info[i].length = avbuf->buf.length; - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, - PROT_READ | PROT_WRITE, MAP_SHARED, - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); + -+ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || -+ !buf_to_m2mctx(avbuf)->output_drm) { ++ if (want_mmap) + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, + PROT_READ | PROT_WRITE, MAP_SHARED, + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); -+ } } - + - if (avbuf->plane_info[i].mm_addr == MAP_FAILED) - return AVERROR(ENOMEM); + if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { @@ -46990,19 +50823,19 @@ Upstream-status: Pending + goto fail; + } } - + avbuf->status = V4L2BUF_AVAILABLE; - + - if (V4L2_TYPE_IS_OUTPUT(ctx->type)) - return 0; - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->buf.m.planes = avbuf->planes; avbuf->buf.length = avbuf->num_planes; -@@ -555,7 +906,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer +@@ -555,20 +972,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer avbuf->buf.length = avbuf->planes[0].length; } - + - return ff_v4l2_buffer_enqueue(avbuf); + if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { + if (buf_to_m2mctx(avbuf)->output_drm) { @@ -47019,19 +50852,20 @@ Upstream-status: Pending + av_buffer_unref(&bufref); + return ret; } - + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) -@@ -564,9 +928,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* a - - avbuf->buf.flags = avbuf->flags; - + { + int ret; ++ int qc; + +- avbuf->buf.flags = avbuf->flags; + if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, + avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, + avbuf->context->q_count); + } -+ + ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf); - if (ret < 0) - return AVERROR(errno); @@ -47042,31 +50876,34 @@ Upstream-status: Pending + err, strerror(err)); + return AVERROR(err); + } + ++ // Lock not wanted - if called from buffer free then lock already obtained ++ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; + avbuf->status = V4L2BUF_IN_DRIVER; ++ pthread_cond_broadcast(&avbuf->context->cond); + -+ ++avbuf->context->q_count; + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, -+ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, -+ avbuf->context->q_count); - - avbuf->status = V4L2BUF_IN_DRIVER; - ++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); + + return 0; + } --- a/libavcodec/v4l2_buffers.h +++ b/libavcodec/v4l2_buffers.h -@@ -27,25 +27,34 @@ +@@ -27,25 +27,38 @@ #include #include - + +#include "libavutil/hwcontext_drm.h" #include "avcodec.h" - + enum V4L2Buffer_status { V4L2BUF_AVAILABLE, V4L2BUF_IN_DRIVER, + V4L2BUF_IN_USE, V4L2BUF_RET_USER, }; - + /** * V4L2Buffer (wrapper for v4l2_buffer management) */ @@ -47083,49 +50920,70 @@ Upstream-status: Pending + */ struct V4L2Context *context; + struct ff_weak_link_client *context_wl; - + - /* This object is refcounted per-plane, so we need to keep track - * of how many context-refs we are holding. */ - AVBufferRef *context_ref; - atomic_uint context_refcount; + /* DRM descriptor */ + AVDRMFrameDescriptor drm_frame; - ++ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we ++ * are done ++ */ ++ AVBufferRef * ref_buf; + /* keep track of the mmap address and mmap length */ struct V4L2Plane_info { -@@ -70,11 +79,12 @@ typedef struct V4L2Buffer { - * - * @param[in] frame The AVFRame to push the information to - * @param[in] buf The V4L2Buffer to get the information from -+ * @param[in] no_rescale_pts If non-zero do not rescale PTS - * - * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect, - * AVERROR(ENOMEM) if the AVBufferRef can't be created. - */ --int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf); -+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts); - - /** - * Extracts the data from a V4L2Buffer to an AVPacket -@@ -98,6 +108,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket +@@ -60,7 +73,6 @@ typedef struct V4L2Buffer { + struct v4l2_buffer buf; + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + +- int flags; + enum V4L2Buffer_status status; + + } V4L2Buffer; +@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket */ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); - -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, -+ const void *extdata, size_t extlen, int no_rescale_pts); + ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, ++ const void *extdata, size_t extlen, ++ const int64_t timestamp); + /** * Extracts the data from an AVFrame to a V4L2Buffer * -@@ -116,7 +129,7 @@ int ff_v4l2_buffer_avframe_to_buf(const +@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV + * + * @returns 0 in case of success, a negative AVERROR code otherwise + */ +-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); ++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts); + + /** + * Initializes a V4L2Buffer +@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const * * @returns 0 in case of success, a negative AVERROR code otherwise */ -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); -+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx); - ++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem); + /** * Enqueues a V4L2Buffer +@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer + */ + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf); + ++static inline void ++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf) ++{ ++ avbuf->status = V4L2BUF_AVAILABLE; ++ av_buffer_unref(&avbuf->ref_buf); ++} ++ + + #endif // AVCODEC_V4L2_BUFFERS_H --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c @@ -27,11 +27,13 @@ @@ -47139,41 +50997,233 @@ Upstream-status: Pending #include "v4l2_fmt.h" #include "v4l2_m2m.h" +#include "weak_link.h" - + struct v4l2_format_update { uint32_t v4l2_fmt; -@@ -53,16 +55,6 @@ static inline AVCodecContext *logger(V4L - return ctx_to_m2mctx(ctx)->avctx; - } - --static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) --{ -- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; --} -- --static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) --{ -- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; --} -- - static AVRational v4l2_get_sar(V4L2Context *ctx) +@@ -41,26 +43,168 @@ struct v4l2_format_update { + int update_avfmt; + }; + +-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx) ++ ++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) { - struct AVRational sar = { 0, 1 }; -@@ -94,8 +86,8 @@ static inline unsigned int v4l2_resoluti +- return V4L2_TYPE_IS_OUTPUT(ctx->type) ? +- container_of(ctx, V4L2m2mContext, output) : +- container_of(ctx, V4L2m2mContext, capture); ++ return (int64_t)n; + } + +-static inline AVCodecContext *logger(V4L2Context *ctx) ++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) + { +- return ctx_to_m2mctx(ctx)->avctx; ++ return (unsigned int)pts; + } + +-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) ++// FFmpeg requires us to propagate a number of vars from the coded pkt into ++// the decoded frame. The only thing that tracks like that in V4L2 stateful ++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no ++// guarantees about PTS being unique or specified for every frame so replace ++// the supplied PTS with a simple incrementing number and keep a circular ++// buffer of all the things we want preserved (including the original PTS) ++// indexed by the tracking no. ++static int64_t ++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt) + { +- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++x->track_no == 0) ++ x->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, x->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pending = 1, ++ .pkt_size = avpkt->size, ++ .pts = avpkt->pts, ++ .dts = avpkt->dts, ++ .reordered_opaque = avctx->reordered_opaque, ++ .pkt_pos = avpkt->pos, ++ .pkt_duration = avpkt->duration, ++ .track_pts = track_pts ++ }; ++ return track_pts; + } + +-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) ++static int64_t ++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame) + { +- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++x->track_no == 0) ++ x->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, x->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pending = 1, ++ .pkt_size = 0, ++ .pts = frame->pts, ++ .dts = AV_NOPTS_VALUE, ++ .reordered_opaque = frame->reordered_opaque, ++ .pkt_pos = frame->pkt_pos, ++ .pkt_duration = frame->pkt_duration, ++ .track_pts = track_pts ++ }; ++ return track_pts; ++} ++ ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_frame_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ AVFrame *const frame) ++{ ++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ V4L2m2mTrackEl *const t = x->track_els + n; ++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) ++ { ++ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, ++ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ frame->pts = AV_NOPTS_VALUE; ++ frame->pkt_dts = AV_NOPTS_VALUE; ++ frame->reordered_opaque = x->last_opaque; ++ frame->pkt_pos = -1; ++ frame->pkt_duration = 0; ++ frame->pkt_size = -1; ++ } ++ else if (!t->discard) ++ { ++ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ frame->pkt_dts = t->dts; ++ frame->reordered_opaque = t->reordered_opaque; ++ frame->pkt_pos = t->pkt_pos; ++ frame->pkt_duration = t->pkt_duration; ++ frame->pkt_size = t->pkt_size; ++ ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (frame->pts != AV_NOPTS_VALUE) ++ x->last_pts = frame->pts; ++ t->pending = 0; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); ++ return 0; ++} ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_pkt_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ AVPacket *const pkt) ++{ ++ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ V4L2m2mTrackEl *const t = x->track_els + n; ++ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts) ++ { ++ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, ++ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); ++ pkt->pts = AV_NOPTS_VALUE; ++ } ++ else if (!t->discard) ++ { ++ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (pkt->pts != AV_NOPTS_VALUE) ++ x->last_pts = pkt->pts; ++ t->pending = 0; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ // * Would like something much better than this...xlat(offset + out_count)? ++ pkt->dts = pkt->pts; ++ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ pkt->pts, t->track_pts, n); ++ return 0; ++} ++ ++ ++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) ++{ ++ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? ++ container_of(ctx, V4L2m2mContext, output) : ++ container_of(ctx, V4L2m2mContext, capture); ++} ++ ++static inline AVCodecContext *logger(const V4L2Context *ctx) ++{ ++ return ctx_to_m2mctx(ctx)->avctx; + } + + static AVRational v4l2_get_sar(V4L2Context *ctx) +@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte + return sar; + } + +-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) ++static inline int ctx_buffers_alloced(const V4L2Context * const ctx) ++{ ++ return ctx->bufrefs != NULL; ++} ++ ++// Width/Height changed or we don't have an alloc in the first place? ++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) + { +- struct v4l2_format *fmt1 = &ctx->format; +- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? +- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || +- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height +- : +- fmt1->fmt.pix.width != fmt2->fmt.pix.width || +- fmt1->fmt.pix.height != fmt2->fmt.pix.height; ++ const struct v4l2_format *fmt1 = &ctx->format; ++ int ret = !ctx_buffers_alloced(ctx) || ++ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ++ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || ++ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height ++ : ++ fmt1->fmt.pix.width != fmt2->fmt.pix.width || ++ fmt1->fmt.pix.height != fmt2->fmt.pix.height); + if (ret) - av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", +- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", ++ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n", ctx->name, - v4l2_get_width(fmt1), v4l2_get_height(fmt1), - v4l2_get_width(fmt2), v4l2_get_height(fmt2)); ++ ctx_buffers_alloced(ctx), + ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), + ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); - + return ret; } -@@ -153,58 +145,67 @@ static inline void v4l2_save_to_context( +@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context( } } - + -/** - * handle resolution change event and end of stream event - * returns 1 if reinit was successful, negative if it failed @@ -47192,7 +51242,7 @@ Upstream-status: Pending + .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, + .target = V4L2_SEL_TGT_COMPOSE + }; - + - ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); - if (ret < 0) { - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); @@ -47201,7 +51251,7 @@ Upstream-status: Pending + memset(r, 0, sizeof(*r)); + if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection)) + return AVERROR(errno); - + - if (evt.type == V4L2_EVENT_EOS) { - ctx->done = 1; - return 0; @@ -47209,49 +51259,45 @@ Upstream-status: Pending + *r = selection.r; + return 0; +} - + - if (evt.type != V4L2_EVENT_SOURCE_CHANGE) - return 0; +static int do_source_change(V4L2m2mContext * const s) +{ + AVCodecContext *const avctx = s->avctx; -+ + +- ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt); +- if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name); +- return 0; +- } + int ret; + int reinit; -+ int full_reinit; + struct v4l2_format cap_fmt = s->capture.format; -+ struct v4l2_format out_fmt = s->output.format; + -+ s->resize_pending = 0; + s->capture.done = 0; - - ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt); - if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name); -+ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->output.name); - return 0; - } - + ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name); + av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name); return 0; } - - full_reinit = v4l2_resolution_changed(&s->output, &out_fmt); - if (full_reinit) { + +- full_reinit = v4l2_resolution_changed(&s->output, &out_fmt); +- if (full_reinit) { - s->output.height = v4l2_get_height(&out_fmt); - s->output.width = v4l2_get_width(&out_fmt); - s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); -+ s->output.height = ff_v4l2_get_format_height(&out_fmt); -+ s->output.width = ff_v4l2_get_format_width(&out_fmt); - } -+ s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); -+ +- } + get_default_selection(&s->capture, &s->capture.selection); - - reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); ++ ++ reinit = ctx_resolution_changed(&s->capture, &cap_fmt); ++ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) ++ reinit = 1; + +- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); ++ s->capture.format = cap_fmt; if (reinit) { - s->capture.height = v4l2_get_height(&cap_fmt); - s->capture.width = v4l2_get_width(&cap_fmt); @@ -47259,368 +51305,491 @@ Upstream-status: Pending + s->capture.height = ff_v4l2_get_format_height(&cap_fmt); + s->capture.width = ff_v4l2_get_format_width(&cap_fmt); } + +- if (full_reinit || reinit) +- s->reinit = 1; +- +- if (full_reinit) { +- ret = ff_v4l2_m2m_codec_full_reinit(s); +- if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n"); +- return AVERROR(EINVAL); +- } +- goto reinit_run; ++ // If we don't support selection (or it is bust) and we obviously have HD then kludge ++ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) && ++ (s->capture.height == 1088 && s->capture.width == 1920)) { ++ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080}; + } + + s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + -+ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", ++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n", + s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, ++ s->capture.width, s->capture.height, + s->capture.selection.width, s->capture.selection.height, -+ s->capture.selection.left, s->capture.selection.top); - - if (full_reinit || reinit) - s->reinit = 1; -@@ -212,34 +213,88 @@ static int v4l2_handle_event(V4L2Context - if (full_reinit) { - ret = ff_v4l2_m2m_codec_full_reinit(s); - if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n"); -+ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit failed\n"); - return AVERROR(EINVAL); - } - goto reinit_run; - } - ++ s->capture.selection.left, s->capture.selection.top, reinit); ++ if (reinit) { - if (s->avctx) +- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); + if (avctx) - ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); ++ ret = ff_set_dimensions(s->avctx, ++ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width, ++ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height); if (ret < 0) - av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n"); + av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); - + ret = ff_v4l2_m2m_codec_reinit(s); if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); + av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); return AVERROR(EINVAL); } ++ ++ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || ++ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { ++ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", ++ s->capture.width, s->capture.height, ++ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); ++ return AVERROR(EINVAL); ++ } ++ ++ // Update pixel format - should only actually do something on initial change ++ s->capture.av_pix_fmt = ++ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); ++ if (s->output_drm) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ avctx->sw_pix_fmt = s->capture.av_pix_fmt; ++ } ++ else ++ avctx->pix_fmt = s->capture.av_pix_fmt; ++ goto reinit_run; } - + - /* dummy event received */ - return 0; + /* Buffers are OK so just stream off to ack */ -+ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__); ++ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__); + + ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); + if (ret) + av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n"); + s->draining = 0; - + /* reinit executed */ reinit_run: + ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON); return 1; } - -+static int ctx_done(V4L2Context * const ctx) -+{ -+ int rv = 0; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ -+ ctx->done = 1; -+ -+ if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ rv = do_source_change(s); -+ -+ return rv; -+} -+ -+/** -+ * handle resolution change event and end of stream event -+ * returns 1 if reinit was successful, negative if it failed -+ * returns 0 if reinit was not executed -+ */ -+static int v4l2_handle_event(V4L2Context *ctx) -+{ -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ struct v4l2_event evt = { 0 }; -+ int ret; -+ -+ ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); -+ if (ret < 0) { -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); -+ return 0; -+ } -+ -+ av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type); -+ -+ if (evt.type == V4L2_EVENT_EOS) { -+// ctx->done = 1; -+ av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name); -+ return 0; -+ } -+ -+ if (evt.type != V4L2_EVENT_SOURCE_CHANGE) -+ return 0; -+ -+ s->resize_pending = 1; -+ if (!ctx->done) -+ return 0; -+ -+ return do_source_change(s); -+} -+ - static int v4l2_stop_decode(V4L2Context *ctx) - { - struct v4l2_decoder_cmd cmd = { -@@ -280,8 +335,26 @@ static int v4l2_stop_encode(V4L2Context + +@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context return 0; } - -+static int count_in_driver(const V4L2Context * const ctx) + +-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) +-{ +- struct v4l2_plane planes[VIDEO_MAX_PLANES]; +- struct v4l2_buffer buf = { 0 }; +- V4L2Buffer *avbuf; +- struct pollfd pfd = { +- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */ +- .fd = ctx_to_m2mctx(ctx)->fd, ++// DQ a buffer ++// Amalgamates all the various ways there are of signalling EOS/Event to ++// generate a consistant EPIPE. ++// ++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped) ++// ++// Returns: ++// 0 Success ++// AVERROR(EPIPE) Nothing more to read ++// AVERROR(ENOSPC) No buffers in Q to put result in ++// * AVERROR(..) ++ ++ static int ++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) +{ -+ int i; -+ int n = 0; ++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); ++ AVCodecContext * const avctx = m->avctx; ++ V4L2Buffer * avbuf; ++ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type); + -+ if (!ctx->bufrefs) -+ return -1; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; + -+ for (i = 0; i < ctx->num_buffers; ++i) { -+ V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_IN_DRIVER) -+ ++n; -+ } -+ return n; -+} -+ - static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) - { -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type); - struct v4l2_plane planes[VIDEO_MAX_PLANES]; - struct v4l2_buffer buf = { 0 }; - V4L2Buffer *avbuf; -@@ -290,50 +363,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf( - .fd = ctx_to_m2mctx(ctx)->fd, ++ struct v4l2_buffer buf = { ++ .type = ctx->type, ++ .memory = V4L2_MEMORY_MMAP, }; - int i, ret; -+ int no_rx_means_done = 0; - +- int i, ret; + - if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) { -+ if (is_capture && ctx->bufrefs) { - for (i = 0; i < ctx->num_buffers; i++) { +- for (i = 0; i < ctx->num_buffers; i++) { - if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) -+ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_IN_DRIVER) - break; - } - if (i == ctx->num_buffers) +- break; +- } +- if (i == ctx->num_buffers) - av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " -+ av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to " - "userspace. Increase num_capture_buffers " - "to prevent device deadlock or dropped " +- "userspace. Increase num_capture_buffers " +- "to prevent device deadlock or dropped " - "packets/frames.\n"); -+ "packets/frames.\n", i); - } - -+#if 0 -+ // I think this is true but pointless -+ // we will get some other form of EOF signal -+ - /* if we are draining and there are no more capture buffers queued in the driver we are done */ +- } +- +- /* if we are draining and there are no more capture buffers queued in the driver we are done */ - if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) { -+ if (is_capture && ctx_to_m2mctx(ctx)->draining) { - for (i = 0; i < ctx->num_buffers; i++) { - /* capture buffer initialization happens during decode hence - * detection happens at runtime - */ +- for (i = 0; i < ctx->num_buffers; i++) { +- /* capture buffer initialization happens during decode hence +- * detection happens at runtime +- */ - if (!ctx->buffers) -+ if (!ctx->bufrefs) - break; - +- break; +- - if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) -+ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_IN_DRIVER) - goto start; - } - ctx->done = 1; - return NULL; - } -+#endif - - start: +- goto start; +- } +- ctx->done = 1; +- return NULL; +- } +- +-start: - if (V4L2_TYPE_IS_OUTPUT(ctx->type)) - pfd.events = POLLOUT | POLLWRNORM; - else { -+ if (is_capture) { - /* no need to listen to requests for more input while draining */ - if (ctx_to_m2mctx(ctx)->draining) - pfd.events = POLLIN | POLLRDNORM | POLLPRI; -+ } else { -+ pfd.events = POLLOUT | POLLWRNORM; +- /* no need to listen to requests for more input while draining */ +- if (ctx_to_m2mctx(ctx)->draining) +- pfd.events = POLLIN | POLLRDNORM | POLLPRI; ++ *ppavbuf = NULL; ++ ++ if (ctx->flag_last) ++ return AVERROR(EPIPE); ++ ++ if (is_mp) { ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; } -+ no_rx_means_done = s->resize_pending && is_capture; - - for (;;) { + +- for (;;) { - ret = poll(&pfd, 1, timeout); -+ // If we have a resize pending then all buffers should be Qed -+ // With a resize pending we should be in drain but evidence suggests -+ // that not all decoders do this so poll to clear -+ int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout; -+ const int e = pfd.events; -+ -+ ret = poll(&pfd, 1, t2); -+ - if (ret > 0) - break; +- if (ret > 0) +- break; - if (errno == EINTR) - continue; +- return NULL; ++ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { ++ const int err = errno; ++ av_assert0(AVERROR(err) < 0); ++ if (err != EINTR) { ++ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", ++ ctx->name, av_err2str(AVERROR(err))); + -+ if (ret < 0) { -+ int err = errno; -+ if (err == EINTR) -+ continue; -+ av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n", -+ err, strerror(err), -+ e, count_in_driver(ctx)); -+ return NULL; -+ } ++ if (err == EPIPE) ++ ctx->flag_last = 1; + -+ // ret == 0 (timeout) -+ if (no_rx_means_done) { -+ av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n"); -+ ret = ctx_done(ctx); -+ if (ret > 0) -+ goto start; ++ return AVERROR(err); + } -+ if (timeout == -1) -+ av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));; - return NULL; } - -@@ -343,7 +450,8 @@ start: - no need to raise a warning */ - if (timeout == 0) { - for (i = 0; i < ctx->num_buffers; i++) { ++ atomic_fetch_sub(&ctx->q_count, 1); + +- /* 0. handle errors */ +- if (pfd.revents & POLLERR) { +- /* if we are trying to get free buffers but none have been queued yet +- no need to raise a warning */ +- if (timeout == 0) { +- for (i = 0; i < ctx->num_buffers; i++) { - if (ctx->buffers[i].status != V4L2BUF_AVAILABLE) -+ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status != V4L2BUF_AVAILABLE) - av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); - } - } -@@ -361,22 +469,25 @@ start: - ctx->done = 1; - return NULL; +- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); +- } ++ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; ++ ff_v4l2_buffer_set_avail(avbuf); ++ avbuf->buf = buf; ++ if (is_mp) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buf.m.planes = avbuf->planes; ++ } ++ // Done with any attached buffer ++ av_buffer_unref(&avbuf->ref_buf); ++ ++ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { ++ // Zero length cap buffer return == EOS ++ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n"); ++ ++ // Must reQ so we don't leak ++ // May not matter if the next thing we do is release all the ++ // buffers but better to be tidy. ++ ff_v4l2_buffer_enqueue(avbuf); ++ ++ ctx->flag_last = 1; ++ return AVERROR(EPIPE); } +- else +- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); + +- return NULL; ++#ifdef V4L2_BUF_FLAG_LAST ++ // If flag_last set then this contains data but is the last frame ++ // so remember that but return OK ++ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0) ++ ctx->flag_last = 1; ++#endif + } + +- /* 1. handle resolution changes */ +- if (pfd.revents & POLLPRI) { +- ret = v4l2_handle_event(ctx); +- if (ret < 0) { +- /* if re-init failed, abort */ +- ctx->done = 1; +- return NULL; +- } - if (ret) { - /* if re-init was successful drop the buffer (if there was one) - * since we had to reconfigure capture (unmap all buffers) - */ - return NULL; -- } -+ if (ret > 0) -+ goto start; - } - - /* 2. dequeue the buffer */ - if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { - -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ if (is_capture) { - /* there is a capture buffer ready */ - if (pfd.revents & (POLLIN | POLLRDNORM)) - goto dequeue; - -+ // CAPTURE Q drained -+ if (no_rx_means_done) { -+ if (ctx_done(ctx) > 0) -+ goto start; -+ return NULL; -+ } ++ *ppavbuf = avbuf; ++ return 0; ++} + - /* the driver is ready to accept more input; instead of waiting for the capture - * buffer to complete we return NULL so input can proceed (we are single threaded) - */ -@@ -394,37 +505,58 @@ dequeue: - buf.m.planes = planes; ++/** ++ * handle resolution change event and end of stream event ++ * Expects to be called after the stream has stopped ++ * ++ * returns 1 if reinit was successful, negative if it failed ++ * returns 0 if reinit was not executed ++ */ ++static int ++get_event(V4L2m2mContext * const m) ++{ ++ AVCodecContext * const avctx = m->avctx; ++ struct v4l2_event evt = { 0 }; ++ ++ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) { ++ const int rv = AVERROR(errno); ++ if (rv == AVERROR(EINTR)) ++ continue; ++ if (rv == AVERROR(EAGAIN)) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n"); ++ return AVERROR_EOF; } - ++ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv)); ++ return rv; + } + +- /* 2. dequeue the buffer */ +- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { ++ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type); + +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- /* there is a capture buffer ready */ +- if (pfd.revents & (POLLIN | POLLRDNORM)) +- goto dequeue; ++ if (evt.type == V4L2_EVENT_EOS) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n"); ++ return AVERROR_EOF; ++ } ++ ++ if (evt.type == V4L2_EVENT_SOURCE_CHANGE) ++ return do_source_change(m); ++ ++ return 0; ++} ++ ++ ++// Get a buffer ++// If output then just gets the buffer in the expected way ++// If capture then runs the capture state m/c to deal with res change etc. ++// If return value == 0 then *ppavbuf != NULL ++ ++static int ++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) ++{ ++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); ++ AVCodecContext * const avctx = m->avctx; ++ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type); ++ ++ const unsigned int poll_cap = (POLLIN | POLLRDNORM); ++ const unsigned int poll_out = (POLLOUT | POLLWRNORM); ++ const unsigned int poll_event = POLLPRI; ++ ++ *ppavbuf = NULL; + +- /* the driver is ready to accept more input; instead of waiting for the capture +- * buffer to complete we return NULL so input can proceed (we are single threaded) +- */ +- if (pfd.revents & (POLLOUT | POLLWRNORM)) +- return NULL; ++ for (;;) { ++ struct pollfd pfd = { ++ .fd = m->fd, ++ // If capture && stream not started then assume we are waiting for the initial event ++ .events = !is_cap ? poll_out : ++ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap : ++ poll_event, ++ }; ++ int ret; ++ ++ if (ctx->done) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); ++ return AVERROR_EOF; + } + +-dequeue: +- memset(&buf, 0, sizeof(buf)); +- buf.memory = V4L2_MEMORY_MMAP; +- buf.type = ctx->type; +- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { +- memset(planes, 0, sizeof(planes)); +- buf.length = VIDEO_MAX_PLANES; +- buf.m.planes = planes; ++ // If capture && timeout == -1 then also wait for rx buffer free ++ if (is_cap && timeout == -1 && m->output.streamon && !m->draining) ++ pfd.events |= poll_out; ++ ++ // If nothing Qed all we will get is POLLERR - avoid that ++ if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || ++ (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || ++ (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); ++ return AVERROR(ENOSPC); + } + - ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); - if (ret) { - if (errno != EAGAIN) { - ctx->done = 1; - if (errno != EPIPE) -+ while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) { -+ const int err = errno; -+ if (err == EINTR) -+ continue; -+ if (err != EAGAIN) { -+ // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST -+ if (err != EPIPE || !is_capture) - av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", +- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", - ctx->name, av_err2str(AVERROR(errno))); -+ ctx->name, av_err2str(AVERROR(err))); -+ if (ctx_done(ctx) > 0) -+ goto start; - } - return NULL; - } -+ --ctx->q_count; -+ av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n", -+ ctx->name, buf.index, -+ buf.timestamp.tv_sec, buf.timestamp.tv_usec, -+ ctx->q_count, ++ctx->dq_count); -+ -+ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; -+ avbuf->status = V4L2BUF_AVAILABLE; -+ avbuf->buf = buf; -+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buf.m.planes = avbuf->planes; ++ // Timeout kludged s.t. "forever" eventually gives up & produces logging ++ // If waiting for an event when we have seen a last_frame then we expect ++ // it to be ready already so force a short timeout ++ ret = poll(&pfd, 1, ++ ff_v4l2_ctx_eos(ctx) ? 10 : ++ timeout == -1 ? 3000 : timeout); ++ if (ret < 0) { ++ ret = AVERROR(errno); // Remember errno before logging etc. ++ av_assert0(ret < 0); + } - -- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ if (ctx_to_m2mctx(ctx)->draining && is_capture) { - int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? - buf.m.planes[0].bytesused : buf.bytesused; - if (bytesused == 0) { -- ctx->done = 1; -+ av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n"); + -+ // Must reQ so we don't leak -+ // May not matter if the next thing we do is release all the -+ // buffers but better to be tidy. -+ ff_v4l2_buffer_enqueue(avbuf); ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", ++ ctx->name, ret, timeout, pfd.events, pfd.revents); + -+ if (ctx_done(ctx) > 0) -+ goto start; - return NULL; ++ if (ret < 0) { ++ if (ret == AVERROR(EINTR)) ++ continue; ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); ++ return ret; ++ } ++ ++ if (ret == 0) { ++ if (timeout == -1) ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); ++ if (ff_v4l2_ctx_eos(ctx)) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name); ++ ret = get_event(m); ++ if (ret < 0) { ++ ctx->done = 1; ++ return ret; ++ } } - #ifdef V4L2_BUF_FLAG_LAST +- return NULL; ++ return AVERROR(EAGAIN); + } + +- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? +- buf.m.planes[0].bytesused : buf.bytesused; +- if (bytesused == 0) { ++ if ((pfd.revents & POLLERR) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); ++ return AVERROR_UNKNOWN; ++ } ++ ++ if ((pfd.revents & poll_event) != 0) { ++ ret = get_event(m); ++ if (ret < 0) { + ctx->done = 1; +- return NULL; ++ return ret; + } +-#ifdef V4L2_BUF_FLAG_LAST - if (buf.flags & V4L2_BUF_FLAG_LAST) - ctx->done = 1; -+ if (buf.flags & V4L2_BUF_FLAG_LAST) { -+ av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n"); -+ avbuf->status = V4L2BUF_IN_USE; // Avoid flushing this buffer -+ ctx_done(ctx); -+ } - #endif +-#endif ++ continue; ++ } ++ ++ if ((pfd.revents & poll_cap) != 0) { ++ ret = dq_buf(ctx, ppavbuf); ++ if (ret == AVERROR(EPIPE)) ++ continue; ++ return ret; } - + - avbuf = &ctx->buffers[buf.index]; - avbuf->status = V4L2BUF_AVAILABLE; - avbuf->buf = buf; - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - memcpy(avbuf->planes, planes, sizeof(planes)); - avbuf->buf.m.planes = avbuf->planes; -- } - return avbuf; ++ if ((pfd.revents & poll_out) != 0) { ++ if (is_cap) ++ return AVERROR(EAGAIN); ++ return dq_buf(ctx, ppavbuf); + } +- return avbuf; ++ ++ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); ++ return AVERROR_UNKNOWN; } - -@@ -443,8 +575,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf( ++} + +- return NULL; ++// Clear out flags and timestamps that should should be set by the user ++// Returns the passed avbuf ++static V4L2Buffer * ++clean_v4l2_buffer(V4L2Buffer * const avbuf) ++{ ++ struct v4l2_buffer *const buf = &avbuf->buf; ++ ++ buf->flags = 0; ++ buf->field = V4L2_FIELD_ANY; ++ buf->timestamp = (struct timeval){0}; ++ buf->timecode = (struct v4l2_timecode){0}; ++ buf->sequence = 0; ++ ++ return avbuf; + } + + static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + { +- int timeout = 0; /* return when no more buffers to dequeue */ + int i; + + /* get back as many output buffers as possible */ + if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- do { +- } while (v4l2_dequeue_v4l2buf(ctx, timeout)); ++ V4L2Buffer * avbuf; ++ do { ++ get_qbuf(ctx, &avbuf, 0); ++ } while (avbuf); } - + for (i = 0; i < ctx->num_buffers; i++) { - if (ctx->buffers[i].status == V4L2BUF_AVAILABLE) - return &ctx->buffers[i]; + V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; + if (avbuf->status == V4L2BUF_AVAILABLE) -+ return avbuf; ++ return clean_v4l2_buffer(avbuf); } - + return NULL; -@@ -452,25 +585,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf( - +@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf( + static int v4l2_release_buffers(V4L2Context* ctx) { - struct v4l2_requestbuffers req = { @@ -47632,12 +51801,12 @@ Upstream-status: Pending + int i; + int ret = 0; + const int fd = ctx_to_m2mctx(ctx)->fd; - + - for (i = 0; i < ctx->num_buffers; i++) { - V4L2Buffer *buffer = &ctx->buffers[i]; + // Orphan any buffers in the wild + ff_weak_link_break(&ctx->wl_master); - + - for (j = 0; j < buffer->num_planes; j++) { - struct V4L2Plane_info *p = &buffer->plane_info[j]; - if (p->mm_addr && p->length) @@ -47672,15 +51841,15 @@ Upstream-status: Pending + " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); } } -+ ctx->q_count = 0; - ++ atomic_store(&ctx->q_count, 0); + - return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); + return ret; } - + static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) -@@ -499,6 +652,8 @@ static inline int v4l2_try_raw_format(V4 - +@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4 + static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) { + V4L2m2mContext* s = ctx_to_m2mctx(ctx); @@ -47688,10 +51857,10 @@ Upstream-status: Pending enum AVPixelFormat pixfmt = ctx->av_pix_fmt; struct v4l2_fmtdesc fdesc; int ret; -@@ -517,6 +672,13 @@ static int v4l2_get_raw_format(V4L2Conte +@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte if (ret) return AVERROR(EINVAL); - + + if (priv->pix_fmt != AV_PIX_FMT_NONE) { + if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { + fdesc.index++; @@ -47702,10 +51871,10 @@ Upstream-status: Pending pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); ret = v4l2_try_raw_format(ctx, pixfmt); if (ret){ -@@ -569,18 +731,77 @@ static int v4l2_get_coded_format(V4L2Con +@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con * *****************************************************************************/ - + + +static void flush_all_buffers_status(V4L2Context* const ctx) +{ @@ -47717,9 +51886,9 @@ Upstream-status: Pending + for (i = 0; i < ctx->num_buffers; ++i) { + struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; + if (buf->status == V4L2BUF_IN_DRIVER) -+ buf->status = V4L2BUF_AVAILABLE; ++ ff_v4l2_buffer_set_avail(buf); + } -+ ctx->q_count = 0; ++ atomic_store(&ctx->q_count, 0); +} + +static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) @@ -47749,18 +51918,25 @@ Upstream-status: Pending int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) { int type = ctx->type; - int ret; +- int ret; ++ int ret = 0; + AVCodecContext * const avctx = logger(ctx); -+ -+ ff_mutex_lock(&ctx->lock); -+ -+ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ stuff_all_buffers(avctx, ctx); - - ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); - if (ret < 0) - return AVERROR(errno); -+ if (ret < 0) { ++ // Avoid doing anything if there is nothing we can do ++ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon) ++ return 0; + +- ctx->streamon = (cmd == VIDIOC_STREAMON); ++ ff_mutex_lock(&ctx->lock); + +- return 0; ++ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ stuff_all_buffers(avctx, ctx); ++ ++ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) { + const int err = errno; + av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); @@ -47770,73 +51946,153 @@ Upstream-status: Pending + { + if (cmd == VIDIOC_STREAMOFF) + flush_all_buffers_status(ctx); - -- ctx->streamon = (cmd == VIDIOC_STREAMON); ++ else ++ ctx->first_buf = 1; ++ + ctx->streamon = (cmd == VIDIOC_STREAMON); + av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); + } - -- return 0; ++ ++ // Both stream off & on effectively clear flag_last ++ ctx->flag_last = 0; ++ + ff_mutex_unlock(&ctx->lock); + + return ret; } - + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) -@@ -608,7 +829,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Co + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ V4L2m2mContext *const s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; ++ int64_t track_ts; + V4L2Buffer* avbuf; + int ret; + + if (!frame) { + ret = v4l2_stop_encode(ctx); + if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name); ++ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name); + s->draining= 1; + return 0; + } +@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co + if (!avbuf) + return AVERROR(ENOMEM); + +- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf); ++ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame); ++ ++ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts); + if (ret) + return ret; + return ff_v4l2_buffer_enqueue(avbuf); } - + -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, -+ const void * extdata, size_t extlen, int no_rescale_pts) ++ const void * extdata, size_t extlen) { V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; V4L2Buffer* avbuf; -@@ -616,8 +838,9 @@ int ff_v4l2_context_enqueue_packet(V4L2C - + int ret; ++ int64_t track_ts; + if (!pkt->size) { ret = v4l2_stop_decode(ctx); + // Log but otherwise ignore stop failure if (ret) - av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name); -+ av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); ++ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); s->draining = 1; return 0; } -@@ -626,14 +849,17 @@ int ff_v4l2_context_enqueue_packet(V4L2C +@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C if (!avbuf) return AVERROR(EAGAIN); - + - ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); - if (ret) -+ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); ++ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt); ++ ++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts); + if (ret == AVERROR(ENOMEM)) + av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", + __func__, pkt->size, avbuf->planes[0].length); + else if (ret) return ret; - + return ff_v4l2_buffer_enqueue(avbuf); - } - --int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) -+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts) +@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C + + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) { ++ V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; V4L2Buffer *avbuf; - -@@ -650,7 +876,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co - return AVERROR(EAGAIN); - } - ++ int rv; + +- /* +- * timeout=-1 blocks until: +- * 1. decoded frame available +- * 2. an input buffer is ready to be dequeued +- */ +- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout); +- if (!avbuf) { +- if (ctx->done) +- return AVERROR_EOF; +- +- return AVERROR(EAGAIN); +- } ++ do { ++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) ++ return rv; ++ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0) ++ return rv; ++ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0); + - return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); -+ return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts); ++ return 0; } - + int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) -@@ -702,78 +928,155 @@ int ff_v4l2_context_get_format(V4L2Conte - + { ++ V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer *avbuf; ++ int rv; + +- /* +- * blocks until: +- * 1. encoded packet available +- * 2. an input buffer ready to be dequeued +- */ +- avbuf = v4l2_dequeue_v4l2buf(ctx, -1); +- if (!avbuf) { +- if (ctx->done) +- return AVERROR_EOF; ++ do { ++ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) ++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC ++ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) ++ return rv; ++ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0); + +- return AVERROR(EAGAIN); +- } +- +- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); ++ return 0; + } + + int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) +@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte + int ff_v4l2_context_set_format(V4L2Context* ctx) { - return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); @@ -47864,29 +52120,30 @@ Upstream-status: Pending + ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); + return ret; } - + void ff_v4l2_context_release(V4L2Context* ctx) { int ret; - + - if (!ctx->buffers) + if (!ctx->bufrefs) return; - + ret = v4l2_release_buffers(ctx); if (ret) av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name); - + - av_freep(&ctx->buffers); + av_freep(&ctx->bufrefs); + av_buffer_unref(&ctx->frames_ref); + + ff_mutex_destroy(&ctx->lock); ++ pthread_cond_destroy(&ctx->cond); } - + -int ff_v4l2_context_init(V4L2Context* ctx) + -+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers) ++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem) { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); + V4L2m2mContext * const s = ctx_to_m2mctx(ctx); @@ -47897,17 +52154,19 @@ Upstream-status: Pending - av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); - return AVERROR_PATCHWELCOME; - } -- ++ int ret; ++ int i; + - ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); - if (ret) - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name); -+ int ret; -+ int i; - ++ av_assert0(ctx->bufrefs == NULL); + memset(&req, 0, sizeof(req)); - req.count = ctx->num_buffers; +- req.memory = V4L2_MEMORY_MMAP; + req.count = req_buffers; - req.memory = V4L2_MEMORY_MMAP; ++ req.memory = mem; req.type = ctx->type; - ret = ioctl(s->fd, VIDIOC_REQBUFS, &req); - if (ret < 0) { @@ -47920,7 +52179,7 @@ Upstream-status: Pending + return ret; + } } - + ctx->num_buffers = req.count; - ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer)); - if (!ctx->buffers) { @@ -47930,7 +52189,7 @@ Upstream-status: Pending - return AVERROR(ENOMEM); + goto fail_release; } - + - for (i = 0; i < req.count; i++) { - ctx->buffers[i].context = ctx; - ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i); @@ -47942,14 +52201,14 @@ Upstream-status: Pending + } + + for (i = 0; i < ctx->num_buffers; i++) { -+ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx); ++ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem); + if (ret) { av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); - goto error; + goto fail_release; } } - + av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name, V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat), req.count, @@ -47959,9 +52218,9 @@ Upstream-status: Pending + ff_v4l2_get_format_height(&ctx->format), V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage, V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline); - + return 0; - + -error: +fail_release: v4l2_release_buffers(ctx); @@ -47976,14 +52235,16 @@ Upstream-status: Pending + + // It is not valid to reinit a context without a previous release + av_assert0(ctx->bufrefs == NULL); -+ + +- av_freep(&ctx->buffers); + if (!v4l2_type_supported(ctx)) { + av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); + return AVERROR_PATCHWELCOME; + } - -- av_freep(&ctx->buffers); ++ + ff_mutex_init(&ctx->lock, NULL); ++ pthread_cond_init(&ctx->cond, NULL); ++ atomic_init(&ctx->q_count, 0); + + if (s->output_drm) { + AVHWFramesContext *hwframes; @@ -47997,8 +52258,8 @@ Upstream-status: Pending + hwframes = (AVHWFramesContext*)ctx->frames_ref->data; + hwframes->format = AV_PIX_FMT_DRM_PRIME; + hwframes->sw_format = ctx->av_pix_fmt; -+ hwframes->width = ctx->width; -+ hwframes->height = ctx->height; ++ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width; ++ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height; + ret = av_hwframe_ctx_init(ctx->frames_ref); + if (ret < 0) + goto fail_unref_hwframes; @@ -48011,12 +52272,12 @@ Upstream-status: Pending + goto fail_unref_hwframes; + } + -+ ret = create_buffers(ctx, ctx->num_buffers); ++ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); + if (ret < 0) + goto fail_unref_hwframes; + + return 0; - + +fail_unref_hwframes: + av_buffer_unref(&ctx->frames_ref); +fail_unlock: @@ -48031,14 +52292,14 @@ Upstream-status: Pending #include "libavutil/buffer.h" +#include "libavutil/thread.h" #include "v4l2_buffers.h" - + typedef struct V4L2Context { @@ -70,11 +71,18 @@ typedef struct V4L2Context { */ int width, height; AVRational sample_aspect_ratio; + struct v4l2_rect selection; - + /** - * Indexed array of V4L2Buffers + * If the default size of buffer is less than this then try to @@ -48051,50 +52312,98 @@ Upstream-status: Pending + * Indexed array of pointers to V4L2Buffers + */ + AVBufferRef **bufrefs; - + /** * Readonly after init. -@@ -92,6 +100,12 @@ typedef struct V4L2Context { +@@ -82,16 +90,38 @@ typedef struct V4L2Context { + int num_buffers; + + /** ++ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF ++ */ ++ enum v4l2_memory buf_mem; ++ ++ /** + * Whether the stream has been started (VIDIOC_STREAMON has been sent). + */ + int streamon; + ++ /* 1st buffer after stream on */ ++ int first_buf; ++ + /** + * Either no more buffers available or an unrecoverable error was notified + * by the V4L2 kernel driver: once set the context has to be exited. */ int done; - + ++ int flag_last; ++ ++ /** ++ * If NZ then when Qing frame/pkt use this rather than the ++ * "real" PTS ++ */ ++ uint64_t track_ts; ++ + AVBufferRef *frames_ref; -+ int q_count; -+ int dq_count; ++ atomic_int q_count; + struct ff_weak_link_master *wl_master; + + AVMutex lock; ++ pthread_cond_t cond; } V4L2Context; - + /** -@@ -156,9 +170,12 @@ int ff_v4l2_context_dequeue_packet(V4L2C +@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C * @param[in] ctx The V4L2Context to dequeue from. * @param[inout] f The AVFrame to dequeue to. * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) -+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as -+ * timestamp directly) + * * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. ++ * AVERROR(ENOSPC) if no buffer availible to put ++ * the frame in */ --int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); -+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts); - - /** - * Enqueues a buffer to a V4L2Context from an AVPacket -@@ -170,7 +187,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + +@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co * @param[in] pkt A pointer to an AVPacket. * @return 0 in case of success, a negative error otherwise. */ -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); -+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts); - ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); + /** * Enqueues a buffer to a V4L2Context from an AVFrame --- a/libavcodec/v4l2_m2m.c +++ b/libavcodec/v4l2_m2m.c -@@ -215,13 +215,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont +@@ -36,6 +36,14 @@ + #include "v4l2_fmt.h" + #include "v4l2_m2m.h" + ++static void ++xlat_init(xlat_track_t * const x) ++{ ++ memset(x, 0, sizeof(*x)); ++ x->last_pts = AV_NOPTS_VALUE; ++} ++ ++ + static inline int v4l2_splane_video(struct v4l2_capability *cap) + { + if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) && +@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m + + s->capture.done = s->output.done = 0; + s->capture.name = "capture"; ++ s->capture.buf_mem = V4L2_MEMORY_MMAP; + s->output.name = "output"; ++ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + atomic_init(&s->refcount, 0); + sem_init(&s->refsync, 0, 0); + +@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); - + /* 2. unmap the capture buffers (v4l2 and ffmpeg): - * we must wait for all references to be released before being allowed - * to queue new buffers. @@ -48104,30 +52413,49 @@ Upstream-status: Pending - while(sem_wait(&s->refsync) == -1 && errno == EINTR); - ff_v4l2_context_release(&s->capture); - + /* 3. get the new capture format */ -@@ -328,7 +322,10 @@ static void v4l2_m2m_destroy_context(voi +@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont + + /* 5. complete reinit */ + s->draining = 0; +- s->reinit = 0; + + return 0; + } +@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2 + + /* start again now that we know the stream dimensions */ + s->draining = 0; +- s->reinit = 0; + + ret = ff_v4l2_context_get_format(&s->output, 0); + if (ret) { +@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi ff_v4l2_context_release(&s->capture); sem_destroy(&s->refsync); - + - close(s->fd); + if (s->fd != -1) + close(s->fd); + ++ av_packet_unref(&s->buf_pkt); ++ av_freep(&s->extdata_data); ++ + av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); - + av_free(s); } -@@ -338,17 +335,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p +@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p V4L2m2mContext *s = priv->context; int ret; - + - ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); - if (ret) - av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name); + if (!s) + return 0; - + - ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); - if (ret) - av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name); @@ -48145,9 +52473,9 @@ Upstream-status: Pending + if (ret) + av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name); + } - + ff_v4l2_context_release(&s->output); - + + close(s->fd); + s->fd = -1; + @@ -48157,20 +52485,65 @@ Upstream-status: Pending + s->avctx = NULL; + priv->context = NULL; av_buffer_unref(&priv->context_ref); - + return 0; +@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv * + return v4l2_configure_contexts(s); + } + +-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s) ++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps) + { +- *s = av_mallocz(sizeof(V4L2m2mContext)); +- if (!*s) ++ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext)); ++ ++ *pps = NULL; ++ if (!s) + return AVERROR(ENOMEM); + +- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext), ++ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s), + &v4l2_m2m_destroy_context, NULL, 0); + if (!priv->context_ref) { +- av_freep(s); ++ av_free(s); + return AVERROR(ENOMEM); + } + + /* assign the context */ +- priv->context = *s; +- (*s)->priv = priv; ++ priv->context = s; ++ s->priv = priv; + + /* populate it */ +- priv->context->capture.num_buffers = priv->num_capture_buffers; +- priv->context->output.num_buffers = priv->num_output_buffers; +- priv->context->self_ref = priv->context_ref; +- priv->context->fd = -1; ++ s->capture.num_buffers = priv->num_capture_buffers; ++ s->output.num_buffers = priv->num_output_buffers; ++ s->self_ref = priv->context_ref; ++ s->fd = -1; ++ ++ xlat_init(&s->xlat); + ++ *pps = s; + return 0; + } --- a/libavcodec/v4l2_m2m.h +++ b/libavcodec/v4l2_m2m.h @@ -30,6 +30,7 @@ #include - + #include "libavcodec/avcodec.h" +#include "libavutil/pixfmt.h" #include "v4l2_context.h" - + #define container_of(ptr, type, member) ({ \ -@@ -38,7 +39,18 @@ - +@@ -38,7 +39,37 @@ + #define V4L_M2M_DEFAULT_OPTS \ { "num_output_buffers", "Number of buffers in the output context",\ - OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS } @@ -48179,26 +52552,45 @@ Upstream-status: Pending +#define FF_V4L2_M2M_TRACK_SIZE 128 +typedef struct V4L2m2mTrackEl { + int discard; // If we see this buffer its been flushed, so discard ++ int pending; + int pkt_size; + int64_t pts; ++ int64_t dts; + int64_t reordered_opaque; + int64_t pkt_pos; + int64_t pkt_duration; + int64_t track_pts; +} V4L2m2mTrackEl; - ++ ++typedef struct pts_stats_s ++{ ++ void * logctx; ++ const char * name; // For debug ++ unsigned int last_count; ++ unsigned int last_interval; ++ int64_t last_pts; ++ int64_t guess; ++} pts_stats_t; ++ ++typedef struct xlat_track_s { ++ unsigned int track_no; ++ int64_t last_pts; ++ int64_t last_opaque; ++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++} xlat_track_t; + typedef struct V4L2m2mContext { char devname[PATH_MAX]; -@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext { +@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext { + AVCodecContext *avctx; sem_t refsync; atomic_uint refcount; - int reinit; -+ int resize_pending; - +- int reinit; + /* null frame/packet received */ int draining; -@@ -63,6 +76,23 @@ typedef struct V4L2m2mContext { - +@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext { + /* reference back to V4L2m2mPriv */ void *priv; + @@ -48207,49 +52599,72 @@ Upstream-status: Pending + /* generate DRM frames */ + int output_drm; + ++ /* input frames are drmprime */ ++ int input_drm; ++ + /* Frame tracking */ -+ int64_t last_pkt_dts; -+ int64_t last_opaque; -+ unsigned int track_no; -+ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++ xlat_track_t xlat; ++ int pending_hw; ++ int pending_n; ++ ++ pts_stats_t pts_stat; + + /* req pkt */ + int req_pkt; + + /* Ext data sent */ + int extdata_sent; ++ /* Ext data sent in packet - overrides ctx */ ++ uint8_t * extdata_data; ++ size_t extdata_size; ++ ++#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 ++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 ++ /* Quirks */ ++ unsigned int quirks; ++ } V4L2m2mContext; - + typedef struct V4L2m2mPriv { -@@ -73,6 +103,7 @@ typedef struct V4L2m2mPriv { - +@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv { + int num_output_buffers; int num_capture_buffers; + enum AVPixelFormat pix_fmt; } V4L2m2mPriv; - + /** -@@ -126,4 +157,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont +@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont */ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); - + + -+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt) ++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt) +{ + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; +} + -+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt) ++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt) +{ + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; +} + ++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; ++} ++ ++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx) ++{ ++ return ctx->flag_last; ++} ++ + #endif /* AVCODEC_V4L2_M2M_H */ --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -23,6 +23,10 @@ - + #include #include + @@ -48259,10 +52674,10 @@ Upstream-status: Pending #include "libavutil/pixfmt.h" #include "libavutil/pixdesc.h" #include "libavutil/opt.h" -@@ -30,26 +34,51 @@ +@@ -30,75 +34,111 @@ #include "libavcodec/decode.h" #include "libavcodec/internal.h" - + +#include "libavcodec/hwaccels.h" +#include "libavcodec/internal.h" +#include "libavcodec/hwconfig.h" @@ -48270,7 +52685,80 @@ Upstream-status: Pending #include "v4l2_context.h" #include "v4l2_m2m.h" #include "v4l2_fmt.h" - + +-static int v4l2_try_start(AVCodecContext *avctx) ++// Pick 64 for max last count - that is >1sec at 60fps ++#define STATS_LAST_COUNT_MAX 64 ++#define STATS_INTERVAL_MAX (1 << 30) ++ ++#ifndef FF_API_BUFFER_SIZE_T ++#define FF_API_BUFFER_SIZE_T 1 ++#endif ++ ++static int64_t pts_stats_guess(const pts_stats_t * const stats) + { +- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- V4L2Context *const capture = &s->capture; +- V4L2Context *const output = &s->output; +- struct v4l2_selection selection = { 0 }; +- int ret; ++ if (stats->last_pts == AV_NOPTS_VALUE || ++ stats->last_interval == 0 || ++ stats->last_count >= STATS_LAST_COUNT_MAX) ++ return AV_NOPTS_VALUE; ++ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; ++} + +- /* 1. start the output process */ +- if (!output->streamon) { +- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); +- if (ret < 0) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); +- return ret; ++static void pts_stats_add(pts_stats_t * const stats, int64_t pts) ++{ ++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { ++ if (stats->last_count < STATS_LAST_COUNT_MAX) ++ ++stats->last_count; ++ return; ++ } ++ ++ if (stats->last_pts != AV_NOPTS_VALUE) { ++ const int64_t interval = pts - stats->last_pts; ++ ++ if (interval < 0 || interval >= STATS_INTERVAL_MAX || ++ stats->last_count >= STATS_LAST_COUNT_MAX) { ++ if (stats->last_interval != 0) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", ++ __func__, stats->name, interval, stats->last_count); ++ stats->last_interval = 0; ++ } ++ else { ++ const int64_t frame_time = interval / (int64_t)stats->last_count; ++ ++ if (frame_time != stats->last_interval) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", ++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); ++ stats->last_interval = frame_time; + } + } + +- if (capture->streamon) ++ stats->last_pts = pts; ++ stats->last_count = 1; ++} ++ ++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) ++{ ++ *stats = (pts_stats_t){ ++ .logctx = logctx, ++ .name = name, ++ .last_count = 1, ++ .last_interval = 0, ++ .last_pts = AV_NOPTS_VALUE ++ }; ++} ++ +static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) +{ + int ret; @@ -48280,81 +52768,43 @@ Upstream-status: Pending + }; + + if (s->output.streamon) -+ return 0; -+ -+ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); -+ -+ if (!s->capture.streamon || ret < 0) -+ return ret; -+ -+ ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); -+ else -+ av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n"); -+ -+ return ret; -+} -+ - static int v4l2_try_start(AVCodecContext *avctx) - { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const capture = &s->capture; -- V4L2Context *const output = &s->output; - struct v4l2_selection selection = { 0 }; - int ret; - - /* 1. start the output process */ -- if (!output->streamon) { -- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); -- if (ret < 0) { -- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); -- return ret; -- } -- } -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; - - if (capture->streamon) return 0; -@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext + +- /* 2. get the capture format */ +- capture->format.type = capture->type; +- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format); +- if (ret) { +- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); ++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); ++ if (ret != 0) { ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret)); + return ret; } - - /* 2.1 update the AVCodecContext */ + +- /* 2.1 update the AVCodecContext */ - avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); - capture->av_pix_fmt = avctx->pix_fmt; -+ capture->av_pix_fmt = -+ ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); -+ if (s->output_drm) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ avctx->sw_pix_fmt = capture->av_pix_fmt; -+ } -+ else -+ avctx->pix_fmt = capture->av_pix_fmt; - - /* 3. set the crop parameters */ -+#if 1 -+ selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ selection.target = V4L2_SEL_TGT_CROP_DEFAULT; -+ ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); -+ av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); -+#else - selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - selection.r.height = avctx->coded_height; - selection.r.width = avctx->coded_width; -+ av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height); - ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); +- +- /* 3. set the crop parameters */ +- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; +- selection.r.height = avctx->coded_height; +- selection.r.width = avctx->coded_width; +- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); - if (!ret) { -+ av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); -+ if (1) { - ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); - if (ret) { - av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); -@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext - capture->width = selection.r.width; - } +- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); +- if (ret) { +- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); +- } else { +- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height); +- /* update the size of the resulting frame */ +- capture->height = selection.r.height; +- capture->width = selection.r.width; +- } ++ // STREAMON should do implicit START so this just for those that don't. ++ // It is optional so don't worry if it fails ++ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) { ++ ret = AVERROR(errno); ++ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret)); } - - /* 4. init the capture context now that we have the capture format */ @@ -48364,131 +52814,133 @@ Upstream-status: Pending - av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); - return AVERROR(ENOMEM); - } ++ else { ++ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n"); + } ++ return 0; ++} + +- /* 5. start the capture process */ +- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); +- if (ret) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n"); +- return ret; - } -+#endif - - /* 5. start the capture process */ - ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); -@@ -133,52 +168,312 @@ static int v4l2_prepare_decoder(V4L2m2mC ++static int v4l2_try_start(AVCodecContext *avctx) ++{ ++ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context; ++ int ret; + ++ /* 1. start the output process */ ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; return 0; } - + +@@ -133,52 +173,525 @@ static int v4l2_prepare_decoder(V4L2m2mC + return 0; + } + -static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) -+{ -+ return (int64_t)n; -+} -+ -+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) -+{ -+ return (unsigned int)pts; -+} -+ -+// FFmpeg requires us to propagate a number of vars from the coded pkt into -+// the decoded frame. The only thing that tracks like that in V4L2 stateful -+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no -+// guarantees about PTS being unique or specified for every frame so replace -+// the supplied PTS with a simple incrementing number and keep a circular -+// buffer of all the things we want preserved (including the original PTS) -+// indexed by the tracking no. +static void -+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt) ++set_best_effort_pts(AVCodecContext *const avctx, ++ pts_stats_t * const ps, ++ AVFrame *const frame) +{ -+ int64_t track_pts; -+ -+ // Avoid 0 -+ if (++s->track_no == 0) -+ s->track_no = 1; -+ -+ track_pts = track_to_pts(avctx, s->track_no); -+ -+ av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no); -+ s->last_pkt_dts = avpkt->dts; -+ s->track_els[s->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ .discard = 0, -+ .pkt_size = avpkt->size, -+ .pts = avpkt->pts, -+ .reordered_opaque = avctx->reordered_opaque, -+ .pkt_pos = avpkt->pos, -+ .pkt_duration = avpkt->duration, -+ .track_pts = track_pts -+ }; -+ avpkt->pts = track_pts; -+} -+ -+// Returns -1 if we should discard the frame -+static int -+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame) -+{ -+ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; -+ const V4L2m2mTrackEl *const t = s->track_els + n; -+ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) -+ { -+ av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ frame->pts = AV_NOPTS_VALUE; -+ frame->pkt_dts = s->last_pkt_dts; -+ frame->reordered_opaque = s->last_opaque; -+ frame->pkt_pos = -1; -+ frame->pkt_duration = 0; -+ frame->pkt_size = -1; -+ } -+ else if (!t->discard) -+ { -+ frame->pts = t->pts; -+ frame->pkt_dts = s->last_pkt_dts; -+ frame->reordered_opaque = t->reordered_opaque; -+ frame->pkt_pos = t->pkt_pos; -+ frame->pkt_duration = t->pkt_duration; -+ frame->pkt_size = t->pkt_size; -+ -+ s->last_opaque = s->track_els[n].reordered_opaque; -+ s->track_els[n].pts = AV_NOPTS_VALUE; // If we hit this again deny accurate knowledge of PTS -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ return -1; -+ } ++ pts_stats_add(ps, frame->pts); + +#if FF_API_PKT_PTS +FF_DISABLE_DEPRECATION_WARNINGS + frame->pkt_pts = frame->pts; +FF_ENABLE_DEPRECATION_WARNINGS +#endif -+ frame->best_effort_timestamp = frame->pts; -+ frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts); -+ return 0; ++ frame->best_effort_timestamp = pts_stats_guess(ps); ++ // If we can't guess from just PTS - try DTS ++ if (frame->best_effort_timestamp == AV_NOPTS_VALUE) ++ frame->best_effort_timestamp = frame->pkt_dts; ++ ++ // We can't emulate what s/w does in a useful manner and using the ++ // "correct" answer seems to just confuse things. ++ frame->pkt_dts = frame->pts; ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts); ++} ++ ++static void ++xlat_flush(xlat_track_t * const x) ++{ ++ unsigned int i; ++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { ++ x->track_els[i].pending = 0; ++ x->track_els[i].discard = 1; ++ } ++ x->last_pts = AV_NOPTS_VALUE; ++} ++ ++static int ++xlat_pending(const xlat_track_t * const x) ++{ ++ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; ++ unsigned int i; ++ int r = 0; ++ int64_t now = AV_NOPTS_VALUE; ++ ++ for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) { ++ const V4L2m2mTrackEl * const t = x->track_els + n; ++ ++ if (!t->pending) ++ continue; ++ ++ if (now == AV_NOPTS_VALUE) ++ now = t->dts; ++ ++ if (t->pts == AV_NOPTS_VALUE || ++ ((now == AV_NOPTS_VALUE || t->pts <= now) && ++ (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts))) ++ ++r; ++ } ++ ++ // If we never get any ideas about PTS vs DTS allow a lot more buffer ++ if (now == AV_NOPTS_VALUE) ++ r -= 16; ++ ++ return r; +} + +static inline int stream_started(const V4L2m2mContext * const s) { -+ return s->capture.streamon && s->output.streamon; ++ return s->output.streamon; +} + +#define NQ_OK 0 +#define NQ_Q_FULL 1 +#define NQ_SRC_EMPTY 2 -+#define NQ_DRAINING 3 -+#define NQ_DEAD 4 ++#define NQ_NONE 3 ++#define NQ_DRAINING 4 ++#define NQ_DEAD 5 + +#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) ++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) ++ ++// do_not_get If true then no new packet will be got but status will ++// be set appropriately + +// AVERROR_EOF Flushing an already flushed stream +// -ve Error (all errors except EOF are unexpected) +// NQ_OK (0) OK +// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) +// NQ_SRC_EMPTY Src empty (do not retry) ++// NQ_NONE Enqueue not attempted +// NQ_DRAINING At EOS, dQ dest until EOS there too +// NQ_DEAD Not running (do not retry, do not attempt capture dQ) + -+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s) ++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get) { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const capture = &s->capture; - V4L2Context *const output = &s->output; - AVPacket avpkt = {0}; int ret; - + - if (s->buf_pkt.size) { - avpkt = s->buf_pkt; - memset(&s->buf_pkt, 0, sizeof(AVPacket)); @@ -48498,8 +52950,50 @@ Upstream-status: Pending + // If we don't already have a coded packet - get a new one + // We will already have a coded pkt if the output Q was full last time we + // tried to Q it -+ if (!s->buf_pkt.size) { -+ ret = ff_decode_get_packet(avctx, &s->buf_pkt); ++ if (!s->buf_pkt.size && !do_not_get) { ++ unsigned int i; ++ ++ for (i = 0; i < 256; ++i) { ++ uint8_t * side_data; ++#if FF_API_BUFFER_SIZE_T ++ int side_size; ++#else ++ size_t side_size; ++#endif ++ ret = ff_decode_get_packet(avctx, &s->buf_pkt); ++ if (ret != 0) ++ break; ++ ++ // New extradata is the only side-data we undertand ++ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); ++ if (side_data) { ++ av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); ++ av_freep(&s->extdata_data); ++ if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d bytes of extra data\n", (int)side_size); ++ return AVERROR(ENOMEM); ++ } ++ memcpy(s->extdata_data, side_data, side_size); ++ s->extdata_size = side_size; ++ s->extdata_sent = 0; ++ } ++ ++ if (s->buf_pkt.size != 0) ++ break; ++ ++ if (s->buf_pkt.side_data_elems == 0) { ++ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n"); ++ ret = AVERROR_EOF; ++ break; ++ } ++ ++ // Retry a side-data only pkt ++ } ++ // If i >= 256 something has gone wrong ++ if (i >= 256) { ++ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n"); ++ return AVERROR(EIO); ++ } + + if (ret == AVERROR(EAGAIN)) { + if (!stream_started(s)) { @@ -48523,7 +53017,7 @@ Upstream-status: Pending + if (!s->draining) { + // Calling enqueue with an empty pkt starts drain + av_assert0(s->buf_pkt.size == 0); -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1); ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); + if (ret) { + av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); + return ret; @@ -48536,22 +53030,37 @@ Upstream-status: Pending + av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); return ret; + } -+ -+ xlat_pts_in(avctx, s, &s->buf_pkt); } - + - if (s->draining) - goto dequeue; -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; - ++ if (s->draining) { ++ if (s->buf_pkt.size) { ++ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); ++ av_packet_unref(&s->buf_pkt); ++ } ++ return NQ_DRAINING; ++ } + - ret = ff_v4l2_context_enqueue_packet(output, &avpkt); - if (ret < 0) { - if (ret != AVERROR(EAGAIN)) - return ret; -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, -+ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size, -+ 1); ++ if (!s->buf_pkt.size) ++ return NQ_NONE; + +- s->buf_pkt = avpkt; +- /* no input buffers available, continue dequeing */ +- } ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; ++ ++ if (s->extdata_sent) ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); ++ else if (s->extdata_data) ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); ++ else ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size); + + if (ret == AVERROR(EAGAIN)) { + // Out of input buffers - keep packet @@ -48561,19 +53070,19 @@ Upstream-status: Pending + // In all other cases we are done with this packet + av_packet_unref(&s->buf_pkt); + s->extdata_sent = 1; - -- s->buf_pkt = avpkt; -- /* no input buffers available, continue dequeing */ -+ if (ret) { + +- if (avpkt.size) { +- ret = v4l2_try_start(avctx); + if (ret) { +- av_packet_unref(&avpkt); + av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); + return ret; + } - } - -- if (avpkt.size) { -- ret = v4l2_try_start(avctx); -- if (ret) { -- av_packet_unref(&avpkt); ++ } + +- /* cant recover */ +- if (ret == AVERROR(ENOMEM)) +- return ret; + // Start if we haven't + { + const int ret2 = v4l2_try_start(avctx); @@ -48582,62 +53091,139 @@ Upstream-status: Pending + ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD; + } + } - -- /* cant recover */ -- if (ret == AVERROR(ENOMEM)) -- return ret; ++ + return ret; +} - ++ ++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) ++{ ++ int rv = 0; + - return 0; ++ ff_mutex_lock(&ctx->lock); ++ ++ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { ++ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { ++ rv = AVERROR(errno); ++ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); ++ break; + } + } + +-dequeue: +- if (!s->buf_pkt.size) +- av_packet_unref(&avpkt); +- return ff_v4l2_context_dequeue_frame(capture, frame, -1); ++ ff_mutex_unlock(&ctx->lock); ++ return rv; ++} ++ ++// Number of frames over what xlat_pending returns that we keep *16 ++// This is a min value - if it appears to be too small the threshold should ++// adjust dynamically. ++#define PENDING_HW_MIN (3 * 16) ++// Offset to use when setting dynamically ++// Set to %16 == 15 to avoid the threshold changing immediately as we relax ++#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) ++// Number of consecutive times we've failed to get a frame when we prefer it ++// before we increase the prefer threshold (5ms * N = max expected decode ++// time) ++#define PENDING_N_THRESHOLD 6 ++ +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -+ int src_rv; ++ int src_rv = NQ_OK; + int dst_rv = 1; // Non-zero (done), non-negative (error) number ++ unsigned int i = 0; + + do { -+ src_rv = try_enqueue_src(avctx, s); ++ const int pending = xlat_pending(&s->xlat); ++ const int prefer_dq = (pending > s->pending_hw / 16); ++ const int last_src_rv = src_rv; + -+ // If we got a frame last time and we have nothing to enqueue then -+ // return now. rv will be AVERROR(EAGAIN) indicating that we want more input ++ // Enqueue another pkt for decode if ++ // (a) We don't have a lot of stuff in the buffer already OR ++ // (b) ... we (think we) do but we've failed to get a frame already OR ++ // (c) We've dequeued a lot of frames without asking for input ++ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2)); ++ ++ // If we got a frame last time or we've already tried to get a frame and ++ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) ++ // indicating that we want more input. + // This should mean that once decode starts we enter a stable state where + // we alternately ask for input and produce output -+ if (s->req_pkt && src_rv == NQ_SRC_EMPTY) ++ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) + break; + -+ if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) { -+ av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail"); -+ src_rv = NQ_SRC_EMPTY; // If we can't enqueue pretend that there is nothing to enqueue ++ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) { ++ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n"); ++ break; + } + + // Try to get a new frame if + // (a) we haven't already got one AND + // (b) enqueue returned a status indicating that decode should be attempted + if (dst_rv != 0 && TRY_DQ(src_rv)) { -+ do { -+ // Dequeue frame will unref any previous contents of frame -+ // if it returns success so we don't need an explicit unref -+ // when discarding -+ // This returns AVERROR(EAGAIN) if there isn't a frame ready yet -+ // but there is room in the input Q -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1); ++ // Pick a timeout depending on state ++ const int t = ++ src_rv == NQ_DRAINING ? 300 : ++ prefer_dq ? 5 : ++ src_rv == NQ_Q_FULL ? -1 : 0; + -+ if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) -+ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -+ s->draining, s->capture.done); -+ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) -+ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", -+ s->draining, s->capture.done, dst_rv); ++ // Dequeue frame will unref any previous contents of frame ++ // if it returns success so we don't need an explicit unref ++ // when discarding ++ // This returns AVERROR(EAGAIN) on timeout or if ++ // there is room in the input Q and timeout == -1 ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + -+ // Go again if we got a frame that we need to discard -+ } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame)); ++ // Failure due to no buffer in Q? ++ if (dst_rv == AVERROR(ENOSPC)) { ++ // Wait & retry ++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); ++ } ++ } ++ ++ // Adjust dynamic pending threshold ++ if (dst_rv == 0) { ++ if (--s->pending_hw < PENDING_HW_MIN) ++ s->pending_hw = PENDING_HW_MIN; ++ s->pending_n = 0; ++ ++ set_best_effort_pts(avctx, &s->pts_stat, frame); ++ } ++ else if (dst_rv == AVERROR(EAGAIN)) { ++ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { ++ s->pending_hw = pending * 16 + PENDING_HW_OFFSET; ++ s->pending_n = 0; ++ } ++ } ++ ++ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { ++ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); ++ dst_rv = AVERROR_EOF; ++ s->capture.done = 1; ++ } ++ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) ++ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", ++ s->draining, s->capture.done); ++ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) ++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", ++ s->draining, s->capture.done, dst_rv); ++ } ++ ++ ++i; ++ if (i >= 256) { ++ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i); ++ src_rv = AVERROR(EIO); + } + + // Continue trying to enqueue packets if either + // (a) we succeeded last time OR -+ // (b) enqueue failed due to input Q full AND there is now room -+ } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) ); ++ // (b) we didn't ret a frame and we can retry the input ++ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv))); + + // Ensure that the frame contains nothing if we aren't returning a frame + // (might happen when discarding) @@ -48645,7 +53231,7 @@ Upstream-status: Pending + av_frame_unref(frame); + + // If we got a frame this time ask for a pkt next time -+ s->req_pkt = (dst_rv == 0); ++ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0; + +#if 0 + if (dst_rv == 0) @@ -48655,8 +53241,8 @@ Upstream-status: Pending + av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n"); + ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); + return -1; - } - } ++ } ++ } +#endif + + return dst_rv == 0 ? 0 : @@ -48687,18 +53273,113 @@ Upstream-status: Pending +} +#endif + ++static int ++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ unsigned int i; ++ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format); ++ const uint32_t w = avctx->coded_width; ++ const uint32_t h = avctx->coded_height; ++ ++ if (w == 0 || h == 0 || fcc == 0) { ++ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); ++ return 0; ++ } ++ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) { ++ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc)); ++ return 0; ++ } ++ ++ for (i = 0;; ++i) { ++ struct v4l2_frmsizeenum fs = { ++ .index = i, ++ .pixel_format = fcc, ++ }; ++ ++ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) { ++ const int err = AVERROR(errno); ++ if (err == AVERROR(EINTR)) ++ continue; ++ if (i == 0 && err == AVERROR(ENOTTY)) { ++ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n"); ++ return 0; ++ } ++ if (err != AVERROR(EINVAL)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); ++ return err; ++ } ++ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n", ++ w, h, av_fourcc2str(fcc), i); ++ return err; ++ } ++ ++ switch (fs.type) { ++ case V4L2_FRMSIZE_TYPE_DISCRETE: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i, ++ fs.discrete.width,fs.discrete.height); ++ if (w == fs.discrete.width && h == fs.discrete.height) ++ return 0; ++ break; ++ case V4L2_FRMSIZE_TYPE_STEPWISE: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, ++ fs.stepwise.min_width, fs.stepwise.min_height, ++ fs.stepwise.max_width, fs.stepwise.max_height, ++ fs.stepwise.step_width,fs.stepwise.step_height); ++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && ++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height && ++ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 && ++ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0) ++ return 0; ++ break; ++ case V4L2_FRMSIZE_TYPE_CONTINUOUS: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, ++ fs.stepwise.min_width, fs.stepwise.min_height, ++ fs.stepwise.max_width, fs.stepwise.max_height, ++ fs.stepwise.step_width,fs.stepwise.step_height); ++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && ++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height) ++ return 0; ++ break; ++ default: ++ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type); ++ return AVERROR(EINVAL); ++ } ++ } ++} ++ ++static int ++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ struct v4l2_capability cap; ++ ++ memset(&cap, 0, sizeof(cap)); ++ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) { ++ int err = errno; ++ if (err == EINTR) ++ continue; ++ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err)); ++ return AVERROR(err); ++ } ++ ++ // Could be made table driven if we have a few more but right now there ++ // seems no point ++ ++ // Meson (amlogic) always gives a resolution changed event after output ++ // streamon and userspace must (re)allocate capture buffers and streamon ++ // capture to clear the event even if the capture buffers were the right ++ // size in the first place. ++ if (strcmp(cap.driver, "meson-vdec") == 0) ++ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); ++ return 0; ++} ++ ++// This heuristic is for H264 but use for everything +static uint32_t max_coded_size(const AVCodecContext * const avctx) +{ + uint32_t wxh = avctx->coded_width * avctx->coded_height; + uint32_t size; - --dequeue: -- if (!s->buf_pkt.size) -- av_packet_unref(&avpkt); -- return ff_v4l2_context_dequeue_frame(capture, frame, -1); -+ // Currently the only thing we try to set our own limits for is H264 -+ if (avctx->codec_id != AV_CODEC_ID_H264) -+ return 0; + + size = wxh * 3 / 2; + // H.264 Annex A table A-1 gives minCR which is either 2 or 4 @@ -48711,27 +53392,53 @@ Upstream-status: Pending + // with small WxH + return size + (1 << 16); } - + static av_cold int v4l2_decode_init(AVCodecContext *avctx) -@@ -186,8 +481,12 @@ static av_cold int v4l2_decode_init(AVCo +@@ -186,12 +699,29 @@ static av_cold int v4l2_decode_init(AVCo V4L2Context *capture, *output; V4L2m2mContext *s; V4L2m2mPriv *priv = avctx->priv_data; + int gf_pix_fmt; int ret; - + + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + ++ if (avctx->codec_id == AV_CODEC_ID_H264) { ++ if (avctx->ticks_per_frame == 1) { ++ if(avctx->time_base.den < INT_MAX/2) { ++ avctx->time_base.den *= 2; ++ } else ++ avctx->time_base.num /= 2; ++ } ++ avctx->ticks_per_frame = 2; ++ } ++ + av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); ret = ff_v4l2_m2m_create_context(priv, &s); if (ret < 0) return ret; -@@ -204,17 +503,43 @@ static av_cold int v4l2_decode_init(AVCo - + ++ pts_stats_init(&s->pts_stat, avctx, "decoder"); ++ s->pending_hw = PENDING_HW_MIN; ++ + capture = &s->capture; + output = &s->output; + +@@ -199,34 +729,127 @@ static av_cold int v4l2_decode_init(AVCo + * by the v4l2 driver; this event will trigger a full pipeline reconfig and + * the proper values will be retrieved from the kernel driver. + */ +- output->height = capture->height = avctx->coded_height; +- output->width = capture->width = avctx->coded_width; ++// output->height = capture->height = avctx->coded_height; ++// output->width = capture->width = avctx->coded_width; ++ output->height = capture->height = 0; ++ output->width = capture->width = 0; + output->av_codec_id = avctx->codec_id; output->av_pix_fmt = AV_PIX_FMT_NONE; + output->min_buf_size = max_coded_size(avctx); - + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; capture->av_pix_fmt = avctx->pix_fmt; + capture->min_buf_size = 0; @@ -48743,15 +53450,21 @@ Upstream-status: Pending + * check the v4l2_get_drm_frame function. + */ + ++ avctx->sw_pix_fmt = avctx->pix_fmt; + gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); -+ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", -+ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); ++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", ++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), ++ avctx->coded_width, avctx->coded_height, ++ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); + -+ s->output_drm = 0; + if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { + avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + s->output_drm = 1; + } ++ else { ++ capture->av_pix_fmt = gf_pix_fmt; ++ s->output_drm = 0; ++ } + + s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); + if (!s->device_ref) { @@ -48762,7 +53475,7 @@ Upstream-status: Pending + ret = av_hwdevice_ctx_init(s->device_ref); + if (ret < 0) + return ret; - + s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); if (ret) { @@ -48772,15 +53485,24 @@ Upstream-status: Pending - return ret; } - -@@ -223,10 +548,53 @@ static av_cold int v4l2_decode_init(AVCo - + +- return v4l2_prepare_decoder(s); ++ if ((ret = v4l2_prepare_decoder(s)) < 0) ++ return ret; ++ ++ if ((ret = get_quirks(avctx, s)) != 0) ++ return ret; ++ ++ if ((ret = check_size(avctx, s)) != 0) ++ return ret; ++ ++ return 0; + } + static av_cold int v4l2_decode_close(AVCodecContext *avctx) { - V4L2m2mPriv *priv = avctx->priv_data; - V4L2m2mContext *s = priv->context; -- av_packet_unref(&s->buf_pkt); -- return ff_v4l2_m2m_codec_end(priv); + int rv; + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + rv = ff_v4l2_m2m_codec_end(avctx->priv_data); @@ -48802,7 +53524,6 @@ Upstream-status: Pending + V4L2m2mContext * const s = priv->context; + V4L2Context * const output = &s->output; + V4L2Context * const capture = &s->capture; -+ int ret, i; + + av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); + @@ -48810,14 +53531,23 @@ Upstream-status: Pending + // states like EOS processing so don't try to optimize out (having got it + // wrong once) + -+ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); -+ if (ret < 0) -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); ++ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); ++ ++ // Clear any buffered input packet + av_packet_unref(&s->buf_pkt); +- return ff_v4l2_m2m_codec_end(priv); ++ ++ // Clear a pending EOS ++ if (ff_v4l2_ctx_eos(capture)) { ++ // Arguably we could delay this but this is easy and doesn't require ++ // thought or extra vars ++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); ++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); ++ } + + // V4L2 makes no guarantees about whether decoded frames are flushed or not + // so mark all frames we are tracking to be discarded if they appear -+ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) -+ s->track_els[i].discard = 1; ++ xlat_flush(&s->xlat); + + // resend extradata + s->extdata_sent = 0; @@ -48829,9 +53559,9 @@ Upstream-status: Pending + // Stream on will occur when we actually submit a new frame + av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); } - + #define OFFSET(x) offsetof(V4L2m2mPriv, x) -@@ -235,10 +603,16 @@ static av_cold int v4l2_decode_close(AVC +@@ -235,10 +858,16 @@ static av_cold int v4l2_decode_close(AVC static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", @@ -48840,7 +53570,7 @@ Upstream-status: Pending + { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, { NULL}, }; - + +static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { + HW_CONFIG_INTERNAL(DRM_PRIME), + NULL @@ -48849,7 +53579,7 @@ Upstream-status: Pending #define M2MDEC_CLASS(NAME) \ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -259,9 +633,15 @@ static const AVOption options[] = { +@@ -259,9 +888,15 @@ static const AVOption options[] = { .init = v4l2_decode_init, \ .receive_frame = v4l2_receive_frame, \ .close = v4l2_decode_close, \ @@ -48865,7 +53595,366 @@ Upstream-status: Pending + .hw_configs = v4l2_m2m_hw_configs, \ .wrapper_name = "v4l2m2m", \ } - + +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -24,6 +24,8 @@ + #include + #include + #include ++#include ++ + #include "libavcodec/avcodec.h" + #include "libavcodec/internal.h" + #include "libavutil/pixdesc.h" +@@ -37,6 +39,34 @@ + #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x + #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x + ++// P030 should be defined in drm_fourcc.h and hopefully will be sometime ++// in the future but until then... ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') ++#endif ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++#ifndef V4L2_CID_CODEC_BASE ++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE ++#endif ++ ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in videodev2.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ + static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den) + { + struct v4l2_streamparm parm = { 0 }; +@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro + static int v4l2_check_b_frame_support(V4L2m2mContext *s) + { + if (s->avctx->max_b_frames) +- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n"); ++ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames); + +- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0); ++ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1); + v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0); + if (s->avctx->max_b_frames == 0) + return 0; + + avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding"); +- + return AVERROR_PATCHWELCOME; + } + +@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC + return 0; + } + ++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame) ++{ ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ const uint32_t drm_fmt = src->layers[0].format; ++ // Treat INVALID as LINEAR ++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? ++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; ++ uint32_t pix_fmt = 0; ++ uint32_t w = 0; ++ uint32_t h = 0; ++ uint32_t bpl = src->layers[0].planes[0].pitch; ++ ++ // We really don't expect multiple layers ++ // All formats that we currently cope with are single object ++ ++ if (src->nb_layers != 1 || src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ switch (drm_fmt) { ++ case DRM_FORMAT_YUV420: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 3) ++ break; ++ pix_fmt = V4L2_PIX_FMT_YUV420; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ break; ++ ++ case DRM_FORMAT_NV12: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_COL128; ++ w = bpl; ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ case DRM_FORMAT_P030: ++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; ++ w = bpl / 2; // Matching lie to how we construct this ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ default: ++ break; ++ } ++ ++ if (!pix_fmt) ++ return AVERROR(EINVAL); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->plane_fmt[0].bytesperline = bpl; ++ pix->num_planes = 1; ++ } ++ else { ++ struct v4l2_pix_format *const pix = &format->fmt.pix; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->bytesperline = bpl; ++ } ++ ++ return 0; ++} ++ ++// Do we have similar enough formats to be usable? ++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b) ++{ ++ if (a->type != b->type) ++ return 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) { ++ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp; ++ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp; ++ unsigned int i; ++ if (pa->pixelformat != pb->pixelformat || ++ pa->num_planes != pb->num_planes) ++ return 0; ++ for (i = 0; i != pa->num_planes; ++i) { ++ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline) ++ return 0; ++ } ++ } ++ else { ++ const struct v4l2_pix_format *const pa = &a->fmt.pix; ++ const struct v4l2_pix_format *const pb = &b->fmt.pix; ++ if (pa->pixelformat != pb->pixelformat || ++ pa->bytesperline != pb->bytesperline) ++ return 0; ++ } ++ return 1; ++} ++ ++ + static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const output = &s->output; + ++ // Signal EOF if needed ++ if (!frame) { ++ return ff_v4l2_context_enqueue_frame(output, frame); ++ } ++ ++ if (s->input_drm && !output->streamon) { ++ int rv; ++ struct v4l2_format req_format = {.type = output->format.type}; ++ ++ // Set format when we first get a buffer ++ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n"); ++ return rv; ++ } ++ ++ ff_v4l2_context_release(output); ++ ++ output->format = req_format; ++ ++ if ((rv = ff_v4l2_context_set_format(output)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n"); ++ return rv; ++ } ++ ++ if (!fmt_eq(&req_format, &output->format)) { ++ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ output->selection.top = frame->crop_top; ++ output->selection.left = frame->crop_left; ++ output->selection.width = av_frame_cropped_width(frame); ++ output->selection.height = av_frame_cropped_height(frame); ++ ++ if ((rv = ff_v4l2_context_init(output)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n"); ++ return rv; ++ } ++ ++ { ++ struct v4l2_selection selection = { ++ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT, ++ .target = V4L2_SEL_TGT_CROP, ++ .r = output->selection ++ }; ++ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n", ++ selection.r.width, selection.r.height, selection.r.left, selection.r.top, ++ av_err2str(AVERROR(errno))); ++ } ++ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n", ++ selection.r.width, selection.r.height, selection.r.left, selection.r.top); ++ } ++ } ++ + #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME +- if (frame && frame->pict_type == AV_PICTURE_TYPE_I) ++ if (frame->pict_type == AV_PICTURE_TYPE_I) + v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); + #endif + +@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo + } + + dequeue: +- return ff_v4l2_context_dequeue_packet(capture, avpkt); ++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) ++ return ret; ++ ++ if (capture->first_buf == 1) { ++ uint8_t * data; ++ const int len = avpkt->size; ++ ++ // 1st buffer after streamon should be SPS/PPS ++ capture->first_buf = 2; ++ ++ // Clear both possible stores so there is no chance of confusion ++ av_freep(&s->extdata_data); ++ s->extdata_size = 0; ++ av_freep(&avctx->extradata); ++ avctx->extradata_size = 0; ++ ++ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL) ++ memcpy(data, avpkt->data, len); ++ ++ av_packet_unref(avpkt); ++ ++ if (data == NULL) ++ return AVERROR(ENOMEM); ++ ++ // We need to copy the header, but keep local if not global ++ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { ++ avctx->extradata = data; ++ avctx->extradata_size = len; ++ } ++ else { ++ s->extdata_data = data; ++ s->extdata_size = len; ++ } ++ ++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) ++ return ret; ++ } ++ ++ // First frame must be key so mark as such even if encoder forgot ++ if (capture->first_buf == 2) ++ avpkt->flags |= AV_PKT_FLAG_KEY; ++ ++ // Add SPS/PPS to the start of every key frame if non-global headers ++ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { ++ const size_t newlen = s->extdata_size + avpkt->size; ++ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); ++ ++ if (buf == NULL) { ++ av_packet_unref(avpkt); ++ return AVERROR(ENOMEM); ++ } ++ ++ memcpy(buf->data, s->extdata_data, s->extdata_size); ++ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); ++ ++ av_buffer_unref(&avpkt->buf); ++ avpkt->buf = buf; ++ avpkt->data = buf->data; ++ avpkt->size = newlen; ++ } ++ ++// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); ++ capture->first_buf = 0; ++ return 0; + } + + static av_cold int v4l2_encode_init(AVCodecContext *avctx) +@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo + uint32_t v4l2_fmt_output; + int ret; + ++ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt); ++ + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; +@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo + capture = &s->capture; + output = &s->output; + ++ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME); ++ + /* common settings output/capture */ + output->height = capture->height = avctx->height; + output->width = capture->width = avctx->width; + + /* output context */ + output->av_codec_id = AV_CODEC_ID_RAWVIDEO; +- output->av_pix_fmt = avctx->pix_fmt; ++ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt : ++ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt : ++ AV_PIX_FMT_YUV420P; + + /* capture context */ + capture->av_codec_id = avctx->codec_id; +@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo + v4l2_fmt_output = output->format.fmt.pix.pixelformat; + + pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO); +- if (pix_fmt_output != avctx->pix_fmt) { ++ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output); + av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name); + return AVERROR(EINVAL); --- /dev/null +++ b/libavcodec/v4l2_req_decode_q.c @@ -0,0 +1,84 @@ @@ -49780,16 +54869,26 @@ Upstream-status: Pending +#include "v4l2_req_hevc_vx.c" + --- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v3.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 3 ++#include "v4l2_req_hevc_vx.c" ++ +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v4.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 4 ++#include "v4l2_req_hevc_vx.c" ++ +--- /dev/null +++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -0,0 +1,1188 @@ +@@ -0,0 +1,1365 @@ +// File included by v4l2_req_hevc_v* - not compiled on its own + +#include "decode.h" +#include "hevcdec.h" +#include "hwconfig.h" + -+#include "v4l2_request_hevc.h" -+ +#if HEVC_CTRLS_VERSION == 1 +#include "hevc-ctrls-v1.h" + @@ -49798,10 +54897,39 @@ Upstream-status: Pending + +#elif HEVC_CTRLS_VERSION == 2 +#include "hevc-ctrls-v2.h" ++#elif HEVC_CTRLS_VERSION == 3 ++#include "hevc-ctrls-v3.h" ++#elif HEVC_CTRLS_VERSION == 4 ++#include ++#if !defined(V4L2_CID_STATELESS_HEVC_SPS) ++#include "hevc-ctrls-v4.h" ++#endif +#else +#error Unknown HEVC_CTRLS_VERSION +#endif + ++#ifndef V4L2_CID_STATELESS_HEVC_SPS ++#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS ++#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS ++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS ++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX ++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS ++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE ++#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE ++ ++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED ++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED ++#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE ++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B ++#endif ++ ++// Should be in videodev2 but we might not have a good enough one ++#ifndef V4L2_PIX_FMT_HEVC_SLICE ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++#endif ++ ++#include "v4l2_request_hevc.h" ++ +#include "libavutil/hwcontext_drm.h" + +#include @@ -49837,11 +54965,16 @@ Upstream-status: Pending + struct v4l2_ctrl_hevc_slice_params * slice_params; + struct slice_info * slices; + ++ size_t num_offsets; ++ size_t alloced_offsets; ++ uint32_t *offsets; ++ +} V4L2MediaReqDescriptor; + +struct slice_info { + const uint8_t * ptr; + size_t len; // bytes ++ size_t n_offsets; +}; + +// Handy container for accumulating controls before setting @@ -49929,6 +55062,7 @@ Upstream-status: Pending + } +} + ++#if HEVC_CTRLS_VERSION <= 2 +static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) +{ + const HEVCFrame *frame; @@ -49954,6 +55088,7 @@ Upstream-status: Pending + + return 0; +} ++#endif + +static unsigned int +get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, @@ -49998,7 +55133,7 @@ Upstream-status: Pending + if (rd->num_slices >= rd->alloced_slices) { + struct v4l2_ctrl_hevc_slice_params * p2; + struct slice_info * s2; -+ size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2; ++ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2; + + p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); + if (p2 == NULL) @@ -50016,6 +55151,23 @@ Upstream-status: Pending + return 0; +} + ++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets) ++{ ++ if (rd->num_offsets + n > rd->alloced_offsets) { ++ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2; ++ void * p2; ++ while (rd->num_offsets + n > n2) ++ n2 *= 2; ++ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL) ++ return AVERROR(ENOMEM); ++ rd->offsets = p2; ++ rd->alloced_offsets = n2; ++ } ++ for (size_t i = 0; i != n; ++i) ++ rd->offsets[rd->num_offsets++] = offsets[i] - 1; ++ return 0; ++} ++ +static unsigned int +fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) +{ @@ -50029,12 +55181,21 @@ Upstream-status: Pending + struct v4l2_hevc_dpb_entry * const entry = entries + n++; + + entry->timestamp = frame_capture_dpb(frame->frame); ++#if HEVC_CTRLS_VERSION <= 2 + entry->rps = find_frame_rps_type(h, entry->timestamp); ++#else ++ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 : ++ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE; ++#endif + entry->field_pic = frame->frame->interlaced_frame; + ++#if HEVC_CTRLS_VERSION <= 3 + /* TODO: Interleaved: Get the POC for each field. */ + entry->pic_order_cnt[0] = frame->poc; + entry->pic_order_cnt[1] = frame->poc; ++#else ++ entry->pic_order_cnt_val = frame->poc; ++#endif + } + } + return n; @@ -50060,8 +55221,11 @@ Upstream-status: Pending + + *slice_params = (struct v4l2_ctrl_hevc_slice_params) { + .bit_size = bit_size, ++#if HEVC_CTRLS_VERSION <= 3 + .data_bit_offset = bit_offset, -+ ++#else ++ .data_byte_offset = bit_offset / 8 + 1, ++#endif + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + .slice_segment_addr = sh->slice_segment_addr, + @@ -50144,6 +55308,7 @@ Upstream-status: Pending + fill_pred_table(h, &slice_params->pred_weight_table); + + slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; ++#if HEVC_CTRLS_VERSION <= 3 + if (slice_params->num_entry_point_offsets > 256) { + slice_params->num_entry_point_offsets = 256; + av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); @@ -50151,6 +55316,7 @@ Upstream-status: Pending + + for (i = 0; i < slice_params->num_entry_point_offsets; i++) + slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; ++#endif +} + +#if HEVC_CTRLS_VERSION >= 2 @@ -50526,51 +55692,66 @@ Upstream-status: Pending +#if HEVC_CTRLS_VERSION >= 2 + struct v4l2_ctrl_hevc_decode_params * const dec, +#endif -+ struct v4l2_ctrl_hevc_slice_params * const slices, -+ const unsigned int slice_no, -+ const unsigned int slice_count) ++ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count, ++ void * const offsets, const size_t offset_count) +{ + int rv; ++#if HEVC_CTRLS_VERSION >= 2 ++ unsigned int n = 3; ++#else ++ unsigned int n = 2; ++#endif + -+ struct v4l2_ext_control control[] = { ++ struct v4l2_ext_control control[6] = { + { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, ++ .id = V4L2_CID_STATELESS_HEVC_SPS, + .ptr = &controls->sps, + .size = sizeof(controls->sps), + }, + { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, ++ .id = V4L2_CID_STATELESS_HEVC_PPS, + .ptr = &controls->pps, + .size = sizeof(controls->pps), + }, +#if HEVC_CTRLS_VERSION >= 2 + { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS, ++ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS, + .ptr = dec, + .size = sizeof(*dec), + }, +#endif -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, -+ .ptr = slices + slice_no, -+ .size = sizeof(*slices) * slice_count, -+ }, -+ // Optional -+ { -+ .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, -+ .ptr = &controls->scaling_matrix, -+ .size = sizeof(controls->scaling_matrix), -+ }, + }; + -+ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, -+ controls->has_scaling ? -+ FF_ARRAY_ELEMS(control) : -+ FF_ARRAY_ELEMS(control) - 1); ++ if (slices) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, ++ .ptr = slices, ++ .size = sizeof(*slices) * slice_count, ++ }; ++ ++ if (controls->has_scaling) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, ++ .ptr = &controls->scaling_matrix, ++ .size = sizeof(controls->scaling_matrix), ++ }; ++ ++#if HEVC_CTRLS_VERSION >= 4 ++ if (offsets) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, ++ .ptr = offsets, ++ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count, ++ }; ++#endif ++ ++ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n); + + return rv; +} + ++// This only works because we started out from a single coded frame buffer ++// that will remain intact until after end_frame +static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) +{ + const HEVCContext * const h = avctx->priv_data; @@ -50579,18 +55760,45 @@ Upstream-status: Pending + int bcount = get_bits_count(&h->HEVClc->gb); + uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; + ++ const unsigned int n = rd->num_slices; ++ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices; ++ + int rv; + struct slice_info * si; + ++ // This looks dodgy but we know that FFmpeg has parsed this from a buffer ++ // that contains the entire frame including the start code ++ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { ++ buffer -= 3; ++ size += 3; ++ boff += 24; ++ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) { ++ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n", ++ buffer[0], buffer[1], buffer[2]); ++ } ++ } ++ ++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { ++ if (rd->slices == NULL) { ++ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL) ++ return AVERROR(ENOMEM); ++ rd->slices->ptr = buffer; ++ rd->num_slices = 1; ++ } ++ rd->slices->len = buffer - rd->slices->ptr + size; ++ return 0; ++ } ++ + if ((rv = slice_add(rd)) != 0) + return rv; + -+ si = rd->slices + rd->num_slices - 1; ++ si = rd->slices + n; + si->ptr = buffer; + si->len = size; ++ si->n_offsets = rd->num_offsets; + -+ if (ctx->multi_slice && rd->num_slices > 1) { -+ struct slice_info *const si0 = rd->slices; ++ if (n != block_start) { ++ struct slice_info *const si0 = rd->slices + block_start; + const size_t offset = (buffer - si0->ptr); + boff += offset * 8; + size += offset; @@ -50598,12 +55806,15 @@ Upstream-status: Pending + } + +#if HEVC_CTRLS_VERSION >= 2 -+ if (rd->num_slices == 1) ++ if (n == 0) + fill_decode_params(h, &rd->dec); -+ fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff); ++ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff); +#else -+ fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff); ++ fill_slice_params(h, rd->slice_params + n, size * 8, boff); +#endif ++ if (ctx->max_offsets != 0 && ++ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0) ++ return rv; + + return 0; +} @@ -50629,10 +55840,13 @@ Upstream-status: Pending +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + ++ const int is_last = (j == rd->num_slices); + struct slice_info *const si = rd->slices + i; + struct media_request * req = NULL; + struct qent_src * src = NULL; + MediaBufsStatus stat; ++ void * offsets = rd->offsets + rd->slices[i].n_offsets; ++ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets; + + if ((req = media_request_get(ctx->mpool)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); @@ -50644,8 +55858,8 @@ Upstream-status: Pending +#if HEVC_CTRLS_VERSION >= 2 + &rd->dec, +#endif -+ rd->slice_params, -+ i, j - i)) { ++ rd->slice_params + i, j - i, ++ offsets, n_offsets)) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); + goto fail1; + } @@ -50665,13 +55879,9 @@ Upstream-status: Pending + goto fail2; + } + -+#warning ANNEX_B start code -+// if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { -+// } -+ + stat = mediabufs_start_request(ctx->mbufs, &req, &src, + i == 0 ? rd->qe_dst : NULL, -+ j == rd->num_slices); ++ is_last); + + if (stat != MEDIABUFS_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); @@ -50736,18 +55946,11 @@ Upstream-status: Pending + } + + // Send as slices -+ if (ctx->multi_slice) -+ { -+ if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0) ++ for (i = 0; i < rd->num_slices; i += ctx->max_slices) { ++ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices); ++ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0) + goto fail; + } -+ else -+ { -+ for (i = 0; i != rd->num_slices; ++i) { -+ if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0) -+ goto fail; -+ } -+ } + + // Set the drm_prime desriptor + drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); @@ -50762,6 +55965,12 @@ Upstream-status: Pending + return rv; +} + ++static inline int ++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) ++{ ++ return v >= c->minimum && v <= c->maximum; ++} ++ +// Initial check & init +static int +probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) @@ -50773,17 +55982,19 @@ Upstream-status: Pending + + // Check for var slice array + struct v4l2_query_ext_ctrl qc[] = { -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX }, ++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_SPS }, ++ { .id = V4L2_CID_STATELESS_HEVC_PPS }, ++ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, +#if HEVC_CTRLS_VERSION >= 2 -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS }, +#endif + }; + // Order & size must match! + static const size_t ctrl_sizes[] = { + sizeof(struct v4l2_ctrl_hevc_slice_params), ++ sizeof(int32_t), + sizeof(struct v4l2_ctrl_hevc_sps), + sizeof(struct v4l2_ctrl_hevc_pps), + sizeof(struct v4l2_ctrl_hevc_scaling_matrix), @@ -50793,26 +56004,44 @@ Upstream-status: Pending + }; + const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); + -+ if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); ++#if HEVC_CTRLS_VERSION == 2 ++ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0)) + return AVERROR(EINVAL); -+ } -+ for (i = 0; i != noof_ctrls; ++i) { -+ if (ctrl_sizes[i] != qc[i].elem_size) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %u != %u\n", -+ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], qc[i].elem_size); ++#elif HEVC_CTRLS_VERSION == 3 ++ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0)) ++ return AVERROR(EINVAL); ++#endif ++ ++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls); ++ i = 0; ++#if HEVC_CTRLS_VERSION >= 4 ++ // Skip slice check if no slice mode ++ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ i = 1; ++#else ++ // Fail frame mode silently for anything prior to V4 ++ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ return AVERROR(EINVAL); ++#endif ++ for (; i != noof_ctrls; ++i) { ++ if (qc[i].type == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id); ++ return AVERROR(EINVAL); ++ } ++ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", ++ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); + return AVERROR(EINVAL); + } + } + + fill_sps(&ctrl_sps, sps); + -+ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { ++ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { + av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); + return AVERROR(EINVAL); + } + -+ ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; + return 0; +} + @@ -50823,38 +56052,63 @@ Upstream-status: Pending + int ret; + + struct v4l2_query_ext_ctrl querys[] = { -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, ++#if HEVC_CTRLS_VERSION >= 4 ++ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, }, ++#endif + }; + + struct v4l2_ext_control ctrls[] = { -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, + }; + + mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); + -+ ctx->decode_mode = querys[0].default_value; ++ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || ++ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? ++ 1 : querys[2].dims[0]; ++ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); + -+ if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && -+ ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); ++#if HEVC_CTRLS_VERSION >= 4 ++ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? ++ 0 : querys[3].dims[0]; ++ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); ++#else ++ ctx->max_offsets = 0; ++#endif ++ ++ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED || ++ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) ++ ctx->decode_mode = querys[0].default_value; ++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)) ++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED; ++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; ++ else { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); + return AVERROR(EINVAL); + } + -+ ctx->start_code = querys[1].default_value; -+ if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && -+ ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); ++ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE || ++ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) ++ ctx->start_code = querys[1].default_value; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; ++ else { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); + return AVERROR(EINVAL); + } + -+ ctx->max_slices = querys[2].elems; -+ if (ctx->max_slices > MAX_SLICES) { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); -+ return AVERROR(EINVAL); -+ } ++ // If we are in slice mode & START_CODE_NONE supported then pick that ++ // as it doesn't require the slightly dodgy look backwards in our raw buffer ++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && ++ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; + + ctrls[0].value = ctx->decode_mode; + ctrls[1].value = ctx->start_code; @@ -50878,6 +56132,7 @@ Upstream-status: Pending + + av_freep(&rd->slices); + av_freep(&rd->slice_params); ++ av_freep(&rd->offsets); + + av_free(rd); +} @@ -50904,6 +56159,7 @@ Upstream-status: Pending + return ref; +} + ++#if 0 +static void v4l2_req_pool_free(void *opaque) +{ + av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); @@ -50915,6 +56171,7 @@ Upstream-status: Pending + + av_buffer_pool_uninit(&hwfc->pool); +} ++#endif + +static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) +{ @@ -50931,7 +56188,7 @@ Upstream-status: Pending + hwfc->width = vfmt->fmt.pix.width; + hwfc->height = vfmt->fmt.pix.height; + } -+ ++#if 0 + hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free); + if (!hwfc->pool) + return AVERROR(ENOMEM); @@ -50950,12 +56207,32 @@ Upstream-status: Pending + default: + hwfc->initial_pool_size += 2; + } -+ ++#endif + av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); + + return 0; +} + ++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ int rv; ++ ++ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor)); ++ if (!frame->buf[0]) ++ return AVERROR(ENOMEM); ++ ++ frame->data[0] = frame->buf[0]->data; ++ ++ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx); ++ ++ if ((rv = ff_attach_decode_data(frame)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n"); ++ av_frame_unref(frame); ++ return rv; ++ } ++ ++ return 0; ++} + +const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = { + .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE, @@ -50968,11 +56245,12 @@ Upstream-status: Pending + .end_frame = v4l2_request_hevc_end_frame, + .abort_frame = v4l2_request_hevc_abort_frame, + .frame_params = frame_params, ++ .alloc_frame = alloc_frame, +}; + --- /dev/null +++ b/libavcodec/v4l2_req_media.c -@@ -0,0 +1,1569 @@ +@@ -0,0 +1,1601 @@ +/* + * Copyright (C) 2018 Paul Kocialkowski + * @@ -51388,7 +56666,7 @@ Upstream-status: Pending + free(be_dst); +} + -+static struct qent_dst * qe_dst_new(void) ++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl) +{ + struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); + if (!be_dst) @@ -51396,7 +56674,8 @@ Upstream-status: Pending + *be_dst = (struct qent_dst){ + .base = QENT_BASE_INITIALIZER, + .lock = PTHREAD_MUTEX_INITIALIZER, -+ .cond = PTHREAD_COND_INITIALIZER ++ .cond = PTHREAD_COND_INITIALIZER, ++ .mbc_wl = ff_weak_link_ref(wl) + }; + return be_dst; +} @@ -51568,6 +56847,7 @@ Upstream-status: Pending + int vfd; + bool stream_on; + bool polling; ++ bool dst_fixed; // Dst Q is fixed size + pthread_mutex_t lock; + struct buf_pool * src; + struct buf_pool * dst; @@ -51577,6 +56857,7 @@ Upstream-status: Pending + + struct v4l2_format src_fmt; + struct v4l2_format dst_fmt; ++ struct v4l2_capability capability; +}; + +static int qe_v4l2_queue(struct qent_base *const be, @@ -51747,13 +57028,13 @@ Upstream-status: Pending +{ + if (!be->dh[0] || len > dmabuf_size(be->dh[0])) { + size_t newsize = round_up_size(len); -+ request_log("%s: Overrun %d > %d; trying %d\n", __func__, len, dmabuf_size(be->dh[0]), newsize); ++ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize); + if (!dbsc) { + request_log("%s: No dmbabuf_ctrl for realloc\n", __func__); + return -ENOMEM; + } + if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) { -+ request_log("%s: Realloc %d failed\n", __func__, newsize); ++ request_log("%s: Realloc %zd failed\n", __func__, newsize); + return -ENOMEM; + } + } @@ -52069,10 +57350,13 @@ Upstream-status: Pending + return MEDIABUFS_STATUS_SUCCESS; +} + -+static int create_dst_buf(struct mediabufs_ctl *const mbc) ++// Returns noof buffers created, -ve for error ++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[]) +{ ++ unsigned int i; ++ + struct v4l2_create_buffers cbuf = { -+ .count = 1, ++ .count = n, + .memory = V4L2_MEMORY_DMABUF, + .format = mbc->dst_fmt, + }; @@ -52084,7 +57368,14 @@ Upstream-status: Pending + return -err; + } + } -+ return cbuf.index; ++ ++ if (cbuf.count != n) ++ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n); ++ ++ for (i = 0; i != cbuf.count; ++i) ++ qes[i]->base.index = cbuf.index + i; ++ ++ return cbuf.count; +} + +struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) @@ -52092,27 +57383,29 @@ Upstream-status: Pending + struct qent_dst * be_dst; + + if (mbc == NULL) { -+ be_dst = qe_dst_new(); ++ be_dst = qe_dst_new(NULL); + if (be_dst) + be_dst->base.status = QENT_IMPORT; + return be_dst; + } + -+ be_dst = base_to_dst(queue_tryget_free(mbc->dst)); -+ if (!be_dst) { -+ int index; -+ -+ be_dst = qe_dst_new(); ++ if (mbc->dst_fixed) { ++ be_dst = base_to_dst(queue_get_free(mbc->dst)); + if (!be_dst) + return NULL; ++ } ++ else { ++ be_dst = base_to_dst(queue_tryget_free(mbc->dst)); ++ if (!be_dst) { ++ be_dst = qe_dst_new(mbc->this_wlm); ++ if (!be_dst) ++ return NULL; + -+ if ((be_dst->mbc_wl = ff_weak_link_ref(mbc->this_wlm)) == NULL || -+ (index = create_dst_buf(mbc)) < 0) { -+ qe_dst_free(be_dst); -+ return NULL; ++ if (create_dst_bufs(mbc, 1, &be_dst) != 1) { ++ qe_dst_free(be_dst); ++ return NULL; ++ } + } -+ -+ be_dst->base.index = (uint32_t)index; + } + + if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { @@ -52166,29 +57459,42 @@ Upstream-status: Pending + return status; +} + -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, unsigned int n) ++// ** This is a mess if we get partial alloc but without any way to remove ++// individual V4L2 Q members we are somewhat stuffed ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed) +{ -+ // **** request buffers + unsigned int i; ++ int a = 0; ++ unsigned int qc; ++ struct qent_dst * qes[32]; + -+ for (i = 0; i != n; ++i) ++ if (n > 32) ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ ++ // Create qents first as it is hard to get rid of the V4L2 buffers on error ++ for (qc = 0; qc != n; ++qc) + { -+ int index; -+ struct qent_dst * const be_dst = qe_dst_new(); -+ if (!be_dst) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ index = create_dst_buf(mbc); -+ if (index < 0) { -+ qe_dst_free(be_dst); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ // Add index to free chain -+ be_dst->base.index = (uint32_t)index; -+ queue_put_free(mbc->dst, &be_dst->base); ++ if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL) ++ goto fail; + } ++ ++ if ((a = create_dst_bufs(mbc, n, qes)) < 0) ++ goto fail; ++ ++ for (i = 0; i != a; ++i) ++ queue_put_free(mbc->dst, &qes[i]->base); ++ ++ if (a != n) ++ goto fail; ++ ++ mbc->dst_fixed = fixed; + return MEDIABUFS_STATUS_SUCCESS; ++ ++fail: ++ for (i = (a < 0 ? 0 : a); i != qc; ++i) ++ qe_dst_free(qes[i]); ++ ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; +} + +struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc) @@ -52446,20 +57752,24 @@ Upstream-status: Pending + mediabufs_ctl_delete(mbc); +} + ++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc) ++{ ++ return mbc->capability.version; ++} ++ +static int set_capabilities(struct mediabufs_ctl *const mbc) +{ -+ struct v4l2_capability capability = { 0 }; + uint32_t caps; + -+ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) { ++ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) { + int err = errno; + request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); + return -err; + } + -+ caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? -+ capability.device_caps : -+ capability.capabilities; ++ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? ++ mbc->capability.device_caps : ++ mbc->capability.capabilities; + + if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { + mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; @@ -52544,7 +57854,7 @@ Upstream-status: Pending + --- /dev/null +++ b/libavcodec/v4l2_req_media.h -@@ -0,0 +1,148 @@ +@@ -0,0 +1,154 @@ +/* +e.h +* @@ -52646,11 +57956,14 @@ Upstream-status: Pending + struct qent_dst *const dst_be, + const bool is_final); +// Get / alloc a dst buffer & associate with a slot -+// * BEWARE * Currently has no alloc limit ++// If the dst pool is empty then behaviour depends on the fixed flag passed to ++// dst_slots_create. Default is !fixed = unlimited alloc +struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, + struct dmabufs_ctl *const dbsc); +// Create dst slots without alloc -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, unsigned int n); ++// If fixed true then qent_alloc will only get slots from this pool and will ++// block until a qent has been unrefed ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed); + +MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); +MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); @@ -52686,6 +57999,9 @@ Upstream-status: Pending + struct dmabufs_ctl * const dbsc, + unsigned int n); + ++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) ++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); ++ +struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, + const char *vpath, struct pollqueue *const pq); +void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); @@ -52695,7 +58011,7 @@ Upstream-status: Pending +#endif --- /dev/null +++ b/libavcodec/v4l2_req_pollqueue.c -@@ -0,0 +1,363 @@ +@@ -0,0 +1,361 @@ +#include +#include +#include @@ -52885,19 +58201,19 @@ Upstream-status: Pending + unsigned int i; + unsigned int n = 0; + struct polltask *pt; ++ struct polltask *pt_next; + uint64_t now = pollqueue_now(0); + int timeout = -1; + int rv; + -+ for (pt = pq->head; pt; pt = pt->next) { ++ for (pt = pq->head; pt; pt = pt_next) { + int64_t t; + ++ pt_next = pt->next; ++ + if (pt->state == POLLTASK_Q_KILL) { -+ struct polltask * const prev = pt->prev; + pollqueue_rem_task(pq, pt); + sem_post(&pt->kill_sem); -+ if ((pt = prev) == NULL) -+ break; + continue; + } + @@ -52936,8 +58252,8 @@ Upstream-status: Pending + * infinite looping + */ + pq->no_prod = true; -+ for (i = 0, pt = pq->head; i < n; ++i) { -+ struct polltask *const pt_next = pt->next; ++ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) { ++ pt_next = pt->next; + + /* Pending? */ + if (a[i].revents || @@ -52961,8 +58277,6 @@ Upstream-status: Pending + if (pt->state == POLLTASK_RUN_KILL) + sem_post(&pt->kill_sem); + } -+ -+ pt = pt_next; + } + pq->no_prod = false; + @@ -53082,12 +58396,13 @@ Upstream-status: Pending +#endif /* POLLQUEUE_H_ */ --- /dev/null +++ b/libavcodec/v4l2_req_utils.h -@@ -0,0 +1,21 @@ +@@ -0,0 +1,22 @@ +#include "libavutil/log.h" + +#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__) + +#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__) ++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__) +#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__) +#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__) + @@ -53106,7 +58421,7 @@ Upstream-status: Pending + --- /dev/null +++ b/libavcodec/v4l2_request_hevc.c -@@ -0,0 +1,280 @@ +@@ -0,0 +1,315 @@ +/* + * This file is part of FFmpeg. + * @@ -53194,6 +58509,13 @@ Upstream-status: Pending + return ctx->fns->frame_params(avctx, hw_frames_ctx); +} + ++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->alloc_frame(avctx, frame); ++} ++ ++ +static int v4l2_request_hevc_uninit(AVCodecContext *avctx) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; @@ -53248,6 +58570,17 @@ Upstream-status: Pending + + av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); + ++ // Give up immediately if this is something that we have no code to deal with ++ if (h->ps.sps->chroma_format_idc != 1) { ++ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc); ++ return AVERROR_PATCHWELCOME; ++ } ++ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) || ++ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) { ++ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma); ++ return AVERROR_PATCHWELCOME; ++ } ++ + if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { + av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); + return (AVERROR(-ret)); @@ -53300,7 +58633,15 @@ Upstream-status: Pending + goto fail4; + } + -+ if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { ++ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 4); ++ } ++ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 3); ++ } ++ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 2); + } @@ -53325,9 +58666,18 @@ Upstream-status: Pending + goto fail4; + } + -+ if (mediabufs_dst_slots_create(ctx->mbufs, 1)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); -+ goto fail4; ++ { ++ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + ++ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6); ++ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots, ++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, ++ avctx->thread_count, avctx->extra_hw_frames); ++ ++ // extra_hw_frames is -1 if unset ++ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); ++ goto fail4; ++ } + } + + if (mediabufs_stream_on(ctx->mbufs)) { @@ -53376,7 +58726,7 @@ Upstream-status: Pending + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_HEVC, + .pix_fmt = AV_PIX_FMT_DRM_PRIME, -+// .alloc_frame = v4l2_request_hevc_alloc_frame, ++ .alloc_frame = v4l2_req_hevc_alloc_frame, + .start_frame = v4l2_req_hevc_start_frame, + .decode_slice = v4l2_req_hevc_decode_slice, + .end_frame = v4l2_req_hevc_end_frame, @@ -53389,7 +58739,7 @@ Upstream-status: Pending +}; --- /dev/null +++ b/libavcodec/v4l2_request_hevc.h -@@ -0,0 +1,100 @@ +@@ -0,0 +1,101 @@ +#ifndef AVCODEC_V4L2_REQUEST_HEVC_H +#define AVCODEC_V4L2_REQUEST_HEVC_H + @@ -53437,8 +58787,6 @@ Upstream-status: Pending +#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 +#endif + -+#define MAX_SLICES 128 -+ +#define VCAT(name, version) name##_v##version +#define V2(n,v) VCAT(n, v) +#define V(n) V2(n, HEVC_CTRLS_VERSION) @@ -53455,10 +58803,10 @@ Upstream-status: Pending + + unsigned int timestamp; // ?? maybe uint64_t + -+ int multi_slice; + int decode_mode; + int start_code; -+ int max_slices; ++ unsigned int max_slices; // 0 => not wanted (frame mode) ++ unsigned int max_offsets; // 0 => not wanted + + req_decode_q decode_q; + @@ -53483,16 +58831,121 @@ Upstream-status: Pending + int (*end_frame)(AVCodecContext *avctx); + void (*abort_frame)(AVCodecContext *avctx); + int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame); +} v4l2_req_decode_fns; + + +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4); + +#endif +--- a/libavcodec/vc1dec.c ++++ b/libavcodec/vc1dec.c +@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod + size = next - start - 4; + if (size <= 0) + continue; +- buf2_size = vc1_unescape_buffer(start + 4, size, buf2); ++ buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + init_get_bits(&gb, buf2, buf2_size * 8); + switch (AV_RB32(start)) { + case VC1_CODE_SEQHDR: +@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte + case VC1_CODE_FRAME: + if (avctx->hwaccel) + buf_start = start; +- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + break; + case VC1_CODE_FIELD: { + int buf_size3; +@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(start + 4, size, +- slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, ++ slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; +@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte + break; + } + case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ +- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + init_get_bits(&s->gb, buf2, buf_size2 * 8); + ff_vc1_decode_entry_point(avctx, v, &s->gb); + break; +@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(start + 4, size, +- slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, ++ slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); +@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = s->mb_height + 1 >> 1; +@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte + n_slices1 = n_slices - 1; + n_slices++; + } +- buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); + } else { +- buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); + } + init_get_bits(&s->gb, buf2, buf_size2*8); + } else +--- a/libavcodec/vc1dsp.c ++++ b/libavcodec/vc1dsp.c +@@ -32,6 +32,7 @@ + #include "rnd_avg.h" + #include "vc1dsp.h" + #include "startcode.h" ++#include "vc1_common.h" + + /* Apply overlap transform to horizontal edge */ + static void vc1_v_overlap_c(uint8_t *src, int stride) +@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex + #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ + + dsp->startcode_find_candidate = ff_startcode_find_candidate_c; ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer; + + if (ARCH_AARCH64) + ff_vc1dsp_init_aarch64(dsp); +--- a/libavcodec/vc1dsp.h ++++ b/libavcodec/vc1dsp.h +@@ -80,6 +80,9 @@ typedef struct VC1DSPContext { + * one or more further zero bytes and a one byte. + */ + int (*startcode_find_candidate)(const uint8_t *buf, int size); ++ ++ /* Copy a buffer, removing startcode emulation escape bytes as we go */ ++ int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); + } VC1DSPContext; + + void ff_vc1dsp_init(VC1DSPContext* c); --- /dev/null +++ b/libavcodec/weak_link.c -@@ -0,0 +1,100 @@ +@@ -0,0 +1,102 @@ +#include +#include +#include @@ -53549,6 +59002,8 @@ Upstream-status: Pending + +struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w) +{ ++ if (!w) ++ return NULL; + atomic_fetch_add(&w->ref_count, 1); + return (struct ff_weak_link_client*)w; +} @@ -53630,7 +59085,7 @@ Upstream-status: Pending +OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o OBJS-$(CONFIG_XV_OUTDEV) += xv.o - + --- a/libavdevice/alldevices.c +++ b/libavdevice/alldevices.c @@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer; @@ -53642,7 +59097,7 @@ Upstream-status: Pending +extern AVOutputFormat ff_vout_rpi_muxer; extern AVInputFormat ff_xcbgrab_demuxer; extern AVOutputFormat ff_xv_muxer; - + --- /dev/null +++ b/libavdevice/drm_vout.c @@ -0,0 +1,643 @@ @@ -53856,7 +59311,7 @@ Upstream-status: Pending + + while (drmWaitVBlank(de->drm_fd, &vbl)) { + if (errno != EINTR) { -+ av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); ++// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); + break; + } + } @@ -54291,7 +59746,7 @@ Upstream-status: Pending + --- /dev/null +++ b/libavdevice/egl_vout.c -@@ -0,0 +1,825 @@ +@@ -0,0 +1,816 @@ +/* + * Copyright (c) 2020 John Cox for Raspberry Pi Trading + * @@ -54334,16 +59789,8 @@ Upstream-status: Pending +#include +#include + -+#include "drm_fourcc.h" -+#include -+#include -+#include -+#include +#include +#include -+#include -+#include -+#include + +#include "libavutil/rpi_sand_fns.h" + @@ -54555,8 +60002,7 @@ Upstream-status: Pending + XMapWindow(dpy, win); + + { -+ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, -+ (void *)(uintptr_t)win, NULL); ++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); + if (!surf) { + av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); + return -1; @@ -55656,7 +61102,15 @@ Upstream-status: Pending +}; --- a/libavfilter/Makefile +++ b/libavfilter/Makefile -@@ -434,6 +434,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER) +@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) + OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o + OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o + OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o ++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o + OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o + OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o + OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o +@@ -434,6 +435,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER) OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o OBJS-$(CONFIG_TRIM_FILTER) += trim.o OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o @@ -55666,7 +61120,15 @@ Upstream-status: Pending opencl/unsharp.o --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c -@@ -414,6 +414,7 @@ extern AVFilter ff_vf_transpose_opencl; +@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot; + extern AVFilter ff_vf_deflate; + extern AVFilter ff_vf_deflicker; + extern AVFilter ff_vf_deinterlace_qsv; ++extern AVFilter ff_vf_deinterlace_v4l2m2m; + extern AVFilter ff_vf_deinterlace_vaapi; + extern AVFilter ff_vf_dejudder; + extern AVFilter ff_vf_delogo; +@@ -414,6 +415,7 @@ extern AVFilter ff_vf_transpose_opencl; extern AVFilter ff_vf_transpose_vaapi; extern AVFilter ff_vf_trim; extern AVFilter ff_vf_unpremultiply; @@ -55683,13 +61145,13 @@ Upstream-status: Pending +#if CONFIG_UNSAND_FILTER +#include "libavutil/rpi_sand_fns.h" +#endif - + #define FF_INTERNAL_FIELDS 1 #include "framequeue.h" @@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFor } } - + +#if CONFIG_UNSAND_FILTER +static int has_sand_format(const AVFilterFormats * const ff) +{ @@ -55711,13 +61173,13 @@ Upstream-status: Pending AVFilterLink *link = filter->inputs[j]; int convert_needed = 0; + unsigned int extra_convert_tried = 0; - + if (!link) continue; @@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph * ) #undef MERGE_DISPATCH - + - if (convert_needed) { + while (convert_needed) { AVFilterContext *convert; @@ -55727,7 +61189,7 @@ Upstream-status: Pending + int can_retry = 0; + + convert_needed = 0; - + if (graph->disable_auto_convert) { av_log(log_ctx, AV_LOG_ERROR, @@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph * @@ -55760,7 +61222,7 @@ Upstream-status: Pending + inst_name, "", NULL, + graph)) < 0) + return ret; - + - if ((ret = avfilter_graph_create_filter(&convert, filter, - inst_name, graph->scale_sws_opts, NULL, - graph)) < 0) @@ -55813,7 +61275,7 @@ Upstream-status: Pending --- a/libavfilter/buffersrc.c +++ b/libavfilter/buffersrc.c @@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_intern - + switch (ctx->outputs[0]->type) { case AVMEDIA_TYPE_VIDEO: - CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, @@ -55822,6 +61284,1345 @@ Upstream-status: Pending break; case AVMEDIA_TYPE_AUDIO: --- /dev/null ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -0,0 +1,1336 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * deinterlace video filter - V4L2 M2M ++ */ ++ ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "libavutil/avassert.h" ++#include "libavutil/avstring.h" ++#include "libavutil/common.h" ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavutil/internal.h" ++#include "libavutil/mathematics.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/time.h" ++ ++#define FF_INTERNAL_FIELDS 1 ++#include "framequeue.h" ++#include "filters.h" ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" ++ ++typedef struct V4L2Queue V4L2Queue; ++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; ++ ++typedef struct V4L2PlaneInfo { ++ int bytesperline; ++ size_t length; ++} V4L2PlaneInfo; ++ ++typedef struct V4L2Buffer { ++ int enqueued; ++ int reenqueue; ++ int fd; ++ struct v4l2_buffer buffer; ++ AVFrame frame; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ int num_planes; ++ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; ++ AVDRMFrameDescriptor drm_frame; ++ V4L2Queue *q; ++} V4L2Buffer; ++ ++typedef struct V4L2Queue { ++ struct v4l2_format format; ++ int num_buffers; ++ V4L2Buffer *buffers; ++ DeintV4L2M2MContextShared *ctx; ++} V4L2Queue; ++ ++typedef struct pts_stats_s ++{ ++ void * logctx; ++ const char * name; // For debug ++ unsigned int last_count; ++ unsigned int last_interval; ++ int64_t last_pts; ++} pts_stats_t; ++ ++#define PTS_TRACK_SIZE 32 ++typedef struct pts_track_el_s ++{ ++ uint32_t n; ++ unsigned int interval; ++ AVFrame * props; ++} pts_track_el_t; ++ ++typedef struct pts_track_s ++{ ++ uint32_t n; ++ uint32_t last_n; ++ int got_2; ++ void * logctx; ++ pts_stats_t stats; ++ pts_track_el_t a[PTS_TRACK_SIZE]; ++} pts_track_t; ++ ++typedef struct DeintV4L2M2MContextShared { ++ void * logctx; // For logging - will be NULL when done ++ ++ int fd; ++ int done; ++ int width; ++ int height; ++ int orig_width; ++ int orig_height; ++ atomic_uint refcount; ++ ++ AVBufferRef *hw_frames_ctx; ++ ++ unsigned int field_order; ++ ++ pts_track_t track; ++ ++ V4L2Queue output; ++ V4L2Queue capture; ++} DeintV4L2M2MContextShared; ++ ++typedef struct DeintV4L2M2MContext { ++ const AVClass *class; ++ ++ DeintV4L2M2MContextShared *shared; ++} DeintV4L2M2MContext; ++ ++static unsigned int pts_stats_interval(const pts_stats_t * const stats) ++{ ++ return stats->last_interval; ++} ++ ++// Pick 64 for max last count - that is >1sec at 60fps ++#define STATS_LAST_COUNT_MAX 64 ++#define STATS_INTERVAL_MAX (1 << 30) ++static void pts_stats_add(pts_stats_t * const stats, int64_t pts) ++{ ++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { ++ if (stats->last_count < STATS_LAST_COUNT_MAX) ++ ++stats->last_count; ++ return; ++ } ++ ++ if (stats->last_pts != AV_NOPTS_VALUE) { ++ const int64_t interval = pts - stats->last_pts; ++ ++ if (interval < 0 || interval >= STATS_INTERVAL_MAX || ++ stats->last_count >= STATS_LAST_COUNT_MAX) { ++ if (stats->last_interval != 0) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", ++ __func__, stats->name, interval, stats->last_count); ++ stats->last_interval = 0; ++ } ++ else { ++ const int64_t frame_time = interval / (int64_t)stats->last_count; ++ ++ if (frame_time != stats->last_interval) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", ++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); ++ stats->last_interval = frame_time; ++ } ++ } ++ ++ stats->last_pts = pts; ++ stats->last_count = 1; ++} ++ ++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) ++{ ++ *stats = (pts_stats_t){ ++ .logctx = logctx, ++ .name = name, ++ .last_count = 1, ++ .last_interval = 0, ++ .last_pts = AV_NOPTS_VALUE ++ }; ++} ++ ++static inline uint32_t pts_track_next_n(pts_track_t * const trk) ++{ ++ if (++trk->n == 0) ++ trk->n = 1; ++ return trk->n; ++} ++ ++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst) ++{ ++ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000); ++ pts_track_el_t * t; ++ ++ // As a first guess assume that n==0 means last frame ++ if (n == 0) { ++ n = trk->last_n; ++ if (n == 0) ++ goto fail; ++ } ++ ++ t = trk->a + (n & (PTS_TRACK_SIZE - 1)); ++ ++ if (t->n != n) { ++ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n); ++ goto fail; ++ } ++ ++ // 1st frame is simple - just believe it ++ if (n != trk->last_n) { ++ trk->last_n = n; ++ trk->got_2 = 0; ++ return av_frame_copy_props(dst, t->props); ++ } ++ ++ // Only believe in a single interpolated frame ++ if (trk->got_2) ++ goto fail; ++ trk->got_2 = 1; ++ ++ av_frame_copy_props(dst, t->props); ++ ++ ++ // If we can't guess - don't ++ if (t->interval == 0) { ++ dst->best_effort_timestamp = AV_NOPTS_VALUE; ++ dst->pts = AV_NOPTS_VALUE; ++ dst->pkt_dts = AV_NOPTS_VALUE; ++ } ++ else { ++ if (dst->best_effort_timestamp != AV_NOPTS_VALUE) ++ dst->best_effort_timestamp += t->interval / 2; ++ if (dst->pts != AV_NOPTS_VALUE) ++ dst->pts += t->interval / 2; ++ if (dst->pkt_dts != AV_NOPTS_VALUE) ++ dst->pkt_dts += t->interval / 2; ++ } ++ ++ return 0; ++ ++fail: ++ trk->last_n = 0; ++ trk->got_2 = 0; ++ dst->pts = AV_NOPTS_VALUE; ++ dst->pkt_dts = AV_NOPTS_VALUE; ++ return 0; ++} ++ ++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) ++{ ++ const uint32_t n = pts_track_next_n(trk); ++ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1)); ++ ++ pts_stats_add(&trk->stats, src->pts); ++ ++ t->n = n; ++ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last ++ av_frame_unref(t->props); ++ av_frame_copy_props(t->props, src); ++ ++ // We now know what the previous interval was, rather than having to guess, ++ // so set it. There is a better than decent chance that this is before ++ // we use it. ++ if (t->interval != 0) { ++ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1)); ++ prev_t->interval = t->interval; ++ } ++ ++ // In case deinterlace interpolates frames use every other usec ++ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2}; ++} ++ ++static void pts_track_uninit(pts_track_t * const trk) ++{ ++ unsigned int i; ++ for (i = 0; i != PTS_TRACK_SIZE; ++i) { ++ trk->a[i].n = 0; ++ av_frame_free(&trk->a[i].props); ++ } ++} ++ ++static int pts_track_init(pts_track_t * const trk, void *logctx) ++{ ++ unsigned int i; ++ trk->n = 1; ++ pts_stats_init(&trk->stats, logctx, "track"); ++ for (i = 0; i != PTS_TRACK_SIZE; ++i) { ++ trk->a[i].n = 0; ++ if ((trk->a[i].props = av_frame_alloc()) == NULL) { ++ pts_track_uninit(trk); ++ return AVERROR(ENOMEM); ++ } ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) ++{ ++ struct v4l2_capability cap; ++ int ret; ++ ++ memset(&cap, 0, sizeof(cap)); ++ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); ++ if (ret < 0) ++ return ret; ++ ++ if (!(cap.capabilities & V4L2_CAP_STREAMING)) ++ return AVERROR(EINVAL); ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { ++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ ++ return 0; ++ } ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { ++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ ++ return 0; ++ } ++ ++ return AVERROR(EINVAL); ++} ++ ++static int deint_v4l2m2m_try_format(V4L2Queue *queue) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret, field; ++ ++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); ++ if (ret) ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); ++ ++ if (V4L2_TYPE_IS_OUTPUT(fmt->type)) ++ field = V4L2_FIELD_INTERLACED_TB; ++ else ++ field = V4L2_FIELD_NONE; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = ctx->width; ++ fmt->fmt.pix_mp.height = ctx->height; ++ } else { ++ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = ctx->width; ++ fmt->fmt.pix.height = ctx->height; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, ++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, ++ fmt->fmt.pix_mp.pixelformat, ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); ++ ++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); ++ if (ret) ++ return AVERROR(EINVAL); ++ ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, ++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, ++ fmt->fmt.pix_mp.pixelformat, ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 && ++ fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) || ++ fmt->fmt.pix_mp.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ ++ return AVERROR(EINVAL); ++ } ++ } else { ++ if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 && ++ fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) || ++ fmt->fmt.pix.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ ++ return AVERROR(EINVAL); ++ } ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret; ++ ++ struct v4l2_selection sel = { ++ .type = fmt->type, ++ .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, ++ }; ++ ++ // This works for most single object 4:2:0 types ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = pixelformat; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = width; ++ fmt->fmt.pix_mp.height = ysize / pitch; ++ fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); ++ } else { ++ fmt->fmt.pix.pixelformat = pixelformat; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = width; ++ fmt->fmt.pix.height = height; ++ fmt->fmt.pix.sizeimage = 0; ++ fmt->fmt.pix.bytesperline = 0; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); ++ return ret; ++ } ++ ++ if (pixelformat != fmt->fmt.pix.pixelformat) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat)); ++ return AVERROR(EINVAL); ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret); ++ } ++ ++ sel.r.width = width; ++ sel.r.height = height; ++ sel.r.left = 0; ++ sel.r.top = 0; ++ sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, ++ sel.flags = V4L2_SEL_FLAG_LE; ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret); ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) ++{ ++ int ret; ++ ++ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); ++ if (ctx->fd < 0) ++ return AVERROR(errno); ++ ++ ret = deint_v4l2m2m_prepare_context(ctx); ++ if (ret) ++ goto fail; ++ ++ ret = deint_v4l2m2m_try_format(&ctx->capture); ++ if (ret) ++ goto fail; ++ ++ ret = deint_v4l2m2m_try_format(&ctx->output); ++ if (ret) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ close(ctx->fd); ++ ctx->fd = -1; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) ++{ ++ int ret = AVERROR(EINVAL); ++ struct dirent *entry; ++ char node[PATH_MAX]; ++ DIR *dirp; ++ ++ dirp = opendir("/dev"); ++ if (!dirp) ++ return AVERROR(errno); ++ ++ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { ++ ++ if (strncmp(entry->d_name, "video", 5)) ++ continue; ++ ++ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node); ++ ret = deint_v4l2m2m_probe_device(ctx, node); ++ if (!ret) ++ break; ++ } ++ ++ closedir(dirp); ++ ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n"); ++ ctx->fd = -1; ++ ++ return ret; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) ++{ ++ int ret; ++ ++ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ buf->enqueued = 1; ++ ++ return 0; ++} ++ ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) ++{ ++ struct v4l2_exportbuffer expbuf; ++ int i, ret; ++ uint64_t mod = DRM_FORMAT_MOD_LINEAR; ++ uint32_t fmt = 0; ++ ++ switch (pixelformat) { ++ case V4L2_PIX_FMT_NV12: ++ fmt = DRM_FORMAT_NV12; ++ break; ++ case V4L2_PIX_FMT_YUV420: ++ fmt = DRM_FORMAT_YUV420; ++ break; ++ default: ++ return AVERROR(EINVAL); ++ } ++ ++ avbuf->drm_frame.layers[0].format = fmt; ++ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ memset(&expbuf, 0, sizeof(expbuf)); ++ ++ expbuf.index = avbuf->buffer.index; ++ expbuf.type = avbuf->buffer.type; ++ expbuf.plane = i; ++ ++ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ avbuf->fd = expbuf.fd; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { ++ /* drm frame */ ++ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; ++ avbuf->drm_frame.objects[i].fd = expbuf.fd; ++ avbuf->drm_frame.objects[i].format_modifier = mod; ++ } else { ++ /* drm frame */ ++ avbuf->drm_frame.objects[0].size = avbuf->buffer.length; ++ avbuf->drm_frame.objects[0].fd = expbuf.fd; ++ avbuf->drm_frame.objects[0].format_modifier = mod; ++ } ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_requestbuffers req; ++ int ret, i, j, multiplanar; ++ uint32_t memory; ++ ++ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? ++ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; ++ ++ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); ++ ++ memset(&req, 0, sizeof(req)); ++ req.count = queue->num_buffers; ++ req.memory = memory; ++ req.type = fmt->type; ++ ++ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); ++ if (ret < 0) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); ++ ++ return AVERROR(errno); ++ } ++ ++ queue->num_buffers = req.count; ++ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); ++ if (!queue->buffers) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n"); ++ ++ return AVERROR(ENOMEM); ++ } ++ ++ for (i = 0; i < queue->num_buffers; i++) { ++ V4L2Buffer *buf = &queue->buffers[i]; ++ ++ buf->enqueued = 0; ++ buf->fd = -1; ++ buf->q = queue; ++ ++ buf->buffer.type = fmt->type; ++ buf->buffer.memory = memory; ++ buf->buffer.index = i; ++ ++ if (multiplanar) { ++ buf->buffer.length = VIDEO_MAX_PLANES; ++ buf->buffer.m.planes = buf->planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); ++ if (ret < 0) { ++ ret = AVERROR(errno); ++ ++ goto fail; ++ } ++ ++ if (multiplanar) ++ buf->num_planes = buf->buffer.length; ++ else ++ buf->num_planes = 1; ++ ++ for (j = 0; j < buf->num_planes; j++) { ++ V4L2PlaneInfo *info = &buf->plane_info[j]; ++ ++ if (multiplanar) { ++ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; ++ info->length = buf->buffer.m.planes[j].length; ++ } else { ++ info->bytesperline = fmt->fmt.pix.bytesperline; ++ info->length = buf->buffer.length; ++ } ++ } ++ ++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { ++ ret = deint_v4l2m2m_enqueue_buffer(buf); ++ if (ret) ++ goto fail; ++ ++ ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat); ++ if (ret) ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ for (i = 0; i < queue->num_buffers; i++) ++ if (queue->buffers[i].fd >= 0) ++ close(queue->buffers[i].fd); ++ av_free(queue->buffers); ++ queue->buffers = NULL; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_streamon(V4L2Queue *queue) ++{ ++ DeintV4L2M2MContextShared * const ctx = queue->ctx; ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_streamoff(V4L2Queue *queue) ++{ ++ DeintV4L2M2MContextShared * const ctx = queue->ctx; ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++// timeout in ms ++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) ++{ ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_buffer buf = { 0 }; ++ V4L2Buffer* avbuf = NULL; ++ struct pollfd pfd; ++ short events; ++ int ret; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ events = POLLOUT | POLLWRNORM; ++ else ++ events = POLLIN | POLLRDNORM; ++ ++ pfd.events = events; ++ pfd.fd = ctx->fd; ++ ++ for (;;) { ++ ret = poll(&pfd, 1, timeout); ++ if (ret > 0) ++ break; ++ if (errno == EINTR) ++ continue; ++ return NULL; ++ } ++ ++ if (pfd.revents & POLLERR) ++ return NULL; ++ ++ if (pfd.revents & events) { ++ memset(&buf, 0, sizeof(buf)); ++ buf.memory = V4L2_MEMORY_MMAP; ++ buf.type = queue->format.type; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memset(planes, 0, sizeof(planes)); ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); ++ if (ret) { ++ if (errno != EAGAIN) ++ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", ++ av_err2str(AVERROR(errno))); ++ return NULL; ++ } ++ ++ avbuf = &queue->buffers[buf.index]; ++ avbuf->enqueued = 0; ++ avbuf->buffer = buf; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buffer.m.planes = avbuf->planes; ++ } ++ return avbuf; ++ } ++ ++ return NULL; ++} ++ ++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) ++{ ++ int i; ++ V4L2Buffer *buf = NULL; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (!queue->buffers[i].enqueued) { ++ buf = &queue->buffers[i]; ++ break; ++ } ++ return buf; ++} ++ ++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue) ++{ ++ int i; ++ V4L2Buffer *buf = NULL; ++ ++ if (!queue || !queue->buffers) ++ return; ++ for (i = 0; i < queue->num_buffers; i++) { ++ buf = &queue->buffers[i]; ++ if (queue->buffers[i].enqueued) ++ av_frame_unref(&buf->frame); ++ } ++} ++ ++static void recycle_q(V4L2Queue * const queue) ++{ ++ V4L2Buffer* avbuf; ++ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) { ++ av_frame_unref(&avbuf->frame); ++ } ++} ++ ++static int count_enqueued(V4L2Queue *queue) ++{ ++ int i; ++ int n = 0; ++ ++ if (queue->buffers == NULL) ++ return 0; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (queue->buffers[i].enqueued) ++ ++n; ++ return n; ++} ++ ++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame) ++{ ++ DeintV4L2M2MContextShared *const ctx = queue->ctx; ++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; ++ V4L2Buffer *buf; ++ int i; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ recycle_q(queue); ++ ++ buf = deint_v4l2m2m_find_free_buf(queue); ++ if (!buf) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0); ++ return AVERROR(EAGAIN); ++ } ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) ++ for (i = 0; i < drm_desc->nb_objects; i++) ++ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; ++ else ++ buf->buffer.m.fd = drm_desc->objects[0].fd; ++ ++ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE : ++ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB : ++ V4L2_FIELD_INTERLACED_BT; ++ ++ if (ctx->field_order != buf->buffer.field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field); ++ ctx->field_order = buf->buffer.field; ++ } ++ ++ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame); ++ ++ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd; ++ ++ av_frame_move_ref(&buf->frame, frame); ++ ++ return deint_v4l2m2m_enqueue_buffer(buf); ++} ++ ++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) ++{ ++ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int i; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); ++ ++ if (ctx->fd >= 0) { ++ deint_v4l2m2m_streamoff(capture); ++ deint_v4l2m2m_streamoff(output); ++ } ++ ++ if (capture->buffers) ++ for (i = 0; i < capture->num_buffers; i++) { ++ capture->buffers[i].q = NULL; ++ if (capture->buffers[i].fd >= 0) ++ close(capture->buffers[i].fd); ++ } ++ ++ deint_v4l2m2m_unref_queued(output); ++ ++ av_buffer_unref(&ctx->hw_frames_ctx); ++ ++ if (capture->buffers) ++ av_free(capture->buffers); ++ ++ if (output->buffers) ++ av_free(output->buffers); ++ ++ if (ctx->fd >= 0) { ++ close(ctx->fd); ++ ctx->fd = -1; ++ } ++ ++ av_free(ctx); ++ } ++} ++ ++static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++{ ++ V4L2Buffer *buf = opaque; ++ DeintV4L2M2MContextShared *ctx = buf->q->ctx; ++ ++ if (!ctx->done) ++ deint_v4l2m2m_enqueue_buffer(buf); ++ ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) ++{ ++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor *layer; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_objects = avbuf->num_planes; ++ drm_desc->nb_layers = 1; ++ ++ layer = &drm_desc->layers[0]; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; ++ } ++ ++ switch (layer->format) { ++ case DRM_FORMAT_YUYV: ++ layer->nb_planes = 1; ++ break; ++ ++ case DRM_FORMAT_NV12: ++ case DRM_FORMAT_NV21: ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; ++ break; ++ ++ case DRM_FORMAT_YUV420: ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ++ ((avbuf->plane_info[0].bytesperline * ++ height) >> 2); ++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ break; ++ ++ default: ++ drm_desc->nb_layers = 0; ++ break; ++ } ++ ++ return (uint8_t *) drm_desc; ++} ++ ++// timeout in ms ++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) ++{ ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ V4L2Buffer* avbuf; ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); ++ if (!avbuf) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); ++ return AVERROR(EAGAIN); ++ } ++ ++ // Fill in PTS and anciliary info from src frame ++ // we will want to overwrite some fields as only the pts/dts ++ // fields are updated with new timing in this fn ++ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); ++ ++ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, ++ sizeof(avbuf->drm_frame), v4l2_free_buffer, ++ avbuf, AV_BUFFER_FLAG_READONLY); ++ if (!frame->buf[0]) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0); ++ return AVERROR(ENOMEM); ++ } ++ ++ atomic_fetch_add(&ctx->refcount, 1); ++ ++ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (ctx->hw_frames_ctx) ++ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); ++ frame->height = ctx->height; ++ frame->width = ctx->width; ++ ++ // Not interlaced now ++ frame->interlaced_frame = 0; ++ frame->top_field_first = 0; ++ // Pkt duration halved ++ frame->pkt_duration /= 2; ++ ++ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); ++ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts); ++ return 0; ++} ++ ++static int deint_v4l2m2m_config_props(AVFilterLink *outlink) ++{ ++ AVFilterLink *inlink = outlink->src->inputs[0]; ++ AVFilterContext *avctx = outlink->src; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ int ret; ++ ++ ctx->height = avctx->inputs[0]->h; ++ ctx->width = avctx->inputs[0]->w; ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height); ++ ++ outlink->time_base = inlink->time_base; ++ outlink->w = inlink->w; ++ outlink->h = inlink->h; ++ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; ++ outlink->format = inlink->format; ++ outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate ++ ++ ret = deint_v4l2m2m_find_device(ctx); ++ if (ret) ++ return ret; ++ ++ if (inlink->hw_frames_ctx) { ++ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); ++ if (!ctx->hw_frames_ctx) ++ return AVERROR(ENOMEM); ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) ++{ ++ static const enum AVPixelFormat pixel_formats[] = { ++ AV_PIX_FMT_DRM_PRIME, ++ AV_PIX_FMT_YUV420P, ++ AV_PIX_FMT_NONE, ++ }; ++ ++ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); ++} ++ ++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) ++{ ++ const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR || ++ drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID); ++ ++ switch (drm_desc->layers[0].format) { ++ case DRM_FORMAT_YUV420: ++ if (is_linear) ++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0; ++ break; ++ case DRM_FORMAT_NV12: ++ if (is_linear) ++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0; ++ break; ++ default: ++ break; ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterContext *avctx = link->dst; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int ret; ++ ++ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n", ++ __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); ++ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, ++ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); ++ ++ if (ctx->field_order == V4L2_FIELD_ANY) { ++ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; ++ const uint32_t pixelformat = desc_pixelformat(drm_desc); ++ ++ if (pixelformat == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", ++ av_fourcc2str(drm_desc->layers[0].format), ++ drm_desc->nb_objects, drm_desc->objects[0].format_modifier); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; ++ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, ++ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); ++ ++ ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_allocate_buffers(capture); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_streamon(capture); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_allocate_buffers(output); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_streamon(output); ++ if (ret) ++ return ret; ++ ++ if (in->top_field_first) ++ ctx->field_order = V4L2_FIELD_INTERLACED_TB; ++ else ++ ctx->field_order = V4L2_FIELD_INTERLACED_BT; ++ ++ } ++ ++ ret = deint_v4l2m2m_enqueue_frame(output, in); ++ ++ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret)); ++ return ret; ++} ++ ++static int deint_v4l2m2m_activate(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext * const priv = avctx->priv; ++ DeintV4L2M2MContextShared *const s = priv->shared; ++ AVFilterLink * const outlink = avctx->outputs[0]; ++ AVFilterLink * const inlink = avctx->inputs[0]; ++ int n = 0; ++ int cn = 99; ++ int instatus = 0; ++ int64_t inpts = 0; ++ int did_something = 0; ++ ++ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); ++ ++ ff_inlink_acknowledge_status(inlink, &instatus, &inpts); ++ ++ if (!ff_outlink_frame_wanted(outlink)) { ++ av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); ++ } ++ else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! ++ { ++ AVFrame * frame = av_frame_alloc(); ++ int rv; ++ ++again: ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ ++ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0); ++ if (rv != 0) { ++ av_frame_free(&frame); ++ if (rv != AVERROR(EAGAIN)) { ++ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ } ++ else { ++ frame->interlaced_frame = 0; ++ // frame is always consumed by filter_frame - even on error despite ++ // a somewhat confusing comment in the header ++ rv = ff_filter_frame(outlink, frame); ++ ++ if (instatus != 0) { ++ av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__); ++ goto again; ++ } ++ ++ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); ++ did_something = 1; ++ } ++ ++ cn = count_enqueued(&s->capture); ++ } ++ ++ if (instatus != 0) { ++ ff_outlink_set_status(outlink, instatus, inpts); ++ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus)); ++ return 0; ++ } ++ ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ while (n < 6) { ++ AVFrame * frame; ++ int rv; ++ ++ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { ++ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); ++ break; ++ } ++ ++ rv = deint_v4l2m2m_filter_frame(inlink, frame); ++ av_frame_free(&frame); ++ ++ if (rv != 0) ++ return rv; ++ ++ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); ++ ++n; ++ } ++ ++ if (n < 6) { ++ ff_inlink_request_frame(inlink); ++ did_something = 1; ++ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); ++ } ++ ++ if (n > 4 && ff_outlink_frame_wanted(outlink)) { ++ ff_filter_set_ready(avctx, 1); ++ did_something = 1; ++ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); ++ } ++ ++ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn); ++ return did_something ? 0 : FFERROR_NOT_READY; ++} ++ ++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext * const priv = avctx->priv; ++ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); ++ ++ if (!ctx) { ++ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0); ++ return AVERROR(ENOMEM); ++ } ++ priv->shared = ctx; ++ ctx->logctx = priv; ++ ctx->fd = -1; ++ ctx->output.ctx = ctx; ++ ctx->output.num_buffers = 8; ++ ctx->capture.ctx = ctx; ++ ctx->capture.num_buffers = 12; ++ ctx->done = 0; ++ ctx->field_order = V4L2_FIELD_ANY; ++ ++ pts_track_init(&ctx->track, priv); ++ ++ atomic_init(&ctx->refcount, 1); ++ ++ return 0; ++} ++ ++static void deint_v4l2m2m_uninit(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ ++ ctx->done = 1; ++ ctx->logctx = NULL; // Log to NULL works, log to missing crashes ++ pts_track_uninit(&ctx->track); ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static const AVOption deinterlace_v4l2m2m_options[] = { ++ { NULL }, ++}; ++ ++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); ++ ++static const AVFilterPad deint_v4l2m2m_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ }, ++ { NULL } ++}; ++ ++static const AVFilterPad deint_v4l2m2m_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .config_props = deint_v4l2m2m_config_props, ++ }, ++ { NULL } ++}; ++ ++AVFilter ff_vf_deinterlace_v4l2m2m = { ++ .name = "deinterlace_v4l2m2m", ++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), ++ .priv_size = sizeof(DeintV4L2M2MContext), ++ .init = &deint_v4l2m2m_init, ++ .uninit = &deint_v4l2m2m_uninit, ++ .query_formats = &deint_v4l2m2m_query_formats, ++ .inputs = deint_v4l2m2m_inputs, ++ .outputs = deint_v4l2m2m_outputs, ++ .priv_class = &deinterlace_v4l2m2m_class, ++ .activate = deint_v4l2m2m_activate, ++}; +--- /dev/null +++ b/libavfilter/vf_unsand.c @@ -0,0 +1,234 @@ +/* @@ -56063,7 +62864,7 @@ Upstream-status: Pending @@ -3051,6 +3051,40 @@ static int has_codec_parameters(AVStream return 1; } - + +#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER +// This should be quite general purpose but avoid possible conflicts +// by limiting usage to cases wehere we know it works. @@ -56165,10 +62966,10 @@ Upstream-status: Pending sha512.h \ @@ -86,6 +87,7 @@ HEADERS = adler32.h tx.h \ - + HEADERS-$(CONFIG_LZO) += lzo.h +HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h - + ARCH_HEADERS = bswap.h \ intmath.h \ @@ -180,6 +182,7 @@ OBJS-$(CONFIG_LZO) @@ -56184,14 +62985,14 @@ Upstream-status: Pending @@ -1,4 +1,6 @@ OBJS += aarch64/cpu.o \ aarch64/float_dsp_init.o \ - + -NEON-OBJS += aarch64/float_dsp_neon.o +NEON-OBJS += aarch64/float_dsp_neon.o \ + aarch64/rpi_sand_neon.o \ + --- /dev/null +++ b/libavutil/aarch64/rpi_sand_neon.S -@@ -0,0 +1,676 @@ +@@ -0,0 +1,781 @@ +/* +Copyright (c) 2021 Michael Eiler + @@ -56252,7 +63053,7 @@ Upstream-status: Pending + + // this is the value we have to add to the src pointer after reading a complete block + // it will move the address to the start of the next block -+ // w10 = stride2 * stride1 - stride1 ++ // w10 = stride2 * stride1 - stride1 + mov w10, w4 + lsl w10, w10, #7 + sub w10, w10, #128 @@ -56279,7 +63080,7 @@ Upstream-status: Pending + // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128 + // fortunately these aren't callee saved ones, meaning we don't need to backup them + ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64 -+ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 ++ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 + + // write these registers back to the destination vector and increase the dst address by 128 + st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 @@ -56310,13 +63111,13 @@ Upstream-status: Pending + add w5, w5, #1 + b incomplete_block_loop_y8 +incomplete_block_loop_end_y8: -+ -+ -+ // increase the row offset by 128 (stride1) ++ ++ ++ // increase the row offset by 128 (stride1) + add w11, w11, #128 + // increment the row counter + add w12, w12, #1 -+ ++ + // process the next row if we haven't finished yet + cmp w15, w12 + bgt row_loop @@ -56380,7 +63181,7 @@ Upstream-status: Pending + beq no_main_c8 + +block_loop_c8: -+ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values ++ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values + ld2 { v0.16b, v1.16b }, [x13], #32 + ld2 { v2.16b, v3.16b }, [x13], #32 + ld2 { v4.16b, v5.16b }, [x13], #32 @@ -56403,14 +63204,14 @@ Upstream-status: Pending + // increment row counter and move src to the beginning of the next block + add w14, w14, #1 + add x13, x13, x10 -+ ++ + // jump to block_loop_c8 iff the block count is smaller than the number of full blocks + cmp w8, w14 + bgt block_loop_c8 + +no_main_c8: + // handle incomplete block at the end of every row -+ eor w5, w5, w5 // point counter, this might be ++ eor w5, w5, w5 // point counter, this might be +incomplete_block_loop_c8: + cmp w5, w9 + bge incomplete_block_loop_end_c8 @@ -56442,228 +63243,6 @@ Upstream-status: Pending + ret +endfunc + -+//void ff_rpi_sand30_lines_to_planar_y16( -+// uint8_t * dest, // [x0] -+// unsigned int dst_stride, // [w1] -> assumed to be equal to _w -+// const uint8_t * src, // [x2] -+// unsigned int src_stride1, // [w3] -> 128 -+// unsigned int src_stride2, // [w4] -+// unsigned int _x, // [w5] -+// unsigned int y, // [w6] -+// unsigned int _w, // [w7] -+// unsigned int h); // [sp, #0] -+ -+function ff_rpi_sand30_lines_to_planar_y16, export=1 -+ stp x19, x20, [sp, #-48]! -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ -+ // w6 = argument h -+ ldr w6, [sp, #48] -+ -+ // slice_inc = ((stride2 - 1) * stride1) -+ mov w5, w4 -+ sub w5, w5, #1 -+ lsl w5, w5, #7 -+ -+ // total number of bytes per row = (width / 3) * 4 -+ mov w8, w7 -+ mov w9, #3 -+ udiv w8, w8, w9 -+ lsl w8, w8, #2 -+ -+ // number of full 128 byte blocks to be processed -+ mov w9, #96 -+ udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96 -+ -+ // w10 = number of full integers to process (4 bytes) -+ // w11 = remaning zero to two 10bit values still to copy over -+ mov w12, #96 -+ mul w12, w9, w12 -+ sub w12, w7, w12 // width - blocks*96 = remaining points per row -+ mov w11, #3 -+ udiv w10, w12, w11 // full integers to process = w12 / 3 -+ mul w11, w10, w11 // #integers *3 -+ sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3 -+ -+ // increase w9 by one if w10+w11 is not zero, and decrease the row count by one -+ // this is to efficiently copy incomplete blocks at the end of the rows -+ // the last row is handled explicitly to avoid writing out of bounds -+ add w22, w10, w11 -+ cmp w22, #0 -+ cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise -+ add w9, w9, w22 -+ sub w6, w6, #1 -+ -+ // store the number of bytes in w20 which we copy too much for every row -+ // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values) -+ mov w20, #96*2 -+ mul w20, w20, w9 -+ sub w20, w1, w20 -+ -+ mov w23, #0 // flag to check whether the last line had already been processed -+ -+ // bitmask to clear the uppper 6bits of the result values -+ mov x19, #0x03ff03ff03ff03ff -+ dup v22.2d, x19 -+ -+ // row counter = 0 -+ eor w12, w12, w12 -+row_loop_y16: -+ cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows -+ bge row_loop_y16_fin -+ -+ mov x13, x2 // row src -+ eor w14, w14, w14 // full block counter -+block_loop_y16: -+ cmp w14, w9 -+ bge block_loop_y16_fin -+ -+ // load 64 bytes -+ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 -+ -+ // process v0 and v1 -+ xtn v16.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v17.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v18.4h, v0.4s -+ -+ xtn2 v16.8h, v1.4s -+ and v16.16b, v16.16b, v22.16b -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v17.8h, v1.4s -+ and v17.16b, v17.16b, v22.16b -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v18.8h, v1.4s -+ and v18.16b, v18.16b, v22.16b -+ -+ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 -+ -+ // process v2 and v3 -+ xtn v23.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v24.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v25.4h, v2.4s -+ -+ xtn2 v23.8h, v3.4s -+ and v23.16b, v23.16b, v22.16b -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v24.8h, v3.4s -+ and v24.16b, v24.16b, v22.16b -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v25.8h, v3.4s -+ and v25.16b, v25.16b, v22.16b -+ -+ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 -+ -+ // load the second half of the block -> 64 bytes into registers v4-v7 -+ ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64 -+ -+ // process v4 and v5 -+ xtn v16.4h, v4.4s -+ ushr v4.4s, v4.4s, #10 -+ xtn v17.4h, v4.4s -+ ushr v4.4s, v4.4s, #10 -+ xtn v18.4h, v4.4s -+ -+ xtn2 v16.8h, v5.4s -+ and v16.16b, v16.16b, v22.16b -+ ushr v5.4s, v5.4s, #10 -+ xtn2 v17.8h, v5.4s -+ and v17.16b, v17.16b, v22.16b -+ ushr v5.4s, v5.4s, #10 -+ xtn2 v18.8h, v5.4s -+ and v18.16b, v18.16b, v22.16b -+ -+ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 -+ -+ // v6 and v7 -+ xtn v23.4h, v6.4s -+ ushr v6.4s, v6.4s, #10 -+ xtn v24.4h, v6.4s -+ ushr v6.4s, v6.4s, #10 -+ xtn v25.4h, v6.4s -+ -+ xtn2 v23.8h, v7.4s -+ and v23.16b, v23.16b, v22.16b -+ ushr v7.4s, v7.4s, #10 -+ xtn2 v24.8h, v7.4s -+ and v24.16b, v24.16b, v22.16b -+ ushr v7.4s, v7.4s, #10 -+ xtn2 v25.8h, v7.4s -+ and v25.16b, v25.16b, v22.16b -+ -+ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 -+ -+ add x13, x13, x5 // row src += slice_inc -+ add w14, w14, #1 -+ b block_loop_y16 -+block_loop_y16_fin: -+ -+ -+ -+ -+ add x2, x2, #128 // src += stride1 (start of the next row) -+ add x0, x0, w20, sxtw // subtract the bytes we copied too much from dst -+ add w12, w12, #1 -+ b row_loop_y16 -+row_loop_y16_fin: -+ -+ // check whether we have incomplete blocks at the end of every row -+ // in that case decrease row block count by one -+ // change height back to it's original value (meaning increase it by 1) -+ // and jump back to another iteration of row_loop_y16 -+ -+ cmp w23, #1 -+ beq row_loop_y16_fin2 // don't continue here if we already processed the last row -+ add w6, w6, #1 // increase height to the original value -+ sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count -+ mov w23, #1 -+ b row_loop_y16 -+row_loop_y16_fin2: -+ -+ sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference -+ -+ // now we've got to handle the last block in the last row -+ eor w12, w12, w12 // w12 = 0 = counter -+integer_loop_y16: -+ cmp w12, w10 -+ bge integer_loop_y16_fin -+ ldr w14, [x13], #4 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ lsr w14, w14, #10 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ lsr w14, w14, #10 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ add w12, w12, #1 -+ b integer_loop_y16 -+integer_loop_y16_fin: -+ -+final_values_y16: -+ // remaining point count = w11 -+ ldr w14, [x13], #4 -+ cmp w11, #0 -+ beq final_values_y16_fin -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+ cmp w11, #1 -+ beq final_values_y16_fin -+ lsr w14, w14, #10 -+ and w15, w14, #0x3ff -+ strh w15, [x0], #2 -+final_values_y16_fin: -+ -+ ldp x23, x24, [sp, #32] -+ ldp x21, x22, [sp, #16] -+ ldp x19, x20, [sp], #48 -+ ret -+endfunc -+ +//void ff_rpi_sand30_lines_to_planar_c16( +// uint8_t * dst_u, // [x0] +// unsigned int dst_stride_u, // [w1] == _w*2 @@ -56671,7 +63250,7 @@ Upstream-status: Pending +// unsigned int dst_stride_v, // [w3] == _w*2 +// const uint8_t * src, // [x4] +// unsigned int stride1, // [w5] == 128 -+// unsigned int stride2, // [w6] ++// unsigned int stride2, // [w6] +// unsigned int _x, // [w7] == 0 +// unsigned int y, // [sp, #0] == 0 +// unsigned int _w, // [sp, #8] -> w3 @@ -56694,7 +63273,7 @@ Upstream-status: Pending + and v5.16b, v5.16b, v16.16b + and v6.16b, v6.16b, v16.16b + st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 -+ ++ + xtn v4.4h, v2.4s + ushr v2.4s, v2.4s, #10 + xtn v5.4h, v2.4s @@ -56841,7 +63420,7 @@ Upstream-status: Pending + ldr w22, [x4], #4 + str w22, [x0], #2 + lsr w22, w22, #16 -+ str w22, [x2], #2 ++ str w22, [x2], #2 + + add w20, w20, #1 + b rem_pix_c16_loop @@ -56868,9 +63447,336 @@ Upstream-status: Pending +// unsigned int _w, +// unsigned int h); + ++// void ff_rpi_sand30_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++// ++// Assumes that we are starting on a stripe boundary and that overreading ++// within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ lsl w4, w4, #7 ++ sub w4, w4, #64 ++ sub w1, w1, w7, lsl #1 ++ uxtw x6, w6 ++ add x8, x2, x6, lsl #7 ++ ldr w6, [sp, #0] ++ ++10: ++ mov x2, x8 ++ mov w5, w7 ++1: ++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 ++ ++ subs w5, w5, #96 ++ ++ // v0, v1 ++ ++ shrn v18.4h, v0.4s, #14 ++ xtn v16.4h, v0.4s ++ shrn v17.4h, v0.4s, #10 ++ ++ shrn2 v18.8h, v1.4s, #14 ++ xtn2 v16.8h, v1.4s ++ shrn2 v17.8h, v1.4s, #10 ++ ++ ushr v18.8h, v18.8h, #6 ++ bic v16.8h, #0xfc, lsl #8 ++ bic v17.8h, #0xfc, lsl #8 ++ ++ // v2, v3 ++ ++ shrn v21.4h, v2.4s, #14 ++ xtn v19.4h, v2.4s ++ shrn v20.4h, v2.4s, #10 ++ ++ shrn2 v21.8h, v3.4s, #14 ++ xtn2 v19.8h, v3.4s ++ shrn2 v20.8h, v3.4s, #10 ++ ++ ushr v21.8h, v21.8h, #6 ++ bic v19.8h, #0xfc, lsl #8 ++ bic v20.8h, #0xfc, lsl #8 ++ ++ // v4, v5 ++ ++ shrn v24.4h, v4.4s, #14 ++ xtn v22.4h, v4.4s ++ shrn v23.4h, v4.4s, #10 ++ ++ shrn2 v24.8h, v5.4s, #14 ++ xtn2 v22.8h, v5.4s ++ shrn2 v23.8h, v5.4s, #10 ++ ++ ushr v24.8h, v24.8h, #6 ++ bic v22.8h, #0xfc, lsl #8 ++ bic v23.8h, #0xfc, lsl #8 ++ ++ // v6, v7 ++ ++ shrn v27.4h, v6.4s, #14 ++ xtn v25.4h, v6.4s ++ shrn v26.4h, v6.4s, #10 ++ ++ shrn2 v27.8h, v7.4s, #14 ++ xtn2 v25.8h, v7.4s ++ shrn2 v26.8h, v7.4s, #10 ++ ++ ushr v27.8h, v27.8h, #6 ++ bic v25.8h, #0xfc, lsl #8 ++ bic v26.8h, #0xfc, lsl #8 ++ ++ blt 2f ++ ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 ++ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48 ++ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48 ++ ++ bne 1b ++ ++11: ++ subs w6, w6, #1 ++ add x0, x0, w1, uxtw ++ add x8, x8, #128 ++ bne 10b ++ ++ ret ++ ++// Partial final write ++2: ++ cmp w5, #48-96 ++ blt 1f ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 ++ beq 11b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ sub w5, w5, #48 ++ mov v18.16b, v24.16b ++ mov v19.16b, v25.16b ++ mov v20.16b, v26.16b ++ mov v21.16b, v27.16b ++1: ++ cmp w5, #24-96 ++ blt 1f ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ beq 11b ++ mov v16.16b, v19.16b ++ mov v17.16b, v20.16b ++ sub w5, w5, #24 ++ mov v18.16b, v21.16b ++1: ++ cmp w5, #12-96 ++ blt 1f ++ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24 ++ beq 11b ++ mov v16.2d[0], v16.2d[1] ++ sub w5, w5, #12 ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w5, #6-96 ++ blt 1f ++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 ++ st3 {v16.h, v17.h, v18.h}[1], [x0], #6 ++ beq 11b ++ mov v16.2s[0], v16.2s[1] ++ sub w5, w5, #6 ++ mov v17.2s[0], v17.2s[1] ++ mov v18.2s[0], v18.2s[1] ++1: ++ cmp w5, #3-96 ++ blt 1f ++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 ++ beq 11b ++ mov v16.4h[0], v16.4h[1] ++ sub w5, w5, #3 ++ mov v17.4h[0], v17.4h[1] ++1: ++ cmp w5, #2-96 ++ blt 1f ++ st2 {v16.h, v17.h}[0], [x0], #4 ++ b 11b ++1: ++ st1 {v16.h}[0], [x0], #2 ++ b 11b ++ ++endfunc ++ ++// void ff_rpi_sand30_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++// ++// Assumes that we are starting on a stripe boundary and that overreading ++// within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y8, export=1 ++ lsl w4, w4, #7 ++ sub w4, w4, #64 ++ sub w1, w1, w7 ++ uxtw x6, w6 ++ add x8, x2, x6, lsl #7 ++ ldr w6, [sp, #0] ++ ++10: ++ mov x2, x8 ++ mov w5, w7 ++1: ++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 ++ ++ subs w5, w5, #96 ++ ++ // v0, v1 ++ ++ shrn v18.4h, v0.4s, #16 ++ xtn v16.4h, v0.4s ++ shrn v17.4h, v0.4s, #12 ++ ++ shrn2 v18.8h, v1.4s, #16 ++ xtn2 v16.8h, v1.4s ++ shrn2 v17.8h, v1.4s, #12 ++ ++ shrn v18.8b, v18.8h, #6 ++ shrn v16.8b, v16.8h, #2 ++ xtn v17.8b, v17.8h ++ ++ // v2, v3 ++ ++ shrn v21.4h, v2.4s, #16 ++ xtn v19.4h, v2.4s ++ shrn v20.4h, v2.4s, #12 ++ ++ shrn2 v21.8h, v3.4s, #16 ++ xtn2 v19.8h, v3.4s ++ shrn2 v20.8h, v3.4s, #12 ++ ++ shrn2 v18.16b, v21.8h, #6 ++ shrn2 v16.16b, v19.8h, #2 ++ xtn2 v17.16b, v20.8h ++ ++ // v4, v5 ++ ++ shrn v24.4h, v4.4s, #16 ++ xtn v22.4h, v4.4s ++ shrn v23.4h, v4.4s, #12 ++ ++ shrn2 v24.8h, v5.4s, #16 ++ xtn2 v22.8h, v5.4s ++ shrn2 v23.8h, v5.4s, #12 ++ ++ shrn v21.8b, v24.8h, #6 ++ shrn v19.8b, v22.8h, #2 ++ xtn v20.8b, v23.8h ++ ++ // v6, v7 ++ ++ shrn v27.4h, v6.4s, #16 ++ xtn v25.4h, v6.4s ++ shrn v26.4h, v6.4s, #12 ++ ++ shrn2 v27.8h, v7.4s, #16 ++ xtn2 v25.8h, v7.4s ++ shrn2 v26.8h, v7.4s, #12 ++ ++ shrn2 v21.16b, v27.8h, #6 ++ shrn2 v19.16b, v25.8h, #2 ++ xtn2 v20.16b, v26.8h ++ ++ blt 2f ++ ++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 ++ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48 ++ ++ bne 1b ++ ++11: ++ subs w6, w6, #1 ++ add x0, x0, w1, uxtw ++ add x8, x8, #128 ++ bne 10b ++ ++ ret ++ ++// Partial final write ++2: ++ cmp w5, #48-96 ++ blt 1f ++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 ++ beq 11b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ sub w5, w5, #48 ++ mov v18.16b, v24.16b ++1: ++ cmp w5, #24-96 ++ blt 1f ++ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24 ++ beq 11b ++ mov v16.2d[0], v16.2d[1] ++ sub w5, w5, #24 ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w5, #12-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[2], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[3], [x0], #3 ++ beq 11b ++ mov v16.2s[0], v16.2s[1] ++ sub w5, w5, #12 ++ mov v17.2s[0], v17.2s[1] ++ mov v18.2s[0], v18.2s[1] ++1: ++ cmp w5, #6-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 ++ beq 11b ++ mov v16.4h[0], v16.4h[1] ++ sub w5, w5, #6 ++ mov v17.4h[0], v17.4h[1] ++ mov v18.4h[0], v18.4h[1] ++1: ++ cmp w5, #3-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ beq 11b ++ mov v16.8b[0], v16.8b[1] ++ sub w5, w5, #3 ++ mov v17.8b[0], v17.8b[1] ++1: ++ cmp w5, #2-96 ++ blt 1f ++ st2 {v16.b, v17.b}[0], [x0], #2 ++ b 11b ++1: ++ st1 {v16.b}[0], [x0], #1 ++ b 11b ++ ++endfunc ++ --- /dev/null +++ b/libavutil/aarch64/rpi_sand_neon.h -@@ -0,0 +1,55 @@ +@@ -0,0 +1,59 @@ +/* +Copyright (c) 2021 Michael Eiler + @@ -56922,6 +63828,10 @@ Upstream-status: Pending + uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, + unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); + ++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ +#ifdef __cplusplus +} +#endif @@ -56929,13 +63839,13 @@ Upstream-status: Pending --- a/libavutil/arm/Makefile +++ b/libavutil/arm/Makefile @@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o - + NEON-OBJS += arm/float_dsp_init_neon.o \ arm/float_dsp_neon.o \ + arm/rpi_sand_neon.o \ --- /dev/null +++ b/libavutil/arm/rpi_sand_neon.S -@@ -0,0 +1,768 @@ +@@ -0,0 +1,925 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. @@ -57298,7 +64208,6 @@ Upstream-status: Pending + ldr r6, [sp, #36] + ldr r7, [sp, #32] @ y + mov r12, #48 -+ vmov.u16 q15, #0x3ff + sub r3, #1 + lsl r3, #7 + sub r1, r1, r6, lsl #1 @@ -57314,37 +64223,33 @@ Upstream-status: Pending + vldm r2!, {q10-q13} + add lr, #64 + -+ vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! ++ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20! + ands lr, #127 + vshrn.u32 d2, q10, #10 + vmovn.u32 d0, q10 -+ vmovn.u32 d4, q14 + -+ vshr.u32 q14, q11, #20 ++ vshrn.u32 d5, q11, #14 + it eq + addeq r2, r3 + vshrn.u32 d3, q11, #10 + vmovn.u32 d1, q11 -+ vmovn.u32 d5, q14 + + subs r5, #48 -+ vand q0, q15 -+ vand q1, q15 -+ vand q2, q15 ++ vshr.u16 q2, #6 ++ vbic.u16 q0, #0xfc00 ++ vbic.u16 q1, #0xfc00 + -+ vshr.u32 q14, q12, #20 ++ vshrn.u32 d20, q12, #14 + vshrn.u32 d18, q12, #10 + vmovn.u32 d16, q12 -+ vmovn.u32 d20, q14 + -+ vshr.u32 q14, q13, #20 ++ vshrn.u32 d21, q13, #14 + vshrn.u32 d19, q13, #10 + vmovn.u32 d17, q13 -+ vmovn.u32 d21, q14 + -+ vand q8, q15 -+ vand q9, q15 -+ vand q10, q15 ++ vshr.u16 q10, #6 ++ vbic.u16 q8, #0xfc00 ++ vbic.u16 q9 , #0xfc00 + blt 2f + + vst3.16 {d0, d2, d4}, [r0], r12 @@ -57437,7 +64342,6 @@ Upstream-status: Pending + ldr r7, [sp, #48] + ldr r9, [sp, #52] + mov r12, #48 -+ vmov.u16 q15, #0x3ff + sub r8, #1 + lsl r8, #7 + add r5, r5, r7, lsl #7 @@ -57453,48 +64357,44 @@ Upstream-status: Pending + add lr, #64 + + @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 -+ vshr.u32 q14, q0, #20 -+ vshrn.u32 d16, q0, #10 ++ vshrn.u32 d20, q0, #14 + vmovn.u32 d18, q0 ++ vshrn.u32 d0, q0, #10 + ands lr, #127 -+ vmovn.u32 d20, q14 + -+ vshr.u32 q14, q1, #20 -+ vshrn.u32 d17, q1, #10 ++ vshrn.u32 d21, q1, #14 + vmovn.u32 d19, q1 -+ vmovn.u32 d21, q14 ++ vshrn.u32 d1, q1, #10 + -+ vshr.u32 q14, q2, #20 + vshrn.u32 d22, q2, #10 -+ vmovn.u32 d24, q2 -+ vmovn.u32 d26, q14 ++ vmovn.u32 d2, q2 ++ vshrn.u32 d4, q2, #14 + -+ vshr.u32 q14, q3, #20 -+ vshrn.u32 d23, q3, #10 -+ vmovn.u32 d25, q3 + add r10, r0, #24 -+ vmovn.u32 d27, q14 ++ vshrn.u32 d23, q3, #10 ++ vmovn.u32 d3, q3 ++ vshrn.u32 d5, q3, #14 + + it eq + addeq r4, r8 -+ vuzp.16 q8, q11 -+ vuzp.16 q9, q12 -+ vuzp.16 q10, q13 ++ vuzp.16 q0, q11 ++ vuzp.16 q9, q1 ++ vuzp.16 q10, q2 + -+ @ q8 V0, V3,.. -> q0 ++ @ q0 V0, V3,.. + @ q9 U0, U3... + @ q10 U1, U4... + @ q11 U2, U5,.. -+ @ q12 V1, V4,.. -> q1 -+ @ q13 V2, V5,.. -> q2 ++ @ q1 V1, V4, ++ @ q2 V2, V5,.. + + subs r6, #24 -+ vand q11, q15 -+ vand q9, q15 -+ vand q10, q15 -+ vand q0, q8, q15 -+ vand q1, q12, q15 -+ vand q2, q13, q15 ++ vbic.u16 q11, #0xfc00 ++ vbic.u16 q9, #0xfc00 ++ vshr.u16 q10, #6 ++ vshr.u16 q2, #6 ++ vbic.u16 q0, #0xfc00 ++ vbic.u16 q1, #0xfc00 + + blt 2f + @@ -57703,10 +64603,177 @@ Upstream-status: Pending +endfunc + + ++@ void ff_rpi_sand30_lines_to_planar_y8( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y8, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ lsl r3, #7 ++ sub r1, r1, r6 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++1: ++ vldm r2, {q8-q15} ++ ++ subs r5, #96 ++ ++ vmovn.u32 d0, q8 ++ vshrn.u32 d2, q8, #12 ++ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20! ++ ++ add r2, r3 ++ ++ vmovn.u32 d1, q9 ++ vshrn.u32 d3, q9, #12 ++ vshrn.u32 d5, q9, #16 ++ ++ pld [r2, #0] ++ ++ vshrn.u16 d0, q0, #2 ++ vmovn.u16 d1, q1 ++ vshrn.u16 d2, q2, #6 ++ ++ vmovn.u32 d16, q10 ++ vshrn.u32 d18, q10, #12 ++ vshrn.u32 d20, q10, #16 ++ ++ vmovn.u32 d17, q11 ++ vshrn.u32 d19, q11, #12 ++ vshrn.u32 d21, q11, #16 ++ ++ pld [r2, #64] ++ ++ vshrn.u16 d4, q8, #2 ++ vmovn.u16 d5, q9 ++ vshrn.u16 d6, q10, #6 ++ ++ vmovn.u32 d16, q12 ++ vshrn.u32 d18, q12, #12 ++ vshrn.u32 d20, q12, #16 ++ ++ vmovn.u32 d17, q13 ++ vshrn.u32 d19, q13, #12 ++ vshrn.u32 d21, q13, #16 ++ ++ vshrn.u16 d16, q8, #2 ++ vmovn.u16 d17, q9 ++ vshrn.u16 d18, q10, #6 ++ ++ vmovn.u32 d20, q14 ++ vshrn.u32 d22, q14, #12 ++ vshrn.u32 d24, q14, #16 ++ ++ vmovn.u32 d21, q15 ++ vshrn.u32 d23, q15, #12 ++ vshrn.u32 d25, q15, #16 ++ ++ vshrn.u16 d20, q10, #2 ++ vmovn.u16 d21, q11 ++ vshrn.u16 d22, q12, #6 ++ ++ blt 2f ++ ++ vst3.8 {d0, d1, d2}, [r0], r12 ++ vst3.8 {d4, d5, d6}, [r4], r12 ++ vst3.8 {d16, d17, d18}, [r0], r12 ++ vst3.8 {d20, d21, d22}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #48-96 ++ blt 1f ++ vst3.8 {d0, d1, d2}, [r0], r12 ++ vst3.8 {d4, d5, d6}, [r4], r12 ++ beq 11b ++ vmov q0, q8 ++ vmov q2, q10 ++ sub r5, #48 ++ vmov d2, d18 ++ vmov d6, d22 ++1: ++ cmp r5, #24-96 ++ blt 1f ++ vst3.8 {d0, d1, d2}, [r0]! ++ beq 11b ++ vmov q0, q2 ++ sub r5, #24 ++ vmov d2, d6 ++1: ++ cmp r5, #12-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! ++ vst3.8 {d0[2], d1[2], d2[2]}, [r0]! ++ vst3.8 {d0[3], d1[3], d2[3]}, [r0]! ++ beq 11b ++ vmov s0, s1 ++ sub r5, #12 ++ vmov s2, s3 ++ vmov s4, s5 ++1: ++ cmp r5, #6-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! ++ add r0, #12 ++ beq 11b ++ vshr.u32 d0, #16 ++ sub r5, #6 ++ vshr.u32 d1, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #3-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #8 ++ vshr.u32 d1, #8 ++1: ++ cmp r5, #2-96 ++ blt 1f ++ vst2.8 {d0[0], d1[0]}, [r0]! ++ b 11b ++1: ++ vst1.8 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ + --- /dev/null +++ b/libavutil/arm/rpi_sand_neon.h -@@ -0,0 +1,99 @@ +@@ -0,0 +1,110 @@ +/* +Copyright (c) 2020 Raspberry Pi (Trading) Ltd. +All rights reserved. @@ -57804,6 +64871,17 @@ Upstream-status: Pending + unsigned int _w, // [sp, #12] -> r6 (cur r5) + unsigned int h); // [sp, #16] -> r7 + ++void ff_rpi_sand30_lines_to_planar_y8( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ +#endif // AVUTIL_ARM_SAND_NEON_H + --- a/libavutil/frame.c @@ -57811,7 +64889,7 @@ Upstream-status: Pending @@ -16,6 +16,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ - + +#include "config.h" + #include "channel_layout.h" @@ -57824,13 +64902,13 @@ Upstream-status: Pending +#if CONFIG_SAND +#include "rpi_sand_fns.h" +#endif - + #if FF_API_FRAME_GET_SET MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp) @@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *fra (frame->crop_top + frame->crop_bottom) >= frame->height) return AVERROR(ERANGE); - + +#if CONFIG_SAND + // Sand cannot be cropped - do not try + if (av_rpi_is_sand_format(frame->format)) @@ -57845,7 +64923,7 @@ Upstream-status: Pending @@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *fra */ const char *av_frame_side_data_name(enum AVFrameSideDataType type); - + + +static inline int av_frame_cropped_width(const AVFrame * const frame) +{ @@ -57866,11 +64944,11 @@ Upstream-status: Pending #include #include +#include - + #include +#include #include - + #include "avassert.h" @@ -28,6 +30,11 @@ #include "hwcontext_drm.h" @@ -57881,13 +64959,13 @@ Upstream-status: Pending +#include +#include +#include - - + + static void drm_device_free(AVHWDeviceContext *hwdev) @@ -43,6 +50,11 @@ static int drm_device_create(AVHWDeviceC AVDRMDeviceContext *hwctx = hwdev->hwctx; drmVersionPtr version; - + + if (device == NULL) { + hwctx->fd = -1; + return 0; @@ -57905,7 +64983,7 @@ Upstream-status: Pending size_t length[AV_DRM_MAX_PLANES]; + int fds[AV_DRM_MAX_PLANES]; } DRMMapping; - + +static int dmasync(const int fd, const unsigned int flags) +{ + struct dma_buf_sync sync = { @@ -57926,19 +65004,19 @@ Upstream-status: Pending { DRMMapping *map = hwmap->priv; int i; - + - for (i = 0; i < map->nb_regions; i++) + for (i = 0; i < map->nb_regions; i++) { munmap(map->address[i], map->length[i]); + dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags); + } - + av_free(map); } @@ -114,15 +145,28 @@ static int drm_map_frame(AVHWFramesConte if (!map) return AVERROR(ENOMEM); - + + for (i = 0; i < AV_DRM_MAX_PLANES; i++) + map->fds[i] = -1; + @@ -57956,7 +65034,7 @@ Upstream-status: Pending + + if (dst->format == AV_PIX_FMT_NONE) + dst->format = hwfc->sw_format; - + av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES); for (i = 0; i < desc->nb_objects; i++) { - addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED, @@ -57968,7 +65046,7 @@ Upstream-status: Pending if (addr == MAP_FAILED) { err = AVERROR(errno); @@ -151,6 +195,23 @@ static int drm_map_frame(AVHWFramesConte - + dst->width = src->width; dst->height = src->height; + dst->crop_top = src->crop_top; @@ -57988,12 +65066,12 @@ Upstream-status: Pending + // *** Are we sure src->height is actually what we want ??? + } +#endif - + err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, &drm_unmap_frame, map); @@ -160,7 +221,9 @@ static int drm_map_frame(AVHWFramesConte return 0; - + fail: - for (i = 0; i < desc->nb_objects; i++) { + for (i = 0; i < AV_DRM_MAX_PLANES; i++) { @@ -58002,13 +65080,23 @@ Upstream-status: Pending if (map->address[i]) munmap(map->address[i], map->length[i]); } -@@ -178,7 +241,15 @@ static int drm_transfer_get_formats(AVHW - if (!pix_fmts) +@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW + enum AVHWFrameTransferDirection dir, + enum AVPixelFormat **formats) + { +- enum AVPixelFormat *pix_fmts; ++ enum AVPixelFormat *p; + +- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts)); +- if (!pix_fmts) ++ p = *formats = av_malloc_array(3, sizeof(*p)); ++ if (!p) return AVERROR(ENOMEM); - + - pix_fmts[0] = ctx->sw_format; +- pix_fmts[1] = AV_PIX_FMT_NONE; + // **** Offer native sand too ???? -+ pix_fmts[0] = ++ *p++ = +#if CONFIG_SAND + ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? + AV_PIX_FMT_YUV420P : @@ -58016,21 +65104,30 @@ Upstream-status: Pending + AV_PIX_FMT_YUV420P10LE : +#endif + ctx->sw_format; - pix_fmts[1] = AV_PIX_FMT_NONE; - - *formats = pix_fmts; -@@ -197,18 +268,80 @@ static int drm_transfer_data_from(AVHWFr ++ ++#if CONFIG_SAND ++ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 || ++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128) ++ *p++ = AV_PIX_FMT_NV12; ++#endif + +- *formats = pix_fmts; ++ *p = AV_PIX_FMT_NONE; + return 0; + } + +@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr map = av_frame_alloc(); if (!map) return AVERROR(ENOMEM); - map->format = dst->format; - + + // Map to default + map->format = AV_PIX_FMT_NONE; err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); if (err) goto fail; - + - map->width = dst->width; - map->height = dst->height; +#if 0 @@ -58054,29 +65151,12 @@ Upstream-status: Pending + const unsigned int w = FFMIN(dst->width, map->width); + const unsigned int h = FFMIN(dst->height, map->height); + -+ if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { -+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -+ map->data[0], -+ 128, stride2, -+ 0, 0, w, h); -+ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ map->data[1], -+ 128, stride2, -+ 0, 0, w / 2, h / 2); -+ } -+ else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { -+ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], -+ map->data[0], -+ 128, stride2, -+ 0, 0, w, h); -+ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ map->data[1], -+ 128, stride2, -+ 0, 0, w / 2, h / 2); -+ } -+ else ++ map->crop_top = 0; ++ map->crop_bottom = 0; ++ map->crop_left = 0; ++ map->crop_right = 0; ++ ++ if (av_rpi_sand_to_planar_frame(dst, map) != 0) + { + av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); + err = AVERROR(EINVAL); @@ -58094,30 +65174,30 @@ Upstream-status: Pending + map->height = dst->height; + err = av_frame_copy(dst, map); + } - + - err = av_frame_copy(dst, map); if (err) + { + av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); goto fail; + } - + err = 0; fail: -@@ -223,7 +356,10 @@ static int drm_transfer_data_to(AVHWFram +@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram int err; - + if (src->width > hwfc->width || src->height > hwfc->height) + { + av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); return AVERROR(EINVAL); + } - + map = av_frame_alloc(); if (!map) --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c -@@ -2371,6 +2371,38 @@ static const AVPixFmtDescriptor av_pix_f +@@ -2371,6 +2371,50 @@ static const AVPixFmtDescriptor av_pix_f .name = "vulkan", .flags = AV_PIX_FMT_FLAG_HWACCEL, }, @@ -58140,17 +65220,29 @@ Upstream-status: Pending + .log2_chroma_h = 1, + .comp = { + { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ -+ { 1, 4, 0, 0, 10, 1, 9, 1 }, /* U */ -+ { 1, 4, 1, 0, 10, 1, 9, 2 }, /* V */ ++ { 1, 4, 0, 0, 10, 3, 9, 1 }, /* U */ ++ { 1, 4, 2, 0, 10, 3, 9, 3 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_SAND64_16] = { ++ .name = "sand64_16", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 16, 0, 15, 1 }, /* Y */ ++ { 1, 4, 0, 0, 16, 3, 15, 1 }, /* U */ ++ { 1, 4, 2, 0, 16, 3, 15, 3 }, /* V */ + }, + .flags = 0, + }, + [AV_PIX_FMT_RPI4_8] = { -+ .name = "rpi", ++ .name = "rpi4_8", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, + [AV_PIX_FMT_RPI4_10] = { -+ .name = "rpi", ++ .name = "rpi4_10", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, }; @@ -58159,7 +65251,7 @@ Upstream-status: Pending --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h @@ -357,6 +357,12 @@ enum AVPixelFormat { - + AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian +// RPI - not on ifdef so can be got at by calling progs @@ -58168,7 +65260,7 @@ Upstream-status: Pending + AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_RPI4_8, + AV_PIX_FMT_RPI4_10, - + AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions }; --- /dev/null @@ -58403,7 +65495,7 @@ Upstream-status: Pending + --- /dev/null +++ b/libavutil/rpi_sand_fns.c -@@ -0,0 +1,356 @@ +@@ -0,0 +1,445 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. @@ -58635,6 +65727,75 @@ Upstream-status: Pending + } +} + ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// single lose bottom 2 bits truncation ++// _x & _w in pixels, strides in bytes ++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 2) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 4; ++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint8_t * d = dst; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3 = *p++; ++ ++ if (xskip0 == 1) ++ *d++ = (p3 >> 12) & 0xff; ++ *d++ = (p3 >> 22) & 0xff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3 = *p++; ++ *d++ = (p3 >> 2) & 0xff; ++ *d++ = (p3 >> 12) & 0xff; ++ *d++ = (p3 >> 22) & 0xff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3 = *p; ++ ++ *d++ = (p3 >> 2) & 0xff; ++ if (xrem1 == 2) ++ *d++ = (p3 >> 12) & 0xff; ++ } ++ } ++} ++ ++ + +// w/h in pixels +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, @@ -58716,6 +65877,16 @@ Upstream-status: Pending + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x/2, y/2, w/2, h/2); + break; ++ case AV_PIX_FMT_NV12: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w, h/2); ++ break; + default: + return -1; + } @@ -58750,6 +65921,16 @@ Upstream-status: Pending + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x/2, y/2, w/2, h/2); + break; ++ case AV_PIX_FMT_NV12: ++ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w, h/2); ++ break; + default: + return -1; + } @@ -58762,7 +65943,7 @@ Upstream-status: Pending +} --- /dev/null +++ b/libavutil/rpi_sand_fns.h -@@ -0,0 +1,183 @@ +@@ -0,0 +1,188 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. @@ -58850,6 +66031,11 @@ Upstream-status: Pending + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + ++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); + +// w/h in pixels +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, @@ -58948,34 +66134,64 @@ Upstream-status: Pending + --- /dev/null +++ b/pi-util/BUILD.txt -@@ -0,0 +1,29 @@ +@@ -0,0 +1,59 @@ +Building Pi FFmpeg +================== + -+Configuration: -+============= ++Current only building on a Pi is supported. ++This builds ffmpeg the way I've tested it + -+These instructions work for cross compiles from Ubuntu 16.04 & Ubuntu -+18.04. I would expect most other linux environments to work but I haven't -+tried them. ++Get all dependencies - the current package dependencies are good enough + -+pi-util/conf_pi2.sh ++$ sudo apt-get build-dep ffmpeg + -+contains suitable options to build the code for Pi2/3. It expects to find -+git clones of ++Configure using the pi-util/conf_native.sh script ++------------------------------------------------- + -+https://github.com/raspberrypi/tools -+https://github.com/raspberrypi/firmware ++This sets the normal release options and creates an ouutput dir to build into ++The directory name will depend on system and options but will be under out/ + -+in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a -+lot of history you don't want. ++There are a few choices here ++ --mmal build including the legacy mmal-based decoders and zero-copy code ++ this requires appropriate libraries which currently will exist for ++ armv7 but not arm64 ++ --noshared ++ Build a static image rather than a shared library one. Static is ++ easier for testing as there is no need to worry about library ++ paths being confused and therefore running the wrong code, Shared ++ is what is needed, in most cases, when building for use by other ++ programs. + -+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be -+rebuilt. Otherwise the prebuilt .c & .h files will be used. -+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild ++So for a static build ++--------------------- + -+pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time -+H265 QPU acceleration is broken on Pi1 and so it is disabled. ++$ pi-util/conf_native.sh --noshared ++ ++$ make -j8 -C out/ ++ ++You can now run ffmpeg directly from where it was built ++ ++For a shared build ++------------------ ++ ++$ pi-util/conf_native.sh ++ ++You will normally want an install target if shared. Note that the script has ++set this up to be generated in out//install, you don't have to worry ++about overwriting your system libs. ++ ++$ make -j8 -C out/ install ++ ++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was ++built or install the image on the system - you have to be careful to get rid ++of all other ffmpeg libs or confusion may result. There is a little script ++that wipes all other versions - obviously use with care! ++ ++$ sudo pi-util/clean_usr_libs.sh ++ ++Then simply copying from the install to /usr works ++ ++$ sudo cp -r out//install/* /usr + + --- /dev/null @@ -59137,29 +66353,32 @@ Upstream-status: Pending + --- /dev/null +++ b/pi-util/clean_usr_libs.sh -@@ -0,0 +1,23 @@ +@@ -0,0 +1,26 @@ +set -e +U=/usr/lib/arm-linux-gnueabihf +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* -+rm -f $U/libavresample.* +rm -f $U/libavutil.* ++rm -f $U/libswresample.* ++rm -f $U/libswscale.* +U=/usr/lib/arm-linux-gnueabihf/neon/vfp +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* -+rm -f $U/libavresample.* +rm -f $U/libavutil.* ++rm -f $U/libswresample.* ++rm -f $U/libswscale.* +U=/usr/lib/aarch64-linux-gnu +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* -+rm -f $U/libavresample.* +rm -f $U/libavutil.* ++rm -f $U/libswresample.* ++rm -f $U/libswscale.* + --- /dev/null +++ b/pi-util/conf_arm64_native.sh @@ -59706,57 +66925,90 @@ Upstream-status: Pending +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 --- /dev/null +++ b/pi-util/conf_native.sh -@@ -0,0 +1,78 @@ +@@ -0,0 +1,106 @@ +echo "Configure for native build" + +FFSRC=`pwd` -+MC=`uname -m` ++MC=`dpkg --print-architecture` ++BUILDBASE=$FFSRC/out + +#RPI_KEEPS="-save-temps=obj" +RPI_KEEPS="" + -+if [ "$MC" == "aarch64" ]; then ++NOSHARED= ++MMAL= ++ ++while [ "$1" != "" ] ; do ++ case $1 in ++ --noshared) ++ NOSHARED=1 ++ ;; ++ --mmal) ++ MMAL=1 ++ ;; ++ *) ++ echo "Usage $0: [--noshared] [--mmal]" ++ exit 1 ++ ;; ++ esac ++ shift ++done ++ ++ ++MCOPTS= ++RPI_INCLUDES= ++RPI_LIBDIRS= ++RPI_DEFINES= ++RPI_EXTRALIBS= ++ ++if [ "$MC" == "arm64" ]; then + echo "M/C aarch64" + A=aarch64-linux-gnu + B=arm64 -+ MCOPTS= -+ RPI_INCLUDES= -+ RPI_LIBDIRS= -+ RPI_DEFINES= -+ RPI_EXTRALIBS= -+ RPIOPTS="--disable-mmal --enable-sand" -+else ++elif [ "$MC" == "armhf" ]; then + echo "M/C armv7" + A=arm-linux-gnueabihf + B=armv7 + MCOPTS="--arch=armv6t2 --cpu=cortex-a7" ++ RPI_DEFINES=-mfpu=neon-vfpv4 ++else ++ echo Unexpected architecture $MC ++ exit 1 ++fi ++ ++if [ $MMAL ]; then + RPI_OPT_VC=/opt/vc + RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" + RPI_LIBDIRS="-L$RPI_OPT_VC/lib" -+ RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4" ++ RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000" + RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group" + RPIOPTS="--enable-mmal --enable-rpi" ++else ++ RPIOPTS="--disable-mmal --enable-sand" +fi ++ +C=`lsb_release -sc` +V=`cat RELEASE` + +SHARED_LIBS="--enable-shared" -+if [ "$1" == "--noshared" ]; then ++if [ $NOSHARED ]; then + SHARED_LIBS="--disable-shared" -+ OUT=out/$B-$C-$V-static-rel ++ OUT=$BUILDBASE/$B-$C-$V-static-rel + echo Static libs +else + echo Shared libs -+ OUT=out/$B-$C-$V-shared-rel ++ OUT=$BUILDBASE/$B-$C-$V-shared-rel +fi + -+USR_PREFIX=$FFSRC/$OUT/install ++USR_PREFIX=$OUT/install +LIB_PREFIX=$USR_PREFIX/lib/$A +INC_PREFIX=$USR_PREFIX/include/$A + +echo Destination directory: $OUT -+mkdir -p $FFSRC/$OUT -+cd $FFSRC/$OUT ++mkdir -p $OUT ++# Nothing under here need worry git - including this .gitignore! ++echo "**" > $BUILDBASE/.gitignore ++cd $OUT + +$FFSRC/configure \ + --prefix=$USR_PREFIX\ @@ -59767,10 +67019,8 @@ Upstream-status: Pending + --disable-thumb\ + --enable-v4l2-request\ + --enable-libdrm\ -+ --enable-epoxy\ -+ --enable-libudev\ -+ --enable-vout-drm\ + --enable-vout-egl\ ++ --enable-vout-drm\ + $SHARED_LIBS\ + $RPIOPTS\ + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ @@ -59779,118 +67029,13 @@ Upstream-status: Pending + --extra-libs="$RPI_EXTRALIBS"\ + --extra-version="rpi" + -+# --enable-decoder=hevc_rpi\ -+# --enable-extra-warnings\ -+# --arch=armv71\ -+ -+# gcc option for getting asm listing -+# -Wa,-ahls ---- /dev/null -+++ b/pi-util/conf_pi1.sh -@@ -0,0 +1,39 @@ -+echo "Configure for Pi1" -+ -+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf -+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc -+ -+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" -+#RPI_KEEPS="-save-temps=obj" -+RPI_KEEPS="" -+ -+SHARED_LIBS="--enable-shared" -+if [ "$1" == "--noshared" ]; then -+ SHARED_LIBS="--disable-shared" -+ echo Static libs -+else -+ echo Shared libs -+fi -+ -+./configure --enable-cross-compile\ -+ --cpu=arm1176jzf-s\ -+ --arch=arm\ -+ --disable-neon\ -+ --target-os=linux\ -+ --disable-stripping\ -+ --enable-mmal\ -+ $SHARED_LIBS\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_INCLUDES"\ -+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ -+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ -+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- -+ -+ -+# --enable-extra-warnings\ -+# --arch=armv71\ -+# --enable-shared\ -+ -+# gcc option for getting asm listing -+# -Wa,-ahls ---- /dev/null -+++ b/pi-util/conf_pi2.sh -@@ -0,0 +1,57 @@ -+echo "Configure for Pi2/3" -+ -+FFSRC=`pwd` -+ -+RPI_TOOLROOT=$FFSRC/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf -+RPI_OPT_VC=$FFSRC/../firmware/hardfp/opt/vc -+ -+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" -+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4" -+#RPI_KEEPS="-save-temps=obj" -+RPI_KEEPS="" -+ -+SHARED_LIBS="--enable-shared" -+if [ "$1" == "--noshared" ]; then -+ SHARED_LIBS="--disable-shared" -+ OUT=out/x-armv7-static-rel -+ echo Static libs -+else -+ echo Shared libs -+ OUT=out/x-armv7-shared-rel -+fi -+ -+USR_PREFIX=$FFSRC/$OUT/install -+LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf -+INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf -+ -+mkdir -p $FFSRC/$OUT -+cd $FFSRC/$OUT -+ -+$FFSRC/configure --enable-cross-compile\ -+ --prefix=$USR_PREFIX\ -+ --libdir=$LIB_PREFIX\ -+ --incdir=$INC_PREFIX\ -+ --arch=armv6t2\ -+ --cpu=cortex-a7\ -+ --target-os=linux\ -+ --disable-stripping\ -+ --disable-thumb\ -+ --enable-mmal\ -+ --enable-rpi\ -+ $SHARED_LIBS\ -+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\ -+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ -+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ -+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- -+ -+# --enable-shared\ -+ -+# --enable-decoder=hevc_rpi\ -+# --enable-extra-warnings\ -+# --arch=armv71\ -+# --enable-shared\ + +# gcc option for getting asm listing +# -Wa,-ahls --- /dev/null +++ b/pi-util/ffconf.py @@ -0,0 +1,215 @@ -+#!/usr/bin/env python ++#!/usr/bin/env python3 + +import string +import os @@ -59967,16 +67112,16 @@ Upstream-status: Pending + pass + + if m1 and m2 and m1.group() == m2.group(): -+ print >> flog, "Match: " + m1.group() ++ print("Match: " + m1.group(), file=flog) + rv = 0 + elif not m1: -+ print >> flog, "****** Cannot find m1" ++ print("****** Cannot find m1", file=flog) + rv = 3 + elif not m2: -+ print >> flog, "****** Cannot find m2" ++ print("****** Cannot find m2", file=flog) + rv = 2 + else: -+ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group() ++ print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog) + rv = 1 + flog.close() + return rv @@ -60022,7 +67167,7 @@ Upstream-status: Pending + exp_test = int(a[0]) + if (exp_test and runtest(a[1], tests)): + name = a[1] -+ print "==== ", name, ++ print ("==== ", name, end="") + sys.stdout.flush() + + rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec) @@ -60033,31 +67178,31 @@ Upstream-status: Pending + + if (rv == 0): + if exp_test == 2: -+ print ": * OK *" ++ print(": * OK *") + unx_success.append(name) + else: -+ print ": ok" ++ print(": ok") + elif exp_test == 2 and rv == 1: -+ print ": fail" ++ print(": fail") + elif exp_test == 3 and rv == 2: + # Call an expected "crash" an abort -+ print ": abort" ++ print(": abort") + else: + unx_failures.append(name) + if rv == 1: -+ print ": * FAIL *" ++ print(": * FAIL *") + elif (rv == 2) : -+ print ": * CRASH *" ++ print(": * CRASH *") + elif (rv == 3) : -+ print ": * MD5 MISSING *" ++ print(": * MD5 MISSING *") + else : -+ print ": * BANG *" ++ print(": * BANG *") + + if unx_failures or unx_success: -+ print "Unexpected Failures:", unx_failures -+ print "Unexpected Success: ", unx_success ++ print("Unexpected Failures:", unx_failures) ++ print("Unexpected Success: ", unx_success) + else: -+ print "All tests normal:", successes, "ok,", failures, "failed" ++ print("All tests normal:", successes, "ok,", failures, "failed") + + +class ConfCSVDialect(csv.Dialect): @@ -60567,3 +67712,630 @@ Upstream-status: Pending + + do_logparse(args.logfile) + +--- a/tests/checkasm/Makefile ++++ b/tests/checkasm/Makefile +@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP) + AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o + AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o + AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o ++AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o + AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o + AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o ++AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o + AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o + AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o + +--- a/tests/checkasm/checkasm.c ++++ b/tests/checkasm/checkasm.c +@@ -121,6 +121,9 @@ static const struct { + #if CONFIG_HUFFYUV_DECODER + { "huffyuvdsp", checkasm_check_huffyuvdsp }, + #endif ++ #if CONFIG_IDCTDSP ++ { "idctdsp", checkasm_check_idctdsp }, ++ #endif + #if CONFIG_JPEG2000_DECODER + { "jpeg2000dsp", checkasm_check_jpeg2000dsp }, + #endif +@@ -145,6 +148,9 @@ static const struct { + #if CONFIG_V210_ENCODER + { "v210enc", checkasm_check_v210enc }, + #endif ++ #if CONFIG_VC1DSP ++ { "vc1dsp", checkasm_check_vc1dsp }, ++ #endif + #if CONFIG_VP8DSP + { "vp8dsp", checkasm_check_vp8dsp }, + #endif +--- a/tests/checkasm/checkasm.h ++++ b/tests/checkasm/checkasm.h +@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void); + void checkasm_check_hevc_idct(void); + void checkasm_check_hevc_sao(void); + void checkasm_check_huffyuvdsp(void); ++void checkasm_check_idctdsp(void); + void checkasm_check_jpeg2000dsp(void); + void checkasm_check_llviddsp(void); + void checkasm_check_llviddspenc(void); +@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void); + void checkasm_check_utvideodsp(void); + void checkasm_check_v210dec(void); + void checkasm_check_v210enc(void); ++void checkasm_check_vc1dsp(void); + void checkasm_check_vf_eq(void); + void checkasm_check_vf_gblur(void); + void checkasm_check_vf_hflip(void); +--- /dev/null ++++ b/tests/checkasm/idctdsp.c +@@ -0,0 +1,98 @@ ++/* ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++ ++#include "checkasm.h" ++ ++#include "libavcodec/idctdsp.h" ++ ++#include "libavutil/common.h" ++#include "libavutil/internal.h" ++#include "libavutil/intreadwrite.h" ++#include "libavutil/mem_internal.h" ++ ++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) }, ++ ++typedef struct { ++ const char *name; ++ size_t offset; ++} test; ++ ++#define RANDOMIZE_BUFFER16(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint16_t r = rnd() % 0x201 - 0x100; \ ++ AV_WN16A(name##0 + i, r); \ ++ AV_WN16A(name##1 + i, r); \ ++ } \ ++ } while (0) ++ ++#define RANDOMIZE_BUFFER8(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint8_t r = rnd(); \ ++ name##0[i] = r; \ ++ name##1[i] = r; \ ++ } \ ++ } while (0) ++ ++static void check_add_put_clamped(void) ++{ ++ /* Source buffers are only as big as needed, since any over-read won't affect results */ ++ LOCAL_ALIGNED_16(int16_t, src0, [64]); ++ LOCAL_ALIGNED_16(int16_t, src1, [64]); ++ /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */ ++ LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]); ++ LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]); ++ ++ AVCodecContext avctx = { 0 }; ++ IDCTDSPContext h; ++ ++ const test tests[] = { ++ IDCTDSP_TEST(add_pixels_clamped) ++ IDCTDSP_TEST(put_pixels_clamped) ++ IDCTDSP_TEST(put_signed_pixels_clamped) ++ }; ++ ++ ff_idctdsp_init(&h, &avctx); ++ ++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { ++ void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset); ++ if (check_func(func, "idctdsp.%s", tests[t].name)) { ++ declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t); ++ RANDOMIZE_BUFFER16(src, 64); ++ RANDOMIZE_BUFFER8(dst, 10 * 24); ++ call_ref(src0, dst0 + 24 + 8, 24); ++ call_new(src1, dst1 + 24 + 8, 24); ++ if (memcmp(dst0, dst1, 10 * 24)) ++ fail(); ++ bench_new(src1, dst1 + 24 + 8, 24); ++ } ++ } ++} ++ ++void checkasm_check_idctdsp(void) ++{ ++ check_add_put_clamped(); ++ report("idctdsp"); ++} +--- /dev/null ++++ b/tests/checkasm/vc1dsp.c +@@ -0,0 +1,452 @@ ++/* ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++ ++#include "checkasm.h" ++ ++#include "libavcodec/vc1dsp.h" ++ ++#include "libavutil/common.h" ++#include "libavutil/internal.h" ++#include "libavutil/intreadwrite.h" ++#include "libavutil/mem_internal.h" ++ ++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, ++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, ++ ++typedef struct { ++ const char *name; ++ size_t offset; ++ int width; ++ int height; ++} test; ++ ++typedef struct matrix { ++ size_t width; ++ size_t height; ++ float d[]; ++} matrix; ++ ++static const matrix T8 = { 8, 8, { ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 16, 15, 9, 4, -4, -9, -15, -16, ++ 16, 6, -6, -16, -16, -6, 6, 16, ++ 15, -4, -16, -9, 9, 16, 4, -15, ++ 12, -12, -12, 12, 12, -12, -12, 12, ++ 9, -16, 4, 15, -15, -4, 16, -9, ++ 6, -16, 16, -6, -6, 16, -16, 6, ++ 4, -9, 15, -16, 16, -15, 9, -4 ++} }; ++ ++static const matrix T4 = { 4, 4, { ++ 17, 17, 17, 17, ++ 22, 10, -10, -22, ++ 17, -17, -17, 17, ++ 10, -22, 22, -10 ++} }; ++ ++static const matrix T8t = { 8, 8, { ++ 12, 16, 16, 15, 12, 9, 6, 4, ++ 12, 15, 6, -4, -12, -16, -16, -9, ++ 12, 9, -6, -16, -12, 4, 16, 15, ++ 12, 4, -16, -9, 12, 15, -6, -16, ++ 12, -4, -16, 9, 12, -15, -6, 16, ++ 12, -9, -6, 16, -12, -4, 16, -15, ++ 12, -15, 6, 4, -12, 16, -16, 9, ++ 12, -16, 16, -15, 12, -9, 6, -4 ++} }; ++ ++static const matrix T4t = { 4, 4, { ++ 17, 22, 17, 10, ++ 17, 10, -17, -22, ++ 17, -10, -17, 22, ++ 17, -22, 17, -10 ++} }; ++ ++static matrix *new_matrix(size_t width, size_t height) ++{ ++ matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); ++ if (out == NULL) { ++ fprintf(stderr, "Memory allocation failure\n"); ++ exit(EXIT_FAILURE); ++ } ++ out->width = width; ++ out->height = height; ++ return out; ++} ++ ++static matrix *multiply(const matrix *a, const matrix *b) ++{ ++ matrix *out; ++ if (a->width != b->height) { ++ fprintf(stderr, "Incompatible multiplication\n"); ++ exit(EXIT_FAILURE); ++ } ++ out = new_matrix(b->width, a->height); ++ for (int j = 0; j < out->height; ++j) ++ for (int i = 0; i < out->width; ++i) { ++ float sum = 0; ++ for (int k = 0; k < a->width; ++k) ++ sum += a->d[j * a->width + k] * b->d[k * b->width + i]; ++ out->d[j * out->width + i] = sum; ++ } ++ return out; ++} ++ ++static void normalise(matrix *a) ++{ ++ for (int j = 0; j < a->height; ++j) ++ for (int i = 0; i < a->width; ++i) { ++ float *p = a->d + j * a->width + i; ++ *p *= 64; ++ if (a->height == 4) ++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [j]; ++ else ++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j]; ++ if (a->width == 4) ++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [i]; ++ else ++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i]; ++ } ++} ++ ++static void divide_and_round_nearest(matrix *a, float by) ++{ ++ for (int j = 0; j < a->height; ++j) ++ for (int i = 0; i < a->width; ++i) { ++ float *p = a->d + j * a->width + i; ++ *p = rintf(*p / by); ++ } ++} ++ ++static void tweak(matrix *a) ++{ ++ for (int j = 4; j < a->height; ++j) ++ for (int i = 0; i < a->width; ++i) { ++ float *p = a->d + j * a->width + i; ++ *p += 1; ++ } ++} ++ ++/* The VC-1 spec places restrictions on the values permitted at three ++ * different stages: ++ * - D: the input coefficients in frequency domain ++ * - E: the intermediate coefficients, inverse-transformed only horizontally ++ * - R: the fully inverse-transformed coefficients ++ * ++ * To fully cater for the ranges specified requires various intermediate ++ * values to be held to 17-bit precision; yet these conditions do not appear ++ * to be utilised in real-world streams. At least some assembly ++ * implementations have chosen to restrict these values to 16-bit precision, ++ * to accelerate the decoding of real-world streams at the cost of strict ++ * adherence to the spec. To avoid our test marking these as failures, ++ * reduce our random inputs. ++ */ ++#define ATTENUATION 4 ++ ++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height) ++{ ++ matrix *raw, *tmp, *D, *E, *R; ++ raw = new_matrix(width, height); ++ for (int i = 0; i < width * height; ++i) ++ raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION; ++ tmp = multiply(height == 8 ? &T8 : &T4, raw); ++ D = multiply(tmp, width == 8 ? &T8t : &T4t); ++ normalise(D); ++ divide_and_round_nearest(D, 1); ++ for (int i = 0; i < width * height; ++i) { ++ if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) { ++ /* Rare, so simply try again */ ++ av_free(raw); ++ av_free(tmp); ++ av_free(D); ++ return generate_inverse_quantized_transform_coefficients(width, height); ++ } ++ } ++ E = multiply(D, width == 8 ? &T8 : &T4); ++ divide_and_round_nearest(E, 8); ++ for (int i = 0; i < width * height; ++i) ++ if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) { ++ /* Rare, so simply try again */ ++ av_free(raw); ++ av_free(tmp); ++ av_free(D); ++ av_free(E); ++ return generate_inverse_quantized_transform_coefficients(width, height); ++ } ++ R = multiply(height == 8 ? &T8t : &T4t, E); ++ tweak(R); ++ divide_and_round_nearest(R, 128); ++ for (int i = 0; i < width * height; ++i) ++ if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) { ++ /* Rare, so simply try again */ ++ av_free(raw); ++ av_free(tmp); ++ av_free(D); ++ av_free(E); ++ av_free(R); ++ return generate_inverse_quantized_transform_coefficients(width, height); ++ } ++ av_free(raw); ++ av_free(tmp); ++ av_free(E); ++ av_free(R); ++ return D; ++} ++ ++#define RANDOMIZE_BUFFER16(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint16_t r = rnd(); \ ++ AV_WN16A(name##0 + i, r); \ ++ AV_WN16A(name##1 + i, r); \ ++ } \ ++ } while (0) ++ ++#define RANDOMIZE_BUFFER8(name, size) \ ++ do { \ ++ int i; \ ++ for (i = 0; i < size; ++i) { \ ++ uint8_t r = rnd(); \ ++ name##0[i] = r; \ ++ name##1[i] = r; \ ++ } \ ++ } while (0) ++ ++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ ++ do { \ ++ uint8_t *p##0 = name##0, *p##1 = name##1; \ ++ int i = (size); \ ++ while (i-- > 0) { \ ++ int x = 0x80 | (rnd() & 0x7F); \ ++ x >>= rnd() % 9; \ ++ if (rnd() & 1) \ ++ x = -x; \ ++ *p##1++ = *p##0++ = 0x80 + x; \ ++ } \ ++ } while (0) ++ ++static void check_inv_trans_inplace(void) ++{ ++ /* Inverse transform input coefficients are stored in a 16-bit buffer ++ * with row stride of 8 coefficients irrespective of transform size. ++ * vc1_inv_trans_8x8 differs from the others in two ways: coefficients ++ * are stored in column-major order, and the outputs are written back ++ * to the input buffer, so we oversize it slightly to catch overruns. */ ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]); ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]); ++ ++ VC1DSPContext h; ++ ++ ff_vc1dsp_init(&h); ++ ++ if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) { ++ matrix *coeffs; ++ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *); ++ RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8); ++ coeffs = generate_inverse_quantized_transform_coefficients(8, 8); ++ for (int j = 0; j < 8; ++j) ++ for (int i = 0; i < 8; ++i) { ++ int idx = 8 + i * 8 + j; ++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i]; ++ } ++ call_ref(inv_trans_in0 + 8); ++ call_new(inv_trans_in1 + 8); ++ if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t))) ++ fail(); ++ bench_new(inv_trans_in1 + 8); ++ av_free(coeffs); ++ } ++} ++ ++static void check_inv_trans_adding(void) ++{ ++ /* Inverse transform input coefficients are stored in a 16-bit buffer ++ * with row stride of 8 coefficients irrespective of transform size. */ ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]); ++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]); ++ ++ /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and ++ * added with saturation to an array of unsigned 8-bit values. Oversize ++ * this by 8 samples left and right and one row above and below. */ ++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]); ++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]); ++ ++ VC1DSPContext h; ++ ++ const test tests[] = { ++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8) ++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4) ++ }; ++ ++ ff_vc1dsp_init(&h); ++ ++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { ++ void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); ++ if (check_func(func, "vc1dsp.%s", tests[t].name)) { ++ matrix *coeffs; ++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); ++ RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); ++ RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); ++ coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); ++ for (int j = 0; j < tests[t].height; ++j) ++ for (int i = 0; i < tests[t].width; ++i) { ++ int idx = j * 8 + i; ++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; ++ } ++ call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); ++ call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); ++ if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24)) ++ fail(); ++ bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8); ++ av_free(coeffs); ++ } ++ } ++} ++ ++static void check_loop_filter(void) ++{ ++ /* Deblocking filter buffers are big enough to hold a 16x16 block, ++ * plus 16 columns left and 4 rows above to hold filter inputs ++ * (depending on whether v or h neighbouring block edge, oversized ++ * horizontally to maintain 16-byte alignment) plus 16 columns and ++ * 4 rows below to catch write overflows */ ++ LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]); ++ LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]); ++ ++ VC1DSPContext h; ++ ++ const test tests[] = { ++ VC1DSP_TEST(vc1_v_loop_filter4) ++ VC1DSP_TEST(vc1_h_loop_filter4) ++ VC1DSP_TEST(vc1_v_loop_filter8) ++ VC1DSP_TEST(vc1_h_loop_filter8) ++ VC1DSP_TEST(vc1_v_loop_filter16) ++ VC1DSP_TEST(vc1_h_loop_filter16) ++ }; ++ ++ ff_vc1dsp_init(&h); ++ ++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { ++ void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); ++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int); ++ if (check_func(func, "vc1dsp.%s", tests[t].name)) { ++ for (int count = 1000; count > 0; --count) { ++ int pq = rnd() % 31 + 1; ++ RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48); ++ call_ref(filter_buf0 + 4 * 48 + 16, 48, pq); ++ call_new(filter_buf1 + 4 * 48 + 16, 48, pq); ++ if (memcmp(filter_buf0, filter_buf1, 24 * 48)) ++ fail(); ++ } ++ } ++ for (int j = 0; j < 24; ++j) ++ for (int i = 0; i < 48; ++i) ++ filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4); ++ if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name)) ++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 1); ++ if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name)) ++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 31); ++ } ++} ++ ++#define TEST_UNESCAPE \ ++ do { \ ++ for (int count = 100; count > 0; --count) { \ ++ escaped_offset = rnd() & 7; \ ++ unescaped_offset = rnd() & 7; \ ++ escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \ ++ RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \ ++ len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \ ++ len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \ ++ if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \ ++ fail(); \ ++ } \ ++ } while (0) ++ ++static void check_unescape(void) ++{ ++ /* This appears to be a typical length of buffer in use */ ++#define LOG2_UNESCAPE_BUF_SIZE 17 ++#define UNESCAPE_BUF_SIZE (1u< +Date: Thu, 8 Dec 2022 10:34:20 -0600 +Subject: [PATCH] configure: setup for OE-core usage + +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. + +Add global CFLAGS and LDFLAGS. So, that when +./configure runs test it's able to locate proper +headers and libs in a cross-compile environment. + +Add new check to opengl. None of the above headers +exists and we also should be using GLESv2. + +Update where compiler finds OMX_Core.h + +Only check that sdl2 version greater than 2.0.1 + +Signed-off-by: Vincent Davis Jr +--- + configure | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +diff --git a/configure b/configure +index 723b81f1..0c7f2654 100755 +--- a/configure ++++ b/configure +@@ -5746,6 +5746,9 @@ enable_weak_pic() { + } + + enabled pic && enable_weak_pic ++# Set CFLAGS and LDFLAGS globally ++add_cflags -I${sysroot}/usr/include/ -I${sysroot}/usr/include/IL -I${sysroot}/usr/include/drm ++add_ldflags -L${sysroot}/usr/lib/ + + test_cc <= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent ++ test_pkg_config sdl2 "sdl2 >= 2.0.1" SDL_events.h SDL_PollEvent + if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then + sdl2_cflags=$("${SDL2_CONFIG}" --cflags) + sdl2_extralibs=$("${SDL2_CONFIG}" --libs) +-- +2.38.1 + diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch new file mode 100644 index 0000000..43a9191 --- /dev/null +++ b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch @@ -0,0 +1,111 @@ +From be426ad76c3e486f1364dd292cf8e1c633c80e91 Mon Sep 17 00:00:00 2001 +From: Vincent Davis Jr +Date: Thu, 8 Dec 2022 10:39:47 -0600 +Subject: [PATCH] libavdevice: opengl_enc.c update dynamic function loader + +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. + +For meta-raspberrypi ffmpeg builds, when opengl +is enabled do_compile will fail. Reasion is that +glGetProcAddress is undefined in either GLES2/gl2.h +or GLES2/gl2ext.h. + +define SelectedGetProcAddress to SDL_GL_GetProcAddress +if sdl2 is included. If not included, define function +pointers at compile time versus runtime. + +Signed-off-by: Vincent Davis Jr +--- + libavdevice/opengl_enc.c | 44 ++++++++++++++++++++++++++++++++++++---- + 1 file changed, 40 insertions(+), 4 deletions(-) + +diff --git a/libavdevice/opengl_enc.c b/libavdevice/opengl_enc.c +index 2bdb8da7..eabc1bf8 100644 +--- a/libavdevice/opengl_enc.c ++++ b/libavdevice/opengl_enc.c +@@ -37,12 +37,13 @@ + #include + #elif HAVE_ES2_GL_H + #include +-#else +-#include +-#include + #endif + #if HAVE_GLXGETPROCADDRESS + #include ++#else ++#define GL_GLEXT_PROTOTYPES ++#include ++#include + #endif + + #if CONFIG_SDL2 +@@ -493,8 +494,14 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl) + + #if HAVE_GLXGETPROCADDRESS + #define SelectedGetProcAddress glXGetProcAddress ++#define CAN_DYNAMIC_LOAD 1 + #elif HAVE_WGLGETPROCADDRESS + #define SelectedGetProcAddress wglGetProcAddress ++#elif CONFIG_SDL2 ++#define SelectedGetProcAddress SDL_GL_GetProcAddress ++#define CAN_DYNAMIC_LOAD 1 ++#else ++#define CAN_DYNAMIC_LOAD 0 + #endif + + #define LOAD_OPENGL_FUN(name, type) \ +@@ -504,7 +511,8 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl) + return AVERROR(ENOSYS); \ + } + +-#if CONFIG_SDL2 ++#if CAN_DYNAMIC_LOAD ++#if CONFIG_SDL2 + if (!opengl->no_window) + return opengl_sdl_load_procedures(opengl); + #endif +@@ -534,9 +542,37 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl) + LOAD_OPENGL_FUN(glGetShaderInfoLog, FF_PFNGLGETSHADERINFOLOGPROC) + LOAD_OPENGL_FUN(glEnableVertexAttribArray, FF_PFNGLENABLEVERTEXATTRIBARRAYPROC) + LOAD_OPENGL_FUN(glVertexAttribPointer, FF_PFNGLVERTEXATTRIBPOINTERPROC) ++#else ++ procs->glActiveTexture = glActiveTexture; ++ procs->glGenBuffers = glGenBuffers; ++ procs->glDeleteBuffers = glDeleteBuffers; ++ procs->glBufferData = glBufferData; ++ procs->glBindBuffer = glBindBuffer; ++ procs->glGetAttribLocation = glGetAttribLocation; ++ procs->glGetUniformLocation = glGetUniformLocation; ++ procs->glUniform1f = glUniform1f; ++ procs->glUniform1i = glUniform1i; ++ procs->glUniformMatrix4fv = glUniformMatrix4fv; ++ procs->glCreateProgram = glCreateProgram; ++ procs->glDeleteProgram = glDeleteProgram; ++ procs->glUseProgram = glUseProgram; ++ procs->glLinkProgram = glLinkProgram; ++ procs->glGetProgramiv = glGetProgramiv; ++ procs->glGetProgramInfoLog = glGetProgramInfoLog; ++ procs->glAttachShader = glAttachShader; ++ procs->glCreateShader = glCreateShader; ++ procs->glDeleteShader = glDeleteShader; ++ procs->glCompileShader = glCompileShader; ++ procs->glShaderSource = glShaderSource; ++ procs->glGetShaderiv = glGetShaderiv; ++ procs->glGetShaderInfoLog = glGetShaderInfoLog; ++ procs->glEnableVertexAttribArray = glEnableVertexAttribArray; ++ procs->glVertexAttribPointer = (FF_PFNGLVERTEXATTRIBPOINTERPROC) glVertexAttribPointer; ++#endif + + return 0; + ++#undef CAN_DYNAMIC_LOAD + #undef SelectedGetProcAddress + #undef LOAD_OPENGL_FUN + } +-- +2.38.1 + diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch new file mode 100644 index 0000000..2232c48 --- /dev/null +++ b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch @@ -0,0 +1,45 @@ +From 62c2f041890a6e20770350721a0a2138d0b38634 Mon Sep 17 00:00:00 2001 +From: Vincent Davis Jr +Date: Sat, 3 Dec 2022 23:35:51 -0600 +Subject: [PATCH] libavcodec: fix v4l2_req_devscan.h + +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. + +Fixes minor differences between v4l2_req_devscan.c +and v4l2_req_devscan.h after all patches have been +applied. + +Signed-off-by: Vincent Davis Jr +--- + libavcodec/v4l2_req_devscan.h | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h +index 0baef365..cd9c49ac 100644 +--- a/libavcodec/v4l2_req_devscan.h ++++ b/libavcodec/v4l2_req_devscan.h +@@ -1,6 +1,8 @@ + #ifndef _DEVSCAN_H_ + #define _DEVSCAN_H_ + ++#include ++ + struct devscan; + struct decdev; + enum v4l2_buf_type; +@@ -13,7 +15,8 @@ const char *decdev_video_path(const struct decdev *const dev); + enum v4l2_buf_type decdev_src_type(const struct decdev *const dev); + uint32_t decdev_src_pixelformat(const struct decdev *const dev); + +-const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2); ++const struct decdev *devscan_find(struct devscan *const scan, ++ const uint32_t src_fmt_v4l2); + + int devscan_build(void * const dc, struct devscan **pscan); + void devscan_delete(struct devscan **const pScan); +-- +2.38.1 + diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch new file mode 100644 index 0000000..02c07de --- /dev/null +++ b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch @@ -0,0 +1,35 @@ +From 0dfb56e12fa709794525cda1471091f6699905d5 Mon Sep 17 00:00:00 2001 +From: Vincent Davis Jr +Date: Thu, 8 Dec 2022 10:49:03 -0600 +Subject: [PATCH] libavcodec: omx replace /opt/vc path with /usr/lib + +Upstream-Status: Inappropriate + +RPI-Distro repo clones original ffmpeg and applies patches to enable +raspiberry pi support. + +Configures omx.c for OE usages as libbcm_host.so +and libopenmaxil.so are located in a different +location. + +Signed-off-by: Vincent Davis Jr +--- + libavcodec/omx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libavcodec/omx.c b/libavcodec/omx.c +index 0a6a3083..8c6e9193 100644 +--- a/libavcodec/omx.c ++++ b/libavcodec/omx.c +@@ -141,7 +141,7 @@ static av_cold OMXContext *omx_init(void *logctx, const char *libname, const cha + { + static const char * const libnames[] = { + #if CONFIG_OMX_RPI +- "/opt/vc/lib/libopenmaxil.so", "/opt/vc/lib/libbcm_host.so", ++ "/usr/lib/libopenmaxil.so", "/usr/lib/libbcm_host.so", + #else + "libOMX_Core.so", NULL, + "libOmxCore.so", NULL, +-- +2.38.1 + diff --git a/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.2.bb b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb similarity index 89% rename from recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.2.bb rename to recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb index de0d445..1720d57 100644 --- a/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.2.bb +++ b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb @@ -33,23 +33,27 @@ RPROVIDES:${PN} = "${PROVIDES}" DEPENDS = "nasm-native" inherit autotools pkgconfig -PACKAGECONFIG ??= "avdevice avfilter avcodec avformat swresample swscale postproc avresample \ - opengl udev sdl2 ffplay alsa bzlib lzma pic pthreads shared theora zlib \ - libvorbis x264 gpl sand rpi vout-drm vout-egl \ - ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal', d)} \ +PACKAGECONFIG ??= "avdevice avfilter avcodec avformat swresample swscale postproc avresample ffplay \ + v4l2 drm udev alsa bzlib lzma pic pthreads shared theora zlib libvorbis x264 gpl \ + ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal rpi sand vout-drm', d)} \ ${@bb.utils.contains('AVAILTUNES', 'mips32r2', 'mips32r2', '', d)} \ - ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'xv xcb', '', d)}" + ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'opengl', '', d)} \ + ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'xv xcb vout-egl epoxy', '', d)}" SRC_URI = "\ git://git@github.com/RPi-Distro/ffmpeg;protocol=https;branch=pios/bullseye \ file://0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch \ file://0002-Fix-build-on-powerpc-and-ppc64.patch \ file://0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch \ - file://0004-ffmpeg-4.3.2-rpi_10.patch \ - file://0005-fix_flags.diff \ -" + file://0004-ffmpeg-4.3.4-rpi_14.patch \ + file://0005-fix-flags.diff \ + file://2001-configure-setup-for-OE-core-usage.patch \ + file://2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch \ + file://2003-libavcodec-fix-v4l2_req_devscan.patch \ + file://2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch \ + " -SRCREV = "ea72093f350f38edcd39c480b331c3219c377642" +SRCREV = "246e1a55a0eca931537d8706acd8b133c07beb05" S = "${WORKDIR}/git" @@ -70,7 +74,7 @@ PACKAGECONFIG[altivec] = "--enable-altivec,--disable-altivec," PACKAGECONFIG[bzlib] = "--enable-bzlib,--disable-bzlib,bzip2" PACKAGECONFIG[fdk-aac] = "--enable-libfdk-aac --enable-nonfree,--disable-libfdk-aac,fdk-aac" PACKAGECONFIG[gpl] = "--enable-gpl,--disable-gpl" -PACKAGECONFIG[opengl] = "--enable-opengl,--disable-opengl,virtual/libgl" +PACKAGECONFIG[opengl] = "--enable-opengl,--disable-opengl,virtual/libgles2" PACKAGECONFIG[gsm] = "--enable-libgsm,--disable-libgsm,libgsm" PACKAGECONFIG[jack] = "--enable-indev=jack,--disable-indev=jack,jack" PACKAGECONFIG[libvorbis] = "--enable-libvorbis,--disable-libvorbis,libvorbis" @@ -90,9 +94,11 @@ PACKAGECONFIG[x264] = "--enable-libx264,--disable-libx264,x264" PACKAGECONFIG[xcb] = "--enable-libxcb,--disable-libxcb,libxcb" PACKAGECONFIG[xv] = "--enable-outdev=xv,--disable-outdev=xv,libxv" PACKAGECONFIG[zlib] = "--enable-zlib,--disable-zlib,zlib" -#PACKAGECONFIG[snappy] = "--enable-libsnappy,--enable-libsnappy,snappy" +PACKAGECONFIG[snappy] = "--enable-libsnappy,--disable-libsnappy,snappy" PACKAGECONFIG[udev] = "--enable-libudev,--disable-libudev,udev" -PACKAGECONFIG[v4l2] = "--enable-libv4l2 --enable-v4l2-request --enable-libdrm,,v4l-utils" +PACKAGECONFIG[drm] = "--enable-libdrm,--disable-libdrm,libdrm" +PACKAGECONFIG[epoxy] = "--enable-epoxy,--disable-epoxy,libepoxy" +PACKAGECONFIG[v4l2] = "--enable-libv4l2 --enable-v4l2-m2m --enable-v4l2-request,,v4l-utils" PACKAGECONFIG[mmal] = "--enable-omx --enable-omx-rpi --enable-mmal,,userland" PACKAGECONFIG[sand] = "--enable-sand,," PACKAGECONFIG[rpi] = "--enable-rpi,," @@ -138,11 +144,6 @@ EXTRA_OECONF = " \ " EXTRA_OECONF:append:linux-gnux32 = " --disable-asm" -# Directly specify the include directories the contain headers for -# libdrm -# openmaxil -TARGET_CFLAGS:append = " -I${STAGING_INCDIR}/IL -I${STAGING_INCDIR}/drm" - # gold crashes on x86, another solution is to --disable-asm but thats more hacky # ld.gold: internal error in relocate_section, at ../../gold/i386.cc:3684 LDFLAGS:append:x86 = "${@bb.utils.contains('DISTRO_FEATURES', 'ld-is-gold', ' -fuse-ld=bfd ', '', d)}"