mirror of
https://github.com/LibreELEC/LibreELEC.tv.git
synced 2025-07-28 13:16:41 +00:00
Merge pull request #8175 from HiassofT/le12-rpi5
This commit is contained in:
commit
dd26ba0f77
@ -11,7 +11,7 @@
|
||||
fi
|
||||
|
||||
# TARGET_CPU:
|
||||
# generic cortex-a35 cortex-a53 cortex-a57 cortex-a72
|
||||
# generic cortex-a35 cortex-a53 cortex-a57 cortex-a72 cortex-a76
|
||||
# exynos-m1 qdf24xx thunderx xgene1 cortex-a57.cortex-a53
|
||||
# cortex-a72.cortex-a53
|
||||
|
||||
@ -21,6 +21,10 @@
|
||||
TARGET_SUBARCH=aarch64
|
||||
TARGET_VARIANT=armv8-a
|
||||
;;
|
||||
cortex-a76)
|
||||
TARGET_SUBARCH=aarch64
|
||||
TARGET_VARIANT=armv8.2-a
|
||||
;;
|
||||
esac
|
||||
|
||||
TARGET_GCC_ARCH=${TARGET_SUBARCH/-}
|
||||
|
@ -23,8 +23,8 @@ case "${LINUX}" in
|
||||
PKG_PATCH_DIRS="default"
|
||||
;;
|
||||
raspberrypi)
|
||||
PKG_VERSION="cc08810f89e52337a99cc6ae5f53f08588357c5f" # 6.1.54
|
||||
PKG_SHA256="9f05e721292dc9de12a22f6c005f79da1421c76dcf9b2f99eac6754e978472de"
|
||||
PKG_VERSION="2d3d2030c0564790a76cf94fa0ca1913873381af" # 6.1.54
|
||||
PKG_SHA256="2fcea07b5500813923ba6e7621ff28863dcc6eef5cab88ce75e7054c49f9bf34"
|
||||
PKG_URL="https://github.com/raspberrypi/linux/archive/${PKG_VERSION}.tar.gz"
|
||||
PKG_SOURCE_NAME="linux-${LINUX}-${PKG_VERSION}.tar.gz"
|
||||
;;
|
||||
|
@ -51,7 +51,7 @@ if [ "${V4L2_SUPPORT}" = "yes" ]; then
|
||||
PKG_NEED_UNPACK+=" $(get_pkg_directory libdrm)"
|
||||
PKG_FFMPEG_V4L2="--enable-v4l2_m2m --enable-libdrm"
|
||||
|
||||
if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" -o "${DEVICE}" = "iMX8" -o "${DEVICE}" = "RPi4" ]; then
|
||||
if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" -o "${DEVICE}" = "iMX8" -o "${DEVICE}" = "RPi4" -o "${DEVICE}" = "RPi5" ]; then
|
||||
PKG_V4L2_REQUEST="yes"
|
||||
else
|
||||
PKG_V4L2_REQUEST="no"
|
||||
|
@ -17,9 +17,11 @@ mount -o remount,rw $BOOT_ROOT
|
||||
|
||||
# update bootloader files
|
||||
cp -p $SYSTEM_ROOT/usr/share/bootloader/LICENCE* $BOOT_ROOT
|
||||
cp -p $SYSTEM_ROOT/usr/share/bootloader/bootcode.bin $BOOT_ROOT
|
||||
cp -p $SYSTEM_ROOT/usr/share/bootloader/fixup.dat $BOOT_ROOT
|
||||
cp -p $SYSTEM_ROOT/usr/share/bootloader/start.elf $BOOT_ROOT
|
||||
for f in bootcode.bin fixup.dat start.elf ; do
|
||||
if [ -f "${SYSTEM_ROOT}/usr/share/bootloader/$f" ]; then
|
||||
cp -p "${SYSTEM_ROOT}/usr/share/bootloader/$f" "${BOOT_ROOT}"
|
||||
fi
|
||||
done
|
||||
|
||||
rm -f $BOOT_ROOT/bcm283*.dtb # cleanup excess dtb's used by upstream kernels (ie. not LE)
|
||||
cp -p $SYSTEM_ROOT/usr/share/bootloader/*.dtb $BOOT_ROOT
|
||||
|
@ -16,14 +16,19 @@ PKG_TOOLCHAIN="manual"
|
||||
makeinstall_target() {
|
||||
mkdir -p ${INSTALL}/usr/share/bootloader
|
||||
cp -PRv LICENCE* ${INSTALL}/usr/share/bootloader
|
||||
cp -PRv bootcode.bin ${INSTALL}/usr/share/bootloader
|
||||
if [ "${DEVICE}" = "RPi4" ]; then
|
||||
cp -PRv fixup4x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
|
||||
cp -PRv start4x.elf ${INSTALL}/usr/share/bootloader/start.elf
|
||||
else
|
||||
cp -PRv fixup_x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
|
||||
cp -PRv start_x.elf ${INSTALL}/usr/share/bootloader/start.elf
|
||||
fi
|
||||
case "${DEVICE}" in
|
||||
RPi4)
|
||||
cp -PRv fixup4x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
|
||||
cp -PRv start4x.elf ${INSTALL}/usr/share/bootloader/start.elf
|
||||
;;
|
||||
RPi5)
|
||||
;;
|
||||
*)
|
||||
cp -PRv bootcode.bin ${INSTALL}/usr/share/bootloader
|
||||
cp -PRv fixup_x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
|
||||
cp -PRv start_x.elf ${INSTALL}/usr/share/bootloader/start.elf
|
||||
;;
|
||||
esac
|
||||
|
||||
find_file_path bootloader/update.sh ${PKG_DIR}/files/update.sh && cp -PRv ${FOUND_PATH} ${INSTALL}/usr/share/bootloader
|
||||
|
||||
|
@ -6,9 +6,11 @@
|
||||
|
||||
mkdir -p $RELEASE_DIR/3rdparty/bootloader
|
||||
cp -PR $INSTALL/usr/share/bootloader/LICENCE* $RELEASE_DIR/3rdparty/bootloader/
|
||||
cp -PR $INSTALL/usr/share/bootloader/bootcode.bin $RELEASE_DIR/3rdparty/bootloader/
|
||||
cp -PR $INSTALL/usr/share/bootloader/fixup.dat $RELEASE_DIR/3rdparty/bootloader/
|
||||
cp -PR $INSTALL/usr/share/bootloader/start.elf $RELEASE_DIR/3rdparty/bootloader/
|
||||
for f in bootcode.bin fixup.dat start.elf ; do
|
||||
if [ -f "${INSTALL}/usr/share/bootloader/$f" ]; then
|
||||
cp -PR "${INSTALL}/usr/share/bootloader/$f" "${RELEASE_DIR}/3rdparty/bootloader/"
|
||||
fi
|
||||
done
|
||||
cp -PR $INSTALL/usr/share/bootloader/*.dtb $RELEASE_DIR/3rdparty/bootloader/
|
||||
cp -PR $INSTALL/usr/share/bootloader/overlays $RELEASE_DIR/3rdparty/bootloader/
|
||||
|
||||
|
@ -2,8 +2,8 @@
|
||||
# Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
|
||||
|
||||
PKG_NAME="rpi-eeprom"
|
||||
PKG_VERSION="d774d5794cc63248e559bba201630d00a6e35762"
|
||||
PKG_SHA256="cd66d1fff5f45d5141e1b6fd6abf17c6977ef81339efad5979f1bfd2347f71a1"
|
||||
PKG_VERSION="4f2d676b4e2a9c2d9ee1ab42015ce711fde97afa"
|
||||
PKG_SHA256="189c5d37f3102247cec72619e3cb357d027ec526fa3c7373d3107bd6c9e30e29"
|
||||
PKG_LICENSE="BSD-3/custom"
|
||||
PKG_SITE="https://github.com/raspberrypi/rpi-eeprom"
|
||||
PKG_URL="https://github.com/raspberrypi/rpi-eeprom/archive/${PKG_VERSION}.tar.gz"
|
||||
@ -12,13 +12,20 @@ PKG_LONGDESC="rpi-eeprom: firmware, config and scripts to update RPi4 SPI bootlo
|
||||
PKG_TOOLCHAIN="manual"
|
||||
|
||||
makeinstall_target() {
|
||||
DESTDIR=${INSTALL}/$(get_kernel_overlay_dir)/lib/firmware/raspberrypi/bootloader
|
||||
|
||||
if [ "${DEVICE}" = "RPi4" ]; then
|
||||
_variant="2711"
|
||||
else
|
||||
_variant="2712"
|
||||
fi
|
||||
|
||||
DESTDIR=${INSTALL}/$(get_kernel_overlay_dir)/lib/firmware/raspberrypi/bootloader-${_variant}
|
||||
|
||||
mkdir -p ${DESTDIR}
|
||||
_dirs="critical stable"
|
||||
_dirs="default latest"
|
||||
|
||||
for _maindir in ${_dirs}; do
|
||||
for _dir in ${PKG_BUILD}/firmware/${_maindir} ${PKG_BUILD}/firmware/{_maindir}-*; do
|
||||
for _dir in ${PKG_BUILD}/firmware-${_variant}/${_maindir} ${PKG_BUILD}/firmware-${_variant}/${_maindir}-*; do
|
||||
[ -d "${_dir}" ] || continue
|
||||
|
||||
_basedir="$(basename "${_dir}")"
|
||||
@ -30,14 +37,17 @@ makeinstall_target() {
|
||||
PKG_FW_FILE="$(ls -1 /${_dir}/pieeprom-* 2>/dev/null | tail -1)"
|
||||
[ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir}
|
||||
|
||||
# VIA USB3
|
||||
PKG_FW_FILE="$(ls -1 ${_dir}/vl805-*.bin 2>/dev/null | tail -1)"
|
||||
[ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir}
|
||||
if [ "${DEVICE}" = "RPi4" ]; then
|
||||
# VIA USB3
|
||||
PKG_FW_FILE="$(ls -1 ${_dir}/vl805-*.bin 2>/dev/null | tail -1)"
|
||||
[ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir}
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
# also copy default and latest symlinks
|
||||
cp -Prv ${PKG_BUILD}/firmware/{default,latest} ${DESTDIR}
|
||||
# also create legacy naming symlinks
|
||||
ln -s default ${DESTDIR}/critical
|
||||
ln -s latest ${DESTDIR}/stable
|
||||
|
||||
mkdir -p ${INSTALL}/usr/bin
|
||||
cp -PRv ${PKG_DIR}/source/rpi-eeprom-update ${INSTALL}/usr/bin
|
||||
|
@ -1,6 +1,6 @@
|
||||
#
|
||||
# Automatically generated file; DO NOT EDIT.
|
||||
# Linux/arm 6.1.53 Kernel Configuration
|
||||
# Linux/arm 6.1.54 Kernel Configuration
|
||||
#
|
||||
CONFIG_CC_VERSION_TEXT="arm-linux-gnueabihf-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
|
||||
CONFIG_CC_IS_GCC=y
|
||||
@ -1944,6 +1944,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
|
||||
# CONFIG_INPUT_DRV260X_HAPTICS is not set
|
||||
# CONFIG_INPUT_DRV2665_HAPTICS is not set
|
||||
# CONFIG_INPUT_DRV2667_HAPTICS is not set
|
||||
# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set
|
||||
CONFIG_RMI4_CORE=y
|
||||
# CONFIG_RMI4_I2C is not set
|
||||
# CONFIG_RMI4_SPI is not set
|
||||
@ -1982,7 +1983,6 @@ CONFIG_SERIO_SERPORT=y
|
||||
CONFIG_BRCM_CHAR_DRIVERS=y
|
||||
CONFIG_BCM2708_VCMEM=y
|
||||
CONFIG_BCM_VCIO=y
|
||||
CONFIG_BCM2835_DEVGPIOMEM=m
|
||||
CONFIG_BCM2835_SMI_DEV=m
|
||||
# CONFIG_RPIVID_MEM is not set
|
||||
CONFIG_TTY=y
|
||||
@ -2067,6 +2067,7 @@ CONFIG_DEVMEM=y
|
||||
# CONFIG_XILLYUSB is not set
|
||||
CONFIG_RANDOM_TRUST_CPU=y
|
||||
CONFIG_RANDOM_TRUST_BOOTLOADER=y
|
||||
CONFIG_RASPBERRYPI_GPIOMEM=y
|
||||
# end of Character devices
|
||||
|
||||
#
|
||||
@ -2215,6 +2216,8 @@ CONFIG_GENERIC_PINCONF=y
|
||||
# CONFIG_PINCTRL_SINGLE is not set
|
||||
# CONFIG_PINCTRL_STMFX is not set
|
||||
# CONFIG_PINCTRL_SX150X is not set
|
||||
# CONFIG_PINCTRL_RP1 is not set
|
||||
# CONFIG_PINCTRL_BCM2712 is not set
|
||||
CONFIG_PINCTRL_BCM2835=y
|
||||
|
||||
#
|
||||
@ -2239,6 +2242,7 @@ CONFIG_GPIO_CDEV_V1=y
|
||||
# CONFIG_GPIO_ALTERA is not set
|
||||
CONFIG_GPIO_RASPBERRYPI_EXP=y
|
||||
CONFIG_GPIO_BCM_VIRT=y
|
||||
# CONFIG_GPIO_BRCMSTB is not set
|
||||
# CONFIG_GPIO_CADENCE is not set
|
||||
# CONFIG_GPIO_DWAPB is not set
|
||||
# CONFIG_GPIO_FTGPIO010 is not set
|
||||
@ -3078,6 +3082,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y
|
||||
# Qualcomm media platform drivers
|
||||
#
|
||||
|
||||
#
|
||||
# Raspberry Pi media platform drivers
|
||||
#
|
||||
# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set
|
||||
# CONFIG_VIDEO_RP1_CFE is not set
|
||||
|
||||
#
|
||||
# Renesas media platform drivers
|
||||
#
|
||||
@ -3749,6 +3759,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y
|
||||
# CONFIG_DRM_V3D is not set
|
||||
CONFIG_DRM_VC4=y
|
||||
CONFIG_DRM_VC4_HDMI_CEC=y
|
||||
# CONFIG_DRM_RP1_DSI is not set
|
||||
# CONFIG_DRM_RP1_DPI is not set
|
||||
# CONFIG_DRM_RP1_VEC is not set
|
||||
# CONFIG_DRM_ETNAVIV is not set
|
||||
# CONFIG_DRM_LOGICVC is not set
|
||||
# CONFIG_DRM_ARCPGU is not set
|
||||
@ -5024,6 +5037,7 @@ CONFIG_PWM_BCM2835=m
|
||||
# CONFIG_PWM_FSL_FTM is not set
|
||||
# CONFIG_PWM_PCA9685 is not set
|
||||
CONFIG_PWM_RASPBERRYPI_POE=m
|
||||
# CONFIG_PWM_RP1 is not set
|
||||
# CONFIG_PWM_XILINX is not set
|
||||
|
||||
#
|
||||
@ -5037,6 +5051,7 @@ CONFIG_BRCMSTB_L2_IRQ=y
|
||||
|
||||
# CONFIG_IPACK_BUS is not set
|
||||
CONFIG_RESET_CONTROLLER=y
|
||||
# CONFIG_RESET_BRCMSTB is not set
|
||||
# CONFIG_RESET_RASPBERRYPI is not set
|
||||
# CONFIG_RESET_SIMPLE is not set
|
||||
# CONFIG_RESET_TI_SYSCON is not set
|
||||
@ -5052,6 +5067,7 @@ CONFIG_RESET_CONTROLLER=y
|
||||
# PHY drivers for Broadcom platforms
|
||||
#
|
||||
# CONFIG_BCM_KONA_USB2_PHY is not set
|
||||
# CONFIG_PHY_BRCM_USB is not set
|
||||
# end of PHY drivers for Broadcom platforms
|
||||
|
||||
# CONFIG_PHY_CADENCE_TORRENT is not set
|
||||
|
@ -1,6 +1,6 @@
|
||||
#
|
||||
# Automatically generated file; DO NOT EDIT.
|
||||
# Linux/arm 6.1.53 Kernel Configuration
|
||||
# Linux/arm 6.1.54 Kernel Configuration
|
||||
#
|
||||
CONFIG_CC_VERSION_TEXT="arm-linux-gnueabihf-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
|
||||
CONFIG_CC_IS_GCC=y
|
||||
@ -2169,6 +2169,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
|
||||
# CONFIG_INPUT_DRV260X_HAPTICS is not set
|
||||
# CONFIG_INPUT_DRV2665_HAPTICS is not set
|
||||
# CONFIG_INPUT_DRV2667_HAPTICS is not set
|
||||
# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set
|
||||
CONFIG_RMI4_CORE=y
|
||||
# CONFIG_RMI4_I2C is not set
|
||||
# CONFIG_RMI4_SPI is not set
|
||||
@ -2207,7 +2208,6 @@ CONFIG_SERIO_SERPORT=y
|
||||
CONFIG_BRCM_CHAR_DRIVERS=y
|
||||
CONFIG_BCM2708_VCMEM=y
|
||||
CONFIG_BCM_VCIO=y
|
||||
CONFIG_BCM2835_DEVGPIOMEM=m
|
||||
CONFIG_BCM2835_SMI_DEV=m
|
||||
# CONFIG_RPIVID_MEM is not set
|
||||
CONFIG_TTY=y
|
||||
@ -2292,6 +2292,7 @@ CONFIG_DEVMEM=y
|
||||
# CONFIG_XILLYUSB is not set
|
||||
CONFIG_RANDOM_TRUST_CPU=y
|
||||
CONFIG_RANDOM_TRUST_BOOTLOADER=y
|
||||
CONFIG_RASPBERRYPI_GPIOMEM=y
|
||||
# end of Character devices
|
||||
|
||||
#
|
||||
@ -2440,6 +2441,8 @@ CONFIG_GENERIC_PINCONF=y
|
||||
# CONFIG_PINCTRL_SINGLE is not set
|
||||
# CONFIG_PINCTRL_STMFX is not set
|
||||
# CONFIG_PINCTRL_SX150X is not set
|
||||
# CONFIG_PINCTRL_RP1 is not set
|
||||
# CONFIG_PINCTRL_BCM2712 is not set
|
||||
CONFIG_PINCTRL_BCM2835=y
|
||||
|
||||
#
|
||||
@ -2464,6 +2467,7 @@ CONFIG_GPIO_CDEV_V1=y
|
||||
# CONFIG_GPIO_ALTERA is not set
|
||||
CONFIG_GPIO_RASPBERRYPI_EXP=y
|
||||
CONFIG_GPIO_BCM_VIRT=y
|
||||
# CONFIG_GPIO_BRCMSTB is not set
|
||||
# CONFIG_GPIO_CADENCE is not set
|
||||
# CONFIG_GPIO_DWAPB is not set
|
||||
# CONFIG_GPIO_FTGPIO010 is not set
|
||||
@ -3310,6 +3314,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y
|
||||
# Qualcomm media platform drivers
|
||||
#
|
||||
|
||||
#
|
||||
# Raspberry Pi media platform drivers
|
||||
#
|
||||
# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set
|
||||
# CONFIG_VIDEO_RP1_CFE is not set
|
||||
|
||||
#
|
||||
# Renesas media platform drivers
|
||||
#
|
||||
@ -3982,6 +3992,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y
|
||||
# CONFIG_DRM_V3D is not set
|
||||
CONFIG_DRM_VC4=y
|
||||
CONFIG_DRM_VC4_HDMI_CEC=y
|
||||
# CONFIG_DRM_RP1_DSI is not set
|
||||
# CONFIG_DRM_RP1_DPI is not set
|
||||
# CONFIG_DRM_RP1_VEC is not set
|
||||
# CONFIG_DRM_ETNAVIV is not set
|
||||
# CONFIG_DRM_LOGICVC is not set
|
||||
# CONFIG_DRM_ARCPGU is not set
|
||||
@ -5260,6 +5273,7 @@ CONFIG_PWM_BCM2835=m
|
||||
# CONFIG_PWM_FSL_FTM is not set
|
||||
# CONFIG_PWM_PCA9685 is not set
|
||||
CONFIG_PWM_RASPBERRYPI_POE=m
|
||||
# CONFIG_PWM_RP1 is not set
|
||||
# CONFIG_PWM_XILINX is not set
|
||||
|
||||
#
|
||||
@ -5269,12 +5283,14 @@ CONFIG_IRQCHIP=y
|
||||
CONFIG_ARM_GIC=y
|
||||
CONFIG_ARM_GIC_MAX_NR=1
|
||||
# CONFIG_AL_FIC is not set
|
||||
# CONFIG_BCM2712_MIP is not set
|
||||
CONFIG_BRCMSTB_L2_IRQ=y
|
||||
# CONFIG_XILINX_INTC is not set
|
||||
# end of IRQ chip support
|
||||
|
||||
# CONFIG_IPACK_BUS is not set
|
||||
CONFIG_RESET_CONTROLLER=y
|
||||
# CONFIG_RESET_BRCMSTB is not set
|
||||
# CONFIG_RESET_RASPBERRYPI is not set
|
||||
# CONFIG_RESET_SIMPLE is not set
|
||||
# CONFIG_RESET_TI_SYSCON is not set
|
||||
@ -5290,6 +5306,7 @@ CONFIG_RESET_CONTROLLER=y
|
||||
# PHY drivers for Broadcom platforms
|
||||
#
|
||||
# CONFIG_BCM_KONA_USB2_PHY is not set
|
||||
# CONFIG_PHY_BRCM_USB is not set
|
||||
# end of PHY drivers for Broadcom platforms
|
||||
|
||||
# CONFIG_PHY_CADENCE_TORRENT is not set
|
||||
|
@ -1,6 +1,6 @@
|
||||
#
|
||||
# Automatically generated file; DO NOT EDIT.
|
||||
# Linux/arm64 6.1.53 Kernel Configuration
|
||||
# Linux/arm64 6.1.54 Kernel Configuration
|
||||
#
|
||||
CONFIG_CC_VERSION_TEXT="aarch64-linux-gnu-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
|
||||
CONFIG_CC_IS_GCC=y
|
||||
@ -2694,6 +2694,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
|
||||
# CONFIG_INPUT_DRV260X_HAPTICS is not set
|
||||
# CONFIG_INPUT_DRV2665_HAPTICS is not set
|
||||
# CONFIG_INPUT_DRV2667_HAPTICS is not set
|
||||
# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set
|
||||
CONFIG_RMI4_CORE=y
|
||||
# CONFIG_RMI4_I2C is not set
|
||||
# CONFIG_RMI4_SPI is not set
|
||||
@ -2733,7 +2734,6 @@ CONFIG_SERIO_SERPORT=y
|
||||
CONFIG_BRCM_CHAR_DRIVERS=y
|
||||
CONFIG_BCM2708_VCMEM=y
|
||||
CONFIG_BCM_VCIO=y
|
||||
CONFIG_BCM2835_DEVGPIOMEM=m
|
||||
CONFIG_BCM2835_SMI_DEV=m
|
||||
# CONFIG_RPIVID_MEM is not set
|
||||
CONFIG_TTY=y
|
||||
@ -2827,6 +2827,7 @@ CONFIG_DEVPORT=y
|
||||
# CONFIG_XILLYUSB is not set
|
||||
CONFIG_RANDOM_TRUST_CPU=y
|
||||
CONFIG_RANDOM_TRUST_BOOTLOADER=y
|
||||
CONFIG_RASPBERRYPI_GPIOMEM=y
|
||||
# end of Character devices
|
||||
|
||||
#
|
||||
@ -3000,6 +3001,8 @@ CONFIG_GENERIC_PINCONF=y
|
||||
# CONFIG_PINCTRL_SINGLE is not set
|
||||
# CONFIG_PINCTRL_STMFX is not set
|
||||
# CONFIG_PINCTRL_SX150X is not set
|
||||
# CONFIG_PINCTRL_RP1 is not set
|
||||
# CONFIG_PINCTRL_BCM2712 is not set
|
||||
CONFIG_PINCTRL_BCM2835=y
|
||||
|
||||
#
|
||||
@ -3023,6 +3026,7 @@ CONFIG_GPIO_CDEV_V1=y
|
||||
# CONFIG_GPIO_ALTERA is not set
|
||||
CONFIG_GPIO_RASPBERRYPI_EXP=y
|
||||
CONFIG_GPIO_BCM_VIRT=y
|
||||
# CONFIG_GPIO_BRCMSTB is not set
|
||||
# CONFIG_GPIO_CADENCE is not set
|
||||
# CONFIG_GPIO_DWAPB is not set
|
||||
# CONFIG_GPIO_EXAR is not set
|
||||
@ -3562,6 +3566,7 @@ CONFIG_MFD_WM5102=y
|
||||
# CONFIG_MFD_QCOM_PM8008 is not set
|
||||
# CONFIG_RAVE_SP_CORE is not set
|
||||
# CONFIG_MFD_INTEL_M10_BMC is not set
|
||||
# CONFIG_MFD_RP1 is not set
|
||||
# CONFIG_MFD_RSMU_I2C is not set
|
||||
# CONFIG_MFD_RSMU_SPI is not set
|
||||
# end of Multifunction device drivers
|
||||
@ -3904,6 +3909,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y
|
||||
# Qualcomm media platform drivers
|
||||
#
|
||||
|
||||
#
|
||||
# Raspberry Pi media platform drivers
|
||||
#
|
||||
# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set
|
||||
# CONFIG_VIDEO_RP1_CFE is not set
|
||||
|
||||
#
|
||||
# Renesas media platform drivers
|
||||
#
|
||||
@ -4577,6 +4588,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y
|
||||
CONFIG_DRM_V3D=y
|
||||
CONFIG_DRM_VC4=y
|
||||
CONFIG_DRM_VC4_HDMI_CEC=y
|
||||
# CONFIG_DRM_RP1_DSI is not set
|
||||
# CONFIG_DRM_RP1_DPI is not set
|
||||
# CONFIG_DRM_RP1_VEC is not set
|
||||
# CONFIG_DRM_ETNAVIV is not set
|
||||
# CONFIG_DRM_HISI_HIBMC is not set
|
||||
# CONFIG_DRM_HISI_KIRIN is not set
|
||||
@ -5837,6 +5851,7 @@ CONFIG_COMMON_CLK=y
|
||||
|
||||
# CONFIG_LMK04832 is not set
|
||||
# CONFIG_COMMON_CLK_MAX9485 is not set
|
||||
# CONFIG_COMMON_CLK_RP1 is not set
|
||||
CONFIG_COMMON_CLK_HIFIBERRY_DACPLUSHD=m
|
||||
CONFIG_COMMON_CLK_HIFIBERRY_DACPRO=m
|
||||
# CONFIG_COMMON_CLK_SI5341 is not set
|
||||
@ -5980,6 +5995,7 @@ CONFIG_PWM_BCM2835=m
|
||||
# CONFIG_PWM_FSL_FTM is not set
|
||||
# CONFIG_PWM_PCA9685 is not set
|
||||
CONFIG_PWM_RASPBERRYPI_POE=m
|
||||
# CONFIG_PWM_RP1 is not set
|
||||
# CONFIG_PWM_XILINX is not set
|
||||
|
||||
#
|
||||
@ -5993,6 +6009,7 @@ CONFIG_ARM_GIC_V3=y
|
||||
CONFIG_ARM_GIC_V3_ITS=y
|
||||
CONFIG_ARM_GIC_V3_ITS_PCI=y
|
||||
# CONFIG_AL_FIC is not set
|
||||
# CONFIG_BCM2712_MIP is not set
|
||||
CONFIG_BRCMSTB_L2_IRQ=y
|
||||
# CONFIG_XILINX_INTC is not set
|
||||
CONFIG_PARTITION_PERCPU=y
|
||||
@ -6000,6 +6017,7 @@ CONFIG_PARTITION_PERCPU=y
|
||||
|
||||
# CONFIG_IPACK_BUS is not set
|
||||
CONFIG_RESET_CONTROLLER=y
|
||||
# CONFIG_RESET_BRCMSTB is not set
|
||||
CONFIG_RESET_RASPBERRYPI=y
|
||||
CONFIG_RESET_SIMPLE=y
|
||||
# CONFIG_RESET_TI_SYSCON is not set
|
||||
@ -6016,6 +6034,7 @@ CONFIG_RESET_SIMPLE=y
|
||||
# PHY drivers for Broadcom platforms
|
||||
#
|
||||
# CONFIG_BCM_KONA_USB2_PHY is not set
|
||||
# CONFIG_PHY_BRCM_USB is not set
|
||||
# end of PHY drivers for Broadcom platforms
|
||||
|
||||
# CONFIG_PHY_CADENCE_TORRENT is not set
|
||||
|
30
projects/RPi/devices/RPi5/config/config.txt
Normal file
30
projects/RPi/devices/RPi5/config/config.txt
Normal file
@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
# Copyright (C) 2009-2014 Stephan Raue (stephan@openelec.tv)
|
||||
# Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
|
||||
################################################################################
|
||||
# Bootloader configuration
|
||||
# config.txt version v1 (do not remove or change this line!)
|
||||
################################################################################
|
||||
# For more options and information see
|
||||
# http://rpf.io/configtxt
|
||||
################################################################################
|
||||
|
||||
# Don't send initial active source message.
|
||||
# Avoids bringing CEC (enabled TV) out of standby and channel switch when
|
||||
# rebooting.
|
||||
hdmi_ignore_cec_init=1
|
||||
|
||||
[all]
|
||||
################################################################################
|
||||
# Use distroconfig-composite.txt instead of distroconfig.txt to enable
|
||||
# composite video output.
|
||||
# The composite video mode needs to be configured in cmdline.txt:
|
||||
# For PAL add: video=Composite-1:720x576@50ie
|
||||
# For NTSC add: video=Composite-1:720x480@60ie
|
||||
################################################################################
|
||||
include distroconfig.txt
|
||||
#include distroconfig-composite.txt
|
||||
|
||||
# uncomment to enable infrared remote receiver connected to GPIO 18
|
||||
#dtoverlay=gpio-ir,gpio_pin=18
|
||||
|
14
projects/RPi/devices/RPi5/config/distroconfig-composite.txt
Normal file
14
projects/RPi/devices/RPi5/config/distroconfig-composite.txt
Normal file
@ -0,0 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv)
|
||||
|
||||
# WARNING: DO NOT EDIT THIS FILE - IT WILL BE OVERWRITTEN WHEN UPGRADING!
|
||||
arm_boost=1
|
||||
arm_64bit=1
|
||||
kernel=kernel.img
|
||||
display_auto_detect=1
|
||||
enable_tvout=1
|
||||
dtoverlay=vc4-kms-v3d,cma-512,composite=1
|
||||
dtoverlay=
|
||||
disable_overscan=1
|
||||
disable_fw_kms_setup=1
|
||||
max_framebuffers=0
|
13
projects/RPi/devices/RPi5/config/distroconfig.txt
Normal file
13
projects/RPi/devices/RPi5/config/distroconfig.txt
Normal file
@ -0,0 +1,13 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
# Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
|
||||
|
||||
# WARNING: DO NOT EDIT THIS FILE - IT WILL BE OVERWRITTEN WHEN UPGRADING!
|
||||
arm_boost=1
|
||||
arm_64bit=1
|
||||
kernel=kernel.img
|
||||
display_auto_detect=1
|
||||
dtoverlay=vc4-kms-v3d,cma-512
|
||||
dtoverlay=
|
||||
disable_overscan=1
|
||||
disable_fw_kms_setup=1
|
||||
max_framebuffers=0
|
14
projects/RPi/devices/RPi5/kodi/appliance.xml
Normal file
14
projects/RPi/devices/RPi5/kodi/appliance.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<settings version="1">
|
||||
|
||||
<section id="system">
|
||||
<category id="audio">
|
||||
<group id="1">
|
||||
<setting id="audiooutput.audiodevice">
|
||||
<default>ALSA:hdmi:CARD=vc4hdmi0,DEV=0</default>
|
||||
</setting>
|
||||
</group>
|
||||
</category>
|
||||
</section>
|
||||
|
||||
</settings>
|
7145
projects/RPi/devices/RPi5/linux/linux.aarch64.conf
Normal file
7145
projects/RPi/devices/RPi5/linux/linux.aarch64.conf
Normal file
File diff suppressed because it is too large
Load Diff
43
projects/RPi/devices/RPi5/options
Normal file
43
projects/RPi/devices/RPi5/options
Normal file
@ -0,0 +1,43 @@
|
||||
################################################################################
|
||||
# Device defaults
|
||||
################################################################################
|
||||
|
||||
# NOOBS supported hex versions (legacy) is not relevant for RPi4
|
||||
unset NOOBS_HEX
|
||||
|
||||
# NOOBS supported model versions
|
||||
NOOBS_SUPPORTED_MODELS='"Pi 5"'
|
||||
|
||||
# additional Firmware to use (dvb-firmware, misc-firmware, wlan-firmware)
|
||||
FIRMWARE="${FIRMWARE} rpi-eeprom"
|
||||
|
||||
# set the addon project
|
||||
ADDON_PROJECT="ARMv8"
|
||||
|
||||
# The TARGET_CPU variable controls which processor should be targeted for
|
||||
# generated code.
|
||||
case $TARGET_ARCH in
|
||||
aarch64)
|
||||
TARGET_CPU="cortex-a76"
|
||||
TARGET_CPU_FLAGS="+crc+crypto"
|
||||
;;
|
||||
arm)
|
||||
TARGET_KERNEL_ARCH="arm64"
|
||||
TARGET_KERNEL_PATCH_ARCH="aarch64"
|
||||
TARGET_FLOAT="hard"
|
||||
# cortex-a72 caused issues in the past, so use a53
|
||||
TARGET_CPU="cortex-a53"
|
||||
TARGET_CPU_FLAGS="+crc+crypto"
|
||||
TARGET_FPU="neon-fp-armv8"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Kernel target
|
||||
KERNEL_TARGET="Image"
|
||||
|
||||
# debug tty path
|
||||
DEBUG_TTY="/dev/ttyAMA10"
|
||||
|
||||
# serial console
|
||||
EXTRA_CMDLINE="console=ttyAMA10,115200 console=tty0"
|
||||
|
@ -0,0 +1,332 @@
|
||||
From f62aa2640f92796ff5216da0a5d3c8f46a2855b4 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Mon, 26 Apr 2021 00:02:21 +0200
|
||||
Subject: [PATCH 001/139] broadcom(cle,clif,common,simulator): add 7.1 version
|
||||
on the list of versions to build
|
||||
|
||||
This adds 7.1 to the list of available V3D_VERSION, and first changes
|
||||
on the simulator needed to get it working.
|
||||
|
||||
Note that we needed to touch all those 4 codebases because it is
|
||||
needed if we want to use V3D_DEBUG=clif with the simulator, that it is
|
||||
the easier way to see which packets a vulkan program is using.
|
||||
|
||||
About the simulator, this commit only handle the rename of some
|
||||
registers. Any additional changes needed to get a proper support for
|
||||
v71 will be handled them on following commits.
|
||||
---
|
||||
src/broadcom/cle/meson.build | 3 +-
|
||||
src/broadcom/cle/v3dx_pack.h | 2 +
|
||||
src/broadcom/clif/clif_private.h | 2 +
|
||||
src/broadcom/common/v3d_device_info.c | 1 +
|
||||
src/broadcom/common/v3d_macros.h | 3 +
|
||||
src/broadcom/meson.build | 2 +-
|
||||
src/broadcom/simulator/v3d_simulator.c | 81 +++++++++++++++++++------
|
||||
src/broadcom/simulator/v3d_simulator.h | 5 ++
|
||||
src/broadcom/simulator/v3dx_simulator.c | 31 ++++++++--
|
||||
9 files changed, 106 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
|
||||
index 31a0d5bfa94..8ac32b313e4 100644
|
||||
--- a/src/broadcom/cle/meson.build
|
||||
+++ b/src/broadcom/cle/meson.build
|
||||
@@ -23,7 +23,8 @@ v3d_versions = [
|
||||
[21, 21],
|
||||
[33, 33],
|
||||
[41, 33],
|
||||
- [42, 33]
|
||||
+ [42, 33],
|
||||
+ [71, 33]
|
||||
]
|
||||
|
||||
v3d_xml_files = []
|
||||
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
|
||||
index 5762e5aaa70..e5a1eb26698 100644
|
||||
--- a/src/broadcom/cle/v3dx_pack.h
|
||||
+++ b/src/broadcom/cle/v3dx_pack.h
|
||||
@@ -37,6 +37,8 @@
|
||||
# include "cle/v3d_packet_v41_pack.h"
|
||||
#elif (V3D_VERSION == 42)
|
||||
# include "cle/v3d_packet_v42_pack.h"
|
||||
+#elif (V3D_VERSION == 71)
|
||||
+# include "cle/v3d_packet_v71_pack.h"
|
||||
#else
|
||||
# error "Need to add a pack header include for this v3d version"
|
||||
#endif
|
||||
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
|
||||
index 6ace62b0310..cda407a00bf 100644
|
||||
--- a/src/broadcom/clif/clif_private.h
|
||||
+++ b/src/broadcom/clif/clif_private.h
|
||||
@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
|
||||
const uint8_t *cl, uint32_t *size, bool reloc_mode);
|
||||
bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
|
||||
const uint8_t *cl, uint32_t *size, bool reloc_mode);
|
||||
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
|
||||
+ const uint8_t *cl, uint32_t *size, bool reloc_mode);
|
||||
|
||||
static inline void
|
||||
out(struct clif_dump *clif, const char *fmt, ...)
|
||||
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
|
||||
index 272190eb2e5..7e0862f1f02 100644
|
||||
--- a/src/broadcom/common/v3d_device_info.c
|
||||
+++ b/src/broadcom/common/v3d_device_info.c
|
||||
@@ -66,6 +66,7 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
|
||||
case 33:
|
||||
case 41:
|
||||
case 42:
|
||||
+ case 71:
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr,
|
||||
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
|
||||
index fe89398208a..b4291fb5350 100644
|
||||
--- a/src/broadcom/common/v3d_macros.h
|
||||
+++ b/src/broadcom/common/v3d_macros.h
|
||||
@@ -41,6 +41,9 @@
|
||||
#elif (V3D_VERSION == 42)
|
||||
# define V3DX(x) V3D42_##x
|
||||
# define v3dX(x) v3d42_##x
|
||||
+#elif (V3D_VERSION == 71)
|
||||
+# define V3DX(x) V3D71_##x
|
||||
+# define v3dX(x) v3d71_##x
|
||||
#else
|
||||
# error "Need to add prefixing macros for this v3d version"
|
||||
#endif
|
||||
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
|
||||
index 2c10e46b188..73cb7aa0575 100644
|
||||
--- a/src/broadcom/meson.build
|
||||
+++ b/src/broadcom/meson.build
|
||||
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
|
||||
|
||||
subdir('cle')
|
||||
|
||||
-v3d_versions = ['33', '41', '42']
|
||||
+v3d_versions = ['33', '41', '42', '71']
|
||||
v3d_libs = []
|
||||
|
||||
if with_gallium_v3d or with_broadcom_vk
|
||||
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
|
||||
index eea5d3f050e..5cceb1a82cc 100644
|
||||
--- a/src/broadcom/simulator/v3d_simulator.c
|
||||
+++ b/src/broadcom/simulator/v3d_simulator.c
|
||||
@@ -490,10 +490,20 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
|
||||
|
||||
v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
|
||||
|
||||
- if (sim_state.ver >= 41)
|
||||
- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
|
||||
- else
|
||||
- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
|
||||
+ switch(sim_state.ver) {
|
||||
+ case 33:
|
||||
+ v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
|
||||
+ break;
|
||||
+ case 41:
|
||||
+ case 42:
|
||||
+ v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
|
||||
+ break;
|
||||
+ case 71:
|
||||
+ v3d71_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
|
||||
+ break;
|
||||
+ default:
|
||||
+ unreachable("Unsupported V3D version\n");
|
||||
+ }
|
||||
|
||||
util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
|
||||
sim_bo) {
|
||||
@@ -635,10 +645,17 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
|
||||
static int
|
||||
v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
|
||||
{
|
||||
- if (sim_state.ver >= 41)
|
||||
- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
|
||||
- else
|
||||
+ switch(sim_state.ver) {
|
||||
+ case 33:
|
||||
return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
|
||||
+ case 41:
|
||||
+ case 42:
|
||||
+ return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
|
||||
+ case 71:
|
||||
+ return v3d71_simulator_get_param_ioctl(sim_state.v3d, args);
|
||||
+ default:
|
||||
+ unreachable("Unsupported V3D version\n");
|
||||
+ }
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -652,10 +669,20 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
|
||||
v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
|
||||
v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
|
||||
|
||||
- if (sim_state.ver >= 41)
|
||||
- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
|
||||
- else
|
||||
+ switch(sim_state.ver) {
|
||||
+ case 33:
|
||||
ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
|
||||
+ break;
|
||||
+ case 41:
|
||||
+ case 42:
|
||||
+ ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
|
||||
+ break;
|
||||
+ case 71:
|
||||
+ ret = v3d71_simulator_submit_tfu_ioctl(sim_state.v3d, args);
|
||||
+ break;
|
||||
+ default:
|
||||
+ unreachable("Unsupported V3D version\n");
|
||||
+ }
|
||||
|
||||
v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
|
||||
|
||||
@@ -682,11 +709,19 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
|
||||
|
||||
v3d_simulator_perfmon_switch(fd, args->perfmon_id);
|
||||
|
||||
- if (sim_state.ver >= 41)
|
||||
- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
|
||||
- file->gmp->ofs);
|
||||
- else
|
||||
- ret = -1;
|
||||
+ switch(sim_state.ver) {
|
||||
+ case 41:
|
||||
+ case 42:
|
||||
+ ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
|
||||
+ file->gmp->ofs);
|
||||
+ break;
|
||||
+ case 71:
|
||||
+ ret = v3d71_simulator_submit_csd_ioctl(sim_state.v3d, args,
|
||||
+ file->gmp->ofs);
|
||||
+ break;
|
||||
+ default:
|
||||
+ ret = -1;
|
||||
+ }
|
||||
|
||||
for (int i = 0; i < args->bo_handle_count; i++)
|
||||
v3d_simulator_copy_out_handle(file, bo_handles[i]);
|
||||
@@ -880,10 +915,20 @@ v3d_simulator_init_global()
|
||||
|
||||
util_dynarray_init(&sim_state.bin_oom, NULL);
|
||||
|
||||
- if (sim_state.ver >= 41)
|
||||
- v3d41_simulator_init_regs(sim_state.v3d);
|
||||
- else
|
||||
+ switch(sim_state.ver) {
|
||||
+ case 33:
|
||||
v3d33_simulator_init_regs(sim_state.v3d);
|
||||
+ break;
|
||||
+ case 41:
|
||||
+ case 42:
|
||||
+ v3d41_simulator_init_regs(sim_state.v3d);
|
||||
+ break;
|
||||
+ case 71:
|
||||
+ v3d71_simulator_init_regs(sim_state.v3d);
|
||||
+ break;
|
||||
+ default:
|
||||
+ unreachable("Not supported V3D version\n");
|
||||
+ }
|
||||
}
|
||||
|
||||
struct v3d_simulator_file *
|
||||
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
|
||||
index ddb079c1455..1472c313a03 100644
|
||||
--- a/src/broadcom/simulator/v3d_simulator.h
|
||||
+++ b/src/broadcom/simulator/v3d_simulator.h
|
||||
@@ -52,6 +52,11 @@ uint32_t v3d_simulator_get_mem_free(void);
|
||||
# define v3dX(x) v3d41_##x
|
||||
# include "v3dx_simulator.h"
|
||||
# undef v3dX
|
||||
+
|
||||
+# define v3dX(x) v3d71_##x
|
||||
+# include "v3dx_simulator.h"
|
||||
+# undef v3dX
|
||||
+
|
||||
#endif
|
||||
|
||||
#endif
|
||||
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
|
||||
index c9322f0397b..723796b16c9 100644
|
||||
--- a/src/broadcom/simulator/v3dx_simulator.c
|
||||
+++ b/src/broadcom/simulator/v3dx_simulator.c
|
||||
@@ -46,11 +46,15 @@
|
||||
|
||||
#define HW_REGISTER_RO(x) (x)
|
||||
#define HW_REGISTER_RW(x) (x)
|
||||
-#if V3D_VERSION >= 41
|
||||
+#if V3D_VERSION == 71
|
||||
+#include "libs/core/v3d/registers/7.1.5.1/v3d.h"
|
||||
+#else
|
||||
+#if V3D_VERSION == 41 || V3D_VERSION == 42
|
||||
#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
|
||||
#else
|
||||
#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
|
||||
#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
|
||||
@@ -310,16 +314,17 @@ v3d_isr_core(struct v3d_hw *v3d,
|
||||
return;
|
||||
}
|
||||
|
||||
+#if V3D_VERSION <= 42
|
||||
if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
|
||||
fprintf(stderr, "GMP violation at 0x%08x\n",
|
||||
V3D_READ(V3D_GMP_VIO_ADDR));
|
||||
- abort();
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"Unexpected ISR with core status 0x%08x\n",
|
||||
core_status);
|
||||
}
|
||||
abort();
|
||||
+#endif
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -396,6 +401,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
|
||||
}
|
||||
|
||||
handle_mmu_interruptions(v3d, hub_status);
|
||||
+
|
||||
+#if V3D_VERSION == 71
|
||||
+ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
|
||||
+ fprintf(stderr, "GMP violation at 0x%08x\n",
|
||||
+ V3D_READ(V3D_GMP_VIO_ADDR));
|
||||
+ } else {
|
||||
+ fprintf(stderr,
|
||||
+ "Unexpected ISR with status 0x%08x\n",
|
||||
+ hub_status);
|
||||
+ }
|
||||
+ abort();
|
||||
+#endif
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -436,8 +453,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
|
||||
* for tracing. Perhaps we should evaluate to do the same here and add
|
||||
* some debug options.
|
||||
*/
|
||||
- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
|
||||
- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
|
||||
+ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
|
||||
+#if V3D_VERSION <= 42
|
||||
+ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
|
||||
+#endif
|
||||
+
|
||||
V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
|
||||
V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
|
||||
|
||||
@@ -447,6 +467,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
|
||||
V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */
|
||||
V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
|
||||
|
||||
+#if V3D_VERSION == 71
|
||||
+ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
|
||||
+#endif
|
||||
V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
|
||||
V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,30 @@
|
||||
From 9e85edd1b347b0e779b393f463f42044a720bcff Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 28 Sep 2021 13:16:49 +0200
|
||||
Subject: [PATCH 002/139] broadcom/simulator: reset CFG7 for compute dispatch
|
||||
in v71
|
||||
|
||||
This register is new in 7.x, it doesn't seem that we need to
|
||||
do anything specific for now, but let's make sure it is reset
|
||||
every time.
|
||||
---
|
||||
src/broadcom/simulator/v3dx_simulator.c | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
|
||||
index 723796b16c9..f23b0538de3 100644
|
||||
--- a/src/broadcom/simulator/v3dx_simulator.c
|
||||
+++ b/src/broadcom/simulator/v3dx_simulator.c
|
||||
@@ -227,6 +227,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
|
||||
V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
|
||||
V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
|
||||
V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
|
||||
+#if V3D_VERSION >= 71
|
||||
+ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
|
||||
+#endif
|
||||
/* CFG0 kicks off the job */
|
||||
V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,712 @@
|
||||
From 6f744bc4bec98f9769486d427e8e2d4e314ae056 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 29 Jun 2021 12:03:24 +0200
|
||||
Subject: [PATCH 003/139] broadcom/cle: update the packet definitions for new
|
||||
generation v71
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Using as reference the spec for 7.1.5. This include totally new
|
||||
packets, and redefine some that already existed on v42.
|
||||
|
||||
Full list:
|
||||
* Add Depth Bounds Test Limits
|
||||
* Redefine Tile Binning Mode Cfg
|
||||
* Redefine Cfg Bits. There are some changes on the fields:
|
||||
* Line Rasterization is now 1 bit size
|
||||
* Depth Bounds Enable (that takes one of the bits of Line Rasterization)
|
||||
* Early-Z/Early-Z updates enable bits (16-17) figure now as reserved.
|
||||
* New Z-Clipping mode field
|
||||
* Redefine Tile Rendering Mode Cfg (Common). Changes with respect to v42:
|
||||
* New log2 tile height/width fields starting at bit 52/55
|
||||
* Due those two news, end pad is smaller
|
||||
* sub-id has now a size of 3. Bit 4 is reserved.
|
||||
* Number of render targets: this field max value is now 7 (not
|
||||
reflected on the xml).
|
||||
* Maximum BPP is removed on v71 (now bits 40-41 are reserved)
|
||||
* Depth Buffer disable: on bit 44
|
||||
* Update Store Tile Buffer General
|
||||
* Adding Cfg Render Target Part1/2/3 packets: they replace v4X "Tile
|
||||
Rendering Mode Cfg (Color)" (real name "Rendering Configuration
|
||||
(Render Targets Config)"), "Tile Rendering Mode Cfg (Clear Colors
|
||||
Part1)", "Tile Rendering Mode Cfg (Clear Colors Part2)", and "Tile
|
||||
Rendering Mode Cfg (Clear Colors Part3)". On those old versions,
|
||||
the first packet is used to configure 4 render targets. Now that 8
|
||||
are supported, invididual per-render-target are used.
|
||||
* Update ZS clear values packet.
|
||||
* Add new v71 output formats
|
||||
* Define Clear Render Targets (Replaces Clear Tile Buffers from v42)
|
||||
* Redefine GL Shader State Record. Changes copared with v42:
|
||||
* Fields removed:
|
||||
* "Coordinate shader has separate input and output VPM blocks"
|
||||
(reserved bit now)
|
||||
* "Vertex shader has separate input and output VPM blocks"
|
||||
(reserved bit now)
|
||||
* "Address of table of default attribute Values." (we needed to
|
||||
change the start position for all the following fields)
|
||||
* New field:
|
||||
* "Never defer FEP depth writes to fragment shader auto Z writes
|
||||
on scoreboard conflict"
|
||||
* Redefine clipper xy scaling: Now it uses 1/64ths of pixels, instead
|
||||
of 1/256ths
|
||||
* Update texture shader state.
|
||||
* Notice we don't use an address type for these fields in the XML
|
||||
description. This is because the addresses are 64-bit aligned
|
||||
(even though the PRM doesn't say it) which means the 6 LSB bits
|
||||
are implicitly 0, but the fields are encoded before the 6th bit
|
||||
of their starting byte, so we can't use the usual trick we do
|
||||
with address types where the first 6 bits in the byte are
|
||||
implicitly overwritten by other fields and we have to encode this
|
||||
manually as a uint field. This would mean that if we had an
|
||||
actual BO we would also need to add it manually to the job's
|
||||
list, but since we don't have one, we don't have to do anything
|
||||
about it.
|
||||
* Add new RB_Swap field for texture shader state
|
||||
* Document Cb/Cr addresses as uint fields in texture shader state
|
||||
* Fixup Blend Config description: we now support 8 RTs.
|
||||
* TMU config parameter 2 has new fields
|
||||
* Add new clipper Z without guardband packet in v71
|
||||
* Add enums for the Z clip modes accepted in v71
|
||||
* Fix texture state array stride packing for V3D 7.1.5
|
||||
|
||||
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
|
||||
|
||||
broadcom/cle: rb_swap
|
||||
---
|
||||
src/broadcom/cle/v3d_packet_v33.xml | 386 ++++++++++++++++++++++++++--
|
||||
1 file changed, 368 insertions(+), 18 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
|
||||
index a0242b5f1c2..624353ca2bf 100644
|
||||
--- a/src/broadcom/cle/v3d_packet_v33.xml
|
||||
+++ b/src/broadcom/cle/v3d_packet_v33.xml
|
||||
@@ -1,4 +1,4 @@
|
||||
-<vcxml gen="3.3" min_ver="33" max_ver="42">
|
||||
+<vcxml gen="3.3" min_ver="33" max_ver="71">
|
||||
|
||||
<enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
|
||||
<value name="NEVER" value="0"/>
|
||||
@@ -167,13 +167,36 @@
|
||||
<value name="depth_16" value="2"/>
|
||||
</enum>
|
||||
|
||||
- <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
|
||||
+ <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
|
||||
<value name="none" value="0"/> <!-- no clamping -->
|
||||
<value name="norm" value="1"/> <!-- [0,1] for f16 -->
|
||||
<value name="pos" value="2"/> <!-- [0, for f16 -->
|
||||
<value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
|
||||
</enum>
|
||||
|
||||
+ <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
|
||||
+ <value name="8i" value="0"/> <!-- no clamping -->
|
||||
+ <value name="16i" value="1"/> <!-- no clamping -->
|
||||
+ <value name="32i" value="2"/> <!-- no clamping -->
|
||||
+ <value name="8ui" value="4"/> <!-- no clamping -->
|
||||
+ <value name="16ui" value="5"/> <!-- no clamping -->
|
||||
+ <value name="32ui" value="6"/> <!-- no clamping -->
|
||||
+ <value name="8" value="8"/> <!-- no clamping -->
|
||||
+ <value name="16f" value="9"/> <!-- no clamping -->
|
||||
+ <value name="32f" value="10"/> <!-- no clamping -->
|
||||
+ <value name="8i_clamped" value="16"/> <!-- clamp to integer RT's range -->
|
||||
+ <value name="16i_clamped" value="17"/> <!-- clamp to integer RT's range -->
|
||||
+ <value name="32i_clamped" value="18"/> <!-- clamp to integer RT's range -->
|
||||
+ <value name="8ui_clamped" value="20"/> <!-- clamp to integer RT's range -->
|
||||
+ <value name="16ui_clamped" value="21"/> <!-- clamp to integer RT's range -->
|
||||
+ <value name="32ui_clamped" value="22"/> <!-- clamp to integer RT's range -->
|
||||
+ <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
|
||||
+ <value name="16f_clamp_pos" value="25"/> <!-- [0, for f16 -->
|
||||
+ <value name="16f_clamp_pq" value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
|
||||
+ <value name="16f_clamp_hlg" value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
|
||||
+ <value name="invalid" value="32"/>
|
||||
+ </enum>
|
||||
+
|
||||
<!---
|
||||
CL cache flush commands are not fully documented and subject to a
|
||||
number of hardware issues that make them unreliable. Specifically:
|
||||
@@ -263,13 +286,27 @@
|
||||
<value name="r8ui" value="36"/>
|
||||
<value name="srgbx8" value="37" max_ver="33"/>
|
||||
<value name="rgbx8" value="38" max_ver="33"/>
|
||||
- <value name="bstc" value="39" min_ver="41"/>
|
||||
+ <value name="bstc8" value="39" min_ver="41"/>
|
||||
<value name="d32f" value="40" min_ver="41"/>
|
||||
<value name="d24" value="41" min_ver="41"/>
|
||||
<value name="d16" value="42" min_ver="41"/>
|
||||
<value name="d24s8" value="43" min_ver="41"/>
|
||||
<value name="s8" value="44" min_ver="41"/>
|
||||
<value name="rgba5551" value="45" min_ver="41"/>
|
||||
+ <value name="bstc8_srgb" value="46" min_ver="71"/>
|
||||
+ <value name="bstc10" value="47" min_ver="71"/>
|
||||
+ <value name="bstc10_srgb" value="48" min_ver="71"/>
|
||||
+ <value name="bstc10_pq" value="49" min_ver="71"/>
|
||||
+ <value name="rgba10x6" value="50" min_ver="71"/>
|
||||
+ <value name="bstc10_hlg" value="55" min_ver="71"/>
|
||||
+ <value name="rgba10x6_hlg" value="56" min_ver="71"/>
|
||||
+ <value name="rgb10_a2_hlg" value="57" min_ver="71"/>
|
||||
+ <value name="bstc10_pq_bt1886" value="58" min_ver="71"/>
|
||||
+ <value name="rgba10x6_pq_bt1886" value="59" min_ver="71"/>
|
||||
+ <value name="rgb10_a2_pq_bt1886" value="60" min_ver="71"/>
|
||||
+ <value name="bstc10_hlg_bt1886" value="61" min_ver="71"/>
|
||||
+ <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
|
||||
+ <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
|
||||
</enum>
|
||||
|
||||
<enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
|
||||
@@ -314,6 +351,12 @@
|
||||
<value name="perp end caps" value="1"/>
|
||||
</enum>
|
||||
|
||||
+ <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
|
||||
+ <value name="NONE" value="0"/>
|
||||
+ <value name="MIN_ONE_TO_ONE" value="1"/>
|
||||
+ <value name="ZERO_TO_ONE" value="2"/>
|
||||
+ </enum>
|
||||
+
|
||||
<packet code="0" name="Halt"/>
|
||||
<packet code="1" name="NOP"/>
|
||||
<packet code="4" name="Flush"/>
|
||||
@@ -381,11 +424,13 @@
|
||||
<field name="Last Tile of Frame" size="1" start="0" type="bool"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
|
||||
+ <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
|
||||
<field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
|
||||
<field name="Clear all Render Targets" size="1" start="0" type="bool"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
|
||||
+
|
||||
<packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
|
||||
<field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
|
||||
<field name="Enable Z load" size="1" start="7" type="bool"/>
|
||||
@@ -443,6 +488,10 @@
|
||||
<value name="Render target 1" value="1"/>
|
||||
<value name="Render target 2" value="2"/>
|
||||
<value name="Render target 3" value="3"/>
|
||||
+ <value name="Render target 4" value="4" min_ver="71"/>
|
||||
+ <value name="Render target 5" value="5" min_ver="71"/>
|
||||
+ <value name="Render target 6" value="6" min_ver="71"/>
|
||||
+ <value name="Render target 7" value="7" min_ver="71"/>
|
||||
<value name="None" value="8"/>
|
||||
<value name="Z" value="9"/>
|
||||
<value name="Stencil" value="10"/>
|
||||
@@ -789,7 +838,7 @@
|
||||
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="84" name="Blend Cfg" min_ver="41">
|
||||
+ <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
|
||||
<field name="Render Target Mask" size="4" start="24" type="uint"/>
|
||||
<field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
|
||||
<field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
|
||||
@@ -799,6 +848,16 @@
|
||||
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="84" name="Blend Cfg" min_ver="71">
|
||||
+ <field name="Render Target Mask" size="8" start="24" type="uint"/>
|
||||
+ <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
|
||||
+ <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
|
||||
+ <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
|
||||
+ <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
|
||||
+ <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
|
||||
+ <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
|
||||
<field name="Alpha (F16)" size="16" start="48" type="uint"/>
|
||||
<field name="Blue (F16)" size="16" start="32" type="uint"/>
|
||||
@@ -828,7 +887,12 @@
|
||||
<field name="address" size="32" start="0" type="address"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="96" name="Cfg Bits">
|
||||
+ <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
|
||||
+ <field name="Lower Test Limit" size="32" start="0" type="float"/>
|
||||
+ <field name="Upper Test Limit" size="32" start="32" type="float"/>
|
||||
+ </packet>
|
||||
+
|
||||
+ <packet code="96" name="Cfg Bits" max_ver="42">
|
||||
<field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
|
||||
<field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
|
||||
<field name="Blend enable" size="1" start="19" type="bool"/>
|
||||
@@ -846,6 +910,25 @@
|
||||
<field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="96" name="Cfg Bits" min_ver="71">
|
||||
+ <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
|
||||
+ <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
|
||||
+ <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
|
||||
+ <field name="Blend enable" size="1" start="19" type="bool"/>
|
||||
+ <field name="Stencil enable" size="1" start="18" type="bool"/>
|
||||
+ <field name="Z updates enable" size="1" start="15" type="bool"/>
|
||||
+ <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
|
||||
+ <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
|
||||
+ <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
|
||||
+ <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
|
||||
+ <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
|
||||
+ <field name="Line Rasterization" size="1" start="4" type="uint"/>
|
||||
+ <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
|
||||
+ <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
|
||||
+ <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
|
||||
+ <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
|
||||
|
||||
<packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
|
||||
@@ -907,16 +990,26 @@
|
||||
<field name="Minimum Zw" size="32" start="0" type="float"/>
|
||||
</packet>
|
||||
|
||||
- <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
|
||||
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
|
||||
<field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
|
||||
<field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
|
||||
</packet>
|
||||
|
||||
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
|
||||
+ <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
|
||||
+ <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
|
||||
<field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
|
||||
<field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
|
||||
</packet>
|
||||
|
||||
+ <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
|
||||
+ <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
|
||||
+ <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet name="Number of Layers" code="119" min_ver="41">
|
||||
<field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
|
||||
</packet>
|
||||
@@ -947,7 +1040,7 @@
|
||||
<field name="sub-id" size="1" start="0" type="uint" default="0"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
|
||||
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">
|
||||
|
||||
<field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
|
||||
<field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
|
||||
@@ -971,6 +1064,35 @@
|
||||
</field>
|
||||
</packet>
|
||||
|
||||
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
|
||||
+ <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
|
||||
+ <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
|
||||
+
|
||||
+ <field name="Log2 Tile Height" size="3" start="11" type="uint">
|
||||
+ <value name="tile height 8 pixels" value="0"/>
|
||||
+ <value name="tile height 16 pixels" value="1"/>
|
||||
+ <value name="tile height 32 pixels" value="2"/>
|
||||
+ <value name="tile height 64 pixels" value="3"/>
|
||||
+ </field>
|
||||
+ <field name="Log2 Tile Width" size="3" start="8" type="uint">
|
||||
+ <value name="tile width 8 pixels" value="0"/>
|
||||
+ <value name="tile width 16 pixels" value="1"/>
|
||||
+ <value name="tile width 32 pixels" value="2"/>
|
||||
+ <value name="tile width 64 pixels" value="3"/>
|
||||
+ </field>
|
||||
+
|
||||
+ <field name="tile allocation block size" size="2" start="4" type="uint">
|
||||
+ <value name="tile allocation block size 64b" value="0"/>
|
||||
+ <value name="tile allocation block size 128b" value="1"/>
|
||||
+ <value name="tile allocation block size 256b" value="2"/>
|
||||
+ </field>
|
||||
+ <field name="tile allocation initial block size" size="2" start="2" type="uint">
|
||||
+ <value name="tile allocation initial block size 64b" value="0"/>
|
||||
+ <value name="tile allocation initial block size 128b" value="1"/>
|
||||
+ <value name="tile allocation initial block size 256b" value="2"/>
|
||||
+ </field>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
|
||||
<field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
|
||||
<field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
|
||||
@@ -1002,7 +1124,7 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
|
||||
<field name="Pad" size="12" start="52" type="uint"/>
|
||||
|
||||
<field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
|
||||
@@ -1018,7 +1140,11 @@
|
||||
<field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
|
||||
<field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
|
||||
|
||||
- <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
|
||||
+ <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
|
||||
+ <value name="Render target maximum 32bpp" value="0"/>
|
||||
+ <value name="Render target maximum 64bpp" value="1"/>
|
||||
+ <value name="Render target maximum 128bpp" value="2"/>
|
||||
+ </field>
|
||||
|
||||
<field name="Image Height (pixels)" size="16" start="24" type="uint"/>
|
||||
<field name="Image Width (pixels)" size="16" start="8" type="uint"/>
|
||||
@@ -1027,6 +1153,43 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
|
||||
+ <field name="Pad" size="6" start="58" type="uint"/>
|
||||
+
|
||||
+ <field name="Log2 Tile Height" size="3" start="55" type="uint">
|
||||
+ <value name="tile height 8 pixels" value="0"/>
|
||||
+ <value name="tile height 16 pixels" value="1"/>
|
||||
+ <value name="tile height 32 pixels" value="2"/>
|
||||
+ <value name="tile height 64 pixels" value="3"/>
|
||||
+ </field>
|
||||
+ <field name="Log2 Tile Width" size="3" start="52" type="uint">
|
||||
+ <value name="tile width 8 pixels" value="0"/>
|
||||
+ <value name="tile width 16 pixels" value="1"/>
|
||||
+ <value name="tile width 32 pixels" value="2"/>
|
||||
+ <value name="tile width 64 pixels" value="3"/>
|
||||
+ </field>
|
||||
+
|
||||
+ <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
|
||||
+ <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
|
||||
+
|
||||
+ <field name="Early-Z disable" size="1" start="46" type="bool"/>
|
||||
+
|
||||
+ <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
|
||||
+ <value name="Early-Z direction LT/LE" value="0"/>
|
||||
+ <value name="Early-Z direction GT/GE" value="1"/>
|
||||
+ </field>
|
||||
+
|
||||
+ <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
|
||||
+ <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
|
||||
+ <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
|
||||
+
|
||||
+ <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
|
||||
+ <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
|
||||
+ <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
|
||||
+
|
||||
+ <field name="sub-id" size="3" start="0" type="uint" default="0"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
|
||||
<field name="Address" size="32" start="32" type="address"/>
|
||||
|
||||
@@ -1048,7 +1211,8 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
|
||||
+ <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">
|
||||
|
||||
<field name="Pad" size="28" start="36" type="uint"/>
|
||||
|
||||
@@ -1099,7 +1263,7 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
|
||||
<field name="unused" size="16" start="48" type="uint"/>
|
||||
|
||||
<field name="Z Clear Value" size="32" start="16" type="float"/>
|
||||
@@ -1108,6 +1272,15 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
|
||||
+ <field name="unused" size="16" start="48" type="uint"/>
|
||||
+
|
||||
+ <field name="Z Clear Value" size="32" start="16" type="float"/>
|
||||
+
|
||||
+ <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
|
||||
+ <field name="sub-id" size="4" start="0" type="uint" default="1"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
|
||||
<!-- Express this as a 56-bit field? -->
|
||||
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
|
||||
@@ -1117,7 +1290,7 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
|
||||
<!-- Express this as a 56-bit field? -->
|
||||
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
|
||||
<field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
|
||||
@@ -1126,6 +1299,19 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
|
||||
+
|
||||
+ <field name="Clear Color low bits" size="32" start="32" type="uint"/>
|
||||
+ <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
|
||||
+ <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
|
||||
+
|
||||
+ <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
|
||||
+ <!-- In multiples of 512 bits -->
|
||||
+ <field name="Base Address" size="11" start="7" type="uint"/>
|
||||
+ <field name="Render Target number" size="3" start="3" type="uint"/>
|
||||
+ <field name="sub-id" size="3" start="0" type="uint" default="2"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
|
||||
<!-- Express this as a 56-bit field? -->
|
||||
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
|
||||
@@ -1135,7 +1321,7 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
|
||||
<!-- Express this as a 56-bit field? -->
|
||||
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
|
||||
<field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
|
||||
@@ -1144,6 +1330,13 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
|
||||
+ <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
|
||||
+
|
||||
+ <field name="Render Target number" size="3" start="3" type="uint"/>
|
||||
+ <field name="sub-id" size="3" start="0" type="uint" default="3"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
|
||||
<field name="pad" size="11" start="53" type="uint"/>
|
||||
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
|
||||
@@ -1155,7 +1348,7 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="6"/>
|
||||
</packet>
|
||||
|
||||
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
|
||||
<field name="pad" size="11" start="53" type="uint"/>
|
||||
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
|
||||
<!-- image height is for Y flipping -->
|
||||
@@ -1166,6 +1359,13 @@
|
||||
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
|
||||
</packet>
|
||||
|
||||
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
|
||||
+ <field name="Clear Color top bits" size="56" start="8" type="uint"/>
|
||||
+
|
||||
+ <field name="Render Target number" size="3" start="3" type="uint"/>
|
||||
+ <field name="sub-id" size="3" start="0" type="uint" default="4"/>
|
||||
+ </packet>
|
||||
+
|
||||
<packet code="124" shortname="tile_coords" name="Tile Coordinates">
|
||||
<field name="tile row number" size="12" start="12" type="uint"/>
|
||||
<field name="tile column number" size="12" start="0" type="uint"/>
|
||||
@@ -1240,7 +1440,7 @@
|
||||
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
|
||||
</struct>
|
||||
|
||||
- <struct name="GL Shader State Record" min_ver="41">
|
||||
+ <struct name="GL Shader State Record" min_ver="41" max_ver="42">
|
||||
<field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
|
||||
<field name="Enable clipping" size="1" start="1" type="bool"/>
|
||||
|
||||
@@ -1299,6 +1499,63 @@
|
||||
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
|
||||
</struct>
|
||||
|
||||
+ <struct name="GL Shader State Record" min_ver="71">
|
||||
+ <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
|
||||
+ <field name="Enable clipping" size="1" start="1" type="bool"/>
|
||||
+
|
||||
+ <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
|
||||
+ <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
|
||||
+ <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
|
||||
+ <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
|
||||
+ <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
|
||||
+ <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
|
||||
+
|
||||
+ <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
|
||||
+ <field name="Turn off early-z test" size="1" start="9" type="bool"/>
|
||||
+
|
||||
+ <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
|
||||
+ <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
|
||||
+ <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
|
||||
+ <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
|
||||
+ <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
|
||||
+ <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
|
||||
+ <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
|
||||
+ <field name="No prim pack" size="1" start="19" type="bool"/>
|
||||
+ <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
|
||||
+
|
||||
+ <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
|
||||
+
|
||||
+ <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
|
||||
+ <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
|
||||
+
|
||||
+ <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
|
||||
+ <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
|
||||
+
|
||||
+ <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
|
||||
+ <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
|
||||
+
|
||||
+ <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
|
||||
+ <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
|
||||
+
|
||||
+ <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
|
||||
+ <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
|
||||
+ <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
|
||||
+ <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
|
||||
+ <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
|
||||
+
|
||||
+ <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
|
||||
+ <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
|
||||
+ <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
|
||||
+ <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
|
||||
+ <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
|
||||
+
|
||||
+ <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
|
||||
+ <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
|
||||
+ <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
|
||||
+ <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
|
||||
+ <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
|
||||
+ </struct>
|
||||
+
|
||||
<struct name="Geometry Shader State Record" min_ver="41">
|
||||
<field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
|
||||
<field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
|
||||
@@ -1543,7 +1800,7 @@
|
||||
<field name="Offset Format 8" size="1" start="0" type="bool"/>
|
||||
</struct>
|
||||
|
||||
- <struct name="TMU Config Parameter 2" min_ver="42">
|
||||
+ <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
|
||||
<field name="Pad" size="7" start="25" type="uint"/>
|
||||
<field name="LOD Query" size="1" start="24" type="bool"/>
|
||||
<field name="Op" size="4" start="20" type="TMU Op"/>
|
||||
@@ -1558,6 +1815,23 @@
|
||||
<field name="Offset Format 8" size="1" start="0" type="bool"/>
|
||||
</struct>
|
||||
|
||||
+ <struct name="TMU Config Parameter 2" min_ver="71">
|
||||
+ <field name="Pad" size="5" start="27" type="uint"/>
|
||||
+ <field name="Write conversion" size="1" start="26" type="bool"/>
|
||||
+ <field name="DIM query" size="1" start="25" type="bool"/>
|
||||
+ <field name="LOD Query" size="1" start="24" type="bool"/>
|
||||
+ <field name="Op" size="4" start="20" type="TMU Op"/>
|
||||
+ <field name="Offset R" size="4" start="16" type="int"/>
|
||||
+ <field name="Offset T" size="4" start="12" type="int"/>
|
||||
+ <field name="Offset S" size="4" start="8" type="int"/>
|
||||
+ <field name="Gather Mode" size="1" start="7" type="bool"/>
|
||||
+ <field name="Gather Component" size="2" start="5" type="uint"/>
|
||||
+ <field name="Coefficient Mode" size="1" start="4" type="bool"/>
|
||||
+ <field name="Sample Number" size="2" start="2" type="uint"/>
|
||||
+ <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
|
||||
+ <field name="Offset Format 8" size="1" start="0" type="bool"/>
|
||||
+ </struct>
|
||||
+
|
||||
<struct name="Texture Shader State" max_ver="33">
|
||||
<field name="UIF XOR disable" size="1" start="255" type="bool"/>
|
||||
<field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
|
||||
@@ -1611,7 +1885,7 @@
|
||||
<field name="Filter" size="4" start="0" type="TMU Filter"/>
|
||||
</struct>
|
||||
|
||||
- <struct name="Texture Shader State" min_ver="41">
|
||||
+ <struct name="Texture Shader State" min_ver="41" max_ver="42">
|
||||
<field name="Pad" size="56" start="136" type="uint"/>
|
||||
<field name="UIF XOR disable" size="1" start="135" type="bool"/>
|
||||
<field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
|
||||
@@ -1652,6 +1926,82 @@
|
||||
<field name="Flip texture X Axis" size="1" start="0" type="bool"/>
|
||||
</struct>
|
||||
|
||||
+ <struct name="Texture Shader State" min_ver="71">
|
||||
+ <field name="Pad" size="2" start="190" type="uint"/>
|
||||
+ <!-- When we use an address type, there is an implicit requirement
|
||||
+ that the address is a 32-bit that is encoded starting at a 32-bit
|
||||
+ aligned bit offset into the packet. If the address field has less than
|
||||
+ 32 bits, it is assumed that the address is aligned. For example, a
|
||||
+ 26-bit address field is expected to be 64-byte aligned (6 lsb bits
|
||||
+ are 0) and that this will be encoded into a packet starting at bit
|
||||
+ offset 6 into a 32-bit dword (since bits 0..5 of the address are
|
||||
+ implicitly 0 and don't need to be explicitly encoded).
|
||||
+
|
||||
+ Unfortunately, the CB address below doesn't match this requirement:
|
||||
+ it starts at bit 138, which is 10 bits into a 32-bit dword, but it
|
||||
+ represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
|
||||
+ encode it as an address type. To fix this we encode these addresses
|
||||
+ as uint types which has two implications:
|
||||
+ 1. the driver is responsible for manually addinng the buffer objects
|
||||
+ for these addresses to the job BO list.
|
||||
+ 2. the driver needs to pass an actual 26-bit address value by manually
|
||||
+ shifting the 6 lsb bits (that are implicitly 0).
|
||||
+ -->
|
||||
+ <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
|
||||
+ <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
|
||||
+ <field name="Chroma offset y" size="1" start="137" type="uint"/>
|
||||
+ <field name="Chroma offset x" size="1" start="136" type="uint"/>
|
||||
+
|
||||
+ <field name="UIF XOR disable" size="1" start="135" type="bool"/>
|
||||
+ <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
|
||||
+ <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
|
||||
+ <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
|
||||
+
|
||||
+ <field name="Base Level" size="4" start="124" type="uint"/>
|
||||
+ <field name="Max Level" size="4" start="120" type="uint"/>
|
||||
+
|
||||
+ <field name="Swizzle A" size="3" start="117" type="uint">
|
||||
+ <value name="Swizzle Zero" value="0"/>
|
||||
+ <value name="Swizzle One" value="1"/>
|
||||
+ <value name="Swizzle Red" value="2"/>
|
||||
+ <value name="Swizzle Green" value="3"/>
|
||||
+ <value name="Swizzle Blue" value="4"/>
|
||||
+ <value name="Swizzle Alpha" value="5"/>
|
||||
+ </field>
|
||||
+
|
||||
+ <field name="Swizzle B" size="3" start="114" type="uint"/>
|
||||
+ <field name="Swizzle G" size="3" start="111" type="uint"/>
|
||||
+ <field name="Swizzle R" size="3" start="108" type="uint"/>
|
||||
+ <field name="Extended" size="1" start="107" type="bool"/>
|
||||
+
|
||||
+ <field name="Texture type" size="7" start="100" type="uint"/>
|
||||
+ <field name="Image Depth" size="14" start="86" type="uint"/>
|
||||
+ <field name="Image Height" size="14" start="72" type="uint"/>
|
||||
+ <field name="Image Width" size="14" start="58" type="uint"/>
|
||||
+
|
||||
+ <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
|
||||
+ at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
|
||||
+ Array Stride starting at 33, which is backwards incompatible,
|
||||
+ We use the definition from 7.1.5.
|
||||
+ -->
|
||||
+ <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
|
||||
+ <field name="R/B swap" size="1" start="32" type="bool"/>
|
||||
+
|
||||
+ <field name="Texture base pointer" size="32" start="0" type="address"/>
|
||||
+
|
||||
+ <field name="Reverse" size="1" start="5" type="bool"/>
|
||||
+ <field name="Transfer func" size="3" start="2" type="uint">
|
||||
+ <value name="Transfer Func None" value="0"/>
|
||||
+ <value name="Transfer Func sRGB" value="1"/>
|
||||
+ <value name="Transfer Func PQ" value="2"/>
|
||||
+ <value name="Transfer Func HLG" value="3"/>
|
||||
+ <value name="Transfer Func PQ BT1886" value="4"/>
|
||||
+ <value name="Transfer Func HLG BT1886" value="5"/>
|
||||
+ </field>
|
||||
+ <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
|
||||
+ <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
|
||||
+ </struct>
|
||||
+
|
||||
<struct name="Sampler State" min_ver="41">
|
||||
<field name="Border color word 3" size="32" start="160" type="uint"/>
|
||||
<field name="Border color word 2" size="32" start="128" type="uint"/>
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,65 @@
|
||||
From 569cbe4229df737ce5915c4be2cad534707fb4f7 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 9 Nov 2021 08:50:51 +0100
|
||||
Subject: [PATCH 004/139] broadcom/common: retrieve V3D revision number
|
||||
|
||||
The subrev field from the hub ident3 register is bumped with every
|
||||
hardware revision doing backwards incompatible changes so we want to
|
||||
keep track of this.
|
||||
|
||||
Instead of modifying the 'ver' field info to acommodate subrev info,
|
||||
which would require a lot of changes, simply add a new 'rev' field in
|
||||
devinfo that we can use when we need to make changes based on the
|
||||
revision number of a hardware release.
|
||||
---
|
||||
src/broadcom/common/v3d_device_info.c | 14 +++++++++++++-
|
||||
src/broadcom/common/v3d_device_info.h | 3 +++
|
||||
2 files changed, 16 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
|
||||
index 7e0862f1f02..7512fe3a06b 100644
|
||||
--- a/src/broadcom/common/v3d_device_info.c
|
||||
+++ b/src/broadcom/common/v3d_device_info.c
|
||||
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
|
||||
struct drm_v3d_get_param ident1 = {
|
||||
.param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
|
||||
};
|
||||
+ struct drm_v3d_get_param hub_ident3 = {
|
||||
+ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
|
||||
+ };
|
||||
int ret;
|
||||
|
||||
ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
|
||||
@@ -76,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
|
||||
return false;
|
||||
}
|
||||
|
||||
- return true;
|
||||
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
|
||||
+ if (ret != 0) {
|
||||
+ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
|
||||
+ strerror(errno));
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ devinfo->rev = (hub_ident3.value >> 8) & 0xff;
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
|
||||
index 97abd9b8d9f..32cb65cf81f 100644
|
||||
--- a/src/broadcom/common/v3d_device_info.h
|
||||
+++ b/src/broadcom/common/v3d_device_info.h
|
||||
@@ -34,6 +34,9 @@ struct v3d_device_info {
|
||||
/** Simple V3D version: major * 10 + minor */
|
||||
uint8_t ver;
|
||||
|
||||
+ /** V3D revision number */
|
||||
+ uint8_t rev;
|
||||
+
|
||||
/** Size of the VPM, in bytes. */
|
||||
int vpm_size;
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,91 @@
|
||||
From c260843c882d25bd31e308566b45d4517fda0fa2 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 17 Nov 2021 14:40:47 +0100
|
||||
Subject: [PATCH 005/139] broadcom/common: add some common v71 helpers
|
||||
|
||||
---
|
||||
src/broadcom/common/v3d_util.c | 27 +++++++++++++++++++++++++++
|
||||
src/broadcom/common/v3d_util.h | 27 +++++++++++++++++++++++++++
|
||||
2 files changed, 54 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
|
||||
index 57872a923d3..26f5c6b336f 100644
|
||||
--- a/src/broadcom/common/v3d_util.c
|
||||
+++ b/src/broadcom/common/v3d_util.c
|
||||
@@ -170,3 +170,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
|
||||
unreachable("Unsupported primitive type");
|
||||
}
|
||||
}
|
||||
+
|
||||
+uint32_t
|
||||
+v3d_internal_bpp_words(uint32_t internal_bpp)
|
||||
+{
|
||||
+ switch (internal_bpp) {
|
||||
+ case 0 /* V3D_INTERNAL_BPP_32 */:
|
||||
+ return 1;
|
||||
+ case 1 /* V3D_INTERNAL_BPP_64 */:
|
||||
+ return 2;
|
||||
+ case 2 /* V3D_INTERNAL_BPP_128 */:
|
||||
+ return 4;
|
||||
+ default:
|
||||
+ unreachable("Unsupported internal BPP");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+uint32_t
|
||||
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
|
||||
+ uint32_t bpp)
|
||||
+{
|
||||
+ /* stride in multiples of 128 bits, and covers 2 rows. This is the
|
||||
+ * reason we divide by 2 instead of 4, as we divide number of 32-bit
|
||||
+ * words per row by 2.
|
||||
+ */
|
||||
+
|
||||
+ return (tile_width * bpp) / 2;
|
||||
+}
|
||||
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
|
||||
index eb802b77f67..864fc949ffa 100644
|
||||
--- a/src/broadcom/common/v3d_util.h
|
||||
+++ b/src/broadcom/common/v3d_util.h
|
||||
@@ -24,6 +24,7 @@
|
||||
#ifndef V3D_UTIL_H
|
||||
#define V3D_UTIL_H
|
||||
|
||||
+#include "util/macros.h"
|
||||
#include "common/v3d_device_info.h"
|
||||
#include "pipe/p_defines.h"
|
||||
|
||||
@@ -46,4 +47,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
|
||||
uint32_t
|
||||
v3d_hw_prim_type(enum mesa_prim prim_type);
|
||||
|
||||
+uint32_t
|
||||
+v3d_internal_bpp_words(uint32_t internal_bpp);
|
||||
+
|
||||
+/* Some configuration packets want the size on log2, but starting at 0 for
|
||||
+ * size 8.
|
||||
+ */
|
||||
+static inline uint8_t
|
||||
+log2_tile_size(uint32_t size)
|
||||
+{
|
||||
+ switch(size) {
|
||||
+ case 8:
|
||||
+ return 0;
|
||||
+ case 16:
|
||||
+ return 1;
|
||||
+ case 32:
|
||||
+ return 2;
|
||||
+ case 64:
|
||||
+ return 3;
|
||||
+ default:
|
||||
+ unreachable("Unsupported tile width/height");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+uint32_t
|
||||
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
|
||||
+ uint32_t bpp);
|
||||
#endif
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,53 @@
|
||||
From a5211a4d71acc53183d2a90eb1694d8cce6eb44f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 5 Aug 2021 01:03:11 +0200
|
||||
Subject: [PATCH 006/139] broadcom/qpu: add comments on waddr not used on V3D
|
||||
7.x
|
||||
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.h | 22 +++++++++++-----------
|
||||
1 file changed, 11 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 2e133472698..45a0cad9760 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -88,11 +88,11 @@ enum v3d_qpu_uf {
|
||||
};
|
||||
|
||||
enum v3d_qpu_waddr {
|
||||
- V3D_QPU_WADDR_R0 = 0,
|
||||
- V3D_QPU_WADDR_R1 = 1,
|
||||
- V3D_QPU_WADDR_R2 = 2,
|
||||
- V3D_QPU_WADDR_R3 = 3,
|
||||
- V3D_QPU_WADDR_R4 = 4,
|
||||
+ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
|
||||
V3D_QPU_WADDR_R5 = 5,
|
||||
V3D_QPU_WADDR_NOP = 6,
|
||||
V3D_QPU_WADDR_TLB = 7,
|
||||
@@ -108,12 +108,12 @@ enum v3d_qpu_waddr {
|
||||
V3D_QPU_WADDR_SYNC = 16,
|
||||
V3D_QPU_WADDR_SYNCU = 17,
|
||||
V3D_QPU_WADDR_SYNCB = 18,
|
||||
- V3D_QPU_WADDR_RECIP = 19,
|
||||
- V3D_QPU_WADDR_RSQRT = 20,
|
||||
- V3D_QPU_WADDR_EXP = 21,
|
||||
- V3D_QPU_WADDR_LOG = 22,
|
||||
- V3D_QPU_WADDR_SIN = 23,
|
||||
- V3D_QPU_WADDR_RSQRT2 = 24,
|
||||
+ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */
|
||||
+ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
|
||||
V3D_QPU_WADDR_TMUC = 32,
|
||||
V3D_QPU_WADDR_TMUS = 33,
|
||||
V3D_QPU_WADDR_TMUT = 34,
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,60 @@
|
||||
From 0ccf3043e4a584e5592bb7fad737d5d98ed23db0 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 5 Aug 2021 01:00:47 +0200
|
||||
Subject: [PATCH 007/139] broadcom/qpu: set V3D 7.x names for some waddr
|
||||
aliasing
|
||||
|
||||
V3D 7.x got rid of the accumulator, but still uses the values for
|
||||
WADDR_R5 and WADDR_R5REP, so let's return a proper name and add some
|
||||
aliases.
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.c | 8 ++++++++
|
||||
src/broadcom/qpu/qpu_instr.h | 6 ++++--
|
||||
2 files changed, 12 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index 60dabf74e8e..7759fb0efdf 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
|
||||
if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
|
||||
return "tmu";
|
||||
|
||||
+ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
|
||||
+ */
|
||||
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
|
||||
+ return "quad";
|
||||
+
|
||||
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
|
||||
+ return "rep";
|
||||
+
|
||||
static const char *waddr_magic[] = {
|
||||
[V3D_QPU_WADDR_R0] = "r0",
|
||||
[V3D_QPU_WADDR_R1] = "r1",
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 45a0cad9760..19bf721dbe1 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -93,7 +93,8 @@ enum v3d_qpu_waddr {
|
||||
V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
|
||||
V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
|
||||
V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
|
||||
- V3D_QPU_WADDR_R5 = 5,
|
||||
+ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */
|
||||
+ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */
|
||||
V3D_QPU_WADDR_NOP = 6,
|
||||
V3D_QPU_WADDR_TLB = 7,
|
||||
V3D_QPU_WADDR_TLBU = 8,
|
||||
@@ -129,7 +130,8 @@ enum v3d_qpu_waddr {
|
||||
V3D_QPU_WADDR_TMUHSCM = 44,
|
||||
V3D_QPU_WADDR_TMUHSF = 45,
|
||||
V3D_QPU_WADDR_TMUHSLOD = 46,
|
||||
- V3D_QPU_WADDR_R5REP = 55,
|
||||
+ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
|
||||
+ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */
|
||||
};
|
||||
|
||||
struct v3d_qpu_flags {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,241 @@
|
||||
From 18de3cc85cf8bbe294e044f7a12abe14e554de0a Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Sun, 19 Sep 2021 03:20:18 +0200
|
||||
Subject: [PATCH 008/139] broadcom/compiler: rename small_imm to small_imm_b
|
||||
|
||||
Current small_imm is associated with the "B" read address.
|
||||
|
||||
We do this change in advance for v71 support, where we will have 4
|
||||
different small_imm (a/b/c/d), so we start with a renaming.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 22 +++++++++----------
|
||||
.../compiler/vir_opt_small_immediates.c | 4 ++--
|
||||
src/broadcom/compiler/vir_to_qpu.c | 2 +-
|
||||
src/broadcom/qpu/qpu_disasm.c | 2 +-
|
||||
src/broadcom/qpu/qpu_instr.h | 2 +-
|
||||
src/broadcom/qpu/qpu_pack.c | 22 +++++++++----------
|
||||
6 files changed, 27 insertions(+), 27 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 3b32b48f86f..a10fa03ed10 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -160,7 +160,7 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
|
||||
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
|
||||
break;
|
||||
case V3D_QPU_MUX_B:
|
||||
- if (!n->inst->qpu.sig.small_imm) {
|
||||
+ if (!n->inst->qpu.sig.small_imm_b) {
|
||||
add_read_dep(state,
|
||||
state->last_rf[n->inst->qpu.raddr_b], n);
|
||||
}
|
||||
@@ -615,7 +615,7 @@ qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
|
||||
return true;
|
||||
|
||||
if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
|
||||
- !inst->sig.small_imm && (inst->raddr_b == waddr))
|
||||
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@@ -790,11 +790,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
|
||||
uint64_t raddrs_used = 0;
|
||||
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
|
||||
raddrs_used |= (1ll << a->raddr_a);
|
||||
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
|
||||
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
|
||||
raddrs_used |= (1ll << a->raddr_b);
|
||||
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
|
||||
raddrs_used |= (1ll << b->raddr_a);
|
||||
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
|
||||
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
|
||||
raddrs_used |= (1ll << b->raddr_b);
|
||||
|
||||
return raddrs_used;
|
||||
@@ -816,16 +816,16 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
if (naddrs > 2)
|
||||
return false;
|
||||
|
||||
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
|
||||
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
|
||||
if (naddrs > 1)
|
||||
return false;
|
||||
|
||||
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
|
||||
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
|
||||
if (add_instr->raddr_b != mul_instr->raddr_b)
|
||||
return false;
|
||||
|
||||
- result->sig.small_imm = true;
|
||||
- result->raddr_b = add_instr->sig.small_imm ?
|
||||
+ result->sig.small_imm_b = true;
|
||||
+ result->raddr_b = add_instr->sig.small_imm_b ?
|
||||
add_instr->raddr_b : mul_instr->raddr_b;
|
||||
}
|
||||
|
||||
@@ -836,7 +836,7 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
raddrs_used &= ~(1ll << raddr_a);
|
||||
result->raddr_a = raddr_a;
|
||||
|
||||
- if (!result->sig.small_imm) {
|
||||
+ if (!result->sig.small_imm_b) {
|
||||
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
|
||||
raddr_a == add_instr->raddr_b) {
|
||||
if (add_instr->alu.add.a == V3D_QPU_MUX_B)
|
||||
@@ -1025,7 +1025,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
merge.sig.ldtmu |= b->sig.ldtmu;
|
||||
merge.sig.ldvary |= b->sig.ldvary;
|
||||
merge.sig.ldvpm |= b->sig.ldvpm;
|
||||
- merge.sig.small_imm |= b->sig.small_imm;
|
||||
+ merge.sig.small_imm_b |= b->sig.small_imm_b;
|
||||
merge.sig.ldtlb |= b->sig.ldtlb;
|
||||
merge.sig.ldtlbu |= b->sig.ldtlbu;
|
||||
merge.sig.ucb |= b->sig.ucb;
|
||||
@@ -1614,7 +1614,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
return false;
|
||||
|
||||
if (inst->raddr_b < 3 &&
|
||||
- !inst->sig.small_imm &&
|
||||
+ !inst->sig.small_imm_b &&
|
||||
v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
|
||||
return false;
|
||||
}
|
||||
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
|
||||
index 47d7722968d..df0d6c36c9b 100644
|
||||
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
|
||||
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
|
||||
@@ -80,7 +80,7 @@ vir_opt_small_immediates(struct v3d_compile *c)
|
||||
*/
|
||||
struct v3d_qpu_sig new_sig = inst->qpu.sig;
|
||||
uint32_t sig_packed;
|
||||
- new_sig.small_imm = true;
|
||||
+ new_sig.small_imm_b = true;
|
||||
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
|
||||
continue;
|
||||
|
||||
@@ -89,7 +89,7 @@ vir_opt_small_immediates(struct v3d_compile *c)
|
||||
vir_dump_inst(c, inst);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
- inst->qpu.sig.small_imm = true;
|
||||
+ inst->qpu.sig.small_imm_b = true;
|
||||
inst->qpu.raddr_b = packed;
|
||||
|
||||
inst->src[i].file = QFILE_SMALL_IMM;
|
||||
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
|
||||
index 45e6bfa1470..15c2e3674c2 100644
|
||||
--- a/src/broadcom/compiler/vir_to_qpu.c
|
||||
+++ b/src/broadcom/compiler/vir_to_qpu.c
|
||||
@@ -94,7 +94,7 @@ static void
|
||||
set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||
{
|
||||
if (src.smimm) {
|
||||
- assert(instr->sig.small_imm);
|
||||
+ assert(instr->sig.small_imm_b);
|
||||
*mux = V3D_QPU_MUX_B;
|
||||
return;
|
||||
}
|
||||
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
|
||||
index 28fb2357b97..6aca3c28e78 100644
|
||||
--- a/src/broadcom/qpu/qpu_disasm.c
|
||||
+++ b/src/broadcom/qpu/qpu_disasm.c
|
||||
@@ -62,7 +62,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
if (mux == V3D_QPU_MUX_A) {
|
||||
append(disasm, "rf%d", instr->raddr_a);
|
||||
} else if (mux == V3D_QPU_MUX_B) {
|
||||
- if (instr->sig.small_imm) {
|
||||
+ if (instr->sig.small_imm_b) {
|
||||
uint32_t val;
|
||||
ASSERTED bool ok =
|
||||
v3d_qpu_small_imm_unpack(disasm->devinfo,
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 19bf721dbe1..9cd831863b4 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -50,7 +50,7 @@ struct v3d_qpu_sig {
|
||||
bool ldvpm:1;
|
||||
bool ldtlb:1;
|
||||
bool ldtlbu:1;
|
||||
- bool small_imm:1;
|
||||
+ bool small_imm_b:1;
|
||||
bool ucb:1;
|
||||
bool rotate:1;
|
||||
bool wrtmuc:1;
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index a875683c6f8..beac591d3c1 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -112,7 +112,7 @@
|
||||
#define LDTMU .ldtmu = true
|
||||
#define LDVARY .ldvary = true
|
||||
#define LDVPM .ldvpm = true
|
||||
-#define SMIMM .small_imm = true
|
||||
+#define SMIMM_B .small_imm_b = true
|
||||
#define LDTLB .ldtlb = true
|
||||
#define LDTLBU .ldtlbu = true
|
||||
#define UCB .ucb = true
|
||||
@@ -135,8 +135,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
|
||||
[11] = { THRSW, LDVARY, LDUNIF },
|
||||
[12] = { LDVARY, LDTMU, },
|
||||
[13] = { THRSW, LDVARY, LDTMU, },
|
||||
- [14] = { SMIMM, LDVARY, },
|
||||
- [15] = { SMIMM, },
|
||||
+ [14] = { SMIMM_B, LDVARY, },
|
||||
+ [15] = { SMIMM_B, },
|
||||
[16] = { LDTLB, },
|
||||
[17] = { LDTLBU, },
|
||||
/* 18-21 reserved */
|
||||
@@ -148,8 +148,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
|
||||
[27] = { THRSW, LDVPM, LDUNIF },
|
||||
[28] = { LDVPM, LDTMU, },
|
||||
[29] = { THRSW, LDVPM, LDTMU, },
|
||||
- [30] = { SMIMM, LDVPM, },
|
||||
- [31] = { SMIMM, },
|
||||
+ [30] = { SMIMM_B, LDVPM, },
|
||||
+ [31] = { SMIMM_B, },
|
||||
};
|
||||
|
||||
static const struct v3d_qpu_sig v40_sig_map[] = {
|
||||
@@ -167,8 +167,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
|
||||
[10] = { LDVARY, LDUNIF },
|
||||
[11] = { THRSW, LDVARY, LDUNIF },
|
||||
/* 12-13 reserved */
|
||||
- [14] = { SMIMM, LDVARY, },
|
||||
- [15] = { SMIMM, },
|
||||
+ [14] = { SMIMM_B, LDVARY, },
|
||||
+ [15] = { SMIMM_B, },
|
||||
[16] = { LDTLB, },
|
||||
[17] = { LDTLBU, },
|
||||
[18] = { WRTMUC },
|
||||
@@ -178,7 +178,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
|
||||
[22] = { UCB, },
|
||||
[23] = { ROT, },
|
||||
/* 24-30 reserved */
|
||||
- [31] = { SMIMM, LDTMU, },
|
||||
+ [31] = { SMIMM_B, LDTMU, },
|
||||
};
|
||||
|
||||
static const struct v3d_qpu_sig v41_sig_map[] = {
|
||||
@@ -197,8 +197,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
|
||||
[11] = { THRSW, LDVARY, LDUNIF },
|
||||
[12] = { LDUNIFRF },
|
||||
[13] = { THRSW, LDUNIFRF },
|
||||
- [14] = { SMIMM, LDVARY, },
|
||||
- [15] = { SMIMM, },
|
||||
+ [14] = { SMIMM_B, LDVARY },
|
||||
+ [15] = { SMIMM_B, },
|
||||
[16] = { LDTLB, },
|
||||
[17] = { LDTLBU, },
|
||||
[18] = { WRTMUC },
|
||||
@@ -210,7 +210,7 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
|
||||
[24] = { LDUNIFA},
|
||||
[25] = { LDUNIFARF },
|
||||
/* 26-30 reserved */
|
||||
- [31] = { SMIMM, LDTMU, },
|
||||
+ [31] = { SMIMM_B, LDTMU, },
|
||||
};
|
||||
|
||||
bool
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,53 @@
|
||||
From 0e87405fe73694c173b7ce14c3d60611f241922c Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 5 Aug 2021 00:50:12 +0200
|
||||
Subject: [PATCH 009/139] broadcom/compiler: add small_imm a/c/d on v3d_qpu_sig
|
||||
|
||||
small_imm_a, small_imm_c and small_imm_d added on top of the already
|
||||
existing small_imm_b, as V3D 7.1 defines 4 small immediates, tied to
|
||||
the 4 raddr. Note that this is only the definition, and just a inst
|
||||
validation rule to check that are not used before v71. Any real use is
|
||||
still pending.
|
||||
---
|
||||
src/broadcom/compiler/qpu_validate.c | 5 +++++
|
||||
src/broadcom/qpu/qpu_instr.h | 5 ++++-
|
||||
2 files changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
|
||||
index 2cc7a0eb0ae..12788692432 100644
|
||||
--- a/src/broadcom/compiler/qpu_validate.c
|
||||
+++ b/src/broadcom/compiler/qpu_validate.c
|
||||
@@ -115,6 +115,11 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
||||
return;
|
||||
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d)
|
||||
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
|
||||
+ }
|
||||
+
|
||||
/* LDVARY writes r5 two instructions later and LDUNIF writes
|
||||
* r5 one instruction later, which is illegal to have
|
||||
* together.
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 9cd831863b4..13b3f37d43f 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
|
||||
bool ldvpm:1;
|
||||
bool ldtlb:1;
|
||||
bool ldtlbu:1;
|
||||
- bool small_imm_b:1;
|
||||
bool ucb:1;
|
||||
bool rotate:1;
|
||||
bool wrtmuc:1;
|
||||
+ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
|
||||
+ bool small_imm_b:1; /* raddr_b (add b) */
|
||||
+ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
|
||||
+ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
|
||||
};
|
||||
|
||||
enum v3d_qpu_cond {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,106 @@
|
||||
From eca19c911d9af3b0ab3b563ea65dc455e3d27987 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 4 Aug 2021 01:11:16 +0200
|
||||
Subject: [PATCH 010/139] broadcom/qpu: add v71 signal map
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Compared with v41, the differences are:
|
||||
* 14, 15, 29 and 30 are now about immediate a, b, c, d respectively
|
||||
* 23 is now reserved. On v42 this was for rotate signals, that are
|
||||
gone on v71.
|
||||
|
||||
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
|
||||
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
|
||||
---
|
||||
src/broadcom/qpu/qpu_pack.c | 47 ++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 44 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index beac591d3c1..2820d9d4c56 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -112,12 +112,15 @@
|
||||
#define LDTMU .ldtmu = true
|
||||
#define LDVARY .ldvary = true
|
||||
#define LDVPM .ldvpm = true
|
||||
-#define SMIMM_B .small_imm_b = true
|
||||
#define LDTLB .ldtlb = true
|
||||
#define LDTLBU .ldtlbu = true
|
||||
#define UCB .ucb = true
|
||||
#define ROT .rotate = true
|
||||
#define WRTMUC .wrtmuc = true
|
||||
+#define SMIMM_A .small_imm_a = true
|
||||
+#define SMIMM_B .small_imm_b = true
|
||||
+#define SMIMM_C .small_imm_c = true
|
||||
+#define SMIMM_D .small_imm_d = true
|
||||
|
||||
static const struct v3d_qpu_sig v33_sig_map[] = {
|
||||
/* MISC R3 R4 R5 */
|
||||
@@ -213,6 +216,40 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
|
||||
[31] = { SMIMM_B, LDTMU, },
|
||||
};
|
||||
|
||||
+
|
||||
+static const struct v3d_qpu_sig v71_sig_map[] = {
|
||||
+ /* MISC phys RF0 */
|
||||
+ [0] = { },
|
||||
+ [1] = { THRSW, },
|
||||
+ [2] = { LDUNIF },
|
||||
+ [3] = { THRSW, LDUNIF },
|
||||
+ [4] = { LDTMU, },
|
||||
+ [5] = { THRSW, LDTMU, },
|
||||
+ [6] = { LDTMU, LDUNIF },
|
||||
+ [7] = { THRSW, LDTMU, LDUNIF },
|
||||
+ [8] = { LDVARY, },
|
||||
+ [9] = { THRSW, LDVARY, },
|
||||
+ [10] = { LDVARY, LDUNIF },
|
||||
+ [11] = { THRSW, LDVARY, LDUNIF },
|
||||
+ [12] = { LDUNIFRF },
|
||||
+ [13] = { THRSW, LDUNIFRF },
|
||||
+ [14] = { SMIMM_A, },
|
||||
+ [15] = { SMIMM_B, },
|
||||
+ [16] = { LDTLB, },
|
||||
+ [17] = { LDTLBU, },
|
||||
+ [18] = { WRTMUC },
|
||||
+ [19] = { THRSW, WRTMUC },
|
||||
+ [20] = { LDVARY, WRTMUC },
|
||||
+ [21] = { THRSW, LDVARY, WRTMUC },
|
||||
+ [22] = { UCB, },
|
||||
+ /* 23 reserved */
|
||||
+ [24] = { LDUNIFA},
|
||||
+ [25] = { LDUNIFARF },
|
||||
+ /* 26-29 reserved */
|
||||
+ [30] = { SMIMM_C, },
|
||||
+ [31] = { SMIMM_D, },
|
||||
+};
|
||||
+
|
||||
bool
|
||||
v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
|
||||
uint32_t packed_sig,
|
||||
@@ -221,7 +258,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
|
||||
if (packed_sig >= ARRAY_SIZE(v33_sig_map))
|
||||
return false;
|
||||
|
||||
- if (devinfo->ver >= 41)
|
||||
+ if (devinfo->ver >= 71)
|
||||
+ *sig = v71_sig_map[packed_sig];
|
||||
+ else if (devinfo->ver >= 41)
|
||||
*sig = v41_sig_map[packed_sig];
|
||||
else if (devinfo->ver == 40)
|
||||
*sig = v40_sig_map[packed_sig];
|
||||
@@ -240,7 +279,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
|
||||
{
|
||||
static const struct v3d_qpu_sig *map;
|
||||
|
||||
- if (devinfo->ver >= 41)
|
||||
+ if (devinfo->ver >= 71)
|
||||
+ map = v71_sig_map;
|
||||
+ else if (devinfo->ver >= 41)
|
||||
map = v41_sig_map;
|
||||
else if (devinfo->ver == 40)
|
||||
map = v40_sig_map;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,778 @@
|
||||
From d10e67a396d713ec81fb133f3516e09fe1e067b6 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Fri, 6 Aug 2021 01:22:31 +0200
|
||||
Subject: [PATCH 011/139] broadcom/qpu: define v3d_qpu_input, use on
|
||||
v3d_qpu_alu_instr
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
At this point it just tidy up a little the alu_instr structure.
|
||||
|
||||
But also serves to prepare the structure for new changes, as 7.x uses
|
||||
raddr instead of mux, and it is just easier to add the raddr to the
|
||||
new input structure.
|
||||
|
||||
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
|
||||
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 65 +++++++--------
|
||||
src/broadcom/compiler/vir.c | 16 ++--
|
||||
src/broadcom/compiler/vir_dump.c | 8 +-
|
||||
.../compiler/vir_opt_copy_propagate.c | 12 +--
|
||||
.../compiler/vir_opt_redundant_flags.c | 8 +-
|
||||
src/broadcom/compiler/vir_to_qpu.c | 30 +++----
|
||||
src/broadcom/qpu/qpu_disasm.c | 16 ++--
|
||||
src/broadcom/qpu/qpu_instr.c | 8 +-
|
||||
src/broadcom/qpu/qpu_instr.h | 13 +--
|
||||
src/broadcom/qpu/qpu_pack.c | 82 +++++++++----------
|
||||
src/broadcom/qpu/tests/qpu_disasm.c | 8 +-
|
||||
11 files changed, 134 insertions(+), 132 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index a10fa03ed10..455fa3867be 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -306,14 +306,14 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
|
||||
/* XXX: LOAD_IMM */
|
||||
|
||||
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
|
||||
- process_mux_deps(state, n, inst->alu.add.a);
|
||||
+ process_mux_deps(state, n, inst->alu.add.a.mux);
|
||||
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
|
||||
- process_mux_deps(state, n, inst->alu.add.b);
|
||||
+ process_mux_deps(state, n, inst->alu.add.b.mux);
|
||||
|
||||
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
|
||||
- process_mux_deps(state, n, inst->alu.mul.a);
|
||||
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
|
||||
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
|
||||
- process_mux_deps(state, n, inst->alu.mul.b);
|
||||
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
|
||||
|
||||
switch (inst->alu.add.op) {
|
||||
case V3D_QPU_A_VPMSETUP:
|
||||
@@ -537,22 +537,22 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
|
||||
|
||||
if (inst->alu.add.op != V3D_QPU_A_NOP) {
|
||||
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
|
||||
+ mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) {
|
||||
return true;
|
||||
}
|
||||
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
|
||||
+ mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
|
||||
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
|
||||
+ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) {
|
||||
return true;
|
||||
}
|
||||
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
|
||||
+ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -839,20 +839,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
if (!result->sig.small_imm_b) {
|
||||
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
|
||||
raddr_a == add_instr->raddr_b) {
|
||||
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
|
||||
- result->alu.add.a = V3D_QPU_MUX_A;
|
||||
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
|
||||
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
|
||||
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
|
||||
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
|
||||
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
|
||||
- result->alu.add.b = V3D_QPU_MUX_A;
|
||||
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
|
||||
}
|
||||
}
|
||||
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
|
||||
raddr_a == mul_instr->raddr_b) {
|
||||
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
|
||||
- result->alu.mul.a = V3D_QPU_MUX_A;
|
||||
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
|
||||
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
|
||||
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
|
||||
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
|
||||
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
|
||||
- result->alu.mul.b = V3D_QPU_MUX_A;
|
||||
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -863,20 +863,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
result->raddr_b = raddr_b;
|
||||
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
|
||||
raddr_b == add_instr->raddr_a) {
|
||||
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
|
||||
- result->alu.add.a = V3D_QPU_MUX_B;
|
||||
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
|
||||
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
|
||||
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
|
||||
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
|
||||
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
|
||||
- result->alu.add.b = V3D_QPU_MUX_B;
|
||||
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
|
||||
}
|
||||
}
|
||||
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
|
||||
raddr_b == mul_instr->raddr_a) {
|
||||
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
|
||||
- result->alu.mul.a = V3D_QPU_MUX_B;
|
||||
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
|
||||
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
|
||||
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
|
||||
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
|
||||
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
|
||||
- result->alu.mul.b = V3D_QPU_MUX_B;
|
||||
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -927,11 +927,12 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
|
||||
inst->flags.auf = V3D_QPU_UF_NONE;
|
||||
|
||||
inst->alu.mul.output_pack = inst->alu.add.output_pack;
|
||||
- inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
|
||||
- inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
|
||||
+
|
||||
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
|
||||
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
|
||||
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
- inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
|
||||
- inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -2064,12 +2065,12 @@ alu_reads_register(struct v3d_qpu_instr *inst,
|
||||
|
||||
if (add) {
|
||||
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
|
||||
- mux_a = inst->alu.add.a;
|
||||
- mux_b = inst->alu.add.b;
|
||||
+ mux_a = inst->alu.add.a.mux;
|
||||
+ mux_b = inst->alu.add.b.mux;
|
||||
} else {
|
||||
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
|
||||
- mux_a = inst->alu.mul.a;
|
||||
- mux_b = inst->alu.mul.b;
|
||||
+ mux_a = inst->alu.mul.a.mux;
|
||||
+ mux_b = inst->alu.mul.b.mux;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_src; i++) {
|
||||
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
|
||||
index 660b11b0577..007cb0a941b 100644
|
||||
--- a/src/broadcom/compiler/vir.c
|
||||
+++ b/src/broadcom/compiler/vir.c
|
||||
@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
|
||||
return false;
|
||||
}
|
||||
|
||||
- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
|
||||
- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
|
||||
- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
|
||||
- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
|
||||
+ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -209,15 +209,15 @@ vir_set_unpack(struct qinst *inst, int src,
|
||||
|
||||
if (vir_is_add(inst)) {
|
||||
if (src == 0)
|
||||
- inst->qpu.alu.add.a_unpack = unpack;
|
||||
+ inst->qpu.alu.add.a.unpack = unpack;
|
||||
else
|
||||
- inst->qpu.alu.add.b_unpack = unpack;
|
||||
+ inst->qpu.alu.add.b.unpack = unpack;
|
||||
} else {
|
||||
assert(vir_is_mul(inst));
|
||||
if (src == 0)
|
||||
- inst->qpu.alu.mul.a_unpack = unpack;
|
||||
+ inst->qpu.alu.mul.a.unpack = unpack;
|
||||
else
|
||||
- inst->qpu.alu.mul.b_unpack = unpack;
|
||||
+ inst->qpu.alu.mul.b.unpack = unpack;
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
|
||||
index 5c47bbdc1b0..ab5d4043039 100644
|
||||
--- a/src/broadcom/compiler/vir_dump.c
|
||||
+++ b/src/broadcom/compiler/vir_dump.c
|
||||
@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
|
||||
vir_print_reg(c, inst, inst->dst);
|
||||
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
|
||||
|
||||
- unpack[0] = instr->alu.add.a_unpack;
|
||||
- unpack[1] = instr->alu.add.b_unpack;
|
||||
+ unpack[0] = instr->alu.add.a.unpack;
|
||||
+ unpack[1] = instr->alu.add.b.unpack;
|
||||
} else {
|
||||
fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
|
||||
fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
|
||||
@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
|
||||
vir_print_reg(c, inst, inst->dst);
|
||||
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
|
||||
|
||||
- unpack[0] = instr->alu.mul.a_unpack;
|
||||
- unpack[1] = instr->alu.mul.b_unpack;
|
||||
+ unpack[0] = instr->alu.mul.a.unpack;
|
||||
+ unpack[1] = instr->alu.mul.b.unpack;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nsrc; i++) {
|
||||
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
|
||||
index da121c2a5bd..c4aa7255a17 100644
|
||||
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
|
||||
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
|
||||
@@ -104,14 +104,14 @@ vir_has_unpack(struct qinst *inst, int chan)
|
||||
|
||||
if (vir_is_add(inst)) {
|
||||
if (chan == 0)
|
||||
- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
|
||||
+ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
|
||||
else
|
||||
- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
|
||||
+ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
|
||||
} else {
|
||||
if (chan == 0)
|
||||
- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
|
||||
+ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
|
||||
else
|
||||
- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
|
||||
+ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,7 +161,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
|
||||
continue;
|
||||
|
||||
/* these ops can't represent abs. */
|
||||
- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
|
||||
+ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
|
||||
switch (inst->qpu.alu.add.op) {
|
||||
case V3D_QPU_A_VFPACK:
|
||||
case V3D_QPU_A_FROUND:
|
||||
@@ -189,7 +189,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
|
||||
|
||||
inst->src[i] = mov->src[0];
|
||||
if (vir_has_unpack(mov, 0)) {
|
||||
- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
|
||||
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
|
||||
|
||||
vir_set_unpack(inst, i, unpack);
|
||||
}
|
||||
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
|
||||
index c7896d57f2b..6b61ed6a39a 100644
|
||||
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
|
||||
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
|
||||
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
|
||||
a->qpu.flags.mpf != b->qpu.flags.mpf ||
|
||||
a->qpu.alu.add.op != b->qpu.alu.add.op ||
|
||||
a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
|
||||
- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
|
||||
- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
|
||||
+ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
|
||||
+ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
|
||||
a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
|
||||
- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
|
||||
- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
|
||||
+ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
|
||||
+ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
|
||||
a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
|
||||
return false;
|
||||
}
|
||||
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
|
||||
index 15c2e3674c2..c8b6e0a91a0 100644
|
||||
--- a/src/broadcom/compiler/vir_to_qpu.c
|
||||
+++ b/src/broadcom/compiler/vir_to_qpu.c
|
||||
@@ -106,20 +106,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||
return;
|
||||
}
|
||||
|
||||
- if (instr->alu.add.a != V3D_QPU_MUX_A &&
|
||||
- instr->alu.add.b != V3D_QPU_MUX_A &&
|
||||
- instr->alu.mul.a != V3D_QPU_MUX_A &&
|
||||
- instr->alu.mul.b != V3D_QPU_MUX_A) {
|
||||
+ if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
|
||||
+ instr->alu.add.b.mux != V3D_QPU_MUX_A &&
|
||||
+ instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
|
||||
+ instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
|
||||
instr->raddr_a = src.index;
|
||||
*mux = V3D_QPU_MUX_A;
|
||||
} else {
|
||||
if (instr->raddr_a == src.index) {
|
||||
*mux = V3D_QPU_MUX_A;
|
||||
} else {
|
||||
- assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
|
||||
- instr->alu.add.b == V3D_QPU_MUX_B &&
|
||||
- instr->alu.mul.a == V3D_QPU_MUX_B &&
|
||||
- instr->alu.mul.b == V3D_QPU_MUX_B) ||
|
||||
+ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
|
||||
+ instr->alu.add.b.mux == V3D_QPU_MUX_B &&
|
||||
+ instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
|
||||
+ instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
|
||||
src.index == instr->raddr_b);
|
||||
|
||||
instr->raddr_b = src.index;
|
||||
@@ -147,14 +147,14 @@ is_no_op_mov(struct qinst *qinst)
|
||||
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
|
||||
return false;
|
||||
|
||||
- if (qinst->qpu.alu.mul.a !=
|
||||
+ if (qinst->qpu.alu.mul.a.mux !=
|
||||
V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
int raddr;
|
||||
|
||||
- switch (qinst->qpu.alu.mul.a) {
|
||||
+ switch (qinst->qpu.alu.mul.a.mux) {
|
||||
case V3D_QPU_MUX_A:
|
||||
raddr = qinst->qpu.raddr_a;
|
||||
break;
|
||||
@@ -171,7 +171,7 @@ is_no_op_mov(struct qinst *qinst)
|
||||
/* No packing or flags updates, or we need to execute the
|
||||
* instruction.
|
||||
*/
|
||||
- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
|
||||
qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
|
||||
qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
|
||||
@@ -302,11 +302,11 @@ v3d_generate_code_block(struct v3d_compile *c,
|
||||
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
||||
if (nsrc >= 1) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.add.a, src[0]);
|
||||
+ &qinst->qpu.alu.add.a.mux, src[0]);
|
||||
}
|
||||
if (nsrc >= 2) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.add.b, src[1]);
|
||||
+ &qinst->qpu.alu.add.b.mux, src[1]);
|
||||
}
|
||||
|
||||
qinst->qpu.alu.add.waddr = dst.index;
|
||||
@@ -314,11 +314,11 @@ v3d_generate_code_block(struct v3d_compile *c,
|
||||
} else {
|
||||
if (nsrc >= 1) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.mul.a, src[0]);
|
||||
+ &qinst->qpu.alu.mul.a.mux, src[0]);
|
||||
}
|
||||
if (nsrc >= 2) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.mul.b, src[1]);
|
||||
+ &qinst->qpu.alu.mul.b.mux, src[1]);
|
||||
}
|
||||
|
||||
qinst->qpu.alu.mul.waddr = dst.index;
|
||||
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
|
||||
index 6aca3c28e78..588a665f770 100644
|
||||
--- a/src/broadcom/qpu/qpu_disasm.c
|
||||
+++ b/src/broadcom/qpu/qpu_disasm.c
|
||||
@@ -121,16 +121,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
|
||||
if (num_src >= 1) {
|
||||
if (has_dst)
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux);
|
||||
append(disasm, "%s",
|
||||
- v3d_qpu_unpack_name(instr->alu.add.a_unpack));
|
||||
+ v3d_qpu_unpack_name(instr->alu.add.a.unpack));
|
||||
}
|
||||
|
||||
if (num_src >= 2) {
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux);
|
||||
append(disasm, "%s",
|
||||
- v3d_qpu_unpack_name(instr->alu.add.b_unpack));
|
||||
+ v3d_qpu_unpack_name(instr->alu.add.b.unpack));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -164,16 +164,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
|
||||
if (num_src >= 1) {
|
||||
if (has_dst)
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux);
|
||||
append(disasm, "%s",
|
||||
- v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
|
||||
+ v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
|
||||
}
|
||||
|
||||
if (num_src >= 2) {
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux);
|
||||
append(disasm, "%s",
|
||||
- v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
|
||||
+ v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index 7759fb0efdf..7ece8b5e570 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -926,10 +926,10 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
|
||||
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
|
||||
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
|
||||
|
||||
- return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
|
||||
- (add_nsrc > 1 && inst->alu.add.b == mux) ||
|
||||
- (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
|
||||
- (mul_nsrc > 1 && inst->alu.mul.b == mux));
|
||||
+ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
|
||||
+ (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
|
||||
+ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
|
||||
+ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
|
||||
}
|
||||
|
||||
bool
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 13b3f37d43f..53a51bfb3e1 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -294,25 +294,26 @@ enum v3d_qpu_mux {
|
||||
V3D_QPU_MUX_B,
|
||||
};
|
||||
|
||||
+struct v3d_qpu_input {
|
||||
+ enum v3d_qpu_mux mux;
|
||||
+ enum v3d_qpu_input_unpack unpack;
|
||||
+};
|
||||
+
|
||||
struct v3d_qpu_alu_instr {
|
||||
struct {
|
||||
enum v3d_qpu_add_op op;
|
||||
- enum v3d_qpu_mux a, b;
|
||||
+ struct v3d_qpu_input a, b;
|
||||
uint8_t waddr;
|
||||
bool magic_write;
|
||||
enum v3d_qpu_output_pack output_pack;
|
||||
- enum v3d_qpu_input_unpack a_unpack;
|
||||
- enum v3d_qpu_input_unpack b_unpack;
|
||||
} add;
|
||||
|
||||
struct {
|
||||
enum v3d_qpu_mul_op op;
|
||||
- enum v3d_qpu_mux a, b;
|
||||
+ struct v3d_qpu_input a, b;
|
||||
uint8_t waddr;
|
||||
bool magic_write;
|
||||
enum v3d_qpu_output_pack output_pack;
|
||||
- enum v3d_qpu_input_unpack a_unpack;
|
||||
- enum v3d_qpu_input_unpack b_unpack;
|
||||
} mul;
|
||||
};
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 2820d9d4c56..6e975793fc0 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -853,12 +853,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
|
||||
- &instr->alu.add.a_unpack)) {
|
||||
+ &instr->alu.add.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
|
||||
- &instr->alu.add.b_unpack)) {
|
||||
+ &instr->alu.add.b.unpack)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@@ -872,7 +872,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
instr->alu.add.output_pack = mux_b & 0x3;
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
|
||||
- &instr->alu.add.a_unpack)) {
|
||||
+ &instr->alu.add.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@@ -884,7 +884,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
|
||||
- &instr->alu.add.a_unpack)) {
|
||||
+ &instr->alu.add.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@@ -892,23 +892,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
case V3D_QPU_A_VFMIN:
|
||||
case V3D_QPU_A_VFMAX:
|
||||
if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
|
||||
- &instr->alu.add.a_unpack)) {
|
||||
+ &instr->alu.add.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
break;
|
||||
|
||||
default:
|
||||
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
|
||||
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
break;
|
||||
}
|
||||
|
||||
- instr->alu.add.a = mux_a;
|
||||
- instr->alu.add.b = mux_b;
|
||||
+ instr->alu.add.a.mux = mux_a;
|
||||
+ instr->alu.add.b.mux = mux_b;
|
||||
instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
|
||||
|
||||
instr->alu.add.magic_write = false;
|
||||
@@ -956,12 +956,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
|
||||
- &instr->alu.mul.a_unpack)) {
|
||||
+ &instr->alu.mul.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
|
||||
- &instr->alu.mul.b_unpack)) {
|
||||
+ &instr->alu.mul.b.unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -972,7 +972,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
((mux_b >> 2) & 1));
|
||||
|
||||
if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
|
||||
- &instr->alu.mul.a_unpack)) {
|
||||
+ &instr->alu.mul.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -982,23 +982,23 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
|
||||
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
|
||||
|
||||
if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
|
||||
- &instr->alu.mul.a_unpack)) {
|
||||
+ &instr->alu.mul.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
|
||||
- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
|
||||
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
break;
|
||||
}
|
||||
|
||||
- instr->alu.mul.a = mux_a;
|
||||
- instr->alu.mul.b = mux_b;
|
||||
+ instr->alu.mul.a.mux = mux_a;
|
||||
+ instr->alu.mul.b.mux = mux_b;
|
||||
instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
|
||||
instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
|
||||
|
||||
@@ -1030,8 +1030,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
|
||||
{
|
||||
uint32_t waddr = instr->alu.add.waddr;
|
||||
- uint32_t mux_a = instr->alu.add.a;
|
||||
- uint32_t mux_b = instr->alu.add.b;
|
||||
+ uint32_t mux_a = instr->alu.add.a.mux;
|
||||
+ uint32_t mux_b = instr->alu.add.b.mux;
|
||||
int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
|
||||
const struct opcode_desc *desc =
|
||||
lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
|
||||
@@ -1102,12 +1102,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
}
|
||||
opcode |= output_pack << 4;
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
|
||||
&a_unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
|
||||
&b_unpack)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1141,17 +1141,17 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
uint32_t a_unpack;
|
||||
uint32_t b_unpack;
|
||||
|
||||
- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
|
||||
- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
|
||||
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
|
||||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
|
||||
&a_unpack)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
|
||||
&b_unpack)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1176,7 +1176,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
}
|
||||
mux_b |= packed;
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1194,7 +1194,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
return false;
|
||||
|
||||
uint32_t packed;
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1207,11 +1207,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
case V3D_QPU_A_VFMIN:
|
||||
case V3D_QPU_A_VFMAX:
|
||||
if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
|
||||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
|
||||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
|
||||
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1221,8 +1221,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
default:
|
||||
if (instr->alu.add.op != V3D_QPU_A_NOP &&
|
||||
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
|
||||
- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
|
||||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
|
||||
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@@ -1242,8 +1242,8 @@ static bool
|
||||
v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
|
||||
{
|
||||
- uint32_t mux_a = instr->alu.mul.a;
|
||||
- uint32_t mux_b = instr->alu.mul.b;
|
||||
+ uint32_t mux_a = instr->alu.mul.a.mux;
|
||||
+ uint32_t mux_b = instr->alu.mul.b.mux;
|
||||
int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
|
||||
|
||||
const struct opcode_desc *desc =
|
||||
@@ -1277,13 +1277,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
*/
|
||||
opcode += packed << 4;
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
opcode |= packed << 2;
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1301,7 +1301,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
opcode |= (packed >> 1) & 1;
|
||||
mux_b = (packed & 1) << 2;
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1315,16 +1315,16 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
|
||||
return false;
|
||||
|
||||
- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
|
||||
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
|
||||
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
|
||||
opcode = 8;
|
||||
else
|
||||
opcode |= (packed + 4) & 7;
|
||||
|
||||
- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
|
||||
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
|
||||
return false;
|
||||
|
||||
break;
|
||||
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
|
||||
index 2f8e19c73fe..be7b78d5ef0 100644
|
||||
--- a/src/broadcom/qpu/tests/qpu_disasm.c
|
||||
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
|
||||
@@ -160,10 +160,10 @@ main(int argc, char **argv)
|
||||
/* Swap the operands to be sure that we test
|
||||
* how the QPUs distinguish between these ops.
|
||||
*/
|
||||
- swap_mux(&instr.alu.add.a,
|
||||
- &instr.alu.add.b);
|
||||
- swap_pack(&instr.alu.add.a_unpack,
|
||||
- &instr.alu.add.b_unpack);
|
||||
+ swap_mux(&instr.alu.add.a.mux,
|
||||
+ &instr.alu.add.b.mux);
|
||||
+ swap_pack(&instr.alu.add.a.unpack,
|
||||
+ &instr.alu.add.b.unpack);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,45 @@
|
||||
From 52ea09792ff8a438ccdecac47b8415657be90098 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Fri, 6 Aug 2021 01:33:32 +0200
|
||||
Subject: [PATCH 012/139] broadcom/qpu: add raddr on v3d_qpu_input
|
||||
|
||||
On V3D 7.x mux are not used, and raddr_a/b/c/d are used instead
|
||||
|
||||
This is not perfect, as for v71, the raddr_a/b defined at qpu_instr
|
||||
became superfluous. But the alternative would be to define two
|
||||
different structs, or even having them defined based on version
|
||||
ifdefs, so this is a reasonable compromise.
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.h | 9 ++++++---
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 53a51bfb3e1..9e56e2d6a99 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -295,7 +295,10 @@ enum v3d_qpu_mux {
|
||||
};
|
||||
|
||||
struct v3d_qpu_input {
|
||||
- enum v3d_qpu_mux mux;
|
||||
+ union {
|
||||
+ enum v3d_qpu_mux mux; /* V3D 4.x */
|
||||
+ uint8_t raddr; /* V3D 7.x */
|
||||
+ };
|
||||
enum v3d_qpu_input_unpack unpack;
|
||||
};
|
||||
|
||||
@@ -385,8 +388,8 @@ struct v3d_qpu_instr {
|
||||
struct v3d_qpu_sig sig;
|
||||
uint8_t sig_addr;
|
||||
bool sig_magic; /* If the signal writes to a magic address */
|
||||
- uint8_t raddr_a;
|
||||
- uint8_t raddr_b;
|
||||
+ uint8_t raddr_a; /* V3D 4.x */
|
||||
+ uint8_t raddr_b; /* V3D 4.x*/
|
||||
struct v3d_qpu_flags flags;
|
||||
|
||||
union {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,37 @@
|
||||
From 3e5ad0881c2789619cdf65f40a44d5481e28e800 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 12 Aug 2021 02:24:02 +0200
|
||||
Subject: [PATCH 013/139] broadcom/qpu: defining shift/mask for raddr_c/d
|
||||
|
||||
On V3D 7.x it replaces mul_a/b and add_a/b
|
||||
---
|
||||
src/broadcom/qpu/qpu_pack.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 6e975793fc0..4f106909729 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -84,6 +84,9 @@
|
||||
#define V3D_QPU_MUL_A_SHIFT 18
|
||||
#define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18)
|
||||
|
||||
+#define V3D_QPU_RADDR_C_SHIFT 18
|
||||
+#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18)
|
||||
+
|
||||
#define V3D_QPU_ADD_B_SHIFT 15
|
||||
#define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15)
|
||||
|
||||
@@ -98,6 +101,9 @@
|
||||
#define V3D_QPU_BRANCH_BDI_SHIFT 12
|
||||
#define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12)
|
||||
|
||||
+#define V3D_QPU_RADDR_D_SHIFT 12
|
||||
+#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12)
|
||||
+
|
||||
#define V3D_QPU_RADDR_A_SHIFT 6
|
||||
#define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6)
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,46 @@
|
||||
From 81febf14fe05ad26e992275b911e8bc1e1416ebc Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Fri, 17 Sep 2021 01:04:31 +0200
|
||||
Subject: [PATCH 014/139] broadcom/commmon: add has_accumulators field on
|
||||
v3d_device_info
|
||||
|
||||
Even if we can just check for the version on the code, checking for
|
||||
this field makes several places more readable. So for example, on the
|
||||
register allocate code we doesn't assign an accumulator because we
|
||||
don't have accumulators on that hw, instead of because hw version is a
|
||||
given one.
|
||||
---
|
||||
src/broadcom/common/v3d_device_info.c | 2 ++
|
||||
src/broadcom/common/v3d_device_info.h | 3 +++
|
||||
2 files changed, 5 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
|
||||
index 7512fe3a06b..7bc2b662cfc 100644
|
||||
--- a/src/broadcom/common/v3d_device_info.c
|
||||
+++ b/src/broadcom/common/v3d_device_info.c
|
||||
@@ -65,6 +65,8 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
|
||||
int qups = (ident1.value >> 8) & 0xf;
|
||||
devinfo->qpu_count = nslc * qups;
|
||||
|
||||
+ devinfo->has_accumulators = devinfo->ver < 71;
|
||||
+
|
||||
switch (devinfo->ver) {
|
||||
case 33:
|
||||
case 41:
|
||||
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
|
||||
index 32cb65cf81f..8dfc7858727 100644
|
||||
--- a/src/broadcom/common/v3d_device_info.h
|
||||
+++ b/src/broadcom/common/v3d_device_info.h
|
||||
@@ -42,6 +42,9 @@ struct v3d_device_info {
|
||||
|
||||
/* NSLC * QUPS from the core's IDENT registers. */
|
||||
int qpu_count;
|
||||
+
|
||||
+ /* If the hw has accumulator registers */
|
||||
+ bool has_accumulators;
|
||||
};
|
||||
|
||||
typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,52 @@
|
||||
From 7d42eca87b6e144697810405308d99d200dca62a Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 15 Sep 2021 10:56:43 +0200
|
||||
Subject: [PATCH 015/139] broadcom/qpu: add qpu_writes_rf0_implicitly helper
|
||||
|
||||
On v71 rf0 replaces r5 as the register that gets updated implicitly
|
||||
with uniform loads, and gets the C coefficient with ldvary. This
|
||||
helper return if rf0 gets implicitly updated.
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++
|
||||
src/broadcom/qpu/qpu_instr.h | 2 ++
|
||||
2 files changed, 14 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index 7ece8b5e570..8de99c611d5 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -920,6 +920,18 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
|
||||
return false;
|
||||
}
|
||||
|
||||
+bool
|
||||
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
|
||||
+ const struct v3d_qpu_instr *inst)
|
||||
+{
|
||||
+ if (devinfo->ver >= 71 &&
|
||||
+ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
bool
|
||||
v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
|
||||
{
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 9e56e2d6a99..a25be8e0ee6 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -473,6 +473,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
|
||||
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
|
||||
+ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
--
|
||||
2.39.2
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,261 @@
|
||||
From ebba9019461083687f6afd23ff0d4646c1a667cb Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Sun, 29 Jan 2023 00:27:11 +0100
|
||||
Subject: [PATCH 017/139] broadcom/compiler: update node/temp translation for
|
||||
v71
|
||||
|
||||
As the offset applied needs to take into account if we have
|
||||
accumulators or not.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 68 +++++++++----------
|
||||
1 file changed, 34 insertions(+), 34 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index b22f915d1df..aa9473d124b 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -39,30 +39,31 @@
|
||||
CLASS_BITS_R5)
|
||||
|
||||
static inline uint32_t
|
||||
-temp_to_node(uint32_t temp)
|
||||
+temp_to_node(struct v3d_compile *c, uint32_t temp)
|
||||
{
|
||||
- return temp + ACC_COUNT;
|
||||
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
-node_to_temp(uint32_t node)
|
||||
+node_to_temp(struct v3d_compile *c, uint32_t node)
|
||||
{
|
||||
- assert(node >= ACC_COUNT);
|
||||
- return node - ACC_COUNT;
|
||||
+ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
|
||||
+ (!c->devinfo->has_accumulators && node >= 0));
|
||||
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0);
|
||||
}
|
||||
|
||||
static inline uint8_t
|
||||
-get_temp_class_bits(struct v3d_ra_node_info *nodes,
|
||||
+get_temp_class_bits(struct v3d_compile *c,
|
||||
uint32_t temp)
|
||||
{
|
||||
- return nodes->info[temp_to_node(temp)].class_bits;
|
||||
+ return c->nodes.info[temp_to_node(c, temp)].class_bits;
|
||||
}
|
||||
|
||||
static inline void
|
||||
-set_temp_class_bits(struct v3d_ra_node_info *nodes,
|
||||
+set_temp_class_bits(struct v3d_compile *c,
|
||||
uint32_t temp, uint8_t class_bits)
|
||||
{
|
||||
- nodes->info[temp_to_node(temp)].class_bits = class_bits;
|
||||
+ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
|
||||
}
|
||||
|
||||
static struct ra_class *
|
||||
@@ -84,7 +85,7 @@ static inline struct ra_class *
|
||||
choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
|
||||
{
|
||||
assert(temp < c->num_temps && temp < c->nodes.alloc_count);
|
||||
- return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
|
||||
+ return choose_reg_class(c, get_temp_class_bits(c, temp));
|
||||
}
|
||||
|
||||
static inline bool
|
||||
@@ -313,7 +314,7 @@ v3d_choose_spill_node(struct v3d_compile *c)
|
||||
|
||||
for (unsigned i = 0; i < c->num_temps; i++) {
|
||||
if (BITSET_TEST(c->spillable, i)) {
|
||||
- ra_set_node_spill_cost(c->g, temp_to_node(i),
|
||||
+ ra_set_node_spill_cost(c->g, temp_to_node(c, i),
|
||||
spill_costs[i]);
|
||||
}
|
||||
}
|
||||
@@ -482,7 +483,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
|
||||
c->temp_start[i] < ip && c->temp_end[i] >= ip :
|
||||
c->temp_start[i] <= ip && c->temp_end[i] > ip;
|
||||
if (thrsw_cross) {
|
||||
- ra_set_node_class(c->g, temp_to_node(i),
|
||||
+ ra_set_node_class(c->g, temp_to_node(c, i),
|
||||
choose_reg_class(c, CLASS_BITS_PHYS));
|
||||
}
|
||||
}
|
||||
@@ -509,8 +510,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
|
||||
* same register class bits as the original.
|
||||
*/
|
||||
if (inst == position) {
|
||||
- uint8_t class_bits = get_temp_class_bits(&c->nodes,
|
||||
- inst->dst.index);
|
||||
+ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
|
||||
inst->dst = vir_get_temp(c);
|
||||
add_node(c, inst->dst.index, class_bits);
|
||||
} else {
|
||||
@@ -574,7 +574,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
reconstruct_op = orig_def->qpu.alu.add.op;
|
||||
}
|
||||
|
||||
- uint32_t spill_node = temp_to_node(spill_temp);
|
||||
+ uint32_t spill_node = temp_to_node(c, spill_temp);
|
||||
|
||||
/* We must disable the ldunif optimization if we are spilling uniforms */
|
||||
bool had_disable_ldunif_opt = c->disable_ldunif_opt;
|
||||
@@ -739,12 +739,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
* update node priorities based one new liveness data.
|
||||
*/
|
||||
uint32_t sb_temp =c->spill_base.index;
|
||||
- uint32_t sb_node = temp_to_node(sb_temp);
|
||||
+ uint32_t sb_node = temp_to_node(c, sb_temp);
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_end[i] == -1)
|
||||
continue;
|
||||
|
||||
- uint32_t node_i = temp_to_node(i);
|
||||
+ uint32_t node_i = temp_to_node(c, i);
|
||||
c->nodes.info[node_i].priority =
|
||||
c->temp_end[i] - c->temp_start[i];
|
||||
|
||||
@@ -752,7 +752,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
j < c->num_temps; j++) {
|
||||
if (interferes(c->temp_start[i], c->temp_end[i],
|
||||
c->temp_start[j], c->temp_end[j])) {
|
||||
- uint32_t node_j = temp_to_node(j);
|
||||
+ uint32_t node_j = temp_to_node(c, j);
|
||||
ra_add_node_interference(c->g, node_i, node_j);
|
||||
}
|
||||
}
|
||||
@@ -958,7 +958,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
ra_add_node_interference(c->g,
|
||||
- temp_to_node(i),
|
||||
+ temp_to_node(c, i),
|
||||
acc_nodes[3]);
|
||||
}
|
||||
}
|
||||
@@ -968,7 +968,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
ra_add_node_interference(c->g,
|
||||
- temp_to_node(i),
|
||||
+ temp_to_node(c, i),
|
||||
acc_nodes[4]);
|
||||
}
|
||||
}
|
||||
@@ -987,7 +987,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
* decides whether the LDVPM is in or out)
|
||||
*/
|
||||
assert(inst->dst.file == QFILE_TEMP);
|
||||
- set_temp_class_bits(&c->nodes, inst->dst.index,
|
||||
+ set_temp_class_bits(c, inst->dst.index,
|
||||
CLASS_BITS_PHYS);
|
||||
break;
|
||||
}
|
||||
@@ -1002,7 +1002,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
* phys regfile.
|
||||
*/
|
||||
assert(inst->dst.file == QFILE_TEMP);
|
||||
- set_temp_class_bits(&c->nodes, inst->dst.index,
|
||||
+ set_temp_class_bits(c, inst->dst.index,
|
||||
CLASS_BITS_PHYS);
|
||||
break;
|
||||
}
|
||||
@@ -1024,7 +1024,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
*/
|
||||
assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
|
||||
assert(inst->dst.file == QFILE_TEMP);
|
||||
- uint32_t node = temp_to_node(inst->dst.index);
|
||||
+ uint32_t node = temp_to_node(c, inst->dst.index);
|
||||
ra_set_node_reg(c->g, node,
|
||||
PHYS_INDEX + inst->src[0].index);
|
||||
break;
|
||||
@@ -1043,9 +1043,9 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
*/
|
||||
if (!inst->qpu.sig.ldunif) {
|
||||
uint8_t class_bits =
|
||||
- get_temp_class_bits(&c->nodes, inst->dst.index) &
|
||||
+ get_temp_class_bits(c, inst->dst.index) &
|
||||
~CLASS_BITS_R5;
|
||||
- set_temp_class_bits(&c->nodes, inst->dst.index,
|
||||
+ set_temp_class_bits(c, inst->dst.index,
|
||||
class_bits);
|
||||
|
||||
} else {
|
||||
@@ -1054,7 +1054,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
* loads interfere with each other.
|
||||
*/
|
||||
if (c->devinfo->ver < 40) {
|
||||
- set_temp_class_bits(&c->nodes, inst->dst.index,
|
||||
+ set_temp_class_bits(c, inst->dst.index,
|
||||
CLASS_BITS_R5);
|
||||
}
|
||||
}
|
||||
@@ -1064,7 +1064,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
if (inst->qpu.sig.thrsw) {
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
- set_temp_class_bits(&c->nodes, i,
|
||||
+ set_temp_class_bits(c, i,
|
||||
CLASS_BITS_PHYS);
|
||||
}
|
||||
}
|
||||
@@ -1125,7 +1125,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
c->nodes.info[i].priority = 0;
|
||||
c->nodes.info[i].class_bits = 0;
|
||||
} else {
|
||||
- uint32_t t = node_to_temp(i);
|
||||
+ uint32_t t = node_to_temp(c, i);
|
||||
c->nodes.info[i].priority =
|
||||
c->temp_end[t] - c->temp_start[t];
|
||||
c->nodes.info[i].class_bits = CLASS_BITS_ANY;
|
||||
@@ -1143,7 +1143,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
|
||||
/* Set the register classes for all our temporaries in the graph */
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
- ra_set_node_class(c->g, temp_to_node(i),
|
||||
+ ra_set_node_class(c->g, temp_to_node(c, i),
|
||||
choose_reg_class_for_temp(c, i));
|
||||
}
|
||||
|
||||
@@ -1153,8 +1153,8 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
if (interferes(c->temp_start[i], c->temp_end[i],
|
||||
c->temp_start[j], c->temp_end[j])) {
|
||||
ra_add_node_interference(c->g,
|
||||
- temp_to_node(i),
|
||||
- temp_to_node(j));
|
||||
+ temp_to_node(c, i),
|
||||
+ temp_to_node(c, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1171,7 +1171,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
if (c->spill_size <
|
||||
V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
|
||||
int node = v3d_choose_spill_node(c);
|
||||
- uint32_t temp = node_to_temp(node);
|
||||
+ uint32_t temp = node_to_temp(c, node);
|
||||
if (node != -1) {
|
||||
v3d_spill_reg(c, acc_nodes, temp);
|
||||
continue;
|
||||
@@ -1186,7 +1186,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
if (node == -1)
|
||||
goto spill_fail;
|
||||
|
||||
- uint32_t temp = node_to_temp(node);
|
||||
+ uint32_t temp = node_to_temp(c, node);
|
||||
enum temp_spill_type spill_type =
|
||||
get_spill_type_for_temp(c, temp);
|
||||
if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
|
||||
@@ -1201,7 +1201,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
/* Allocation was successful, build the 'temp -> reg' map */
|
||||
temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
- int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
|
||||
+ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
|
||||
if (ra_reg < PHYS_INDEX) {
|
||||
temp_registers[i].magic = true;
|
||||
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,144 @@
|
||||
From 9b2dfe0286212aba3687a06023cc5b4ce9944ee0 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Mon, 23 Aug 2021 02:18:43 +0200
|
||||
Subject: [PATCH 018/139] broadcom/compiler: phys index depends on hw version
|
||||
|
||||
For 7.1 there are not accumulators. So we replace the macro with a
|
||||
function call.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++-----
|
||||
1 file changed, 29 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index aa9473d124b..a358b616e13 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -28,9 +28,19 @@
|
||||
|
||||
#define ACC_INDEX 0
|
||||
#define ACC_COUNT 6
|
||||
-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
|
||||
-#define PHYS_COUNT 64
|
||||
|
||||
+#define PHYS_COUNT 64
|
||||
+
|
||||
+static uint8_t
|
||||
+get_phys_index(const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ if (devinfo->has_accumulators)
|
||||
+ return ACC_INDEX + ACC_COUNT;
|
||||
+ else
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/* ACC as accumulator */
|
||||
#define CLASS_BITS_PHYS (1 << 0)
|
||||
#define CLASS_BITS_ACC (1 << 1)
|
||||
#define CLASS_BITS_R5 (1 << 4)
|
||||
@@ -771,9 +781,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
}
|
||||
|
||||
struct v3d_ra_select_callback_data {
|
||||
+ uint32_t phys_index;
|
||||
uint32_t next_acc;
|
||||
uint32_t next_phys;
|
||||
struct v3d_ra_node_info *nodes;
|
||||
+ const struct v3d_device_info *devinfo;
|
||||
};
|
||||
|
||||
/* Choosing accumulators improves chances of merging QPU instructions
|
||||
@@ -794,7 +806,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
static const int available_rf_threshold = 5;
|
||||
int available_rf = 0 ;
|
||||
for (int i = 0; i < PHYS_COUNT; i++) {
|
||||
- if (BITSET_TEST(regs, PHYS_INDEX + i))
|
||||
+ if (BITSET_TEST(regs, v3d_ra->phys_index + i))
|
||||
available_rf++;
|
||||
if (available_rf >= available_rf_threshold)
|
||||
break;
|
||||
@@ -854,7 +866,7 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
{
|
||||
for (int i = 0; i < PHYS_COUNT; i++) {
|
||||
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
|
||||
- int phys = PHYS_INDEX + phys_off;
|
||||
+ int phys = v3d_ra->phys_index + phys_off;
|
||||
|
||||
if (BITSET_TEST(regs, phys)) {
|
||||
v3d_ra->next_phys = phys_off + 1;
|
||||
@@ -896,8 +908,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||
* register file can be divided up for fragment shader threading.
|
||||
*/
|
||||
int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
|
||||
+ uint8_t phys_index = get_phys_index(compiler->devinfo);
|
||||
|
||||
- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
|
||||
+ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
|
||||
false);
|
||||
if (!compiler->regs)
|
||||
return false;
|
||||
@@ -912,8 +925,8 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||
compiler->reg_class_phys[threads] =
|
||||
ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
|
||||
- for (int i = PHYS_INDEX;
|
||||
- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
|
||||
+ for (int i = phys_index;
|
||||
+ i < phys_index + (PHYS_COUNT >> threads); i++) {
|
||||
ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
|
||||
ra_class_add_reg(compiler->reg_class_phys[threads], i);
|
||||
ra_class_add_reg(compiler->reg_class_any[threads], i);
|
||||
@@ -1026,7 +1039,8 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
assert(inst->dst.file == QFILE_TEMP);
|
||||
uint32_t node = temp_to_node(c, inst->dst.index);
|
||||
ra_set_node_reg(c->g, node,
|
||||
- PHYS_INDEX + inst->src[0].index);
|
||||
+ get_phys_index(c->devinfo) +
|
||||
+ inst->src[0].index);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1086,13 +1100,17 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
c->num_temps + ACC_COUNT),
|
||||
};
|
||||
|
||||
+ uint32_t phys_index = get_phys_index(c->devinfo);
|
||||
+
|
||||
struct v3d_ra_select_callback_data callback_data = {
|
||||
+ .phys_index = phys_index,
|
||||
.next_acc = 0,
|
||||
/* Start at RF3, to try to keep the TLB writes from using
|
||||
* RF0-2.
|
||||
*/
|
||||
.next_phys = 3,
|
||||
.nodes = &c->nodes,
|
||||
+ .devinfo = c->devinfo,
|
||||
};
|
||||
|
||||
vir_calculate_live_intervals(c);
|
||||
@@ -1139,6 +1157,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
vir_for_each_inst_inorder(inst, c) {
|
||||
inst->ip = ip++;
|
||||
update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
|
||||
+
|
||||
}
|
||||
|
||||
/* Set the register classes for all our temporaries in the graph */
|
||||
@@ -1202,13 +1221,13 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
|
||||
- if (ra_reg < PHYS_INDEX) {
|
||||
+ if (ra_reg < phys_index) {
|
||||
temp_registers[i].magic = true;
|
||||
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
|
||||
ra_reg - ACC_INDEX);
|
||||
} else {
|
||||
temp_registers[i].magic = false;
|
||||
- temp_registers[i].index = ra_reg - PHYS_INDEX;
|
||||
+ temp_registers[i].index = ra_reg - phys_index;
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,40 @@
|
||||
From da0a3deadf86a46c8323267d3f6a49e442835608 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Fri, 17 Sep 2021 01:07:06 +0200
|
||||
Subject: [PATCH 019/139] broadcom/compiler: don't favor/select accum registers
|
||||
for hw not supporting it
|
||||
|
||||
Note that what we do is to just return false on the favor/select accum
|
||||
methods. We could just avoid to call them, but as the select is called
|
||||
more than once, it is just easier this way.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index a358b616e13..1f495180784 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -797,6 +797,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
BITSET_WORD *regs,
|
||||
int priority)
|
||||
{
|
||||
+ if (!v3d_ra->devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
/* Favor accumulators if we have less that this number of physical
|
||||
* registers. Accumulators have more restrictions (like being
|
||||
* invalidated through thrsw), so running out of physical registers
|
||||
@@ -832,6 +835,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
BITSET_WORD *regs,
|
||||
unsigned int *out)
|
||||
{
|
||||
+ if (!v3d_ra->devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
/* Choose r5 for our ldunifs if possible (nobody else can load to that
|
||||
* reg, and it keeps the QPU cond field free from being occupied by
|
||||
* ldunifrf).
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,105 @@
|
||||
From 6c04d7c917da6b38f8b2b4306ab03ed2ab7e6ce0 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 9 Sep 2021 00:28:53 +0200
|
||||
Subject: [PATCH 020/139] broadcom/vir: implement is_no_op_mov for v71
|
||||
|
||||
Did some refactoring/splitting.
|
||||
---
|
||||
src/broadcom/compiler/vir_to_qpu.c | 66 ++++++++++++++++++++++++------
|
||||
1 file changed, 53 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
|
||||
index c8b6e0a91a0..08970d52954 100644
|
||||
--- a/src/broadcom/compiler/vir_to_qpu.c
|
||||
+++ b/src/broadcom/compiler/vir_to_qpu.c
|
||||
@@ -129,19 +129,8 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||
}
|
||||
|
||||
static bool
|
||||
-is_no_op_mov(struct qinst *qinst)
|
||||
+v3d33_mov_src_and_dst_equal(struct qinst *qinst)
|
||||
{
|
||||
- static const struct v3d_qpu_sig no_sig = {0};
|
||||
-
|
||||
- /* Make sure it's just a lone MOV. */
|
||||
- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
|
||||
- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
|
||||
- qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
|
||||
- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- /* Check if it's a MOV from a register to itself. */
|
||||
enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
|
||||
if (qinst->qpu.alu.mul.magic_write) {
|
||||
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
|
||||
@@ -168,6 +157,57 @@ is_no_op_mov(struct qinst *qinst)
|
||||
return false;
|
||||
}
|
||||
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool
|
||||
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
|
||||
+{
|
||||
+ if (qinst->qpu.alu.mul.magic_write)
|
||||
+ return false;
|
||||
+
|
||||
+ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
|
||||
+ int raddr;
|
||||
+
|
||||
+ raddr = qinst->qpu.alu.mul.a.raddr;
|
||||
+ if (raddr != waddr)
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool
|
||||
+mov_src_and_dst_equal(struct qinst *qinst,
|
||||
+ const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ if (devinfo->ver < 71)
|
||||
+ return v3d33_mov_src_and_dst_equal(qinst);
|
||||
+ else
|
||||
+ return v3d71_mov_src_and_dst_equal(qinst);
|
||||
+}
|
||||
+
|
||||
+
|
||||
+static bool
|
||||
+is_no_op_mov(struct qinst *qinst,
|
||||
+ const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ static const struct v3d_qpu_sig no_sig = {0};
|
||||
+
|
||||
+ /* Make sure it's just a lone MOV. We only check for M_MOV. Although
|
||||
+ * for V3D 7.x there is also A_MOV, we don't need to check for it as
|
||||
+ * we always emit using M_MOV. We could use A_MOV later on the
|
||||
+ * squedule to improve performance
|
||||
+ */
|
||||
+ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
|
||||
+ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
|
||||
+ qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
|
||||
+ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ if (!mov_src_and_dst_equal(qinst, devinfo))
|
||||
+ return false;
|
||||
+
|
||||
/* No packing or flags updates, or we need to execute the
|
||||
* instruction.
|
||||
*/
|
||||
@@ -324,7 +364,7 @@ v3d_generate_code_block(struct v3d_compile *c,
|
||||
qinst->qpu.alu.mul.waddr = dst.index;
|
||||
qinst->qpu.alu.mul.magic_write = dst.magic;
|
||||
|
||||
- if (is_no_op_mov(qinst)) {
|
||||
+ if (is_no_op_mov(qinst, c->devinfo)) {
|
||||
vir_remove_instruction(c, qinst);
|
||||
continue;
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,104 @@
|
||||
From 7b5be2d9b178a45c34c22db2744639a6a8a216d1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 9 Sep 2021 01:18:54 +0200
|
||||
Subject: [PATCH 021/139] broadcom/compiler: update vir_to_qpu::set_src for v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/vir_to_qpu.c | 47 ++++++++++++++++++++++++++----
|
||||
1 file changed, 42 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
|
||||
index 08970d52954..afc4941fdb1 100644
|
||||
--- a/src/broadcom/compiler/vir_to_qpu.c
|
||||
+++ b/src/broadcom/compiler/vir_to_qpu.c
|
||||
@@ -86,12 +86,22 @@ new_qpu_nop_before(struct qinst *inst)
|
||||
return q;
|
||||
}
|
||||
|
||||
+static void
|
||||
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
|
||||
+{
|
||||
+ if (src.smimm)
|
||||
+ unreachable("v3d71_set_src: pending handling small immediates");
|
||||
+
|
||||
+ assert(!src.magic);
|
||||
+ *raddr = src.index;
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* Allocates the src register (accumulator or register file) into the RADDR
|
||||
* fields of the instruction.
|
||||
*/
|
||||
static void
|
||||
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||
+v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||
{
|
||||
if (src.smimm) {
|
||||
assert(instr->sig.small_imm_b);
|
||||
@@ -128,6 +138,24 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
|
||||
}
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * The main purpose of the following wrapper is to make calling set_src
|
||||
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
|
||||
+ * will be filled or not based on the device version.
|
||||
+ */
|
||||
+static void
|
||||
+set_src(struct v3d_qpu_instr *instr,
|
||||
+ enum v3d_qpu_mux *mux,
|
||||
+ uint8_t *raddr,
|
||||
+ struct qpu_reg src,
|
||||
+ const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ if (devinfo->ver < 71)
|
||||
+ return v3d33_set_src(instr, mux, src);
|
||||
+ else
|
||||
+ return v3d71_set_src(instr, raddr, src);
|
||||
+}
|
||||
+
|
||||
static bool
|
||||
v3d33_mov_src_and_dst_equal(struct qinst *qinst)
|
||||
{
|
||||
@@ -340,13 +368,18 @@ v3d_generate_code_block(struct v3d_compile *c,
|
||||
qinst->qpu.sig_magic = dst.magic;
|
||||
} else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
|
||||
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
||||
+
|
||||
if (nsrc >= 1) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.add.a.mux, src[0]);
|
||||
+ &qinst->qpu.alu.add.a.mux,
|
||||
+ &qinst->qpu.alu.add.a.raddr,
|
||||
+ src[0], c->devinfo);
|
||||
}
|
||||
if (nsrc >= 2) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.add.b.mux, src[1]);
|
||||
+ &qinst->qpu.alu.add.b.mux,
|
||||
+ &qinst->qpu.alu.add.b.raddr,
|
||||
+ src[1], c->devinfo);
|
||||
}
|
||||
|
||||
qinst->qpu.alu.add.waddr = dst.index;
|
||||
@@ -354,11 +387,15 @@ v3d_generate_code_block(struct v3d_compile *c,
|
||||
} else {
|
||||
if (nsrc >= 1) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.mul.a.mux, src[0]);
|
||||
+ &qinst->qpu.alu.mul.a.mux,
|
||||
+ &qinst->qpu.alu.mul.a.raddr,
|
||||
+ src[0], c->devinfo);
|
||||
}
|
||||
if (nsrc >= 2) {
|
||||
set_src(&qinst->qpu,
|
||||
- &qinst->qpu.alu.mul.b.mux, src[1]);
|
||||
+ &qinst->qpu.alu.mul.b.mux,
|
||||
+ &qinst->qpu.alu.mul.b.raddr,
|
||||
+ src[1], c->devinfo);
|
||||
}
|
||||
|
||||
qinst->qpu.alu.mul.waddr = dst.index;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,92 @@
|
||||
From fe89703008f2a3d6bfe6e260791f712013be5e48 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 9 Sep 2021 23:59:28 +0200
|
||||
Subject: [PATCH 022/139] broadcom/qpu_schedule: add process_raddr_deps
|
||||
|
||||
On v71 we don't have muxes, but more raddr. Adding a equivalent add
|
||||
deps function.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 52 +++++++++++++++++++++++-----
|
||||
1 file changed, 44 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 455fa3867be..89254643c90 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -155,6 +155,7 @@ static void
|
||||
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
|
||||
enum v3d_qpu_mux mux)
|
||||
{
|
||||
+ assert(state->devinfo->ver < 71);
|
||||
switch (mux) {
|
||||
case V3D_QPU_MUX_A:
|
||||
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
|
||||
@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
|
||||
}
|
||||
}
|
||||
|
||||
+
|
||||
+static void
|
||||
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
|
||||
+ uint8_t raddr, bool is_small_imm)
|
||||
+{
|
||||
+ assert(state->devinfo->ver >= 71);
|
||||
+
|
||||
+ if (!is_small_imm)
|
||||
+ add_read_dep(state, state->last_rf[raddr], n);
|
||||
+}
|
||||
+
|
||||
static bool
|
||||
tmu_write_is_sequence_terminator(uint32_t waddr)
|
||||
{
|
||||
@@ -305,15 +317,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
|
||||
|
||||
/* XXX: LOAD_IMM */
|
||||
|
||||
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
|
||||
- process_mux_deps(state, n, inst->alu.add.a.mux);
|
||||
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
|
||||
- process_mux_deps(state, n, inst->alu.add.b.mux);
|
||||
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ process_mux_deps(state, n, inst->alu.add.a.mux);
|
||||
+ } else {
|
||||
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
|
||||
+ inst->sig.small_imm_a);
|
||||
+ }
|
||||
+ }
|
||||
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ process_mux_deps(state, n, inst->alu.add.b.mux);
|
||||
+ } else {
|
||||
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
|
||||
+ inst->sig.small_imm_b);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
|
||||
- process_mux_deps(state, n, inst->alu.mul.a.mux);
|
||||
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
|
||||
- process_mux_deps(state, n, inst->alu.mul.b.mux);
|
||||
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
|
||||
+ } else {
|
||||
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
|
||||
+ inst->sig.small_imm_c);
|
||||
+ }
|
||||
+ }
|
||||
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
|
||||
+ } else {
|
||||
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
|
||||
+ inst->sig.small_imm_d);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
switch (inst->alu.add.op) {
|
||||
case V3D_QPU_A_VPMSETUP:
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,128 @@
|
||||
From 20ce426df1ab2546332141f4bc4531ada754cdea Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Fri, 10 Sep 2021 01:20:44 +0200
|
||||
Subject: [PATCH 023/139] broadcom/qpu: update disasm_raddr for v71
|
||||
|
||||
---
|
||||
src/broadcom/qpu/qpu_disasm.c | 72 ++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 66 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
|
||||
index 588a665f770..b613de781dc 100644
|
||||
--- a/src/broadcom/qpu/qpu_disasm.c
|
||||
+++ b/src/broadcom/qpu/qpu_disasm.c
|
||||
@@ -56,8 +56,9 @@ pad_to(struct disasm_state *disasm, int n)
|
||||
|
||||
|
||||
static void
|
||||
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
- const struct v3d_qpu_instr *instr, uint8_t mux)
|
||||
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
+ const struct v3d_qpu_instr *instr,
|
||||
+ enum v3d_qpu_mux mux)
|
||||
{
|
||||
if (mux == V3D_QPU_MUX_A) {
|
||||
append(disasm, "rf%d", instr->raddr_a);
|
||||
@@ -82,6 +83,65 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
}
|
||||
}
|
||||
|
||||
+enum v3d_qpu_input_class {
|
||||
+ V3D_QPU_ADD_A,
|
||||
+ V3D_QPU_ADD_B,
|
||||
+ V3D_QPU_MUL_A,
|
||||
+ V3D_QPU_MUL_B
|
||||
+};
|
||||
+
|
||||
+static void
|
||||
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
+ const struct v3d_qpu_instr *instr,
|
||||
+ uint8_t raddr,
|
||||
+ enum v3d_qpu_input_class input_class)
|
||||
+{
|
||||
+ bool is_small_imm = false;
|
||||
+ switch(input_class) {
|
||||
+ case V3D_QPU_ADD_A:
|
||||
+ is_small_imm = instr->sig.small_imm_a;
|
||||
+ break;
|
||||
+ case V3D_QPU_ADD_B:
|
||||
+ is_small_imm = instr->sig.small_imm_b;
|
||||
+ break;
|
||||
+ case V3D_QPU_MUL_A:
|
||||
+ is_small_imm = instr->sig.small_imm_c;
|
||||
+ break;
|
||||
+ case V3D_QPU_MUL_B:
|
||||
+ is_small_imm = instr->sig.small_imm_d;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ if (is_small_imm) {
|
||||
+ unreachable("Pending handling small immediates");
|
||||
+ uint32_t val;
|
||||
+ ASSERTED bool ok =
|
||||
+ v3d_qpu_small_imm_unpack(disasm->devinfo,
|
||||
+ raddr,
|
||||
+ &val);
|
||||
+
|
||||
+ if ((int)val >= -16 && (int)val <= 15)
|
||||
+ append(disasm, "%d", val);
|
||||
+ else
|
||||
+ append(disasm, "0x%08x", val);
|
||||
+ assert(ok);
|
||||
+ } else {
|
||||
+ append(disasm, "rf%d", raddr);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
+ const struct v3d_qpu_instr *instr,
|
||||
+ const struct v3d_qpu_input *input,
|
||||
+ enum v3d_qpu_input_class input_class)
|
||||
+{
|
||||
+ if (disasm->devinfo->ver < 71)
|
||||
+ v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
|
||||
+ else
|
||||
+ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
|
||||
+}
|
||||
+
|
||||
static void
|
||||
v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
|
||||
{
|
||||
@@ -121,14 +181,14 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
|
||||
if (num_src >= 1) {
|
||||
if (has_dst)
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
|
||||
append(disasm, "%s",
|
||||
v3d_qpu_unpack_name(instr->alu.add.a.unpack));
|
||||
}
|
||||
|
||||
if (num_src >= 2) {
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
|
||||
append(disasm, "%s",
|
||||
v3d_qpu_unpack_name(instr->alu.add.b.unpack));
|
||||
}
|
||||
@@ -164,14 +224,14 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
|
||||
if (num_src >= 1) {
|
||||
if (has_dst)
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
|
||||
append(disasm, "%s",
|
||||
v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
|
||||
}
|
||||
|
||||
if (num_src >= 2) {
|
||||
append(disasm, ", ");
|
||||
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux);
|
||||
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
|
||||
append(disasm, "%s",
|
||||
v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,59 @@
|
||||
From 7263fa24a3c57b1dcd4d870670cda86ae89aa28c Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 15 Sep 2021 10:55:49 +0200
|
||||
Subject: [PATCH 024/139] broadcom/qpu: return false on
|
||||
qpu_writes_accumulatorXX helpers for v71
|
||||
|
||||
As for v71 doesn't have accumulators (devinfo->has_accumulators set to
|
||||
false), those methods would always return false.
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++
|
||||
1 file changed, 12 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index 8de99c611d5..7ec3c867260 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -854,6 +854,9 @@ bool
|
||||
v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
+ if(!devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
|
||||
return true;
|
||||
|
||||
@@ -864,6 +867,9 @@ bool
|
||||
v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
+ if (!devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
|
||||
if (inst->alu.add.op != V3D_QPU_A_NOP &&
|
||||
inst->alu.add.magic_write &&
|
||||
@@ -894,6 +900,9 @@ bool
|
||||
v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
+ if (!devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
|
||||
return true;
|
||||
|
||||
@@ -904,6 +913,9 @@ bool
|
||||
v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
+ if (!devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
if (v3d_qpu_writes_r5(devinfo, inst))
|
||||
return true;
|
||||
if (v3d_qpu_writes_r4(devinfo, inst))
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,116 @@
|
||||
From 6a9611c5a22218388bba419174d3343e0cdf773b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 14 Sep 2021 10:42:55 +0200
|
||||
Subject: [PATCH 025/139] broadcom/compiler: add support for varyings on nir to
|
||||
vir generation for v71
|
||||
|
||||
Needs update as v71 doesn't have accumulators anymore, and ldvary uses
|
||||
now rf0 to return the value.
|
||||
---
|
||||
src/broadcom/compiler/nir_to_vir.c | 34 +++++++++++++++++-------------
|
||||
1 file changed, 19 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
|
||||
index ca072971f01..79a22c3bd08 100644
|
||||
--- a/src/broadcom/compiler/nir_to_vir.c
|
||||
+++ b/src/broadcom/compiler/nir_to_vir.c
|
||||
@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
|
||||
|
||||
static struct qreg
|
||||
emit_smooth_varying(struct v3d_compile *c,
|
||||
- struct qreg vary, struct qreg w, struct qreg r5)
|
||||
+ struct qreg vary, struct qreg w, struct qreg c_reg)
|
||||
{
|
||||
- return vir_FADD(c, vir_FMUL(c, vary, w), r5);
|
||||
+ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
emit_noperspective_varying(struct v3d_compile *c,
|
||||
- struct qreg vary, struct qreg r5)
|
||||
+ struct qreg vary, struct qreg c_reg)
|
||||
{
|
||||
- return vir_FADD(c, vir_MOV(c, vary), r5);
|
||||
+ return vir_FADD(c, vir_MOV(c, vary), c_reg);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
emit_flat_varying(struct v3d_compile *c,
|
||||
- struct qreg vary, struct qreg r5)
|
||||
+ struct qreg vary, struct qreg c_reg)
|
||||
{
|
||||
vir_MOV_dest(c, c->undef, vary);
|
||||
- return vir_MOV(c, r5);
|
||||
+ return vir_MOV(c, c_reg);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
int8_t input_idx, uint8_t swizzle, int array_index)
|
||||
{
|
||||
- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
|
||||
- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
|
||||
+ struct qreg c_reg; /* C coefficient */
|
||||
+
|
||||
+ if (c->devinfo->has_accumulators)
|
||||
+ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
|
||||
+ else
|
||||
+ c_reg = vir_reg(QFILE_REG, 0);
|
||||
|
||||
struct qinst *ldvary = NULL;
|
||||
struct qreg vary;
|
||||
@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
vary = vir_emit_def(c, ldvary);
|
||||
} else {
|
||||
vir_NOP(c)->qpu.sig.ldvary = true;
|
||||
- vary = r3;
|
||||
+ vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
|
||||
}
|
||||
|
||||
/* Store the input value before interpolation so we can implement
|
||||
@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
if (input_idx >= 0) {
|
||||
assert(var);
|
||||
c->interp[input_idx].vp = vary;
|
||||
- c->interp[input_idx].C = vir_MOV(c, r5);
|
||||
+ c->interp[input_idx].C = vir_MOV(c, c_reg);
|
||||
c->interp[input_idx].mode = var->data.interpolation;
|
||||
}
|
||||
|
||||
@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
*/
|
||||
if (!var) {
|
||||
assert(input_idx < 0);
|
||||
- return emit_smooth_varying(c, vary, c->payload_w, r5);
|
||||
+ return emit_smooth_varying(c, vary, c->payload_w, c_reg);
|
||||
}
|
||||
|
||||
int i = c->num_inputs++;
|
||||
@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
|
||||
if (var->data.centroid) {
|
||||
BITSET_SET(c->centroid_flags, i);
|
||||
result = emit_smooth_varying(c, vary,
|
||||
- c->payload_w_centroid, r5);
|
||||
+ c->payload_w_centroid, c_reg);
|
||||
} else {
|
||||
- result = emit_smooth_varying(c, vary, c->payload_w, r5);
|
||||
+ result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
|
||||
}
|
||||
break;
|
||||
|
||||
case INTERP_MODE_NOPERSPECTIVE:
|
||||
BITSET_SET(c->noperspective_flags, i);
|
||||
- result = emit_noperspective_varying(c, vary, r5);
|
||||
+ result = emit_noperspective_varying(c, vary, c_reg);
|
||||
break;
|
||||
|
||||
case INTERP_MODE_FLAT:
|
||||
BITSET_SET(c->flat_shade_flags, i);
|
||||
- result = emit_flat_varying(c, vary, r5);
|
||||
+ result = emit_flat_varying(c, vary, c_reg);
|
||||
break;
|
||||
|
||||
default:
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,55 @@
|
||||
From 06af15a60f7a9c135893e5f8934b8030c1da95f9 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 15 Sep 2021 01:14:15 +0200
|
||||
Subject: [PATCH 026/139] broadcom/compiler: payload_w is loaded on rf3 for v71
|
||||
|
||||
And in general rf0 is now used for other needs.
|
||||
---
|
||||
src/broadcom/compiler/nir_to_vir.c | 6 +++++-
|
||||
src/broadcom/compiler/vir_register_allocate.c | 6 +++++-
|
||||
2 files changed, 10 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
|
||||
index 79a22c3bd08..1a05b279a2d 100644
|
||||
--- a/src/broadcom/compiler/nir_to_vir.c
|
||||
+++ b/src/broadcom/compiler/nir_to_vir.c
|
||||
@@ -4325,7 +4325,11 @@ nir_to_vir(struct v3d_compile *c)
|
||||
{
|
||||
switch (c->s->info.stage) {
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
|
||||
+ if (c->devinfo->ver < 71)
|
||||
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
|
||||
+ else
|
||||
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
|
||||
+
|
||||
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
|
||||
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 1f495180784..eca9a6751a6 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -1034,6 +1034,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
if (inst->src[0].file == QFILE_REG) {
|
||||
switch (inst->src[0].index) {
|
||||
case 0:
|
||||
+ /* V3D 7.x doesn't use rf0 for thread payload */
|
||||
+ if (c->devinfo->ver >= 71)
|
||||
+ break;
|
||||
+ else
|
||||
+ FALLTHROUGH;
|
||||
case 1:
|
||||
case 2:
|
||||
case 3: {
|
||||
@@ -1163,7 +1168,6 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
vir_for_each_inst_inorder(inst, c) {
|
||||
inst->ip = ip++;
|
||||
update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
|
||||
-
|
||||
}
|
||||
|
||||
/* Set the register classes for all our temporaries in the graph */
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,30 @@
|
||||
From d38d8056903b9a4f96ab56261ac3b3c3be0af4fb Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 15 Sep 2021 11:12:59 +0200
|
||||
Subject: [PATCH 027/139] broadcom/qpu_schedule: update write deps for v71
|
||||
|
||||
We just need to add a write dep if rf0 is written implicitly.
|
||||
|
||||
Note that we don't need to check if we have accumulators when checking
|
||||
for r3/r4/r5, as v3d_qpu_writes_rX would return false for hw version
|
||||
that doesn't have accumulators.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 89254643c90..2fa9031d7b6 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -422,6 +422,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
|
||||
add_write_dep(state, &state->last_r[4], n);
|
||||
if (v3d_qpu_writes_r5(devinfo, inst))
|
||||
add_write_dep(state, &state->last_r[5], n);
|
||||
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
|
||||
+ add_write_dep(state, &state->last_rf[0], n);
|
||||
|
||||
/* If we add any more dependencies here we should consider whether we
|
||||
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,140 @@
|
||||
From 7e2a2be830b1672ab846389a46b5d09bad0f7a98 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 16 Sep 2021 00:49:25 +0200
|
||||
Subject: [PATCH 028/139] broadcom/compiler: update register classes to not
|
||||
include accumulators on v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 56 ++++++++++++-------
|
||||
1 file changed, 36 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index eca9a6751a6..7b3f6c41934 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -44,10 +44,15 @@ get_phys_index(const struct v3d_device_info *devinfo)
|
||||
#define CLASS_BITS_PHYS (1 << 0)
|
||||
#define CLASS_BITS_ACC (1 << 1)
|
||||
#define CLASS_BITS_R5 (1 << 4)
|
||||
-#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \
|
||||
- CLASS_BITS_ACC | \
|
||||
- CLASS_BITS_R5)
|
||||
|
||||
+static uint8_t
|
||||
+get_class_bit_any(const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ if (devinfo->has_accumulators)
|
||||
+ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
|
||||
+ else
|
||||
+ return CLASS_BITS_PHYS;
|
||||
+}
|
||||
static inline uint32_t
|
||||
temp_to_node(struct v3d_compile *c, uint32_t temp)
|
||||
{
|
||||
@@ -82,11 +87,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
|
||||
if (class_bits == CLASS_BITS_PHYS) {
|
||||
return c->compiler->reg_class_phys[c->thread_index];
|
||||
} else if (class_bits == (CLASS_BITS_R5)) {
|
||||
+ assert(c->devinfo->has_accumulators);
|
||||
return c->compiler->reg_class_r5[c->thread_index];
|
||||
} else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
|
||||
+ assert(c->devinfo->has_accumulators);
|
||||
return c->compiler->reg_class_phys_or_acc[c->thread_index];
|
||||
} else {
|
||||
- assert(class_bits == CLASS_BITS_ANY);
|
||||
+ assert(class_bits == get_class_bit_any(c->devinfo));
|
||||
return c->compiler->reg_class_any[c->thread_index];
|
||||
}
|
||||
}
|
||||
@@ -447,7 +454,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
|
||||
*/
|
||||
assert(c->disable_ldunif_opt);
|
||||
struct qreg offset = vir_uniform_ui(c, spill_offset);
|
||||
- add_node(c, offset.index, CLASS_BITS_ANY);
|
||||
+ add_node(c, offset.index, get_class_bit_any(c->devinfo));
|
||||
|
||||
/* We always enable per-quad on spills/fills to ensure we spill
|
||||
* any channels involved with helper invocations.
|
||||
@@ -645,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
* instruction immediately after, so
|
||||
* we can use any register class for it.
|
||||
*/
|
||||
- add_node(c, unif.index, CLASS_BITS_ANY);
|
||||
+ add_node(c, unif.index,
|
||||
+ get_class_bit_any(c->devinfo));
|
||||
} else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
|
||||
struct qreg temp =
|
||||
reconstruct_temp(c, reconstruct_op);
|
||||
@@ -924,31 +932,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||
for (int threads = 0; threads < max_thread_index; threads++) {
|
||||
compiler->reg_class_any[threads] =
|
||||
ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
- compiler->reg_class_r5[threads] =
|
||||
- ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
- compiler->reg_class_phys_or_acc[threads] =
|
||||
- ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
+ if (compiler->devinfo->has_accumulators) {
|
||||
+ compiler->reg_class_r5[threads] =
|
||||
+ ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
+ compiler->reg_class_phys_or_acc[threads] =
|
||||
+ ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
+ }
|
||||
compiler->reg_class_phys[threads] =
|
||||
ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
|
||||
for (int i = phys_index;
|
||||
i < phys_index + (PHYS_COUNT >> threads); i++) {
|
||||
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
|
||||
+ if (compiler->devinfo->has_accumulators)
|
||||
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
|
||||
ra_class_add_reg(compiler->reg_class_phys[threads], i);
|
||||
ra_class_add_reg(compiler->reg_class_any[threads], i);
|
||||
}
|
||||
|
||||
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
|
||||
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
|
||||
- ra_class_add_reg(compiler->reg_class_any[threads], i);
|
||||
+ if (compiler->devinfo->has_accumulators) {
|
||||
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
|
||||
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
|
||||
+ ra_class_add_reg(compiler->reg_class_any[threads], i);
|
||||
+ }
|
||||
}
|
||||
/* r5 can only store a single 32-bit value, so not much can
|
||||
* use it.
|
||||
*/
|
||||
- ra_class_add_reg(compiler->reg_class_r5[threads],
|
||||
- ACC_INDEX + 5);
|
||||
- ra_class_add_reg(compiler->reg_class_any[threads],
|
||||
- ACC_INDEX + 5);
|
||||
+ if (compiler->devinfo->has_accumulators) {
|
||||
+ ra_class_add_reg(compiler->reg_class_r5[threads],
|
||||
+ ACC_INDEX + 5);
|
||||
+ ra_class_add_reg(compiler->reg_class_any[threads],
|
||||
+ ACC_INDEX + 5);
|
||||
+ }
|
||||
}
|
||||
|
||||
ra_set_finalize(compiler->regs, NULL);
|
||||
@@ -1086,7 +1101,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
}
|
||||
|
||||
/* All accumulators are invalidated across a thread switch. */
|
||||
- if (inst->qpu.sig.thrsw) {
|
||||
+ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
set_temp_class_bits(c, i,
|
||||
@@ -1157,7 +1172,8 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
uint32_t t = node_to_temp(c, i);
|
||||
c->nodes.info[i].priority =
|
||||
c->temp_end[t] - c->temp_start[t];
|
||||
- c->nodes.info[i].class_bits = CLASS_BITS_ANY;
|
||||
+ c->nodes.info[i].class_bits =
|
||||
+ get_class_bit_any(c->devinfo);
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,109 @@
|
||||
From 0157228c729b8812dc4900fa24db63b7d27aa342 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 23 Sep 2021 11:19:58 +0200
|
||||
Subject: [PATCH 029/139] broadcom/compiler: implement "reads/writes too soon"
|
||||
checks for v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 65 ++++++++++++++++++++++------
|
||||
1 file changed, 51 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 2fa9031d7b6..4db0c2e72da 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -562,7 +562,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
|
||||
}
|
||||
|
||||
static bool
|
||||
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
|
||||
+reads_too_soon(struct choose_scoreboard *scoreboard,
|
||||
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
|
||||
+{
|
||||
+ switch (raddr) {
|
||||
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
|
||||
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
|
||||
+ return true;
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static bool
|
||||
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
|
||||
+ struct choose_scoreboard *scoreboard,
|
||||
struct qinst *qinst)
|
||||
{
|
||||
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
||||
@@ -574,24 +591,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
|
||||
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
|
||||
|
||||
if (inst->alu.add.op != V3D_QPU_A_NOP) {
|
||||
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) {
|
||||
- return true;
|
||||
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
|
||||
+ return true;
|
||||
+ } else {
|
||||
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
|
||||
+ return true;
|
||||
+ }
|
||||
}
|
||||
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) {
|
||||
- return true;
|
||||
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
|
||||
+ return true;
|
||||
+ } else {
|
||||
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
|
||||
+ return true;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
|
||||
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) {
|
||||
- return true;
|
||||
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
|
||||
+ return true;
|
||||
+ } else {
|
||||
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
|
||||
+ return true;
|
||||
+ }
|
||||
}
|
||||
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
|
||||
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) {
|
||||
- return true;
|
||||
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
|
||||
+ return true;
|
||||
+ } else {
|
||||
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
|
||||
+ return true;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1147,7 +1184,7 @@ retry:
|
||||
* regfile A or B that was written to by the previous
|
||||
* instruction."
|
||||
*/
|
||||
- if (reads_too_soon_after_write(scoreboard, n->inst))
|
||||
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
|
||||
continue;
|
||||
|
||||
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,118 @@
|
||||
From 3fb3333bdf9699157cf0a2bd46ba4c25058bc5c1 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 23 Sep 2021 11:44:59 +0200
|
||||
Subject: [PATCH 030/139] broadcom/compiler: implement read stall check for v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 32 +++++++++++++++++-----------
|
||||
src/broadcom/qpu/qpu_instr.c | 12 +++++++++++
|
||||
src/broadcom/qpu/qpu_instr.h | 2 ++
|
||||
3 files changed, 34 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 4db0c2e72da..b78abe003e9 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -679,29 +679,37 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
|
||||
}
|
||||
|
||||
static bool
|
||||
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
|
||||
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
|
||||
+ const struct v3d_qpu_instr *inst,
|
||||
uint32_t waddr) {
|
||||
|
||||
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
||||
return false;
|
||||
|
||||
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
|
||||
- inst->raddr_a == waddr)
|
||||
- return true;
|
||||
+ if (devinfo->ver < 71) {
|
||||
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
|
||||
+ inst->raddr_a == waddr)
|
||||
+ return true;
|
||||
|
||||
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
|
||||
- !inst->sig.small_imm_b && (inst->raddr_b == waddr))
|
||||
- return true;
|
||||
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
|
||||
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
|
||||
+ return true;
|
||||
+ } else {
|
||||
+ /* FIXME: skip if small immediate */
|
||||
+ if (v3d71_qpu_reads_raddr(inst, waddr))
|
||||
+ return true;
|
||||
+ }
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
-mux_read_stalls(struct choose_scoreboard *scoreboard,
|
||||
- const struct v3d_qpu_instr *inst)
|
||||
+read_stalls(const struct v3d_device_info *devinfo,
|
||||
+ struct choose_scoreboard *scoreboard,
|
||||
+ const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
|
||||
- qpu_instruction_uses_rf(inst,
|
||||
+ qpu_instruction_uses_rf(devinfo, inst,
|
||||
scoreboard->last_stallable_sfu_reg);
|
||||
}
|
||||
|
||||
@@ -1319,7 +1327,7 @@ retry:
|
||||
|
||||
int prio = get_instruction_priority(c->devinfo, inst);
|
||||
|
||||
- if (mux_read_stalls(scoreboard, inst)) {
|
||||
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
|
||||
/* Don't merge an instruction that stalls */
|
||||
if (prev_inst)
|
||||
continue;
|
||||
@@ -2389,7 +2397,7 @@ schedule_instructions(struct v3d_compile *c,
|
||||
}
|
||||
}
|
||||
}
|
||||
- if (mux_read_stalls(scoreboard, inst))
|
||||
+ if (read_stalls(c->devinfo, scoreboard, inst))
|
||||
c->qpu_inst_stalled_count++;
|
||||
}
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index 7ec3c867260..e8bbb2141b0 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -956,6 +956,18 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
|
||||
(mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
|
||||
}
|
||||
|
||||
+bool
|
||||
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
|
||||
+{
|
||||
+ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
|
||||
+ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
|
||||
+
|
||||
+ return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) ||
|
||||
+ (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) ||
|
||||
+ (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) ||
|
||||
+ (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
|
||||
+}
|
||||
+
|
||||
bool
|
||||
v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_sig *sig)
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index a25be8e0ee6..9f7582ab06d 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -494,4 +494,6 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
|
||||
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
+
|
||||
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
|
||||
#endif
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,65 @@
|
||||
From cbe0a7a06a5fb9b3f28acba8c9cac362a6bc5324 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 6 Oct 2021 13:58:00 +0200
|
||||
Subject: [PATCH 031/139] broadcom/compiler: add a
|
||||
v3d71_qpu_writes_waddr_explicitly helper
|
||||
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.c | 28 ++++++++++++++++++++++++++++
|
||||
src/broadcom/qpu/qpu_instr.h | 3 +++
|
||||
2 files changed, 31 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index e8bbb2141b0..feb6b343c1c 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -968,6 +968,34 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
|
||||
(mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
|
||||
}
|
||||
|
||||
+bool
|
||||
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
|
||||
+ const struct v3d_qpu_instr *inst,
|
||||
+ uint8_t waddr)
|
||||
+{
|
||||
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
||||
+ return false;
|
||||
+
|
||||
+ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
|
||||
+ !inst->alu.add.magic_write &&
|
||||
+ inst->alu.add.waddr == waddr) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
|
||||
+ !inst->alu.mul.magic_write &&
|
||||
+ inst->alu.mul.waddr == waddr) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
|
||||
+ !inst->sig_magic && inst->sig_addr == waddr) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
bool
|
||||
v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
|
||||
const struct v3d_qpu_sig *sig)
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 9f7582ab06d..50a69ce8c3a 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -496,4 +496,7 @@ bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
|
||||
bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
|
||||
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
|
||||
+ const struct v3d_qpu_instr *inst,
|
||||
+ uint8_t waddr);
|
||||
#endif
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,67 @@
|
||||
From 92e91a9b22ae61dc9f39880e8fdaa7714789efdb Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 27 Sep 2021 11:49:24 +0200
|
||||
Subject: [PATCH 032/139] broadcom/compiler: prevent rf2-3 usage in thread end
|
||||
delay slots for v71
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 37 +++++++++++++++++++++-------
|
||||
1 file changed, 28 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index b78abe003e9..839c0c62315 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1691,16 +1691,35 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
|
||||
return false;
|
||||
|
||||
- /* RF0-2 might be overwritten during the delay slots by
|
||||
- * fragment shader setup.
|
||||
- */
|
||||
- if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
|
||||
- return false;
|
||||
+ if (c->devinfo->ver <= 42) {
|
||||
+ /* RF0-2 might be overwritten during the delay slots by
|
||||
+ * fragment shader setup.
|
||||
+ */
|
||||
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
|
||||
+ return false;
|
||||
|
||||
- if (inst->raddr_b < 3 &&
|
||||
- !inst->sig.small_imm_b &&
|
||||
- v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
|
||||
- return false;
|
||||
+ if (inst->raddr_b < 3 &&
|
||||
+ !inst->sig.small_imm_b &&
|
||||
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (c->devinfo->ver >= 71) {
|
||||
+ /* RF2-3 might be overwritten during the delay slots by
|
||||
+ * fragment shader setup.
|
||||
+ *
|
||||
+ * FIXME: handle small immediate cases
|
||||
+ */
|
||||
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
|
||||
+ v3d71_qpu_reads_raddr(inst, 3)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
|
||||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
|
||||
+ return false;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,78 @@
|
||||
From 68a1545eb973e41608534ff05a9e84a86c046453 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 27 Sep 2021 13:26:04 +0200
|
||||
Subject: [PATCH 033/139] broadcom/qpu: add new ADD opcodes for FMOV/MOV in v71
|
||||
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.c | 5 +++++
|
||||
src/broadcom/qpu/qpu_instr.h | 4 ++++
|
||||
src/broadcom/qpu/qpu_pack.c | 15 +++++++++++++++
|
||||
3 files changed, 24 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index feb6b343c1c..195a0dcd232 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -177,6 +177,8 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
|
||||
[V3D_QPU_A_ITOF] = "itof",
|
||||
[V3D_QPU_A_CLZ] = "clz",
|
||||
[V3D_QPU_A_UTOF] = "utof",
|
||||
+ [V3D_QPU_A_MOV] = "mov",
|
||||
+ [V3D_QPU_A_FMOV] = "fmov",
|
||||
};
|
||||
|
||||
if (op >= ARRAY_SIZE(op_names))
|
||||
@@ -458,6 +460,9 @@ static const uint8_t add_op_args[] = {
|
||||
[V3D_QPU_A_ITOF] = D | A,
|
||||
[V3D_QPU_A_CLZ] = D | A,
|
||||
[V3D_QPU_A_UTOF] = D | A,
|
||||
+
|
||||
+ [V3D_QPU_A_MOV] = D | A,
|
||||
+ [V3D_QPU_A_FMOV] = D | A,
|
||||
};
|
||||
|
||||
static const uint8_t mul_op_args[] = {
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 50a69ce8c3a..c86a4119c54 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -227,6 +227,10 @@ enum v3d_qpu_add_op {
|
||||
V3D_QPU_A_ITOF,
|
||||
V3D_QPU_A_CLZ,
|
||||
V3D_QPU_A_UTOF,
|
||||
+
|
||||
+ /* V3D 7.x */
|
||||
+ V3D_QPU_A_FMOV,
|
||||
+ V3D_QPU_A_MOV,
|
||||
};
|
||||
|
||||
enum v3d_qpu_mul_op {
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 4045275cb9a..0e504e65fbf 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -776,6 +776,21 @@ static const struct opcode_desc add_ops_v71[] = {
|
||||
|
||||
{ 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
|
||||
{ 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
|
||||
+
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
|
||||
+
|
||||
+ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
|
||||
+ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
|
||||
+
|
||||
};
|
||||
|
||||
static const struct opcode_desc mul_ops_v71[] = {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,46 @@
|
||||
From 8dbbb7e22b694fdc62376d112b3dc6105d556c63 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 4 Oct 2021 13:07:35 +0200
|
||||
Subject: [PATCH 034/139] broadcom/qpu: fix packing/unpacking of fmov variants
|
||||
for v71
|
||||
|
||||
---
|
||||
src/broadcom/qpu/qpu_pack.c | 9 ++++-----
|
||||
1 file changed, 4 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 0e504e65fbf..0eb820b3f10 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -1405,9 +1405,9 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
|
||||
break;
|
||||
|
||||
case V3D_QPU_M_FMOV:
|
||||
- instr->alu.mul.output_pack = (raddr_d >> 2) & 1;
|
||||
+ instr->alu.mul.output_pack = raddr_d & 0x3;
|
||||
|
||||
- if (!v3d_qpu_float32_unpack_unpack(raddr_d & 0x3,
|
||||
+ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
|
||||
&instr->alu.mul.a.unpack)) {
|
||||
return false;
|
||||
}
|
||||
@@ -2046,14 +2046,13 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
- opcode |= (packed >> 1) & 1;
|
||||
- raddr_d = (packed & 1) << 2;
|
||||
+ raddr_d |= packed;
|
||||
|
||||
if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
|
||||
&packed)) {
|
||||
return false;
|
||||
}
|
||||
- raddr_d |= packed;
|
||||
+ raddr_d |= packed << 2;
|
||||
break;
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,107 @@
|
||||
From 63d0059ebef288afb0e2e746dadda8c2238bdfcb Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 28 Sep 2021 01:17:08 +0200
|
||||
Subject: [PATCH 035/139] broadcom/qpu: implement switch rules for fmin/fmax
|
||||
fadd/faddnf for v71
|
||||
|
||||
They use the same opcodes, and switch between one and the other based
|
||||
on raddr.
|
||||
|
||||
Note that the rule rule includes also if small_imm_a/b are used. That
|
||||
is still not in place so that part is hardcode. Would be updated later
|
||||
when small immediates support for v71 gets implemented.
|
||||
---
|
||||
src/broadcom/qpu/qpu_pack.c | 48 +++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 48 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 0eb820b3f10..7a262f18ac3 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -651,7 +651,9 @@ static const struct opcode_desc mul_ops_v33[] = {
|
||||
* opcodes that changed on v71
|
||||
*/
|
||||
static const struct opcode_desc add_ops_v71[] = {
|
||||
+ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
|
||||
{ 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
|
||||
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
|
||||
{ 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
|
||||
{ 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
|
||||
{ 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
|
||||
@@ -666,6 +668,10 @@ static const struct opcode_desc add_ops_v71[] = {
|
||||
{ 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
|
||||
{ 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
|
||||
{ 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
|
||||
+ /* FMIN is instead FMAX depending on the raddr_a/b order. */
|
||||
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
|
||||
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
|
||||
+ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
|
||||
|
||||
{ 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
|
||||
{ 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
|
||||
@@ -1162,6 +1168,22 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
|
||||
|
||||
instr->alu.add.op = desc->op;
|
||||
|
||||
+ /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the
|
||||
+ * operands.
|
||||
+ */
|
||||
+ /* FIXME: for now hardcoded values, until we got the small_imm support
|
||||
+ * in place
|
||||
+ */
|
||||
+ uint32_t small_imm_a = 0;
|
||||
+ uint32_t small_imm_b = 0;
|
||||
+ if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
|
||||
+ small_imm_b *256 + (op & 3) * 64 + raddr_b) {
|
||||
+ if (instr->alu.add.op == V3D_QPU_A_FMIN)
|
||||
+ instr->alu.add.op = V3D_QPU_A_FMAX;
|
||||
+ if (instr->alu.add.op == V3D_QPU_A_FADD)
|
||||
+ instr->alu.add.op = V3D_QPU_A_FADDNF;
|
||||
+ }
|
||||
+
|
||||
/* Some QPU ops require a bit more than just basic opcode and mux a/b
|
||||
* comparisons to distinguish them.
|
||||
*/
|
||||
@@ -1754,6 +1776,11 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
uint32_t output_pack;
|
||||
uint32_t a_unpack;
|
||||
uint32_t b_unpack;
|
||||
+ /* FIXME: for now hardcoded values, until we got the small_imm
|
||||
+ * support in place
|
||||
+ */
|
||||
+ uint32_t small_imm_a = 0;
|
||||
+ uint32_t small_imm_b = 0;
|
||||
|
||||
if (instr->alu.add.op != V3D_QPU_A_FCMP) {
|
||||
if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
|
||||
@@ -1773,6 +1800,27 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
return false;
|
||||
}
|
||||
|
||||
+ /* These operations with commutative operands are
|
||||
+ * distinguished by which order their operands come in.
|
||||
+ */
|
||||
+ bool ordering =
|
||||
+ small_imm_a * 256 + a_unpack * 64 + raddr_a >
|
||||
+ small_imm_b * 256 + b_unpack * 64 + raddr_b;
|
||||
+ if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
|
||||
+ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
|
||||
+ ((instr->alu.add.op == V3D_QPU_A_FMAX ||
|
||||
+ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
|
||||
+ uint32_t temp;
|
||||
+
|
||||
+ temp = a_unpack;
|
||||
+ a_unpack = b_unpack;
|
||||
+ b_unpack = temp;
|
||||
+
|
||||
+ temp = raddr_a;
|
||||
+ raddr_a = raddr_b;
|
||||
+ raddr_b = temp;
|
||||
+ }
|
||||
+
|
||||
opcode |= a_unpack << 2;
|
||||
opcode |= b_unpack << 0;
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,37 @@
|
||||
From c9f6faa3ddc91024b3d9dc67ce2221187daac128 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 29 Sep 2021 11:54:18 +0200
|
||||
Subject: [PATCH 036/139] broadcom/compiler: make vir_write_rX return false on
|
||||
platforms without accums
|
||||
|
||||
---
|
||||
src/broadcom/compiler/vir.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
|
||||
index 007cb0a941b..d75cd777b6d 100644
|
||||
--- a/src/broadcom/compiler/vir.c
|
||||
+++ b/src/broadcom/compiler/vir.c
|
||||
@@ -158,6 +158,9 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
bool
|
||||
vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
{
|
||||
+ if (!devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||
switch (inst->src[i].file) {
|
||||
case QFILE_VPM:
|
||||
@@ -180,6 +183,9 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
bool
|
||||
vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
{
|
||||
+ if (!devinfo->has_accumulators)
|
||||
+ return false;
|
||||
+
|
||||
switch (inst->dst.file) {
|
||||
case QFILE_MAGIC:
|
||||
switch (inst->dst.index) {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,77 @@
|
||||
From 3d16229743e26b58735ed049ee982073f6034342 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 29 Sep 2021 12:03:50 +0200
|
||||
Subject: [PATCH 037/139] broadcom/compiler: rename vir_writes_rX to
|
||||
vir_writes_rX_implicitly
|
||||
|
||||
Since that represents more accurately what they check..
|
||||
---
|
||||
src/broadcom/compiler/v3d_compiler.h | 4 ++--
|
||||
src/broadcom/compiler/vir.c | 6 ++++--
|
||||
src/broadcom/compiler/vir_register_allocate.c | 4 ++--
|
||||
3 files changed, 8 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
|
||||
index eb4e692464b..7e8f3bfc1a7 100644
|
||||
--- a/src/broadcom/compiler/v3d_compiler.h
|
||||
+++ b/src/broadcom/compiler/v3d_compiler.h
|
||||
@@ -1149,8 +1149,8 @@ bool vir_is_raw_mov(struct qinst *inst);
|
||||
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
|
||||
bool vir_is_add(struct qinst *inst);
|
||||
bool vir_is_mul(struct qinst *inst);
|
||||
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
|
||||
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
|
||||
+bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
|
||||
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
|
||||
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
|
||||
uint8_t vir_channels_written(struct qinst *inst);
|
||||
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
|
||||
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
|
||||
index d75cd777b6d..aea113f050e 100644
|
||||
--- a/src/broadcom/compiler/vir.c
|
||||
+++ b/src/broadcom/compiler/vir.c
|
||||
@@ -156,7 +156,8 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
}
|
||||
|
||||
bool
|
||||
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
+vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
|
||||
+ struct qinst *inst)
|
||||
{
|
||||
if (!devinfo->has_accumulators)
|
||||
return false;
|
||||
@@ -181,7 +182,8 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
}
|
||||
|
||||
bool
|
||||
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
|
||||
+ struct qinst *inst)
|
||||
{
|
||||
if (!devinfo->has_accumulators)
|
||||
return false;
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 7b3f6c41934..f2df35cd458 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -988,7 +988,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
* result to a temp), nothing else can be stored in r3/r4 across
|
||||
* it.
|
||||
*/
|
||||
- if (vir_writes_r3(c->devinfo, inst)) {
|
||||
+ if (vir_writes_r3_implicitly(c->devinfo, inst)) {
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
ra_add_node_interference(c->g,
|
||||
@@ -998,7 +998,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
}
|
||||
}
|
||||
|
||||
- if (vir_writes_r4(c->devinfo, inst)) {
|
||||
+ if (vir_writes_r4_implicitly(c->devinfo, inst)) {
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
ra_add_node_interference(c->g,
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,170 @@
|
||||
From 83fae160491737e8568b8fb5eaa5be4d2c8bf3c8 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 29 Sep 2021 12:10:31 +0200
|
||||
Subject: [PATCH 038/139] broadcom/compiler: only handle accumulator classes if
|
||||
present
|
||||
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 77 ++++++++++++-------
|
||||
1 file changed, 49 insertions(+), 28 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index f2df35cd458..e78ccb7c6aa 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -53,6 +53,17 @@ get_class_bit_any(const struct v3d_device_info *devinfo)
|
||||
else
|
||||
return CLASS_BITS_PHYS;
|
||||
}
|
||||
+
|
||||
+static uint8_t
|
||||
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
|
||||
+{
|
||||
+ if (!devinfo->has_accumulators) {
|
||||
+ assert(class_bits & CLASS_BITS_PHYS);
|
||||
+ class_bits = CLASS_BITS_PHYS;
|
||||
+ }
|
||||
+ return class_bits;
|
||||
+}
|
||||
+
|
||||
static inline uint32_t
|
||||
temp_to_node(struct v3d_compile *c, uint32_t temp)
|
||||
{
|
||||
@@ -413,8 +424,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
|
||||
*/
|
||||
if (c->spilling) {
|
||||
int temp_class = CLASS_BITS_PHYS;
|
||||
- if (i != c->spill_base.index)
|
||||
+ if (c->devinfo->has_accumulators &&
|
||||
+ i != c->spill_base.index) {
|
||||
temp_class |= CLASS_BITS_ACC;
|
||||
+ }
|
||||
add_node(c, i, temp_class);
|
||||
}
|
||||
}
|
||||
@@ -473,14 +486,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
|
||||
* temp will be used immediately so just like the uniform above we
|
||||
* can allow accumulators.
|
||||
*/
|
||||
+ int temp_class =
|
||||
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
|
||||
if (!fill_dst) {
|
||||
struct qreg dst = vir_TMUWT(c);
|
||||
assert(dst.file == QFILE_TEMP);
|
||||
- add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
|
||||
+ add_node(c, dst.index, temp_class);
|
||||
} else {
|
||||
*fill_dst = vir_LDTMU(c);
|
||||
assert(fill_dst->file == QFILE_TEMP);
|
||||
- add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
|
||||
+ add_node(c, fill_dst->index, temp_class);
|
||||
}
|
||||
|
||||
/* Temps across the thread switch we injected can't be assigned to
|
||||
@@ -662,8 +677,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
* instruction immediately after so we
|
||||
* can use ACC.
|
||||
*/
|
||||
- add_node(c, temp.index, CLASS_BITS_PHYS |
|
||||
- CLASS_BITS_ACC);
|
||||
+ int temp_class =
|
||||
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
|
||||
+ CLASS_BITS_ACC);
|
||||
+ add_node(c, temp.index, temp_class);
|
||||
} else {
|
||||
/* If we have a postponed spill, we
|
||||
* don't need a fill as the temp would
|
||||
@@ -941,6 +958,7 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||
compiler->reg_class_phys[threads] =
|
||||
ra_alloc_contig_reg_class(compiler->regs, 1);
|
||||
|
||||
+ /* Init physical regs */
|
||||
for (int i = phys_index;
|
||||
i < phys_index + (PHYS_COUNT >> threads); i++) {
|
||||
if (compiler->devinfo->has_accumulators)
|
||||
@@ -949,16 +967,15 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||
ra_class_add_reg(compiler->reg_class_any[threads], i);
|
||||
}
|
||||
|
||||
+ /* Init accumulator regs */
|
||||
if (compiler->devinfo->has_accumulators) {
|
||||
for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
|
||||
ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
|
||||
ra_class_add_reg(compiler->reg_class_any[threads], i);
|
||||
}
|
||||
- }
|
||||
- /* r5 can only store a single 32-bit value, so not much can
|
||||
- * use it.
|
||||
- */
|
||||
- if (compiler->devinfo->has_accumulators) {
|
||||
+ /* r5 can only store a single 32-bit value, so not much can
|
||||
+ * use it.
|
||||
+ */
|
||||
ra_class_add_reg(compiler->reg_class_r5[threads],
|
||||
ACC_INDEX + 5);
|
||||
ra_class_add_reg(compiler->reg_class_any[threads],
|
||||
@@ -1081,21 +1098,23 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
* because ldunif has usually a shorter lifespan, allowing for
|
||||
* more accumulator reuse and QPU merges.
|
||||
*/
|
||||
- if (!inst->qpu.sig.ldunif) {
|
||||
- uint8_t class_bits =
|
||||
- get_temp_class_bits(c, inst->dst.index) &
|
||||
- ~CLASS_BITS_R5;
|
||||
- set_temp_class_bits(c, inst->dst.index,
|
||||
- class_bits);
|
||||
-
|
||||
- } else {
|
||||
- /* Until V3D 4.x, we could only load a uniform
|
||||
- * to r5, so we'll need to spill if uniform
|
||||
- * loads interfere with each other.
|
||||
- */
|
||||
- if (c->devinfo->ver < 40) {
|
||||
+ if (c->devinfo->has_accumulators) {
|
||||
+ if (!inst->qpu.sig.ldunif) {
|
||||
+ uint8_t class_bits =
|
||||
+ get_temp_class_bits(c, inst->dst.index) &
|
||||
+ ~CLASS_BITS_R5;
|
||||
set_temp_class_bits(c, inst->dst.index,
|
||||
- CLASS_BITS_R5);
|
||||
+ class_bits);
|
||||
+
|
||||
+ } else {
|
||||
+ /* Until V3D 4.x, we could only load a uniform
|
||||
+ * to r5, so we'll need to spill if uniform
|
||||
+ * loads interfere with each other.
|
||||
+ */
|
||||
+ if (c->devinfo->ver < 40) {
|
||||
+ set_temp_class_bits(c, inst->dst.index,
|
||||
+ CLASS_BITS_R5);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1152,8 +1171,10 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
c->thread_index--;
|
||||
}
|
||||
|
||||
- c->g = ra_alloc_interference_graph(c->compiler->regs,
|
||||
- c->num_temps + ARRAY_SIZE(acc_nodes));
|
||||
+ unsigned num_ra_nodes = c->num_temps;
|
||||
+ if (c->devinfo->has_accumulators)
|
||||
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
|
||||
+ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
|
||||
ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
|
||||
|
||||
/* Make some fixed nodes for the accumulators, which we will need to
|
||||
@@ -1162,8 +1183,8 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
* live in, but the classes take up a lot of memory to set up, so we
|
||||
* don't want to make too many.
|
||||
*/
|
||||
- for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
|
||||
- if (i < ACC_COUNT) {
|
||||
+ for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
||||
+ if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
||||
acc_nodes[i] = i;
|
||||
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
|
||||
c->nodes.info[i].priority = 0;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,187 @@
|
||||
From fd77cc3204e7c69927f97ce2a1d55d2a47d77a27 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 29 Sep 2021 12:14:04 +0200
|
||||
Subject: [PATCH 039/139] broadcom/compiler: don't assign rf0 to temps across
|
||||
implicit rf0 writes
|
||||
|
||||
In platforms that don't have accumulators and have implicit writes to
|
||||
the register file we need to be careful and avoid assigning a physical
|
||||
register to a temp that lives across an implicit write to that same
|
||||
physical register.
|
||||
|
||||
For now, we have the case of implicit writes to rf0 from various
|
||||
signals, but it should be easy to extend this to include additional
|
||||
registers if needed.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++----
|
||||
1 file changed, 57 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index e78ccb7c6aa..e0adc1de7a4 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -29,6 +29,9 @@
|
||||
#define ACC_INDEX 0
|
||||
#define ACC_COUNT 6
|
||||
|
||||
+/* RA nodes used to track RF registers with implicit writes */
|
||||
+#define IMPLICIT_RF_COUNT 1
|
||||
+
|
||||
#define PHYS_COUNT 64
|
||||
|
||||
static uint8_t
|
||||
@@ -67,15 +70,17 @@ filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
|
||||
static inline uint32_t
|
||||
temp_to_node(struct v3d_compile *c, uint32_t temp)
|
||||
{
|
||||
- return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0);
|
||||
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
|
||||
+ IMPLICIT_RF_COUNT);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
node_to_temp(struct v3d_compile *c, uint32_t node)
|
||||
{
|
||||
assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
|
||||
- (!c->devinfo->has_accumulators && node >= 0));
|
||||
- return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0);
|
||||
+ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
|
||||
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT :
|
||||
+ IMPLICIT_RF_COUNT);
|
||||
}
|
||||
|
||||
static inline uint8_t
|
||||
@@ -360,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
|
||||
c->nodes.info = reralloc_array_size(c,
|
||||
c->nodes.info,
|
||||
sizeof(c->nodes.info[0]),
|
||||
- c->nodes.alloc_count + ACC_COUNT);
|
||||
+ c->nodes.alloc_count +
|
||||
+ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
|
||||
}
|
||||
|
||||
/* Creates the interference node for a new temp. We use this to keep the node
|
||||
@@ -372,7 +378,8 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
|
||||
ensure_nodes(c);
|
||||
|
||||
int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
|
||||
- assert(node == temp + ACC_COUNT);
|
||||
+ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
|
||||
+ node == temp + IMPLICIT_RF_COUNT);
|
||||
|
||||
/* We fill the node priority after we are done inserting spills */
|
||||
c->nodes.info[node].class_bits = class_bits;
|
||||
@@ -995,7 +1002,9 @@ tmu_spilling_allowed(struct v3d_compile *c)
|
||||
}
|
||||
|
||||
static void
|
||||
-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
|
||||
+ int *acc_nodes,
|
||||
+ int *implicit_rf_nodes,
|
||||
struct qinst *inst)
|
||||
{
|
||||
int32_t ip = inst->ip;
|
||||
@@ -1025,6 +1034,19 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
}
|
||||
}
|
||||
|
||||
+ /* If any instruction writes to a physical register implicitly
|
||||
+ * nothing else can write the same register across it.
|
||||
+ */
|
||||
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
|
||||
+ for (int i = 0; i < c->num_temps; i++) {
|
||||
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
|
||||
+ ra_add_node_interference(c->g,
|
||||
+ temp_to_node(c, i),
|
||||
+ implicit_rf_nodes[0]);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
|
||||
switch (inst->qpu.alu.add.op) {
|
||||
case V3D_QPU_A_LDVPMV_IN:
|
||||
@@ -1116,6 +1138,16 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
|
||||
CLASS_BITS_R5);
|
||||
}
|
||||
}
|
||||
+ } else {
|
||||
+ /* If the instruction has an implicit write
|
||||
+ * we can't allocate its dest to the same
|
||||
+ * register.
|
||||
+ */
|
||||
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
|
||||
+ ra_add_node_interference(c->g,
|
||||
+ temp_to_node(c, inst->dst.index),
|
||||
+ implicit_rf_nodes[0]);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1139,10 +1171,18 @@ struct qpu_reg *
|
||||
v3d_register_allocate(struct v3d_compile *c)
|
||||
{
|
||||
int acc_nodes[ACC_COUNT];
|
||||
+ int implicit_rf_nodes[IMPLICIT_RF_COUNT];
|
||||
+
|
||||
+ unsigned num_ra_nodes = c->num_temps;
|
||||
+ if (c->devinfo->has_accumulators)
|
||||
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
|
||||
+ else
|
||||
+ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
|
||||
+
|
||||
c->nodes = (struct v3d_ra_node_info) {
|
||||
.alloc_count = c->num_temps,
|
||||
.info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
|
||||
- c->num_temps + ACC_COUNT),
|
||||
+ num_ra_nodes),
|
||||
};
|
||||
|
||||
uint32_t phys_index = get_phys_index(c->devinfo);
|
||||
@@ -1171,9 +1211,6 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
c->thread_index--;
|
||||
}
|
||||
|
||||
- unsigned num_ra_nodes = c->num_temps;
|
||||
- if (c->devinfo->has_accumulators)
|
||||
- num_ra_nodes += ARRAY_SIZE(acc_nodes);
|
||||
c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
|
||||
ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
|
||||
|
||||
@@ -1181,7 +1218,8 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
* interfere with when ops have implied r3/r4 writes or for the thread
|
||||
* switches. We could represent these as classes for the nodes to
|
||||
* live in, but the classes take up a lot of memory to set up, so we
|
||||
- * don't want to make too many.
|
||||
+ * don't want to make too many. We use the same mechanism on platforms
|
||||
+ * without accumulators that can have implicit writes to phys regs.
|
||||
*/
|
||||
for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
||||
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
||||
@@ -1189,6 +1227,12 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
|
||||
c->nodes.info[i].priority = 0;
|
||||
c->nodes.info[i].class_bits = 0;
|
||||
+ } else if (!c->devinfo->has_accumulators &&
|
||||
+ i < ARRAY_SIZE(implicit_rf_nodes)) {
|
||||
+ implicit_rf_nodes[i] = i;
|
||||
+ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
|
||||
+ c->nodes.info[i].priority = 0;
|
||||
+ c->nodes.info[i].class_bits = 0;
|
||||
} else {
|
||||
uint32_t t = node_to_temp(c, i);
|
||||
c->nodes.info[i].priority =
|
||||
@@ -1204,7 +1248,8 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
int ip = 0;
|
||||
vir_for_each_inst_inorder(inst, c) {
|
||||
inst->ip = ip++;
|
||||
- update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
|
||||
+ update_graph_and_reg_classes_for_inst(c, acc_nodes,
|
||||
+ implicit_rf_nodes, inst);
|
||||
}
|
||||
|
||||
/* Set the register classes for all our temporaries in the graph */
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,33 @@
|
||||
From 9a08ae9f354a6da6d9d71b87800aca8b3df49e29 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 28 Sep 2021 13:37:28 +0200
|
||||
Subject: [PATCH 040/139] broadcom/compiler: CS payload registers have changed
|
||||
in v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/nir_to_vir.c | 9 +++++++--
|
||||
1 file changed, 7 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
|
||||
index 1a05b279a2d..220ff6bcd49 100644
|
||||
--- a/src/broadcom/compiler/nir_to_vir.c
|
||||
+++ b/src/broadcom/compiler/nir_to_vir.c
|
||||
@@ -4362,8 +4362,13 @@ nir_to_vir(struct v3d_compile *c)
|
||||
V3D_QPU_WADDR_SYNC));
|
||||
}
|
||||
|
||||
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
|
||||
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
|
||||
+ if (c->devinfo->ver <= 42) {
|
||||
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
|
||||
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
|
||||
+ } else if (c->devinfo->ver >= 71) {
|
||||
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
|
||||
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
|
||||
+ }
|
||||
|
||||
/* Set up the division between gl_LocalInvocationIndex and
|
||||
* wg_in_mem in the payload reg.
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,46 @@
|
||||
From 5477884196cb54a71f54fa6cad42c6d3326bde88 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Fri, 22 Oct 2021 13:39:48 +0200
|
||||
Subject: [PATCH 041/139] broadcom/compiler: don't schedule rf0 writes right
|
||||
after ldvary
|
||||
|
||||
ldvary writes rf0 implicitly on the next cycle so they would clash.
|
||||
This case is not handled correctly by our normal dependency tracking,
|
||||
which doesn't know anything about delayed writes from instructions
|
||||
and thinks the rf0 write happens on the same cycle ldvary is emitted.
|
||||
|
||||
Fixes (v71):
|
||||
dEQP-VK.glsl.conversions.matrix_to_matrix.mat2x3_to_mat4x2_fragment
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 15 +++++++++++++++
|
||||
1 file changed, 15 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 839c0c62315..870823fd2b1 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -652,6 +652,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
|
||||
v3d_qpu_writes_r4(devinfo, inst))
|
||||
return true;
|
||||
|
||||
+ if (devinfo->ver <= 42)
|
||||
+ return false;
|
||||
+
|
||||
+ /* Don't schedule anything that writes rf0 right after ldvary, since
|
||||
+ * that would clash with the ldvary's delayed rf0 write (the exception
|
||||
+ * is another ldvary, since its implicit rf0 write would also have
|
||||
+ * one cycle of delay and would not clash).
|
||||
+ */
|
||||
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
|
||||
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
|
||||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
|
||||
+ !inst->sig.ldvary))) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
return false;
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,60 @@
|
||||
From 31623712c2f741d393767641f32d56c35150eda5 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 30 Sep 2021 13:22:48 +0200
|
||||
Subject: [PATCH 042/139] broadcom/compiler: allow instruction merges in v71
|
||||
|
||||
In v3d 4.x there were restrictions based on the number of raddrs used
|
||||
by the combined instructions, but we don't have these restrictions in
|
||||
v3d 7.x.
|
||||
|
||||
It should be noted that while there are no restrictions on the number
|
||||
of raddrs addressed, a QPU instruction can only address a single small
|
||||
immediate, so we should be careful about that when we add support for
|
||||
small immediates.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 21 +++++++++++++++++----
|
||||
1 file changed, 17 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 870823fd2b1..ff544fb3c1c 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -906,8 +906,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
|
||||
static bool
|
||||
qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
const struct v3d_qpu_instr *add_instr,
|
||||
- const struct v3d_qpu_instr *mul_instr)
|
||||
+ const struct v3d_qpu_instr *mul_instr,
|
||||
+ const struct v3d_device_info *devinfo)
|
||||
{
|
||||
+ assert(devinfo->ver <= 42);
|
||||
+
|
||||
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
|
||||
int naddrs = util_bitcount64(raddrs_used);
|
||||
|
||||
@@ -1111,9 +1114,19 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
add_instr = a;
|
||||
}
|
||||
|
||||
- if (add_instr && mul_instr &&
|
||||
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
|
||||
- return false;
|
||||
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
|
||||
+ * they have restrictions on the number of raddrs that can be adressed
|
||||
+ * in a single instruction.
|
||||
+ *
|
||||
+ * FIXME: for V3D 7.x we can't merge instructions if they address more
|
||||
+ * than one small immediate. For now, we don't support small immediates,
|
||||
+ * so it is not a problem.
|
||||
+ */
|
||||
+ if (devinfo->ver <= 42) {
|
||||
+ if (add_instr && mul_instr &&
|
||||
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
|
||||
+ return false;
|
||||
+ }
|
||||
}
|
||||
|
||||
merge.sig.thrsw |= b->sig.thrsw;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,172 @@
|
||||
From 959a0128654c94d84fda53ffc108971d3b3a817a Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 6 Oct 2021 09:27:43 +0200
|
||||
Subject: [PATCH 043/139] broadcom/qpu: add MOV integer packing/unpacking
|
||||
variants
|
||||
|
||||
These are new in v71 and cover MOV on both the ADD and the MUL alus.
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.h | 9 ++++
|
||||
src/broadcom/qpu/qpu_pack.c | 98 ++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 107 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index c86a4119c54..4b34d17bd4c 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -285,6 +285,15 @@ enum v3d_qpu_input_unpack {
|
||||
|
||||
/** Swap high and low 16 bits */
|
||||
V3D_QPU_UNPACK_SWAP_16,
|
||||
+
|
||||
+ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
|
||||
+ V3D_QPU_UNPACK_UL,
|
||||
+ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
|
||||
+ V3D_QPU_UNPACK_UH,
|
||||
+ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
|
||||
+ V3D_QPU_UNPACK_IL,
|
||||
+ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
|
||||
+ V3D_QPU_UNPACK_IH,
|
||||
};
|
||||
|
||||
enum v3d_qpu_mux {
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 7a262f18ac3..4d677894755 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -922,6 +922,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
|
||||
}
|
||||
}
|
||||
|
||||
+static bool
|
||||
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
|
||||
+ enum v3d_qpu_input_unpack *unpacked)
|
||||
+{
|
||||
+ switch (packed) {
|
||||
+ case 0:
|
||||
+ *unpacked = V3D_QPU_UNPACK_NONE;
|
||||
+ return true;
|
||||
+ case 1:
|
||||
+ *unpacked = V3D_QPU_UNPACK_UL;
|
||||
+ return true;
|
||||
+ case 2:
|
||||
+ *unpacked = V3D_QPU_UNPACK_UH;
|
||||
+ return true;
|
||||
+ case 3:
|
||||
+ *unpacked = V3D_QPU_UNPACK_IL;
|
||||
+ return true;
|
||||
+ case 4:
|
||||
+ *unpacked = V3D_QPU_UNPACK_IH;
|
||||
+ return true;
|
||||
+ default:
|
||||
+ return false;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static bool
|
||||
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
|
||||
+ uint32_t *packed)
|
||||
+{
|
||||
+ switch (unpacked) {
|
||||
+ case V3D_QPU_UNPACK_NONE:
|
||||
+ *packed = 0;
|
||||
+ return true;
|
||||
+ case V3D_QPU_UNPACK_UL:
|
||||
+ *packed = 1;
|
||||
+ return true;
|
||||
+ case V3D_QPU_UNPACK_UH:
|
||||
+ *packed = 2;
|
||||
+ return true;
|
||||
+ case V3D_QPU_UNPACK_IL:
|
||||
+ *packed = 3;
|
||||
+ return true;
|
||||
+ case V3D_QPU_UNPACK_IH:
|
||||
+ *packed = 4;
|
||||
+ return true;
|
||||
+ default:
|
||||
+ return false;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static bool
|
||||
v3d_qpu_float16_unpack_unpack(uint32_t packed,
|
||||
enum v3d_qpu_input_unpack *unpacked)
|
||||
@@ -1273,6 +1323,15 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
|
||||
instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
break;
|
||||
|
||||
+ case V3D_QPU_A_MOV:
|
||||
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
+
|
||||
+ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
|
||||
+ &instr->alu.add.a.unpack)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ break;
|
||||
+
|
||||
default:
|
||||
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
@@ -1449,6 +1508,15 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
|
||||
|
||||
break;
|
||||
|
||||
+ case V3D_QPU_M_MOV:
|
||||
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
|
||||
+
|
||||
+ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
|
||||
+ &instr->alu.mul.a.unpack)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ break;
|
||||
+
|
||||
default:
|
||||
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
|
||||
instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
@@ -1909,6 +1977,21 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
opcode |= packed;
|
||||
break;
|
||||
|
||||
+ case V3D_QPU_A_MOV: {
|
||||
+ uint32_t packed;
|
||||
+
|
||||
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
|
||||
+ return false;
|
||||
+
|
||||
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
|
||||
+ &packed)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ raddr_b |= packed << 2;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
default:
|
||||
if (instr->alu.add.op != V3D_QPU_A_NOP &&
|
||||
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
|
||||
@@ -2126,6 +2209,21 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
break;
|
||||
}
|
||||
|
||||
+ case V3D_QPU_M_MOV: {
|
||||
+ uint32_t packed;
|
||||
+
|
||||
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
|
||||
+ return false;
|
||||
+
|
||||
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
|
||||
+ &packed)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ raddr_d |= packed << 2;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
default:
|
||||
break;
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,47 @@
|
||||
From 2e86dd0c357d7b432ce6794ae22fbfae89ad186b Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 6 Oct 2021 12:01:10 +0200
|
||||
Subject: [PATCH 044/139] broadcom/qpu: fail packing on unhandled mul
|
||||
pack/unpack
|
||||
|
||||
We are doing this for the ADD alu already and it may be helpful to
|
||||
identify cases where we have QPU code with pack/unpack modifiers on
|
||||
MUL opcodes that we then are not packing into the actual QPU
|
||||
instructions.
|
||||
---
|
||||
src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++
|
||||
1 file changed, 12 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 4d677894755..180d7ab08a3 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -2106,6 +2106,12 @@ v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
}
|
||||
|
||||
default:
|
||||
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
|
||||
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
|
||||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
|
||||
+ return false;
|
||||
+ }
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2225,6 +2231,12 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
|
||||
}
|
||||
|
||||
default:
|
||||
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
|
||||
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
|
||||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
|
||||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
|
||||
+ return false;
|
||||
+ }
|
||||
break;
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,30 @@
|
||||
From ed6bfa29d43b5a89ff070961454f1e82e23b4f45 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Fri, 8 Oct 2021 15:10:24 +0200
|
||||
Subject: [PATCH 045/139] broadcom/compiler: generalize check for shaders using
|
||||
pixel center W
|
||||
|
||||
V3D 4.x has pixel center W in rf0 and V3D 7.x has it in rf3. We already
|
||||
account for this when we setup the c->payload_w, so use that.
|
||||
---
|
||||
src/broadcom/compiler/nir_to_vir.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
|
||||
index 220ff6bcd49..90fe1d1e7f0 100644
|
||||
--- a/src/broadcom/compiler/nir_to_vir.c
|
||||
+++ b/src/broadcom/compiler/nir_to_vir.c
|
||||
@@ -4547,8 +4547,8 @@ vir_check_payload_w(struct v3d_compile *c)
|
||||
|
||||
vir_for_each_inst_inorder(inst, c) {
|
||||
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||
- if (inst->src[i].file == QFILE_REG &&
|
||||
- inst->src[i].index == 0) {
|
||||
+ if (inst->src[i].file == c->payload_w.file &&
|
||||
+ inst->src[i].index == c->payload_w.index) {
|
||||
c->uses_center_w = true;
|
||||
return;
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,34 @@
|
||||
From e1a0fa2c2010ef29b8cec798cd0fc99cf44f3a2d Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 14 Oct 2021 14:16:40 +0200
|
||||
Subject: [PATCH 046/139] broadcom/compiler: v71 isn't affected by
|
||||
double-rounding of viewport X,Y coords
|
||||
|
||||
---
|
||||
src/broadcom/compiler/v3d_nir_lower_io.c | 10 +++++++---
|
||||
1 file changed, 7 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
|
||||
index 3ef0e398228..4cdba3748a1 100644
|
||||
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
|
||||
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
|
||||
@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
|
||||
* The correct fix for this as recommended by Broadcom
|
||||
* is to convert to .8 fixed-point with ffloor().
|
||||
*/
|
||||
- pos = nir_f2i32(b, nir_ffloor(b, pos));
|
||||
- v3d_nir_store_output(b, state->vp_vpm_offset + i,
|
||||
- offset_reg, pos);
|
||||
+ if (c->devinfo->ver <= 42)
|
||||
+ pos = nir_f2i32(b, nir_ffloor(b, pos));
|
||||
+ else
|
||||
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
|
||||
+
|
||||
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
|
||||
+ offset_reg, pos);
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,31 @@
|
||||
From 697e6cf01b781b244404872f331a778b6d4e67da Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 19 Oct 2021 11:16:43 +0200
|
||||
Subject: [PATCH 047/139] broadcom/compiler: update one TMUWT restriction for
|
||||
v71
|
||||
|
||||
TMUWT not allowed in the final instruction restriction doesn't apply
|
||||
for v71.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index ff544fb3c1c..25f79aa6f46 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1700,8 +1700,10 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
|
||||
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
|
||||
/* GFXH-1625: TMUWT not allowed in the final instruction. */
|
||||
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
|
||||
+ if (c->devinfo->ver <= 42 && slot == 2 &&
|
||||
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
|
||||
return false;
|
||||
+ }
|
||||
|
||||
/* No writing physical registers at the end. */
|
||||
bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,37 @@
|
||||
From 26fea727a9f34b75a3fe3f6a806accaddcc317f6 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 19 Oct 2021 11:51:32 +0200
|
||||
Subject: [PATCH 048/139] broadcom/compiler: update ldunif/ldvary comment for
|
||||
v71
|
||||
|
||||
For v42 and below ldunif/ldvary write both on r5, but with a different
|
||||
delay, so we need to take that into account when scheduling both.
|
||||
|
||||
For v71 the register used is rf0, but the behaviour is the same. So
|
||||
the scheduling code can be the same, but the comment needs update.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 25f79aa6f46..e8197661f89 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1234,10 +1234,11 @@ retry:
|
||||
if (pixel_scoreboard_too_soon(c, scoreboard, inst))
|
||||
continue;
|
||||
|
||||
- /* ldunif and ldvary both write r5, but ldunif does so a tick
|
||||
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
|
||||
+ /* ldunif and ldvary both write the same register (r5 for v42
|
||||
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
|
||||
+ * If the ldvary's register wasn't used, then ldunif might
|
||||
* otherwise get scheduled so ldunif and ldvary try to update
|
||||
- * r5 in the same tick.
|
||||
+ * the register in the same tick.
|
||||
*/
|
||||
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
|
||||
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,52 @@
|
||||
From 70456e27b039174f767010f96d9b649e5e42d84f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 19 Oct 2021 23:52:30 +0200
|
||||
Subject: [PATCH 049/139] broadcom/compiler: update payload registers handling
|
||||
when computing live intervals
|
||||
|
||||
As for v71 the payload registers are not the same. Specifically now
|
||||
rf3 is used as payload register, so this is needed to avoid rf3 being
|
||||
selected as a instruction dst by the register allocator, overwriting
|
||||
the payload value that could be still used.
|
||||
---
|
||||
src/broadcom/compiler/vir_live_variables.c | 21 +++++++++++++--------
|
||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
|
||||
index 575b0481dc8..87a7e2b5b81 100644
|
||||
--- a/src/broadcom/compiler/vir_live_variables.c
|
||||
+++ b/src/broadcom/compiler/vir_live_variables.c
|
||||
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
|
||||
flags_inst = NULL;
|
||||
}
|
||||
|
||||
- /* Payload registers: r0/1/2 contain W, centroid W,
|
||||
- * and Z at program start. Register allocation will
|
||||
- * force their nodes to R0/1/2.
|
||||
+ /* Payload registers: for fragment shaders, W,
|
||||
+ * centroid W, and Z will be initialized at r0/1/2
|
||||
+ * until v42, or r1/r2/r3 from v71.
|
||||
+ *
|
||||
+ * For compute shaders, payload would be r0/r2 until
|
||||
+ * v42, r3/r2 from v71
|
||||
+ *
|
||||
+ * Register allocation will force their nodes to those
|
||||
+ * registers.
|
||||
*/
|
||||
if (inst->src[0].file == QFILE_REG) {
|
||||
- switch (inst->src[0].index) {
|
||||
- case 0:
|
||||
- case 1:
|
||||
- case 2:
|
||||
+ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
|
||||
+ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
|
||||
+ if (inst->src[0].index >= min_payload_r ||
|
||||
+ inst->src[0].index <= max_payload_r) {
|
||||
c->temp_start[inst->dst.index] = 0;
|
||||
- break;
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,235 @@
|
||||
From f9a76b3a1e316e5ed6387819b87eaaf60f989a2b Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 26 Oct 2021 11:43:02 +0200
|
||||
Subject: [PATCH 050/139] broadcom/compiler: update peripheral access
|
||||
restrictions for v71
|
||||
|
||||
In V3D 4.x only a couple of simultaneous accesses where allowed, but
|
||||
V3D 7.x is a bit more flexible, so rather than trying to check for all
|
||||
the allowed combinations it is easier to check if we are one of the
|
||||
disallows.
|
||||
|
||||
Shader-db (pi5):
|
||||
|
||||
total instructions in shared programs: 11338883 -> 11307386 (-0.28%)
|
||||
instructions in affected programs: 2727201 -> 2695704 (-1.15%)
|
||||
helped: 12555
|
||||
HURT: 289
|
||||
Instructions are helped.
|
||||
|
||||
total max-temps in shared programs: 2230199 -> 2229260 (-0.04%)
|
||||
max-temps in affected programs: 20508 -> 19569 (-4.58%)
|
||||
helped: 608
|
||||
HURT: 4
|
||||
Max-temps are helped.
|
||||
|
||||
total sfu-stalls in shared programs: 15236 -> 15293 (0.37%)
|
||||
sfu-stalls in affected programs: 148 -> 205 (38.51%)
|
||||
helped: 38
|
||||
HURT: 64
|
||||
Inconclusive result (%-change mean confidence interval includes 0).
|
||||
|
||||
total inst-and-stalls in shared programs: 11354119 -> 11322679 (-0.28%)
|
||||
inst-and-stalls in affected programs: 2732262 -> 2700822 (-1.15%)
|
||||
helped: 12550
|
||||
HURT: 304
|
||||
Inst-and-stalls are helped.
|
||||
|
||||
total nops in shared programs: 273711 -> 274095 (0.14%)
|
||||
nops in affected programs: 9626 -> 10010 (3.99%)
|
||||
helped: 186
|
||||
HURT: 397
|
||||
Nops are HURT.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 88 +++++++++++++++++++++-------
|
||||
src/broadcom/compiler/qpu_validate.c | 2 +-
|
||||
src/broadcom/qpu/qpu_instr.c | 16 +++--
|
||||
src/broadcom/qpu/qpu_instr.h | 2 +
|
||||
4 files changed, 82 insertions(+), 26 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index e8197661f89..adb501e85ce 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -790,7 +790,8 @@ enum {
|
||||
V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
|
||||
V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
|
||||
V3D_PERIPHERAL_TSY = (1 << 8),
|
||||
- V3D_PERIPHERAL_TLB = (1 << 9),
|
||||
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
|
||||
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
|
||||
};
|
||||
|
||||
static uint32_t
|
||||
@@ -815,8 +816,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
|
||||
if (v3d_qpu_uses_sfu(inst))
|
||||
result |= V3D_PERIPHERAL_SFU;
|
||||
|
||||
- if (v3d_qpu_uses_tlb(inst))
|
||||
- result |= V3D_PERIPHERAL_TLB;
|
||||
+ if (v3d_qpu_reads_tlb(inst))
|
||||
+ result |= V3D_PERIPHERAL_TLB_READ;
|
||||
+ if (v3d_qpu_writes_tlb(inst))
|
||||
+ result |= V3D_PERIPHERAL_TLB_WRITE;
|
||||
|
||||
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
|
||||
if (inst->alu.add.op != V3D_QPU_A_NOP &&
|
||||
@@ -847,32 +850,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
|
||||
if (devinfo->ver < 41)
|
||||
return false;
|
||||
|
||||
- /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
|
||||
- * tmuc).
|
||||
+ /* V3D 4.x can't do more than one peripheral access except in a
|
||||
+ * few cases:
|
||||
*/
|
||||
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
|
||||
- b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
|
||||
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
|
||||
+ if (devinfo->ver <= 42) {
|
||||
+ /* WRTMUC signal with TMU register write (other than tmuc). */
|
||||
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
|
||||
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
|
||||
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
|
||||
+ }
|
||||
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
|
||||
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
|
||||
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
|
||||
+ }
|
||||
+
|
||||
+ /* TMU read with VPM read/write. */
|
||||
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
|
||||
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
|
||||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
|
||||
+ return true;
|
||||
+ }
|
||||
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
|
||||
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
|
||||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
}
|
||||
|
||||
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
|
||||
- b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
|
||||
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
|
||||
+ /* V3D 7.x can't have more than one of these restricted peripherals */
|
||||
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
|
||||
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
|
||||
+ V3D_PERIPHERAL_TSY |
|
||||
+ V3D_PERIPHERAL_TLB_READ |
|
||||
+ V3D_PERIPHERAL_SFU |
|
||||
+ V3D_PERIPHERAL_VPM_READ |
|
||||
+ V3D_PERIPHERAL_VPM_WRITE;
|
||||
+
|
||||
+ const uint32_t a_restricted = a_peripherals & restricted;
|
||||
+ const uint32_t b_restricted = b_peripherals & restricted;
|
||||
+ if (a_restricted && b_restricted) {
|
||||
+ /* WRTMUC signal with TMU register write (other than tmuc) is
|
||||
+ * allowed though.
|
||||
+ */
|
||||
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
|
||||
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
|
||||
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
|
||||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
|
||||
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
|
||||
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
|
||||
+ return false;
|
||||
+ }
|
||||
}
|
||||
|
||||
- /* V3D 4.1+ allows TMU read with VPM read/write. */
|
||||
- if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
|
||||
- (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
|
||||
- b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
|
||||
- return true;
|
||||
+ /* Only one TMU read per instruction */
|
||||
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
|
||||
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
|
||||
+ return false;
|
||||
}
|
||||
- if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
|
||||
- (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
|
||||
- a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
|
||||
- return true;
|
||||
+
|
||||
+ /* Only one TLB access per instruction */
|
||||
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
|
||||
+ V3D_PERIPHERAL_TLB_READ)) &&
|
||||
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
|
||||
+ V3D_PERIPHERAL_TLB_READ))) {
|
||||
+ return false;
|
||||
}
|
||||
|
||||
- return false;
|
||||
+ return true;
|
||||
}
|
||||
|
||||
/* Compute a bitmask of which rf registers are used between
|
||||
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
|
||||
index 12788692432..fde6695d59b 100644
|
||||
--- a/src/broadcom/compiler/qpu_validate.c
|
||||
+++ b/src/broadcom/compiler/qpu_validate.c
|
||||
@@ -227,7 +227,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||
vpm_writes +
|
||||
tlb_writes +
|
||||
tsy_writes +
|
||||
- inst->sig.ldtmu +
|
||||
+ (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
|
||||
inst->sig.ldtlb +
|
||||
inst->sig.ldvpm +
|
||||
inst->sig.ldtlbu > 1) {
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index 195a0dcd232..f54ce7210fb 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -649,12 +649,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op)
|
||||
}
|
||||
|
||||
bool
|
||||
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
|
||||
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
- if (inst->sig.ldtlb ||
|
||||
- inst->sig.ldtlbu)
|
||||
- return true;
|
||||
+ return inst->sig.ldtlb || inst->sig.ldtlbu;
|
||||
+}
|
||||
|
||||
+bool
|
||||
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
|
||||
+{
|
||||
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
|
||||
if (inst->alu.add.op != V3D_QPU_A_NOP &&
|
||||
inst->alu.add.magic_write &&
|
||||
@@ -672,6 +674,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
|
||||
return false;
|
||||
}
|
||||
|
||||
+bool
|
||||
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
|
||||
+{
|
||||
+ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
|
||||
+}
|
||||
+
|
||||
bool
|
||||
v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
|
||||
{
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index 4b34d17bd4c..dece45c5c54 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -472,6 +472,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
|
||||
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,61 @@
|
||||
From 3520cceb87fb2f9765ba7dbe2771fbd0cadca78d Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 26 Oct 2021 08:37:54 +0200
|
||||
Subject: [PATCH 051/139] broadcom/qpu: add packing for fmov on ADD alu
|
||||
|
||||
---
|
||||
src/broadcom/qpu/qpu_pack.c | 31 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 31 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 180d7ab08a3..ed5a8bc667d 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -1332,6 +1332,20 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
|
||||
}
|
||||
break;
|
||||
|
||||
+ case V3D_QPU_A_FMOV:
|
||||
+ instr->alu.add.output_pack = raddr_b & 0x3;
|
||||
+
|
||||
+ /* Mul alu FMOV has one additional variant */
|
||||
+ int32_t unpack = (raddr_b >> 2) & 0x7;
|
||||
+ if (unpack == 7)
|
||||
+ return false;
|
||||
+
|
||||
+ if (!v3d_qpu_float32_unpack_unpack(unpack,
|
||||
+ &instr->alu.add.a.unpack)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ break;
|
||||
+
|
||||
default:
|
||||
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
@@ -1992,6 +2006,23 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
break;
|
||||
}
|
||||
|
||||
+ case V3D_QPU_A_FMOV: {
|
||||
+ uint32_t packed;
|
||||
+
|
||||
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
|
||||
+ &packed)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ raddr_b = packed;
|
||||
+
|
||||
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
|
||||
+ &packed)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ raddr_b |= packed << 2;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
default:
|
||||
if (instr->alu.add.op != V3D_QPU_A_NOP &&
|
||||
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,155 @@
|
||||
From 7c7ab15b3c9def4bc3bb5be492228a933c325f8a Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 6 Oct 2021 13:58:27 +0200
|
||||
Subject: [PATCH 052/139] broadcom/compiler: handle rf0 flops storage
|
||||
restriction in v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 81 +++++++++++++++++++++++++++-
|
||||
1 file changed, 79 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index adb501e85ce..7048d9257b6 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -538,6 +538,10 @@ struct choose_scoreboard {
|
||||
int ldvary_count;
|
||||
int pending_ldtmu_count;
|
||||
bool first_ldtmu_after_thrsw;
|
||||
+
|
||||
+ /* V3D 7.x */
|
||||
+ int last_implicit_rf0_write_tick;
|
||||
+ bool has_rf0_flops_conflict;
|
||||
};
|
||||
|
||||
static bool
|
||||
@@ -1499,6 +1503,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
|
||||
}
|
||||
}
|
||||
|
||||
+static void
|
||||
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
|
||||
+ const struct v3d_qpu_instr *inst,
|
||||
+ const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
|
||||
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
|
||||
+ !inst->sig_magic) {
|
||||
+ scoreboard->has_rf0_flops_conflict = true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
|
||||
+ const struct v3d_qpu_instr *inst,
|
||||
+ const struct v3d_device_info *devinfo)
|
||||
+{
|
||||
+ if (devinfo->ver < 71)
|
||||
+ return;
|
||||
+
|
||||
+ /* Thread switch restrictions:
|
||||
+ *
|
||||
+ * At the point of a thread switch or thread end (when the actual
|
||||
+ * thread switch or thread end happens, not when the signalling
|
||||
+ * instruction is processed):
|
||||
+ *
|
||||
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
|
||||
+ * ldvary instruction in which another signal also wrote to the
|
||||
+ * register file, and the final instruction of the thread section
|
||||
+ * contained a signal which wrote to the register file, then the
|
||||
+ * value of rf0 is undefined at the start of the new section
|
||||
+ *
|
||||
+ * Here we use the scoreboard to track if our last rf0 implicit write
|
||||
+ * happens at the same time that another signal writes the register
|
||||
+ * file (has_rf0_flops_conflict). We will use that information when
|
||||
+ * scheduling thrsw instructions to avoid putting anything in their
|
||||
+ * last delay slot which has a signal that writes to the register file.
|
||||
+ */
|
||||
+
|
||||
+ /* Reset tracking if we have an explicit rf0 write or we are starting
|
||||
+ * a new thread section.
|
||||
+ */
|
||||
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
|
||||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
|
||||
+ scoreboard->last_implicit_rf0_write_tick = -10;
|
||||
+ scoreboard->has_rf0_flops_conflict = false;
|
||||
+ }
|
||||
+
|
||||
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
|
||||
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
|
||||
+ scoreboard->tick + 1 : scoreboard->tick;
|
||||
+ }
|
||||
+
|
||||
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
|
||||
+}
|
||||
+
|
||||
static void
|
||||
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
||||
const struct qinst *qinst,
|
||||
@@ -1542,6 +1602,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
||||
if (inst->sig.ldvary)
|
||||
scoreboard->last_ldvary_tick = scoreboard->tick;
|
||||
|
||||
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
|
||||
+
|
||||
update_scoreboard_tmu_tracking(scoreboard, qinst);
|
||||
}
|
||||
|
||||
@@ -1812,6 +1874,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
*/
|
||||
static bool
|
||||
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
||||
+ struct choose_scoreboard *scoreboard,
|
||||
const struct qinst *qinst,
|
||||
uint32_t slot)
|
||||
{
|
||||
@@ -1842,6 +1905,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
||||
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
|
||||
return false;
|
||||
|
||||
+ /* See comment when we set has_rf0_flops_conflict for details */
|
||||
+ if (c->devinfo->ver >= 71 &&
|
||||
+ slot == 2 &&
|
||||
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
|
||||
+ !qinst->qpu.sig_magic) {
|
||||
+ if (scoreboard->has_rf0_flops_conflict)
|
||||
+ return false;
|
||||
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1874,7 +1948,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
||||
* also apply to instructions scheduled after the thrsw that we want
|
||||
* to place in its delay slots.
|
||||
*/
|
||||
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
|
||||
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
|
||||
return false;
|
||||
|
||||
/* TLB access is disallowed until scoreboard wait is executed, which
|
||||
@@ -1947,8 +2021,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
|
||||
bool is_thrend)
|
||||
{
|
||||
for (int slot = 0; slot < instructions_in_sequence; slot++) {
|
||||
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
|
||||
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
|
||||
+ qinst, slot)) {
|
||||
return false;
|
||||
+ }
|
||||
|
||||
if (is_thrend &&
|
||||
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
|
||||
@@ -2718,6 +2794,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
|
||||
scoreboard.last_setmsf_tick = -10;
|
||||
scoreboard.last_stallable_sfu_tick = -10;
|
||||
scoreboard.first_ldtmu_after_thrsw = true;
|
||||
+ scoreboard.last_implicit_rf0_write_tick = - 10;
|
||||
|
||||
if (debug) {
|
||||
fprintf(stderr, "Pre-schedule instructions\n");
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,189 @@
|
||||
From 0c6910721eb50b38b3388c2d2344b6ecfe0fee58 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 27 Oct 2021 11:35:12 +0200
|
||||
Subject: [PATCH 053/139] broadcom/compiler: enable ldvary pipelining on v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 121 ++++++++++++++++++---------
|
||||
1 file changed, 80 insertions(+), 41 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 7048d9257b6..334ffdc6d58 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -2312,46 +2312,72 @@ emit_branch(struct v3d_compile *c,
|
||||
}
|
||||
|
||||
static bool
|
||||
-alu_reads_register(struct v3d_qpu_instr *inst,
|
||||
+alu_reads_register(const struct v3d_device_info *devinfo,
|
||||
+ struct v3d_qpu_instr *inst,
|
||||
bool add, bool magic, uint32_t index)
|
||||
{
|
||||
uint32_t num_src;
|
||||
- enum v3d_qpu_mux mux_a, mux_b;
|
||||
-
|
||||
- if (add) {
|
||||
+ if (add)
|
||||
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
|
||||
- mux_a = inst->alu.add.a.mux;
|
||||
- mux_b = inst->alu.add.b.mux;
|
||||
- } else {
|
||||
+ else
|
||||
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
|
||||
- mux_a = inst->alu.mul.a.mux;
|
||||
- mux_b = inst->alu.mul.b.mux;
|
||||
- }
|
||||
|
||||
- for (int i = 0; i < num_src; i++) {
|
||||
- if (magic) {
|
||||
- if (i == 0 && mux_a == index)
|
||||
- return true;
|
||||
- if (i == 1 && mux_b == index)
|
||||
- return true;
|
||||
+ if (devinfo->ver <= 42) {
|
||||
+ enum v3d_qpu_mux mux_a, mux_b;
|
||||
+ if (add) {
|
||||
+ mux_a = inst->alu.add.a.mux;
|
||||
+ mux_b = inst->alu.add.b.mux;
|
||||
} else {
|
||||
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
|
||||
- inst->raddr_a == index) {
|
||||
- return true;
|
||||
- }
|
||||
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
|
||||
- inst->raddr_b == index) {
|
||||
- return true;
|
||||
- }
|
||||
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
|
||||
- inst->raddr_a == index) {
|
||||
- return true;
|
||||
- }
|
||||
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
|
||||
- inst->raddr_b == index) {
|
||||
- return true;
|
||||
+ mux_a = inst->alu.mul.a.mux;
|
||||
+ mux_b = inst->alu.mul.b.mux;
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < num_src; i++) {
|
||||
+ if (magic) {
|
||||
+ if (i == 0 && mux_a == index)
|
||||
+ return true;
|
||||
+ if (i == 1 && mux_b == index)
|
||||
+ return true;
|
||||
+ } else {
|
||||
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
|
||||
+ inst->raddr_a == index) {
|
||||
+ return true;
|
||||
+ }
|
||||
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
|
||||
+ inst->raddr_b == index) {
|
||||
+ return true;
|
||||
+ }
|
||||
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
|
||||
+ inst->raddr_a == index) {
|
||||
+ return true;
|
||||
+ }
|
||||
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
|
||||
+ inst->raddr_b == index) {
|
||||
+ return true;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
+
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ assert(devinfo->ver >= 71);
|
||||
+ assert(!magic);
|
||||
+
|
||||
+ uint32_t raddr_a, raddr_b;
|
||||
+ if (add) {
|
||||
+ raddr_a = inst->alu.add.a.raddr;
|
||||
+ raddr_b = inst->alu.add.b.raddr;
|
||||
+ } else {
|
||||
+ raddr_a = inst->alu.mul.a.raddr;
|
||||
+ raddr_b = inst->alu.mul.b.raddr;
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < num_src; i++) {
|
||||
+ if (i == 0 && raddr_a == index)
|
||||
+ return true;
|
||||
+ if (i == 1 && raddr_b == index)
|
||||
+ return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
@@ -2386,6 +2412,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
|
||||
struct qblock *block,
|
||||
struct v3d_qpu_instr *inst)
|
||||
{
|
||||
+ const struct v3d_device_info *devinfo = c->devinfo;
|
||||
+
|
||||
/* We only call this if we have successfully merged an ldvary into a
|
||||
* previous instruction.
|
||||
*/
|
||||
@@ -2398,9 +2426,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
|
||||
* the ldvary destination, if it does, then moving the ldvary before
|
||||
* it would overwrite it.
|
||||
*/
|
||||
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
|
||||
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
|
||||
return false;
|
||||
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
|
||||
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
|
||||
return false;
|
||||
|
||||
/* The implicit ldvary destination may not be written to by a signal
|
||||
@@ -2436,13 +2464,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
|
||||
}
|
||||
|
||||
/* The previous instruction cannot have a conflicting signal */
|
||||
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
|
||||
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
|
||||
return false;
|
||||
|
||||
uint32_t sig;
|
||||
struct v3d_qpu_sig new_sig = prev->qpu.sig;
|
||||
new_sig.ldvary = true;
|
||||
- if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
|
||||
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
|
||||
return false;
|
||||
|
||||
/* The previous instruction cannot use flags since ldvary uses the
|
||||
@@ -2471,14 +2499,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
|
||||
inst->sig_magic = false;
|
||||
inst->sig_addr = 0;
|
||||
|
||||
- /* By moving ldvary to the previous instruction we make it update
|
||||
- * r5 in the current one, so nothing else in it should write r5.
|
||||
- * This should've been prevented by our dependency tracking, which
|
||||
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
|
||||
+ if (devinfo->ver >= 71) {
|
||||
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
|
||||
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
|
||||
+ }
|
||||
+
|
||||
+ /* By moving ldvary to the previous instruction we make it update r5
|
||||
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
|
||||
+ * should write this register.
|
||||
+ *
|
||||
+ * This should've been prevented by our depedency tracking, which
|
||||
* would not allow ldvary to be paired up with an instruction that
|
||||
- * writes r5 (since our dependency tracking doesn't know that the
|
||||
- * ldvary write r5 happens in the next instruction).
|
||||
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
|
||||
+ * ldvary write to r5/rf0 happens in the next instruction).
|
||||
*/
|
||||
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
|
||||
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
|
||||
+ assert(devinfo->ver <= 42 ||
|
||||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
|
||||
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
|
||||
|
||||
return true;
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,144 @@
|
||||
From 0670d642bb91fc68ce73f2d9fb88c482295a446d Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 28 Oct 2021 14:13:29 +0200
|
||||
Subject: [PATCH 054/139] broadcom/compiler: try to use ldunif(a) instead of
|
||||
ldunif(a)rf in v71
|
||||
|
||||
The rf variants need to encode the destination in the cond bits, which
|
||||
prevents these to be merged with any other instruction that need them.
|
||||
|
||||
In 4.x, ldunif(a) write to r5 which is a special register that only
|
||||
ldunif(a) and ldvary can write so we have a special register class for
|
||||
it and only allow it for them. Then when we need to choose a register
|
||||
for a node, if this register is available we always use it.
|
||||
|
||||
In 7.x these instructions write to rf0, which can be used by any
|
||||
instruction, so instead of restricting rf0, we track the temps that
|
||||
are used as ldunif(a) destinations and use that information to favor
|
||||
rf0 for them.
|
||||
---
|
||||
src/broadcom/compiler/v3d_compiler.h | 3 ++
|
||||
src/broadcom/compiler/vir_register_allocate.c | 34 ++++++++++++++++---
|
||||
src/broadcom/compiler/vir_to_qpu.c | 11 ++++--
|
||||
3 files changed, 41 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
|
||||
index 7e8f3bfc1a7..36adf8830b5 100644
|
||||
--- a/src/broadcom/compiler/v3d_compiler.h
|
||||
+++ b/src/broadcom/compiler/v3d_compiler.h
|
||||
@@ -613,6 +613,9 @@ struct v3d_ra_node_info {
|
||||
struct {
|
||||
uint32_t priority;
|
||||
uint8_t class_bits;
|
||||
+
|
||||
+ /* V3D 7.x */
|
||||
+ bool is_ldunif_dst;
|
||||
} *info;
|
||||
uint32_t alloc_count;
|
||||
};
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index e0adc1de7a4..1be091f8518 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -384,6 +384,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
|
||||
/* We fill the node priority after we are done inserting spills */
|
||||
c->nodes.info[node].class_bits = class_bits;
|
||||
c->nodes.info[node].priority = 0;
|
||||
+ c->nodes.info[node].is_ldunif_dst = false;
|
||||
}
|
||||
|
||||
/* The spill offset for this thread takes a bit of setup, so do it once at
|
||||
@@ -899,9 +900,22 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
|
||||
static bool
|
||||
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
+ unsigned int node,
|
||||
BITSET_WORD *regs,
|
||||
unsigned int *out)
|
||||
{
|
||||
+ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
|
||||
+ * so we can avoid turning them into ldunifrf (which uses the
|
||||
+ * cond field to encode the dst and would prevent merge with
|
||||
+ * instructions that use cond flags).
|
||||
+ */
|
||||
+ if (v3d_ra->nodes->info[node].is_ldunif_dst &&
|
||||
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
|
||||
+ assert(v3d_ra->devinfo->ver >= 71);
|
||||
+ *out = v3d_ra->phys_index;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
for (int i = 0; i < PHYS_COUNT; i++) {
|
||||
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
|
||||
int phys = v3d_ra->phys_index + phys_off;
|
||||
@@ -927,7 +941,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
|
||||
return reg;
|
||||
}
|
||||
|
||||
- if (v3d_ra_select_rf(v3d_ra, regs, ®))
|
||||
+ if (v3d_ra_select_rf(v3d_ra, n, regs, ®))
|
||||
return reg;
|
||||
|
||||
/* If we ran out of physical registers try to assign an accumulator
|
||||
@@ -1139,15 +1153,24 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
- /* If the instruction has an implicit write
|
||||
- * we can't allocate its dest to the same
|
||||
- * register.
|
||||
+ /* Make sure we don't allocate the ldvary's
|
||||
+ * destination to rf0, since it would clash
|
||||
+ * with its implicit write to that register.
|
||||
*/
|
||||
- if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
|
||||
+ if (inst->qpu.sig.ldvary) {
|
||||
ra_add_node_interference(c->g,
|
||||
temp_to_node(c, inst->dst.index),
|
||||
implicit_rf_nodes[0]);
|
||||
}
|
||||
+ /* Flag dst temps from ldunif(a) instructions
|
||||
+ * so we can try to assign rf0 to them and avoid
|
||||
+ * converting these to ldunif(a)rf.
|
||||
+ */
|
||||
+ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
|
||||
+ const uint32_t dst_n =
|
||||
+ temp_to_node(c, inst->dst.index);
|
||||
+ c->nodes.info[dst_n].is_ldunif_dst = true;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1222,6 +1245,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
* without accumulators that can have implicit writes to phys regs.
|
||||
*/
|
||||
for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
||||
+ c->nodes.info[i].is_ldunif_dst = false;
|
||||
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
||||
acc_nodes[i] = i;
|
||||
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
|
||||
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
|
||||
index afc4941fdb1..cbbb495592b 100644
|
||||
--- a/src/broadcom/compiler/vir_to_qpu.c
|
||||
+++ b/src/broadcom/compiler/vir_to_qpu.c
|
||||
@@ -345,8 +345,15 @@ v3d_generate_code_block(struct v3d_compile *c,
|
||||
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
|
||||
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
||||
|
||||
- if (!dst.magic ||
|
||||
- dst.index != V3D_QPU_WADDR_R5) {
|
||||
+ bool use_rf;
|
||||
+ if (c->devinfo->has_accumulators) {
|
||||
+ use_rf = !dst.magic ||
|
||||
+ dst.index != V3D_QPU_WADDR_R5;
|
||||
+ } else {
|
||||
+ use_rf = dst.magic || dst.index != 0;
|
||||
+ }
|
||||
+
|
||||
+ if (use_rf) {
|
||||
assert(c->devinfo->ver >= 40);
|
||||
|
||||
if (qinst->qpu.sig.ldunif) {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,82 @@
|
||||
From cbed3b97394da09c9ae644c79e098e3ba8b5c3e8 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Fri, 29 Oct 2021 13:00:56 +0200
|
||||
Subject: [PATCH 055/139] broadcom/compiler: don't assign rf0 to temps that
|
||||
conflict with ldvary
|
||||
|
||||
ldvary writes to rf0 implicitly, so we don't want to allocate rf0 to
|
||||
any temps that are live across ldvary's rf0 live ranges.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++++++-
|
||||
1 file changed, 38 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 1be091f8518..6f7b1ca0589 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -1019,6 +1019,7 @@ static void
|
||||
update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
|
||||
int *acc_nodes,
|
||||
int *implicit_rf_nodes,
|
||||
+ int last_ldvary_ip,
|
||||
struct qinst *inst)
|
||||
{
|
||||
int32_t ip = inst->ip;
|
||||
@@ -1125,6 +1126,25 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
|
||||
}
|
||||
}
|
||||
|
||||
+ /* Don't allocate rf0 to temps that cross ranges where we have
|
||||
+ * live implicit rf0 writes from ldvary. We can identify these
|
||||
+ * by tracking the last ldvary instruction and explicit reads
|
||||
+ * of rf0.
|
||||
+ */
|
||||
+ if (c->devinfo->ver >= 71 &&
|
||||
+ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
|
||||
+ (vir_get_nsrc(inst) > 1 &&
|
||||
+ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
|
||||
+ for (int i = 0; i < c->num_temps; i++) {
|
||||
+ if (c->temp_start[i] < ip &&
|
||||
+ c->temp_end[i] > last_ldvary_ip) {
|
||||
+ ra_add_node_interference(c->g,
|
||||
+ temp_to_node(c, i),
|
||||
+ implicit_rf_nodes[0]);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (inst->dst.file == QFILE_TEMP) {
|
||||
/* Only a ldunif gets to write to R5, which only has a
|
||||
* single 32-bit channel of storage.
|
||||
@@ -1270,10 +1290,27 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
* interferences.
|
||||
*/
|
||||
int ip = 0;
|
||||
+ int last_ldvary_ip = -1;
|
||||
vir_for_each_inst_inorder(inst, c) {
|
||||
inst->ip = ip++;
|
||||
+
|
||||
+ /* ldunif(a) always write to a temporary, so we have
|
||||
+ * liveness info available to decide if rf0 is
|
||||
+ * available for them, however, ldvary is different:
|
||||
+ * it always writes to rf0 directly so we don't have
|
||||
+ * liveness information for its implicit rf0 write.
|
||||
+ *
|
||||
+ * That means the allocator may assign rf0 to a temp
|
||||
+ * that is defined while an implicit rf0 write from
|
||||
+ * ldvary is still live. We fix that by manually
|
||||
+ * tracking rf0 live ranges from ldvary instructions.
|
||||
+ */
|
||||
+ if (inst->qpu.sig.ldvary)
|
||||
+ last_ldvary_ip = ip;
|
||||
+
|
||||
update_graph_and_reg_classes_for_inst(c, acc_nodes,
|
||||
- implicit_rf_nodes, inst);
|
||||
+ implicit_rf_nodes,
|
||||
+ last_ldvary_ip, inst);
|
||||
}
|
||||
|
||||
/* Set the register classes for all our temporaries in the graph */
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,139 @@
|
||||
From cbaa469c09974c1574b16f559173694904fe1bb0 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 25 Oct 2021 09:38:57 +0200
|
||||
Subject: [PATCH 056/139] broadcom/compiler: convert mul to add when needed to
|
||||
allow merge
|
||||
|
||||
V3D 7.x added 'mov' opcodes to the ADD alu, so now it is possible to
|
||||
move these to the ADD alu to facilitate merging them with other MUL
|
||||
instructions.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 102 ++++++++++++++++++++++++---
|
||||
1 file changed, 94 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 334ffdc6d58..caa84254998 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1086,6 +1086,57 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
|
||||
inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
}
|
||||
|
||||
+static bool
|
||||
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
|
||||
+{
|
||||
+ switch (op) {
|
||||
+ case V3D_QPU_M_MOV:
|
||||
+ case V3D_QPU_M_FMOV:
|
||||
+ return devinfo->ver >= 71;
|
||||
+ default:
|
||||
+ return false;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static enum v3d_qpu_mul_op
|
||||
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
|
||||
+{
|
||||
+ switch (op) {
|
||||
+ case V3D_QPU_M_MOV:
|
||||
+ return V3D_QPU_A_MOV;
|
||||
+ case V3D_QPU_M_FMOV:
|
||||
+ return V3D_QPU_A_FMOV;
|
||||
+ default:
|
||||
+ unreachable("unexpected mov opcode");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
|
||||
+{
|
||||
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
|
||||
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
|
||||
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
|
||||
+
|
||||
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
|
||||
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
|
||||
+ inst->alu.mul.op = V3D_QPU_M_NOP;
|
||||
+
|
||||
+ inst->flags.ac = inst->flags.mc;
|
||||
+ inst->flags.apf = inst->flags.mpf;
|
||||
+ inst->flags.auf = inst->flags.muf;
|
||||
+ inst->flags.mc = V3D_QPU_COND_NONE;
|
||||
+ inst->flags.mpf = V3D_QPU_PF_NONE;
|
||||
+ inst->flags.muf = V3D_QPU_UF_NONE;
|
||||
+
|
||||
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
|
||||
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
|
||||
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
|
||||
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
|
||||
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+}
|
||||
+
|
||||
static bool
|
||||
qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
struct v3d_qpu_instr *result,
|
||||
@@ -1151,17 +1202,52 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
}
|
||||
}
|
||||
|
||||
+ struct v3d_qpu_instr add_inst;
|
||||
if (b->alu.mul.op != V3D_QPU_M_NOP) {
|
||||
- if (a->alu.mul.op != V3D_QPU_M_NOP)
|
||||
- return false;
|
||||
- merge.alu.mul = b->alu.mul;
|
||||
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
|
||||
+ merge.alu.mul = b->alu.mul;
|
||||
+
|
||||
+ merge.flags.mc = b->flags.mc;
|
||||
+ merge.flags.mpf = b->flags.mpf;
|
||||
+ merge.flags.muf = b->flags.muf;
|
||||
+
|
||||
+ mul_instr = b;
|
||||
+ add_instr = a;
|
||||
+ }
|
||||
+ /* If a's mul op is used but its add op is not, then see if we
|
||||
+ * can convert either a's mul op or b's mul op to an add op
|
||||
+ * so we can merge.
|
||||
+ */
|
||||
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
|
||||
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
|
||||
+ add_inst = *b;
|
||||
+ qpu_convert_mul_to_add(&add_inst);
|
||||
|
||||
- merge.flags.mc = b->flags.mc;
|
||||
- merge.flags.mpf = b->flags.mpf;
|
||||
- merge.flags.muf = b->flags.muf;
|
||||
+ merge.alu.add = add_inst.alu.add;
|
||||
|
||||
- mul_instr = b;
|
||||
- add_instr = a;
|
||||
+ merge.flags.ac = b->flags.mc;
|
||||
+ merge.flags.apf = b->flags.mpf;
|
||||
+ merge.flags.auf = b->flags.muf;
|
||||
+
|
||||
+ mul_instr = a;
|
||||
+ add_instr = &add_inst;
|
||||
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
|
||||
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
|
||||
+ add_inst = *a;
|
||||
+ qpu_convert_mul_to_add(&add_inst);
|
||||
+
|
||||
+ merge = add_inst;
|
||||
+ merge.alu.mul = b->alu.mul;
|
||||
+
|
||||
+ merge.flags.mc = b->flags.mc;
|
||||
+ merge.flags.mpf = b->flags.mpf;
|
||||
+ merge.flags.muf = b->flags.muf;
|
||||
+
|
||||
+ mul_instr = b;
|
||||
+ add_instr = &add_inst;
|
||||
+ } else {
|
||||
+ return false;
|
||||
+ }
|
||||
}
|
||||
|
||||
/* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,418 @@
|
||||
From b59b3725fb16f4ab1ac0db86a5452a4ed6176074 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 3 Nov 2021 10:34:19 +0100
|
||||
Subject: [PATCH 057/139] broadcom/compiler: implement small immediates for v71
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 90 +++++++++++++------
|
||||
src/broadcom/compiler/qpu_validate.c | 20 ++++-
|
||||
.../compiler/vir_opt_small_immediates.c | 26 +++++-
|
||||
src/broadcom/compiler/vir_to_qpu.c | 11 ++-
|
||||
src/broadcom/qpu/qpu_disasm.c | 1 -
|
||||
src/broadcom/qpu/qpu_instr.c | 8 +-
|
||||
src/broadcom/qpu/qpu_instr.h | 2 +-
|
||||
src/broadcom/qpu/qpu_pack.c | 36 ++++----
|
||||
8 files changed, 139 insertions(+), 55 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index caa84254998..bd1c920848a 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -714,7 +714,6 @@ qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
|
||||
!inst->sig.small_imm_b && (inst->raddr_b == waddr))
|
||||
return true;
|
||||
} else {
|
||||
- /* FIXME: skip if small immediate */
|
||||
if (v3d71_qpu_reads_raddr(inst, waddr))
|
||||
return true;
|
||||
}
|
||||
@@ -948,10 +947,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
|
||||
return raddrs_used;
|
||||
}
|
||||
|
||||
-/* Take two instructions and attempt to merge their raddr fields
|
||||
- * into one merged instruction. Returns false if the two instructions
|
||||
- * access more than two different rf registers between them, or more
|
||||
- * than one rf register and one small immediate.
|
||||
+/* Takes two instructions and attempts to merge their raddr fields (including
|
||||
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
|
||||
+ * if the two instructions access more than two different rf registers between
|
||||
+ * them, or more than one rf register and one small immediate. For 7.x returns
|
||||
+ * false if both instructions use small immediates.
|
||||
*/
|
||||
static bool
|
||||
qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
@@ -959,6 +959,27 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
||||
const struct v3d_qpu_instr *mul_instr,
|
||||
const struct v3d_device_info *devinfo)
|
||||
{
|
||||
+ if (devinfo->ver >= 71) {
|
||||
+ assert(add_instr->sig.small_imm_a +
|
||||
+ add_instr->sig.small_imm_b <= 1);
|
||||
+ assert(add_instr->sig.small_imm_c +
|
||||
+ add_instr->sig.small_imm_d == 0);
|
||||
+ assert(mul_instr->sig.small_imm_a +
|
||||
+ mul_instr->sig.small_imm_b == 0);
|
||||
+ assert(mul_instr->sig.small_imm_c +
|
||||
+ mul_instr->sig.small_imm_d <= 1);
|
||||
+
|
||||
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
|
||||
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
|
||||
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
|
||||
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
|
||||
+
|
||||
+ return (result->sig.small_imm_a +
|
||||
+ result->sig.small_imm_b +
|
||||
+ result->sig.small_imm_c +
|
||||
+ result->sig.small_imm_d) <= 1;
|
||||
+ }
|
||||
+
|
||||
assert(devinfo->ver <= 42);
|
||||
|
||||
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
|
||||
@@ -1060,7 +1081,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
|
||||
}
|
||||
|
||||
static void
|
||||
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
|
||||
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
|
||||
+ struct v3d_qpu_instr *inst)
|
||||
{
|
||||
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
|
||||
assert(inst->alu.add.op != V3D_QPU_A_NOP);
|
||||
@@ -1084,6 +1106,18 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
|
||||
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
||||
inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+
|
||||
+ if (devinfo->ver >= 71) {
|
||||
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
|
||||
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
|
||||
+ if (inst->sig.small_imm_a) {
|
||||
+ inst->sig.small_imm_c = true;
|
||||
+ inst->sig.small_imm_a = false;
|
||||
+ } else if (inst->sig.small_imm_b) {
|
||||
+ inst->sig.small_imm_d = true;
|
||||
+ inst->sig.small_imm_b = false;
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -1135,6 +1169,16 @@ qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
|
||||
inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
|
||||
inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
|
||||
inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
|
||||
+
|
||||
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
|
||||
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
|
||||
+ if (inst->sig.small_imm_c) {
|
||||
+ inst->sig.small_imm_a = true;
|
||||
+ inst->sig.small_imm_c = false;
|
||||
+ } else if (inst->sig.small_imm_d) {
|
||||
+ inst->sig.small_imm_b = true;
|
||||
+ inst->sig.small_imm_d = false;
|
||||
+ }
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -1173,20 +1217,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
|
||||
can_do_add_as_mul(b->alu.add.op)) {
|
||||
mul_inst = *b;
|
||||
- qpu_convert_add_to_mul(&mul_inst);
|
||||
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
|
||||
|
||||
merge.alu.mul = mul_inst.alu.mul;
|
||||
|
||||
- merge.flags.mc = b->flags.ac;
|
||||
- merge.flags.mpf = b->flags.apf;
|
||||
- merge.flags.muf = b->flags.auf;
|
||||
+ merge.flags.mc = mul_inst.flags.mc;
|
||||
+ merge.flags.mpf = mul_inst.flags.mpf;
|
||||
+ merge.flags.muf = mul_inst.flags.muf;
|
||||
|
||||
add_instr = a;
|
||||
mul_instr = &mul_inst;
|
||||
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
|
||||
can_do_add_as_mul(a->alu.add.op)) {
|
||||
mul_inst = *a;
|
||||
- qpu_convert_add_to_mul(&mul_inst);
|
||||
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
|
||||
|
||||
merge = mul_inst;
|
||||
merge.alu.add = b->alu.add;
|
||||
@@ -1225,9 +1269,9 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
|
||||
merge.alu.add = add_inst.alu.add;
|
||||
|
||||
- merge.flags.ac = b->flags.mc;
|
||||
- merge.flags.apf = b->flags.mpf;
|
||||
- merge.flags.auf = b->flags.muf;
|
||||
+ merge.flags.ac = add_inst.flags.ac;
|
||||
+ merge.flags.apf = add_inst.flags.apf;
|
||||
+ merge.flags.auf = add_inst.flags.auf;
|
||||
|
||||
mul_instr = a;
|
||||
add_instr = &add_inst;
|
||||
@@ -1252,17 +1296,12 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
|
||||
/* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
|
||||
* they have restrictions on the number of raddrs that can be adressed
|
||||
- * in a single instruction.
|
||||
- *
|
||||
- * FIXME: for V3D 7.x we can't merge instructions if they address more
|
||||
- * than one small immediate. For now, we don't support small immediates,
|
||||
- * so it is not a problem.
|
||||
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
|
||||
+ * but we are still limited to a single small immediate per instruction.
|
||||
*/
|
||||
- if (devinfo->ver <= 42) {
|
||||
- if (add_instr && mul_instr &&
|
||||
- !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
|
||||
- return false;
|
||||
- }
|
||||
+ if (add_instr && mul_instr &&
|
||||
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
|
||||
+ return false;
|
||||
}
|
||||
|
||||
merge.sig.thrsw |= b->sig.thrsw;
|
||||
@@ -1273,7 +1312,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
|
||||
merge.sig.ldtmu |= b->sig.ldtmu;
|
||||
merge.sig.ldvary |= b->sig.ldvary;
|
||||
merge.sig.ldvpm |= b->sig.ldvpm;
|
||||
- merge.sig.small_imm_b |= b->sig.small_imm_b;
|
||||
merge.sig.ldtlb |= b->sig.ldtlb;
|
||||
merge.sig.ldtlbu |= b->sig.ldtlbu;
|
||||
merge.sig.ucb |= b->sig.ucb;
|
||||
@@ -1933,8 +1971,6 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
if (c->devinfo->ver >= 71) {
|
||||
/* RF2-3 might be overwritten during the delay slots by
|
||||
* fragment shader setup.
|
||||
- *
|
||||
- * FIXME: handle small immediate cases
|
||||
*/
|
||||
if (v3d71_qpu_reads_raddr(inst, 2) ||
|
||||
v3d71_qpu_reads_raddr(inst, 3)) {
|
||||
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
|
||||
index fde6695d59b..41070484286 100644
|
||||
--- a/src/broadcom/compiler/qpu_validate.c
|
||||
+++ b/src/broadcom/compiler/qpu_validate.c
|
||||
@@ -116,8 +116,24 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||
return;
|
||||
|
||||
if (devinfo->ver < 71) {
|
||||
- if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d)
|
||||
- fail_instr(state, "small imm a/c/d added after V3D 7.1");
|
||||
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
|
||||
+ inst->sig.small_imm_d) {
|
||||
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
|
||||
+ }
|
||||
+ } else {
|
||||
+ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
|
||||
+ !vir_is_add(qinst)) {
|
||||
+ fail_instr(state, "small imm a/b used but no ADD inst");
|
||||
+ }
|
||||
+ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
|
||||
+ !vir_is_mul(qinst)) {
|
||||
+ fail_instr(state, "small imm c/d used but no MUL inst");
|
||||
+ }
|
||||
+ if (inst->sig.small_imm_a + inst->sig.small_imm_b +
|
||||
+ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
|
||||
+ fail_instr(state, "only one small immediate can be "
|
||||
+ "enabled per instruction");
|
||||
+ }
|
||||
}
|
||||
|
||||
/* LDVARY writes r5 two instructions later and LDUNIF writes
|
||||
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
|
||||
index df0d6c36c9b..ed5bc011964 100644
|
||||
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
|
||||
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
|
||||
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
|
||||
/* The small immediate value sits in the raddr B field, so we
|
||||
* can't have 2 small immediates in one instruction (unless
|
||||
* they're the same value, but that should be optimized away
|
||||
- * elsewhere).
|
||||
+ * elsewhere). Since 7.x we can encode small immediates in
|
||||
+ * any raddr field, but each instruction can still only use
|
||||
+ * one.
|
||||
*/
|
||||
bool uses_small_imm = false;
|
||||
for (int i = 0; i < vir_get_nsrc(inst); i++) {
|
||||
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
|
||||
*/
|
||||
struct v3d_qpu_sig new_sig = inst->qpu.sig;
|
||||
uint32_t sig_packed;
|
||||
- new_sig.small_imm_b = true;
|
||||
+ if (c->devinfo->ver <= 42) {
|
||||
+ new_sig.small_imm_b = true;
|
||||
+ } else {
|
||||
+ if (vir_is_add(inst)) {
|
||||
+ if (i == 0)
|
||||
+ new_sig.small_imm_a = true;
|
||||
+ else
|
||||
+ new_sig.small_imm_b = true;
|
||||
+ } else {
|
||||
+ if (i == 0)
|
||||
+ new_sig.small_imm_c = true;
|
||||
+ else
|
||||
+ new_sig.small_imm_d = true;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
|
||||
continue;
|
||||
|
||||
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
|
||||
vir_dump_inst(c, inst);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
- inst->qpu.sig.small_imm_b = true;
|
||||
+ inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
|
||||
+ inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
|
||||
+ inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
|
||||
+ inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
|
||||
inst->qpu.raddr_b = packed;
|
||||
|
||||
inst->src[i].file = QFILE_SMALL_IMM;
|
||||
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
|
||||
index cbbb495592b..4ed184cbbcb 100644
|
||||
--- a/src/broadcom/compiler/vir_to_qpu.c
|
||||
+++ b/src/broadcom/compiler/vir_to_qpu.c
|
||||
@@ -89,8 +89,15 @@ new_qpu_nop_before(struct qinst *inst)
|
||||
static void
|
||||
v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
|
||||
{
|
||||
- if (src.smimm)
|
||||
- unreachable("v3d71_set_src: pending handling small immediates");
|
||||
+ /* If we have a small immediate move it from inst->raddr_b to the
|
||||
+ * corresponding raddr.
|
||||
+ */
|
||||
+ if (src.smimm) {
|
||||
+ assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
|
||||
+ instr->sig.small_imm_c || instr->sig.small_imm_d);
|
||||
+ *raddr = instr->raddr_b;
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
assert(!src.magic);
|
||||
*raddr = src.index;
|
||||
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
|
||||
index b613de781dc..c1590a760de 100644
|
||||
--- a/src/broadcom/qpu/qpu_disasm.c
|
||||
+++ b/src/broadcom/qpu/qpu_disasm.c
|
||||
@@ -113,7 +113,6 @@ v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
|
||||
}
|
||||
|
||||
if (is_small_imm) {
|
||||
- unreachable("Pending handling small immediates");
|
||||
uint32_t val;
|
||||
ASSERTED bool ok =
|
||||
v3d_qpu_small_imm_unpack(disasm->devinfo,
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index f54ce7210fb..c30f4bbbccf 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -975,10 +975,10 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
|
||||
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
|
||||
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
|
||||
|
||||
- return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) ||
|
||||
- (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) ||
|
||||
- (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) ||
|
||||
- (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
|
||||
+ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
|
||||
+ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
|
||||
+ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
|
||||
+ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
|
||||
}
|
||||
|
||||
bool
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index dece45c5c54..d408fb426fa 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -402,7 +402,7 @@ struct v3d_qpu_instr {
|
||||
uint8_t sig_addr;
|
||||
bool sig_magic; /* If the signal writes to a magic address */
|
||||
uint8_t raddr_a; /* V3D 4.x */
|
||||
- uint8_t raddr_b; /* V3D 4.x*/
|
||||
+ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
|
||||
struct v3d_qpu_flags flags;
|
||||
|
||||
union {
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index ed5a8bc667d..7984712d527 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -1218,16 +1218,11 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
|
||||
|
||||
instr->alu.add.op = desc->op;
|
||||
|
||||
- /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the
|
||||
+ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
|
||||
* operands.
|
||||
*/
|
||||
- /* FIXME: for now hardcoded values, until we got the small_imm support
|
||||
- * in place
|
||||
- */
|
||||
- uint32_t small_imm_a = 0;
|
||||
- uint32_t small_imm_b = 0;
|
||||
- if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
|
||||
- small_imm_b *256 + (op & 3) * 64 + raddr_b) {
|
||||
+ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
|
||||
+ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
|
||||
if (instr->alu.add.op == V3D_QPU_A_FMIN)
|
||||
instr->alu.add.op = V3D_QPU_A_FMAX;
|
||||
if (instr->alu.add.op == V3D_QPU_A_FADD)
|
||||
@@ -1858,11 +1853,6 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
uint32_t output_pack;
|
||||
uint32_t a_unpack;
|
||||
uint32_t b_unpack;
|
||||
- /* FIXME: for now hardcoded values, until we got the small_imm
|
||||
- * support in place
|
||||
- */
|
||||
- uint32_t small_imm_a = 0;
|
||||
- uint32_t small_imm_b = 0;
|
||||
|
||||
if (instr->alu.add.op != V3D_QPU_A_FCMP) {
|
||||
if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
|
||||
@@ -1886,8 +1876,8 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
* distinguished by which order their operands come in.
|
||||
*/
|
||||
bool ordering =
|
||||
- small_imm_a * 256 + a_unpack * 64 + raddr_a >
|
||||
- small_imm_b * 256 + b_unpack * 64 + raddr_b;
|
||||
+ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
|
||||
+ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
|
||||
if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
|
||||
instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
|
||||
((instr->alu.add.op == V3D_QPU_A_FMAX ||
|
||||
@@ -1901,6 +1891,22 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
|
||||
temp = raddr_a;
|
||||
raddr_a = raddr_b;
|
||||
raddr_b = temp;
|
||||
+
|
||||
+ /* If we are swapping raddr_a/b we also need to swap
|
||||
+ * small_imm_a/b.
|
||||
+ */
|
||||
+ if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
|
||||
+ assert(instr->sig.small_imm_a !=
|
||||
+ instr->sig.small_imm_b);
|
||||
+ struct v3d_qpu_sig new_sig = instr->sig;
|
||||
+ new_sig.small_imm_a = !instr->sig.small_imm_a;
|
||||
+ new_sig.small_imm_b = !instr->sig.small_imm_b;
|
||||
+ uint32_t sig;
|
||||
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
|
||||
+ return false;
|
||||
+ *packed_instr &= ~V3D_QPU_SIG_MASK;
|
||||
+ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
|
||||
+ }
|
||||
}
|
||||
|
||||
opcode |= a_unpack << 2;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,61 @@
|
||||
From 3af87d2672da7c928ecf8a0a1cd1bef8a6729364 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 22 Nov 2021 12:56:03 +0100
|
||||
Subject: [PATCH 058/139] broadcom/compiler: update thread end restrictions for
|
||||
v7.x
|
||||
|
||||
In 4.x it is not allowed to write to the register file in the last
|
||||
3 instructions, but in 7.x we only have this restriction in the
|
||||
thread end instruction itself, and only if the write comes from
|
||||
the ALU ports.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 31 ++++++++++++++++++++--------
|
||||
1 file changed, 22 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index bd1c920848a..cba16c77d67 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1938,17 +1938,30 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
return false;
|
||||
}
|
||||
|
||||
- /* No writing physical registers at the end. */
|
||||
- bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
|
||||
- bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
|
||||
- if ((!add_is_nop && !inst->alu.add.magic_write) ||
|
||||
- (!mul_is_nop && !inst->alu.mul.magic_write)) {
|
||||
- return false;
|
||||
+ if (c->devinfo->ver <= 42) {
|
||||
+ /* No writing physical registers at the end. */
|
||||
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
|
||||
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
|
||||
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
|
||||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
|
||||
+ !inst->sig_magic) {
|
||||
+ return false;
|
||||
+ }
|
||||
}
|
||||
|
||||
- if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
|
||||
- !inst->sig_magic) {
|
||||
- return false;
|
||||
+ if (c->devinfo->ver >= 71) {
|
||||
+ /* The thread end instruction must not write to the
|
||||
+ * register file via the add/mul ALUs.
|
||||
+ */
|
||||
+ if (slot == 0 &&
|
||||
+ (!inst->alu.add.magic_write ||
|
||||
+ !inst->alu.mul.magic_write)) {
|
||||
+ return false;
|
||||
+ }
|
||||
}
|
||||
|
||||
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,112 @@
|
||||
From 7cfd5b808bb2f1cb17f57435cb5d411c4ac3aa6c Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 23 Nov 2021 10:04:49 +0100
|
||||
Subject: [PATCH 059/139] broadcom/compiler: update ldvary thread switch delay
|
||||
slot restriction for v7.x
|
||||
|
||||
In V3D 7.x we don't have accumulators which would not survive a thread
|
||||
switch, so the only restriction is that ldvary can't be placed in the second
|
||||
delay slot of a thread switch.
|
||||
|
||||
shader-db results for UnrealEngine4 shaders:
|
||||
|
||||
total instructions in shared programs: 446458 -> 446401 (-0.01%)
|
||||
instructions in affected programs: 13492 -> 13435 (-0.42%)
|
||||
helped: 58
|
||||
HURT: 3
|
||||
Instructions are helped.
|
||||
|
||||
total nops in shared programs: 19571 -> 19541 (-0.15%)
|
||||
nops in affected programs: 161 -> 131 (-18.63%)
|
||||
helped: 30
|
||||
HURT: 0
|
||||
Nops are helped.
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 33 +++++++++++++++++++++-------
|
||||
src/broadcom/compiler/qpu_validate.c | 10 +++++++--
|
||||
2 files changed, 33 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index cba16c77d67..32f651851cf 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1491,11 +1491,20 @@ retry:
|
||||
* ldvary now if the follow-up fixup would place
|
||||
* it in the delay slots of a thrsw, which is not
|
||||
* allowed and would prevent the fixup from being
|
||||
- * successful.
|
||||
+ * successful. In V3D 7.x we can allow this to happen
|
||||
+ * as long as it is not the last delay slot.
|
||||
*/
|
||||
- if (inst->sig.ldvary &&
|
||||
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
|
||||
- continue;
|
||||
+ if (inst->sig.ldvary) {
|
||||
+ if (c->devinfo->ver <= 42 &&
|
||||
+ scoreboard->last_thrsw_tick + 2 >=
|
||||
+ scoreboard->tick - 1) {
|
||||
+ continue;
|
||||
+ }
|
||||
+ if (c->devinfo->ver >= 71 &&
|
||||
+ scoreboard->last_thrsw_tick + 2 ==
|
||||
+ scoreboard->tick - 1) {
|
||||
+ continue;
|
||||
+ }
|
||||
}
|
||||
|
||||
/* We can emit a new tmu lookup with a previous ldtmu
|
||||
@@ -2020,8 +2029,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
||||
if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
|
||||
return false;
|
||||
|
||||
- if (slot > 0 && qinst->qpu.sig.ldvary)
|
||||
- return false;
|
||||
+ if (qinst->qpu.sig.ldvary) {
|
||||
+ if (c->devinfo->ver <= 42 && slot > 0)
|
||||
+ return false;
|
||||
+ if (c->devinfo->ver >= 71 && slot == 2)
|
||||
+ return false;
|
||||
+ }
|
||||
|
||||
/* unifa and the following 3 instructions can't overlap a
|
||||
* thread switch/end. The docs further clarify that this means
|
||||
@@ -2618,9 +2631,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
|
||||
|
||||
/* We can't put an ldvary in the delay slots of a thrsw. We should've
|
||||
* prevented this when pairing up the ldvary with another instruction
|
||||
- * and flagging it for a fixup.
|
||||
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
|
||||
+ * second delay slot.
|
||||
*/
|
||||
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
|
||||
+ assert((devinfo->ver <= 42 &&
|
||||
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
|
||||
+ (devinfo->ver >= 71 &&
|
||||
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
|
||||
|
||||
/* Move the ldvary to the previous instruction and remove it from the
|
||||
* current one.
|
||||
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
|
||||
index 41070484286..4f09aa8aef4 100644
|
||||
--- a/src/broadcom/compiler/qpu_validate.c
|
||||
+++ b/src/broadcom/compiler/qpu_validate.c
|
||||
@@ -215,8 +215,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||
"SFU write started during THRSW delay slots ");
|
||||
}
|
||||
|
||||
- if (inst->sig.ldvary)
|
||||
- fail_instr(state, "LDVARY during THRSW delay slots");
|
||||
+ if (inst->sig.ldvary) {
|
||||
+ if (devinfo->ver <= 42)
|
||||
+ fail_instr(state, "LDVARY during THRSW delay slots");
|
||||
+ if (devinfo->ver >= 71 &&
|
||||
+ state->ip - state->last_thrsw_ip == 2) {
|
||||
+ fail_instr(state, "LDVARY in 2nd THRSW delay slot");
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
|
||||
(void)qpu_magic_waddr_matches; /* XXX */
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,30 @@
|
||||
From ca4063d627cd31c589a8e8688f2876dd8211d1bc Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 25 Nov 2021 08:31:02 +0100
|
||||
Subject: [PATCH 060/139] broadcom/compiler: lift restriction for branch +
|
||||
msfign after setmsf for v7.x
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 32f651851cf..476eae691ab 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -2373,10 +2373,11 @@ emit_branch(struct v3d_compile *c,
|
||||
assert(scoreboard->last_branch_tick + 3 < branch_tick);
|
||||
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
|
||||
|
||||
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
|
||||
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
|
||||
* setmsf.
|
||||
*/
|
||||
bool is_safe_msf_branch =
|
||||
+ c->devinfo->ver >= 71 ||
|
||||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
|
||||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
|
||||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,38 @@
|
||||
From 167510aa43bbcf06e57a64495cee40e8cdaf5f8b Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Fri, 26 Nov 2021 10:37:05 +0100
|
||||
Subject: [PATCH 061/139] broadcom/compiler: start allocating from RF 4 in V7.x
|
||||
|
||||
In V3D 4.x we start at RF3 so that we allocate RF0-2 only if there
|
||||
aren't any other RFs available. This is useful with small shaders
|
||||
to ensure that our TLB writes don't use these registers because
|
||||
these are the last instructions we emit in fragment shaders and
|
||||
the last instructions in a program can't write to these registers,
|
||||
so if we do, we need to emit NOPs.
|
||||
|
||||
In V3D 7.x the registers affected by this restriction are RF2-3,
|
||||
so we choose to start at RF4.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 6f7b1ca0589..440b093a636 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -1234,9 +1234,10 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
.phys_index = phys_index,
|
||||
.next_acc = 0,
|
||||
/* Start at RF3, to try to keep the TLB writes from using
|
||||
- * RF0-2.
|
||||
+ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
|
||||
+ * using RF2-3.
|
||||
*/
|
||||
- .next_phys = 3,
|
||||
+ .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
|
||||
.nodes = &c->nodes,
|
||||
.devinfo = c->devinfo,
|
||||
};
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,71 @@
|
||||
From d47ea903b96e43b07bdef21f8026da818e30fcd1 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Thu, 25 Nov 2021 13:00:34 +0100
|
||||
Subject: [PATCH 062/139] broadcom/compiler: validate restrictions after TLB Z
|
||||
write
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_validate.c | 28 ++++++++++++++++++++++++++++
|
||||
1 file changed, 28 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
|
||||
index 4f09aa8aef4..1082fb7d50a 100644
|
||||
--- a/src/broadcom/compiler/qpu_validate.c
|
||||
+++ b/src/broadcom/compiler/qpu_validate.c
|
||||
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
|
||||
int last_sfu_write;
|
||||
int last_branch_ip;
|
||||
int last_thrsw_ip;
|
||||
+ int first_tlb_z_write;
|
||||
|
||||
/* Set when we've found the last-THRSW signal, or if we were started
|
||||
* in single-segment mode.
|
||||
@@ -110,11 +111,37 @@ static void
|
||||
qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||
{
|
||||
const struct v3d_device_info *devinfo = state->c->devinfo;
|
||||
+
|
||||
+ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
|
||||
+ state->first_tlb_z_write = state->ip;
|
||||
+
|
||||
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
||||
|
||||
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
|
||||
+ state->first_tlb_z_write >= 0 &&
|
||||
+ state->ip > state->first_tlb_z_write &&
|
||||
+ inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
|
||||
+ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
|
||||
+ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
|
||||
+ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
|
||||
+ fail_instr(state, "Implicit branch MSF read after TLB Z write");
|
||||
+ }
|
||||
+
|
||||
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
||||
return;
|
||||
|
||||
+ if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
|
||||
+ state->first_tlb_z_write >= 0 &&
|
||||
+ state->ip > state->first_tlb_z_write) {
|
||||
+ fail_instr(state, "SETMSF after TLB Z write");
|
||||
+ }
|
||||
+
|
||||
+ if (state->first_tlb_z_write >= 0 &&
|
||||
+ state->ip > state->first_tlb_z_write &&
|
||||
+ inst->alu.add.op == V3D_QPU_A_MSF) {
|
||||
+ fail_instr(state, "MSF read after TLB Z write");
|
||||
+ }
|
||||
+
|
||||
if (devinfo->ver < 71) {
|
||||
if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
|
||||
inst->sig.small_imm_d) {
|
||||
@@ -348,6 +375,7 @@ qpu_validate(struct v3d_compile *c)
|
||||
.last_sfu_write = -10,
|
||||
.last_thrsw_ip = -10,
|
||||
.last_branch_ip = -10,
|
||||
+ .first_tlb_z_write = INT_MAX,
|
||||
.ip = 0,
|
||||
|
||||
.last_thrsw_found = !c->last_thrsw,
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,26 @@
|
||||
From 6cdf01fad49489b5fc66d231b527de5245d5de32 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 29 Nov 2021 13:23:11 +0100
|
||||
Subject: [PATCH 063/139] broadcom/compiler: lift restriction on vpmwt in last
|
||||
instruction for V3D 7.x
|
||||
|
||||
---
|
||||
src/broadcom/compiler/qpu_schedule.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
|
||||
index 476eae691ab..77fb6a794e6 100644
|
||||
--- a/src/broadcom/compiler/qpu_schedule.c
|
||||
+++ b/src/broadcom/compiler/qpu_schedule.c
|
||||
@@ -1934,7 +1934,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
||||
if (slot > 0 && qinst->uniform != ~0)
|
||||
return false;
|
||||
|
||||
- if (v3d_qpu_waits_vpm(inst))
|
||||
+ if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
|
||||
return false;
|
||||
|
||||
if (inst->sig.ldvary)
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,134 @@
|
||||
From acc54637f0787ba4dc887130c25c628ccdaf4e38 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 9 Nov 2021 11:34:59 +0100
|
||||
Subject: [PATCH 064/139] broadcom/compiler: fix up copy propagation for v71
|
||||
|
||||
Update rules for unsafe copy propagations to match v7.x.
|
||||
---
|
||||
.../compiler/vir_opt_copy_propagate.c | 83 +++++++++++++------
|
||||
1 file changed, 56 insertions(+), 27 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
|
||||
index c4aa7255a17..1260838ca05 100644
|
||||
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
|
||||
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "v3d_compiler.h"
|
||||
|
||||
static bool
|
||||
-is_copy_mov(struct qinst *inst)
|
||||
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
|
||||
{
|
||||
if (!inst)
|
||||
return false;
|
||||
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
|
||||
return false;
|
||||
}
|
||||
|
||||
- switch (inst->src[0].file) {
|
||||
- case QFILE_MAGIC:
|
||||
- /* No copy propagating from R3/R4/R5 -- the MOVs from those
|
||||
- * are there to register allocate values produced into R3/4/5
|
||||
- * to other regs (though hopefully r3/4/5).
|
||||
- */
|
||||
- switch (inst->src[0].index) {
|
||||
- case V3D_QPU_WADDR_R3:
|
||||
- case V3D_QPU_WADDR_R4:
|
||||
- case V3D_QPU_WADDR_R5:
|
||||
- return false;
|
||||
+ if (devinfo->ver <= 42) {
|
||||
+ switch (inst->src[0].file) {
|
||||
+ case QFILE_MAGIC:
|
||||
+ /* No copy propagating from R3/R4/R5 -- the MOVs from
|
||||
+ * those are there to register allocate values produced
|
||||
+ * into R3/4/5 to other regs (though hopefully r3/4/5).
|
||||
+ */
|
||||
+ switch (inst->src[0].index) {
|
||||
+ case V3D_QPU_WADDR_R3:
|
||||
+ case V3D_QPU_WADDR_R4:
|
||||
+ case V3D_QPU_WADDR_R5:
|
||||
+ return false;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ break;
|
||||
+
|
||||
+ case QFILE_REG:
|
||||
+ switch (inst->src[0].index) {
|
||||
+ case 0:
|
||||
+ case 1:
|
||||
+ case 2:
|
||||
+ /* MOVs from rf0/1/2 are only to track the live
|
||||
+ * intervals for W/centroid W/Z.
|
||||
+ */
|
||||
+ return false;
|
||||
+ }
|
||||
+ break;
|
||||
+
|
||||
default:
|
||||
break;
|
||||
}
|
||||
- break;
|
||||
-
|
||||
- case QFILE_REG:
|
||||
- switch (inst->src[0].index) {
|
||||
- case 0:
|
||||
- case 1:
|
||||
- case 2:
|
||||
- /* MOVs from rf0/1/2 are only to track the live
|
||||
+ } else {
|
||||
+ assert(devinfo->ver >= 71);
|
||||
+ switch (inst->src[0].file) {
|
||||
+ case QFILE_REG:
|
||||
+ switch (inst->src[0].index) {
|
||||
+ /* MOVs from rf1/2/3 are only to track the live
|
||||
* intervals for W/centroid W/Z.
|
||||
+ *
|
||||
+ * Note: rf0 can be implicitly written by ldvary
|
||||
+ * (no temp involved), so it is not an SSA value and
|
||||
+ * could clash with writes to other temps that are
|
||||
+ * also allocated to rf0. In theory, that would mean
|
||||
+ * that we can't copy propagate from it, but we handle
|
||||
+ * this at register allocation time, preventing temps
|
||||
+ * from being allocated to rf0 while the rf0 value from
|
||||
+ * ldvary is still live.
|
||||
*/
|
||||
- return false;
|
||||
- }
|
||||
- break;
|
||||
+ case 1:
|
||||
+ case 2:
|
||||
+ case 3:
|
||||
+ return false;
|
||||
+ }
|
||||
+ break;
|
||||
|
||||
- default:
|
||||
- break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
|
||||
*/
|
||||
struct qinst *mov = movs[inst->src[i].index];
|
||||
if (!mov) {
|
||||
- if (!is_copy_mov(c->defs[inst->src[i].index]))
|
||||
+ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
|
||||
continue;
|
||||
mov = c->defs[inst->src[i].index];
|
||||
|
||||
@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)
|
||||
|
||||
apply_kills(c, movs, inst);
|
||||
|
||||
- if (is_copy_mov(inst))
|
||||
+ if (is_copy_mov(c->devinfo, inst))
|
||||
movs[inst->dst.index] = inst;
|
||||
}
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,150 @@
|
||||
From c340f7f1eb4a1e5c0fafe1ea2f801f2ebaf82d8d Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Fri, 26 Nov 2021 01:24:12 +0100
|
||||
Subject: [PATCH 065/139] broadcom/qpu: new packing/conversion v71 instructions
|
||||
|
||||
This commits adds the qpu definitions for several new v71
|
||||
instructions.
|
||||
|
||||
Packing:
|
||||
* vpack does a 2x32 to 2x16 bit integer pack
|
||||
* v8pack: Pack 2 x 2x16 bit integers into 4x8 bits
|
||||
* v10pack packs parts of 2 2x16 bit integer into r10g10b10a2.
|
||||
* v11fpack packs parts of 2 2x16 bit float into r11g11b10 rounding
|
||||
to nearest
|
||||
|
||||
Conversion to unorm/snorm:
|
||||
* vftounorm8/vftosnorm8: converts from 2x16-bit floating point
|
||||
to 2x8 bit unorm/snorm.
|
||||
* ftounorm16/ftosnorm16: converts floating point to 16-bit
|
||||
unorm/snorm
|
||||
* vftounorm10lo: Convert 2x16-bit floating point to 2x10-bit unorm
|
||||
* vftounorm10hi: Convert 2x16-bit floating point to one 2-bit and one 10-bit unorm
|
||||
---
|
||||
src/broadcom/qpu/qpu_instr.c | 20 ++++++++++++++++++++
|
||||
src/broadcom/qpu/qpu_instr.h | 12 ++++++++++++
|
||||
src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++
|
||||
3 files changed, 44 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
|
||||
index c30f4bbbccf..44f20618a5a 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.c
|
||||
+++ b/src/broadcom/qpu/qpu_instr.c
|
||||
@@ -179,6 +179,10 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
|
||||
[V3D_QPU_A_UTOF] = "utof",
|
||||
[V3D_QPU_A_MOV] = "mov",
|
||||
[V3D_QPU_A_FMOV] = "fmov",
|
||||
+ [V3D_QPU_A_VPACK] = "vpack",
|
||||
+ [V3D_QPU_A_V8PACK] = "v8pack",
|
||||
+ [V3D_QPU_A_V10PACK] = "v10pack",
|
||||
+ [V3D_QPU_A_V11FPACK] = "v11fpack",
|
||||
};
|
||||
|
||||
if (op >= ARRAY_SIZE(op_names))
|
||||
@@ -201,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
|
||||
[V3D_QPU_M_MOV] = "mov",
|
||||
[V3D_QPU_M_NOP] = "nop",
|
||||
[V3D_QPU_M_FMUL] = "fmul",
|
||||
+ [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
|
||||
+ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
|
||||
+ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
|
||||
+ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
|
||||
+ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
|
||||
+ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
|
||||
};
|
||||
|
||||
if (op >= ARRAY_SIZE(op_names))
|
||||
@@ -463,6 +473,10 @@ static const uint8_t add_op_args[] = {
|
||||
|
||||
[V3D_QPU_A_MOV] = D | A,
|
||||
[V3D_QPU_A_FMOV] = D | A,
|
||||
+ [V3D_QPU_A_VPACK] = D | A | B,
|
||||
+ [V3D_QPU_A_V8PACK] = D | A | B,
|
||||
+ [V3D_QPU_A_V10PACK] = D | A | B,
|
||||
+ [V3D_QPU_A_V11FPACK] = D | A | B,
|
||||
};
|
||||
|
||||
static const uint8_t mul_op_args[] = {
|
||||
@@ -476,6 +490,12 @@ static const uint8_t mul_op_args[] = {
|
||||
[V3D_QPU_M_NOP] = 0,
|
||||
[V3D_QPU_M_MOV] = D | A,
|
||||
[V3D_QPU_M_FMUL] = D | A | B,
|
||||
+ [V3D_QPU_M_FTOUNORM16] = D | A,
|
||||
+ [V3D_QPU_M_FTOSNORM16] = D | A,
|
||||
+ [V3D_QPU_M_VFTOUNORM8] = D | A,
|
||||
+ [V3D_QPU_M_VFTOSNORM8] = D | A,
|
||||
+ [V3D_QPU_M_VFTOUNORM10LO] = D | A,
|
||||
+ [V3D_QPU_M_VFTOUNORM10HI] = D | A,
|
||||
};
|
||||
|
||||
bool
|
||||
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
|
||||
index d408fb426fa..56eee9f9cac 100644
|
||||
--- a/src/broadcom/qpu/qpu_instr.h
|
||||
+++ b/src/broadcom/qpu/qpu_instr.h
|
||||
@@ -231,6 +231,10 @@ enum v3d_qpu_add_op {
|
||||
/* V3D 7.x */
|
||||
V3D_QPU_A_FMOV,
|
||||
V3D_QPU_A_MOV,
|
||||
+ V3D_QPU_A_VPACK,
|
||||
+ V3D_QPU_A_V8PACK,
|
||||
+ V3D_QPU_A_V10PACK,
|
||||
+ V3D_QPU_A_V11FPACK,
|
||||
};
|
||||
|
||||
enum v3d_qpu_mul_op {
|
||||
@@ -244,6 +248,14 @@ enum v3d_qpu_mul_op {
|
||||
V3D_QPU_M_MOV,
|
||||
V3D_QPU_M_NOP,
|
||||
V3D_QPU_M_FMUL,
|
||||
+
|
||||
+ /* V3D 7.x */
|
||||
+ V3D_QPU_M_FTOUNORM16,
|
||||
+ V3D_QPU_M_FTOSNORM16,
|
||||
+ V3D_QPU_M_VFTOUNORM8,
|
||||
+ V3D_QPU_M_VFTOSNORM8,
|
||||
+ V3D_QPU_M_VFTOUNORM10LO,
|
||||
+ V3D_QPU_M_VFTOUNORM10HI,
|
||||
};
|
||||
|
||||
enum v3d_qpu_output_pack {
|
||||
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
|
||||
index 7984712d527..6cd75adac6d 100644
|
||||
--- a/src/broadcom/qpu/qpu_pack.c
|
||||
+++ b/src/broadcom/qpu/qpu_pack.c
|
||||
@@ -783,6 +783,9 @@ static const struct opcode_desc add_ops_v71[] = {
|
||||
{ 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
|
||||
{ 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
|
||||
|
||||
+ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
|
||||
+ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
|
||||
+
|
||||
{ 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
|
||||
{ 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
|
||||
{ 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
|
||||
@@ -797,6 +800,8 @@ static const struct opcode_desc add_ops_v71[] = {
|
||||
{ 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
|
||||
{ 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
|
||||
|
||||
+ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
|
||||
+ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
|
||||
};
|
||||
|
||||
static const struct opcode_desc mul_ops_v71[] = {
|
||||
@@ -822,6 +827,13 @@ static const struct opcode_desc mul_ops_v71[] = {
|
||||
{ 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
|
||||
{ 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
|
||||
|
||||
+ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
|
||||
+ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
|
||||
+ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
|
||||
+ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
|
||||
+ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
|
||||
+ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
|
||||
+
|
||||
{ 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
|
||||
|
||||
{ 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,221 @@
|
||||
From 4f33de7771621e15aae3e3c60c09fd5a2f29bdac Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 30 Nov 2021 02:39:20 +0100
|
||||
Subject: [PATCH 066/139] nir: add new opcodes to map new v71
|
||||
packing/conversion instructions
|
||||
|
||||
Since v71, broadcom hw include specific packing/conversion
|
||||
instructions, so this commit adds opcodes to be able to make use of
|
||||
them, specially for image stores:
|
||||
|
||||
* vftounorm8/vftosnorm8: 2x16-bit floating point to 2x8-bit
|
||||
unorm/snorm
|
||||
* ftounorm16/ftosnorm16: floating point to 16-bit unorm/snorm
|
||||
* vftounorm10lo/vftounorm10hi: used to convert a floating point to
|
||||
a r10g10b10a2 unorm
|
||||
|
||||
* v11fpack: packs 2 2x16 FP into R11G11B10.
|
||||
* v10pack: pack 2 2x16 integer into R10G10B10A2
|
||||
* v8pack: packs 2 2x16 bit integer into 4x8 bits.
|
||||
* vpack: 2x32 bit to 2x16 integer pack
|
||||
|
||||
For the latter, it can be easly confused with the existing and general
|
||||
pack_32_2x16_split. But note that this one receives two 16bit integer,
|
||||
and packs them on a 32bit integer. But broadcom opcode takes two 32bit
|
||||
integer, takes the lower halfword, and packs them as 2x16 on a 32bit
|
||||
integer.
|
||||
|
||||
Interestingly broadcom also defines a similar one that packs the
|
||||
higher halfword. Not used yet.
|
||||
|
||||
FIXME: vftounorm10lo/hi constant expression implementation is somewhat
|
||||
convoluted. It is likely that it could be implemented in a more easy
|
||||
way. But it works (passing the tests added with CTS issue #3372,
|
||||
created with this change in mind).
|
||||
---
|
||||
src/compiler/nir/nir_constant_expressions.py | 106 +++++++++++++++++++
|
||||
src/compiler/nir/nir_opcodes.py | 44 ++++++++
|
||||
2 files changed, 150 insertions(+)
|
||||
|
||||
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
|
||||
index e6383b67737..46395d79a89 100644
|
||||
--- a/src/compiler/nir/nir_constant_expressions.py
|
||||
+++ b/src/compiler/nir/nir_constant_expressions.py
|
||||
@@ -62,6 +62,8 @@ template = """\
|
||||
#include "util/softfloat.h"
|
||||
#include "util/bigmath.h"
|
||||
#include "util/format/format_utils.h"
|
||||
+#include "util/format_r11g11b10f.h"
|
||||
+#include "util/u_math.h"
|
||||
#include "nir_constant_expressions.h"
|
||||
|
||||
/**
|
||||
@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u)
|
||||
return _mesa_half_to_float(u);
|
||||
}
|
||||
|
||||
+/* Broadcom v3d specific instructions */
|
||||
+/**
|
||||
+ * Packs 2 2x16 floating split into a r11g11b10f
|
||||
+ */
|
||||
+static uint32_t v11fpack_v3d(const uint32_t src0,
|
||||
+ const uint32_t src1)
|
||||
+{
|
||||
+ float rgb[3];
|
||||
+
|
||||
+ rgb[0] = unpack_half_1x16((src0 & 0xffff));
|
||||
+ rgb[1] = unpack_half_1x16((src0 >> 16));
|
||||
+ rgb[2] = unpack_half_1x16((src1 & 0xffff));
|
||||
+
|
||||
+ return float3_to_r11g11b10f(rgb);
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
|
||||
+ * as it receives a uint16_t val instead of a float
|
||||
+ */
|
||||
+static uint8_t _mesa_half_to_snorm8(uint16_t val)
|
||||
+{
|
||||
+ float x = _mesa_half_to_float(val);
|
||||
+
|
||||
+ return pack_snorm_1x8(x);
|
||||
+}
|
||||
+
|
||||
+static uint16_t _mesa_float_to_snorm16(uint32_t val)
|
||||
+{
|
||||
+ union fi aux;
|
||||
+ aux.ui = val;
|
||||
+ return pack_snorm_1x16(aux.f);
|
||||
+}
|
||||
+
|
||||
+static uint16_t _mesa_float_to_unorm16(uint32_t val)
|
||||
+{
|
||||
+ union fi aux;
|
||||
+ aux.ui = val;
|
||||
+ return pack_unorm_1x16(aux.f);
|
||||
+}
|
||||
+
|
||||
+/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too
|
||||
+ * verbose. It is likely that there would be a simpler way to implement
|
||||
+ * it.
|
||||
+ */
|
||||
+static uint32_t float_pack16_v3d(uint32_t f32)
|
||||
+{
|
||||
+ float f = uif(f32);
|
||||
+ return _mesa_float_to_half(f);
|
||||
+}
|
||||
+
|
||||
+static uint32_t float_unpack16_v3d(uint32_t f16)
|
||||
+{
|
||||
+ float f = _mesa_half_to_float(f16);
|
||||
+ return fui(f);
|
||||
+}
|
||||
+
|
||||
+static uint32_t vfpack_v3d(uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
|
||||
+}
|
||||
+
|
||||
+static uint32_t vfsat_v3d(uint32_t a)
|
||||
+{
|
||||
+ return vfpack_v3d(
|
||||
+ fui(SATURATE(_mesa_half_to_float(a & 0xffff))),
|
||||
+ fui(SATURATE(_mesa_half_to_float(a >> 16))));
|
||||
+}
|
||||
+
|
||||
+static uint32_t fmul_v3d(uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ float f = uif(a);
|
||||
+ float g = uif(b);
|
||||
+
|
||||
+ float x = f * g;
|
||||
+
|
||||
+ return fui(x);
|
||||
+}
|
||||
+
|
||||
+#define L(x) float_unpack16_v3d((x) & 0xffff)
|
||||
+#define H(x) float_unpack16_v3d((x) >> 16)
|
||||
+#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b)))
|
||||
+
|
||||
+static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ return V(fmul_v3d, a, b);
|
||||
+}
|
||||
+
|
||||
+/* Convert 2x16-bit floating point to 2x10-bit unorm */
|
||||
+static uint32_t vftounorm10lo(uint32_t src0)
|
||||
+{
|
||||
+ return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Convert 2x16-bit floating point to one 2-bit and one
|
||||
+ * 10-bit unorm
|
||||
+ */
|
||||
+static uint32_t vftounorm10hi(uint32_t src0)
|
||||
+{
|
||||
+ return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
|
||||
+}
|
||||
+
|
||||
+
|
||||
/* Some typed vector structures to make things like src0.y work */
|
||||
typedef int8_t int1_t;
|
||||
typedef uint8_t uint1_t;
|
||||
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
|
||||
index e4d87aa6126..63aa7cfa315 100644
|
||||
--- a/src/compiler/nir/nir_opcodes.py
|
||||
+++ b/src/compiler/nir/nir_opcodes.py
|
||||
@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) {
|
||||
}
|
||||
""")
|
||||
|
||||
+# v3d-specific opcodes
|
||||
+
|
||||
+# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into
|
||||
+# r11g11b10 bits, rounding to nearest even
|
||||
+binop_convert("v11fpack_v3d", tuint32, tuint32, "",
|
||||
+ "v11fpack_v3d(src0, src1)")
|
||||
+
|
||||
+# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
|
||||
+# difference with pack_32_2x16_split is that the sources are 32bit too. So it
|
||||
+# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit
|
||||
+# pack.
|
||||
+binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
|
||||
+ "(src0.x & 0xffff) | (src1.x << 16)")
|
||||
+
|
||||
+# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2
|
||||
+binop_convert("v10pack_v3d", tuint32, tuint32, "",
|
||||
+ "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
|
||||
+
|
||||
+# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
|
||||
+# dst[7:0] = src0[7:0]
|
||||
+# dst[15:8] = src0[23:16]
|
||||
+# dst[23:16] = src1[7:0]
|
||||
+# dst[31:24] = src1[23:16]
|
||||
+opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
|
||||
+ False, "",
|
||||
+ "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
|
||||
+
|
||||
+# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
|
||||
+unop("vftounorm8_v3d", tuint32,
|
||||
+ "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
|
||||
+unop("vftosnorm8_v3d", tuint32,
|
||||
+ "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
|
||||
+
|
||||
+# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
|
||||
+unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
|
||||
+unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
|
||||
+
|
||||
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
|
||||
+unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)")
|
||||
+
|
||||
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
|
||||
+# and one 10 bit unorm
|
||||
+unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)")
|
||||
+
|
||||
# Mali-specific opcodes
|
||||
unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
|
||||
unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,452 @@
|
||||
From 381c29e3ff5237c89380cc53eb2271d1985f4e34 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 2 Dec 2021 13:26:43 +0100
|
||||
Subject: [PATCH 067/139] broadcom/compiler: update image store lowering to use
|
||||
v71 new packing/conversion instructions
|
||||
|
||||
Vulkan shaderdb stats with pattern dEQP-VK.image.*.with_format.*.*:
|
||||
total instructions in shared programs: 35993 -> 33245 (-7.63%)
|
||||
instructions in affected programs: 21153 -> 18405 (-12.99%)
|
||||
helped: 394
|
||||
HURT: 1
|
||||
Instructions are helped.
|
||||
|
||||
total uniforms in shared programs: 8550 -> 7418 (-13.24%)
|
||||
uniforms in affected programs: 5136 -> 4004 (-22.04%)
|
||||
helped: 399
|
||||
HURT: 0
|
||||
Uniforms are helped.
|
||||
|
||||
total max-temps in shared programs: 6014 -> 5905 (-1.81%)
|
||||
max-temps in affected programs: 473 -> 364 (-23.04%)
|
||||
helped: 58
|
||||
HURT: 0
|
||||
Max-temps are helped.
|
||||
|
||||
total nops in shared programs: 1515 -> 1504 (-0.73%)
|
||||
nops in affected programs: 46 -> 35 (-23.91%)
|
||||
helped: 14
|
||||
HURT: 2
|
||||
Inconclusive result (%-change mean confidence interval includes 0).
|
||||
|
||||
FWIW, that one HURT on the instructions count is for just one
|
||||
instruction.
|
||||
---
|
||||
src/broadcom/compiler/nir_to_vir.c | 39 +++
|
||||
src/broadcom/compiler/v3d_compiler.h | 16 +-
|
||||
.../compiler/v3d_nir_lower_image_load_store.c | 246 +++++++++++++++++-
|
||||
src/broadcom/compiler/vir.c | 2 +-
|
||||
4 files changed, 294 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
|
||||
index 90fe1d1e7f0..a8cf02dd386 100644
|
||||
--- a/src/broadcom/compiler/nir_to_vir.c
|
||||
+++ b/src/broadcom/compiler/nir_to_vir.c
|
||||
@@ -1689,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
|
||||
result = vir_VFPACK(c, src[0], src[1]);
|
||||
break;
|
||||
|
||||
+ case nir_op_vpack_v3d:
|
||||
+ result = vir_VPACK(c, src[0], src[1]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_v11fpack_v3d:
|
||||
+ result = vir_V11FPACK(c, src[0], src[1]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_v10pack_v3d:
|
||||
+ result = vir_V10PACK(c, src[0], src[1]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_v8pack_v3d:
|
||||
+ result = vir_V8PACK(c, src[0], src[1]);
|
||||
+ break;
|
||||
+
|
||||
case nir_op_unpack_half_2x16_split_x:
|
||||
result = vir_FMOV(c, src[0]);
|
||||
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
|
||||
@@ -1719,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
|
||||
result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
|
||||
break;
|
||||
}
|
||||
+ case nir_op_vftounorm8_v3d:
|
||||
+ result = vir_VFTOUNORM8(c, src[0]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_vftosnorm8_v3d:
|
||||
+ result = vir_VFTOSNORM8(c, src[0]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_vftounorm10lo_v3d:
|
||||
+ result = vir_VFTOUNORM10LO(c, src[0]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_vftounorm10hi_v3d:
|
||||
+ result = vir_VFTOUNORM10HI(c, src[0]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_ftounorm16_v3d:
|
||||
+ result = vir_FTOUNORM16(c, src[0]);
|
||||
+ break;
|
||||
+
|
||||
+ case nir_op_ftosnorm16_v3d:
|
||||
+ result = vir_FTOSNORM16(c, src[0]);
|
||||
+ break;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "unknown NIR ALU inst: ");
|
||||
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
|
||||
index 36adf8830b5..425ab0cdf9d 100644
|
||||
--- a/src/broadcom/compiler/v3d_compiler.h
|
||||
+++ b/src/broadcom/compiler/v3d_compiler.h
|
||||
@@ -1186,7 +1186,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
|
||||
bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
|
||||
bool v3d_nir_lower_scratch(nir_shader *s);
|
||||
bool v3d_nir_lower_txf_ms(nir_shader *s);
|
||||
-bool v3d_nir_lower_image_load_store(nir_shader *s);
|
||||
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
|
||||
bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
|
||||
|
||||
void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
|
||||
@@ -1427,6 +1427,20 @@ VIR_SFU(LOG)
|
||||
VIR_SFU(SIN)
|
||||
VIR_SFU(RSQRT2)
|
||||
|
||||
+VIR_A_ALU2(VPACK)
|
||||
+VIR_A_ALU2(V8PACK)
|
||||
+VIR_A_ALU2(V10PACK)
|
||||
+VIR_A_ALU2(V11FPACK)
|
||||
+
|
||||
+VIR_M_ALU1(FTOUNORM16)
|
||||
+VIR_M_ALU1(FTOSNORM16)
|
||||
+
|
||||
+VIR_M_ALU1(VFTOUNORM8)
|
||||
+VIR_M_ALU1(VFTOSNORM8)
|
||||
+
|
||||
+VIR_M_ALU1(VFTOUNORM10LO)
|
||||
+VIR_M_ALU1(VFTOUNORM10HI)
|
||||
+
|
||||
static inline struct qinst *
|
||||
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
|
||||
struct qreg dest, struct qreg src)
|
||||
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
|
||||
index 2900a29817f..bbb55be4a14 100644
|
||||
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
|
||||
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
|
||||
@@ -40,6 +40,10 @@
|
||||
* calculations and load/store using the TMU general memory access path.
|
||||
*/
|
||||
|
||||
+static const unsigned bits_8[4] = {8, 8, 8, 8};
|
||||
+static const unsigned bits_16[4] = {16, 16, 16, 16};
|
||||
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
|
||||
+
|
||||
bool
|
||||
v3d_gl_format_is_return_32(enum pipe_format format)
|
||||
{
|
||||
@@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format)
|
||||
|
||||
/* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
|
||||
* 32-bit SSA value, with as many channels as necessary to store all the bits
|
||||
+ *
|
||||
+ * This is the generic helper, using all common nir operations.
|
||||
*/
|
||||
static nir_ssa_def *
|
||||
pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
|
||||
@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
|
||||
return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
|
||||
}
|
||||
|
||||
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
|
||||
+ * just easier to read vfpack on the code, specially while using the PRM as
|
||||
+ * reference
|
||||
+ */
|
||||
+static nir_ssa_def *
|
||||
+nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2)
|
||||
+{
|
||||
+ return nir_pack_half_2x16_split(b, p1, p2);
|
||||
+}
|
||||
+
|
||||
+static inline nir_ssa_def *
|
||||
+pack_11f11f10f(nir_builder *b, nir_ssa_def *color)
|
||||
+{
|
||||
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
|
||||
+ nir_channel(b, color, 1));
|
||||
+ /* FIXME: we noted that we could just use p2 again as the second
|
||||
+ * element to pack, and CTS tests still works. Just using undef as is
|
||||
+ * slightly more correct
|
||||
+ */
|
||||
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
|
||||
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
|
||||
+
|
||||
+ return nir_v11fpack_v3d(b, p1, p2);
|
||||
+}
|
||||
+
|
||||
+static inline nir_ssa_def *
|
||||
+pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color)
|
||||
+{
|
||||
+ nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
|
||||
+ nir_channel(b, color, 1));
|
||||
+ nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
|
||||
+ nir_channel(b, color, 3));
|
||||
+
|
||||
+ return nir_v10pack_v3d(b, p1, p2);
|
||||
+}
|
||||
+
|
||||
+static inline nir_ssa_def *
|
||||
+pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color)
|
||||
+{
|
||||
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
|
||||
+ nir_channel(b, color, 1));
|
||||
+ p1 = nir_vftounorm10lo_v3d(b, p1);
|
||||
+
|
||||
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
|
||||
+ nir_channel(b, color, 3));
|
||||
+ p2 = nir_vftounorm10hi_v3d(b, p2);
|
||||
+
|
||||
+ return nir_v10pack_v3d(b, p1, p2);
|
||||
+}
|
||||
+
|
||||
+enum hw_conversion {
|
||||
+ NONE,
|
||||
+ TO_SNORM,
|
||||
+ TO_UNORM
|
||||
+};
|
||||
+
|
||||
+static inline nir_ssa_def *
|
||||
+pack_8bit(nir_builder *b, nir_ssa_def *color,
|
||||
+ unsigned num_components,
|
||||
+ enum hw_conversion conversion)
|
||||
+{
|
||||
+ /* Note that usually you should not use this method (that relies on
|
||||
+ * custom packing) for 1 component if we are not doing any
|
||||
+ * conversion. But we support also that case, and let the caller
|
||||
+ * decide which method to use.
|
||||
+ */
|
||||
+ nir_ssa_def *p1;
|
||||
+ nir_ssa_def *p2;
|
||||
+
|
||||
+ if (conversion == NONE) {
|
||||
+ p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
|
||||
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
|
||||
+ } else {
|
||||
+ p1 = nir_vfpack(b, nir_channel(b, color, 0),
|
||||
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
|
||||
+ p1 = (conversion == TO_UNORM) ?
|
||||
+ nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1);
|
||||
+ }
|
||||
+ if (num_components == 4) {
|
||||
+ if (conversion == NONE) {
|
||||
+ p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
|
||||
+ nir_channel(b, color, 3));
|
||||
+ } else {
|
||||
+ p2 = nir_vfpack(b, nir_channel(b, color, 2),
|
||||
+ nir_channel(b, color, 3));
|
||||
+ p2 = (conversion == TO_UNORM) ?
|
||||
+ nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2);
|
||||
+ }
|
||||
+ } else {
|
||||
+ /* As mentioned on the comment before, using an undef here
|
||||
+ * would be more correct. But for this case we are getting
|
||||
+ * worse values, and in fact even some worse instruction count
|
||||
+ * with some CTS tests, so we just reuse the first packing
|
||||
+ */
|
||||
+ p2 = p1;
|
||||
+ }
|
||||
+
|
||||
+ return nir_v8pack_v3d(b, p1, p2);
|
||||
+}
|
||||
+
|
||||
+static inline nir_ssa_def *
|
||||
+pack_16bit(nir_builder *b, nir_ssa_def *color,
|
||||
+ unsigned num_components,
|
||||
+ enum hw_conversion conversion)
|
||||
+{
|
||||
+ nir_ssa_def *results[2];
|
||||
+ nir_ssa_def *channels[4];
|
||||
+
|
||||
+ /* Note that usually you should not use this method (that relies on
|
||||
+ * custom packing) if we are not doing any conversion. But we support
|
||||
+ * also that case, and let the caller decide which method to use.
|
||||
+ */
|
||||
+
|
||||
+ for (unsigned i = 0; i < num_components; i++) {
|
||||
+ channels[i] = nir_channel(b, color, i);
|
||||
+ switch (conversion) {
|
||||
+ case TO_SNORM:
|
||||
+ channels[i] = nir_ftosnorm16_v3d(b, channels[i]);
|
||||
+ break;
|
||||
+ case TO_UNORM:
|
||||
+ channels[i] = nir_ftounorm16_v3d(b, channels[i]);
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ switch (num_components) {
|
||||
+ case 1:
|
||||
+ results[0] = channels[0];
|
||||
+ break;
|
||||
+ case 4:
|
||||
+ results[1] = nir_vpack_v3d(b, channels[2], channels[3]);
|
||||
+ FALLTHROUGH;
|
||||
+ case 2:
|
||||
+ results[0] = nir_vpack_v3d(b, channels[0], channels[1]);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
|
||||
+}
|
||||
+
|
||||
+static inline nir_ssa_def *
|
||||
+pack_xbit(nir_builder *b, nir_ssa_def *color,
|
||||
+ unsigned num_components,
|
||||
+ const struct util_format_channel_description *r_chan)
|
||||
+{
|
||||
+ bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
|
||||
+ enum hw_conversion conversion = NONE;
|
||||
+ if (r_chan->normalized) {
|
||||
+ conversion =
|
||||
+ (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
|
||||
+ }
|
||||
+
|
||||
+ switch (r_chan->size) {
|
||||
+ case 8:
|
||||
+ if (conversion == NONE && num_components < 2)
|
||||
+ return pack_bits(b, color, bits_8, num_components, pack_mask);
|
||||
+ else
|
||||
+ return pack_8bit(b, color, num_components, conversion);
|
||||
+ break;
|
||||
+ case 16:
|
||||
+ /* pack_mask implies that the generic packing method would
|
||||
+ * need to include extra operations to handle negative values,
|
||||
+ * so in that case, even without a conversion, it is better to
|
||||
+ * use the packing using custom hw operations.
|
||||
+ */
|
||||
+ if (conversion == NONE && !pack_mask)
|
||||
+ return pack_bits(b, color, bits_16, num_components, pack_mask);
|
||||
+ else
|
||||
+ return pack_16bit(b, color, num_components, conversion);
|
||||
+ break;
|
||||
+ default:
|
||||
+ unreachable("unrecognized bits");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static bool
|
||||
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
{
|
||||
enum pipe_format format = nir_intrinsic_format(instr);
|
||||
assert(format != PIPE_FORMAT_NONE);
|
||||
@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
*/
|
||||
formatted = color;
|
||||
} else {
|
||||
- static const unsigned bits_8[4] = {8, 8, 8, 8};
|
||||
- static const unsigned bits_16[4] = {16, 16, 16, 16};
|
||||
- static const unsigned bits_1010102[4] = {10, 10, 10, 2};
|
||||
const unsigned *bits;
|
||||
|
||||
switch (r_chan->size) {
|
||||
@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
return true;
|
||||
}
|
||||
|
||||
+
|
||||
+static bool
|
||||
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
+{
|
||||
+ enum pipe_format format = nir_intrinsic_format(instr);
|
||||
+ assert(format != PIPE_FORMAT_NONE);
|
||||
+ const struct util_format_description *desc =
|
||||
+ util_format_description(format);
|
||||
+ const struct util_format_channel_description *r_chan = &desc->channel[0];
|
||||
+ unsigned num_components = util_format_get_nr_components(format);
|
||||
+ b->cursor = nir_before_instr(&instr->instr);
|
||||
+
|
||||
+ nir_ssa_def *color = nir_channels(b,
|
||||
+ nir_ssa_for_src(b, instr->src[3], 4),
|
||||
+ (1 << num_components) - 1);
|
||||
+ nir_ssa_def *formatted = NULL;
|
||||
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
|
||||
+ formatted = nir_format_pack_r9g9b9e5(b, color);
|
||||
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
|
||||
+ formatted = pack_11f11f10f(b, color);
|
||||
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
|
||||
+ formatted = pack_r10g10b10a2_uint(b, color);
|
||||
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
|
||||
+ formatted = pack_r10g10b10a2_unorm(b, color);
|
||||
+ } else if (r_chan->size == 32) {
|
||||
+ /* For 32-bit formats, we just have to move the vector
|
||||
+ * across (possibly reducing the number of channels).
|
||||
+ */
|
||||
+ formatted = color;
|
||||
+ } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
|
||||
+ assert(r_chan->size == 16);
|
||||
+ formatted = nir_format_float_to_half(b, color);
|
||||
+ formatted = pack_bits(b, formatted, bits_16, num_components,
|
||||
+ false);
|
||||
+ } else {
|
||||
+ assert(r_chan->size == 8 || r_chan->size == 16);
|
||||
+ formatted = pack_xbit(b, color, num_components, r_chan);
|
||||
+ }
|
||||
+
|
||||
+ nir_instr_rewrite_src(&instr->instr, &instr->src[3],
|
||||
+ nir_src_for_ssa(formatted));
|
||||
+ instr->num_components = formatted->num_components;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static bool
|
||||
v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
{
|
||||
@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
|
||||
nir_intrinsic_instr *intr =
|
||||
nir_instr_as_intrinsic(instr);
|
||||
|
||||
+ struct v3d_compile *c = (struct v3d_compile *) _state;
|
||||
+
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_image_load:
|
||||
return v3d_nir_lower_image_load(b, intr);
|
||||
case nir_intrinsic_image_store:
|
||||
- return v3d_nir_lower_image_store(b, intr);
|
||||
+ if (c->devinfo->ver >= 71)
|
||||
+ return v3d_nir_lower_image_store_v71(b, intr);
|
||||
+ else
|
||||
+ return v3d_nir_lower_image_store_v42(b, intr);
|
||||
+ break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
|
||||
}
|
||||
|
||||
bool
|
||||
-v3d_nir_lower_image_load_store(nir_shader *s)
|
||||
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
|
||||
{
|
||||
return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb,
|
||||
nir_metadata_block_index |
|
||||
- nir_metadata_dominance, NULL);
|
||||
+ nir_metadata_dominance, c);
|
||||
}
|
||||
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
|
||||
index aea113f050e..7612eed7130 100644
|
||||
--- a/src/broadcom/compiler/vir.c
|
||||
+++ b/src/broadcom/compiler/vir.c
|
||||
@@ -1576,7 +1576,7 @@ v3d_attempt_compile(struct v3d_compile *c)
|
||||
|
||||
NIR_PASS(_, c->s, v3d_nir_lower_io, c);
|
||||
NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
|
||||
- NIR_PASS(_, c->s, v3d_nir_lower_image_load_store);
|
||||
+ NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
|
||||
|
||||
NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
|
||||
nir_lower_idiv_options idiv_options = {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,68 @@
|
||||
From f6082e941a3454c8735df2ff2713ae49b3daa74f Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 18 Apr 2023 08:50:13 +0200
|
||||
Subject: [PATCH 068/139] broadcom/compiler: don't allocate spill base to rf0
|
||||
in V3D 7.x
|
||||
|
||||
Otherwise it can be stomped by instructions doing implicit rf0 writes.
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 21 +++++++++++++++----
|
||||
1 file changed, 17 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 440b093a636..121c9b2794f 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -582,7 +582,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
|
||||
}
|
||||
|
||||
static void
|
||||
-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
|
||||
+ int spill_temp)
|
||||
{
|
||||
c->spill_start_num_temps = c->num_temps;
|
||||
c->spilling = true;
|
||||
@@ -594,8 +595,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
|
||||
spill_offset = c->spill_size;
|
||||
c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
|
||||
|
||||
- if (spill_offset == 0)
|
||||
+ if (spill_offset == 0) {
|
||||
v3d_setup_spill_base(c);
|
||||
+
|
||||
+ /* Don't allocate our spill base to rf0 to avoid
|
||||
+ * conflicts with instructions doing implicit writes
|
||||
+ * to that register.
|
||||
+ */
|
||||
+ if (!c->devinfo->has_accumulators) {
|
||||
+ ra_add_node_interference(
|
||||
+ c->g,
|
||||
+ temp_to_node(c, c->spill_base.index),
|
||||
+ implicit_rf_nodes[0]);
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
|
||||
struct qinst *last_thrsw = c->last_thrsw;
|
||||
@@ -1346,7 +1359,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
int node = v3d_choose_spill_node(c);
|
||||
uint32_t temp = node_to_temp(c, node);
|
||||
if (node != -1) {
|
||||
- v3d_spill_reg(c, acc_nodes, temp);
|
||||
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -1363,7 +1376,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
enum temp_spill_type spill_type =
|
||||
get_spill_type_for_temp(c, temp);
|
||||
if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
|
||||
- v3d_spill_reg(c, acc_nodes, temp);
|
||||
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
|
||||
if (c->spills + c->fills > c->max_tmu_spills)
|
||||
goto spill_fail;
|
||||
} else {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,186 @@
|
||||
From 0e9577fbb18a026390f653ca22f5a98a69a5fe59 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 2 May 2023 10:12:37 +0200
|
||||
Subject: [PATCH 069/139] broadcom/compiler: improve allocation for final
|
||||
program instructions
|
||||
|
||||
The last 3 instructions can't use specific registers so flag all the
|
||||
nodes for temps used in the last program instructions and try to
|
||||
avoid assigning any of these. This may help us avoid injecting nops
|
||||
for the last thread switch instruction.
|
||||
|
||||
Because regisster allocation needs to happen before QPU scheduling
|
||||
and instruction merging we can't tell exactly what the last 3
|
||||
instructions will be, so we do this for a few more instructions than
|
||||
just 3.
|
||||
|
||||
We only do this for fragment shaders because other shader stages
|
||||
always end with VPM store instructions that take an small immediate
|
||||
and therefore will never allow us to merge the final thread switch
|
||||
earlier, so limiting allocation for these shaders will never improve
|
||||
anything and might instead be detrimental.
|
||||
|
||||
total instructions in shared programs: 11471389 -> 11464335 (-0.06%)
|
||||
instructions in affected programs: 582908 -> 575854 (-1.21%)
|
||||
helped: 4669
|
||||
HURT: 578
|
||||
Instructions are helped.
|
||||
|
||||
total max-temps in shared programs: 2230497 -> 2230150 (-0.02%)
|
||||
max-temps in affected programs: 5662 -> 5315 (-6.13%)
|
||||
helped: 344
|
||||
HURT: 44
|
||||
Max-temps are helped.
|
||||
|
||||
total sfu-stalls in shared programs: 18068 -> 18077 (0.05%)
|
||||
sfu-stalls in affected programs: 264 -> 273 (3.41%)
|
||||
helped: 37
|
||||
HURT: 48
|
||||
Inconclusive result (value mean confidence interval includes 0).
|
||||
|
||||
total inst-and-stalls in shared programs: 11489457 -> 11482412 (-0.06%)
|
||||
inst-and-stalls in affected programs: 585180 -> 578135 (-1.20%)
|
||||
helped: 4659
|
||||
HURT: 588
|
||||
Inst-and-stalls are helped.
|
||||
|
||||
total nops in shared programs: 301738 -> 298140 (-1.19%)
|
||||
nops in affected programs: 14680 -> 11082 (-24.51%)
|
||||
helped: 3252
|
||||
HURT: 108
|
||||
Nops are helped.
|
||||
---
|
||||
src/broadcom/compiler/v3d_compiler.h | 1 +
|
||||
src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++++--
|
||||
2 files changed, 66 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
|
||||
index 425ab0cdf9d..2642d23b629 100644
|
||||
--- a/src/broadcom/compiler/v3d_compiler.h
|
||||
+++ b/src/broadcom/compiler/v3d_compiler.h
|
||||
@@ -613,6 +613,7 @@ struct v3d_ra_node_info {
|
||||
struct {
|
||||
uint32_t priority;
|
||||
uint8_t class_bits;
|
||||
+ bool is_program_end;
|
||||
|
||||
/* V3D 7.x */
|
||||
bool is_ldunif_dst;
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 121c9b2794f..495644bb557 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -385,6 +385,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
|
||||
c->nodes.info[node].class_bits = class_bits;
|
||||
c->nodes.info[node].priority = 0;
|
||||
c->nodes.info[node].is_ldunif_dst = false;
|
||||
+ c->nodes.info[node].is_program_end = false;
|
||||
}
|
||||
|
||||
/* The spill offset for this thread takes a bit of setup, so do it once at
|
||||
@@ -929,6 +930,17 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
return true;
|
||||
}
|
||||
|
||||
+ /* The last 3 instructions in a shader can't use some specific registers
|
||||
+ * (usually early rf registers, depends on v3d version) so try to
|
||||
+ * avoid allocating these to registers used by the last instructions
|
||||
+ * in the shader.
|
||||
+ */
|
||||
+ const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
|
||||
+ if (v3d_ra->nodes->info[node].is_program_end &&
|
||||
+ v3d_ra->next_phys < safe_rf_start) {
|
||||
+ v3d_ra->next_phys = safe_rf_start;
|
||||
+ }
|
||||
+
|
||||
for (int i = 0; i < PHYS_COUNT; i++) {
|
||||
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
|
||||
int phys = v3d_ra->phys_index + phys_off;
|
||||
@@ -1218,6 +1230,44 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
|
||||
}
|
||||
}
|
||||
|
||||
+static void
|
||||
+flag_program_end_nodes(struct v3d_compile *c)
|
||||
+{
|
||||
+ /* Only look for registers used in this many instructions */
|
||||
+ uint32_t last_set_count = 6;
|
||||
+
|
||||
+ struct qblock *last_block = vir_exit_block(c);
|
||||
+ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
|
||||
+ if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
|
||||
+ continue;
|
||||
+
|
||||
+ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
|
||||
+ for (int i = 0; i < num_src; i++) {
|
||||
+ if (inst->src[i].file == QFILE_TEMP) {
|
||||
+ int node = temp_to_node(c, inst->src[i].index);
|
||||
+ c->nodes.info[node].is_program_end = true;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
|
||||
+ for (int i = 0; i < num_src; i++) {
|
||||
+ if (inst->src[i].file == QFILE_TEMP) {
|
||||
+ int node = temp_to_node(c, inst->src[i].index);
|
||||
+ c->nodes.info[node].is_program_end = true;
|
||||
+
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (inst->dst.file == QFILE_TEMP) {
|
||||
+ int node = temp_to_node(c, inst->dst.index);
|
||||
+ c->nodes.info[node].is_program_end = true;
|
||||
+ }
|
||||
+
|
||||
+ if (--last_set_count == 0)
|
||||
+ break;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
|
||||
*
|
||||
@@ -1280,17 +1330,16 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
*/
|
||||
for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
||||
c->nodes.info[i].is_ldunif_dst = false;
|
||||
+ c->nodes.info[i].is_program_end = false;
|
||||
+ c->nodes.info[i].priority = 0;
|
||||
+ c->nodes.info[i].class_bits = 0;
|
||||
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
||||
acc_nodes[i] = i;
|
||||
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
|
||||
- c->nodes.info[i].priority = 0;
|
||||
- c->nodes.info[i].class_bits = 0;
|
||||
} else if (!c->devinfo->has_accumulators &&
|
||||
i < ARRAY_SIZE(implicit_rf_nodes)) {
|
||||
implicit_rf_nodes[i] = i;
|
||||
ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
|
||||
- c->nodes.info[i].priority = 0;
|
||||
- c->nodes.info[i].class_bits = 0;
|
||||
} else {
|
||||
uint32_t t = node_to_temp(c, i);
|
||||
c->nodes.info[i].priority =
|
||||
@@ -1327,6 +1376,18 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
last_ldvary_ip, inst);
|
||||
}
|
||||
|
||||
+ /* Flag the nodes that are used in the last instructions of the program
|
||||
+ * (there are some registers that cannot be used in the last 3
|
||||
+ * instructions). We only do this for fragment shaders, because the idea
|
||||
+ * is that by avoiding this conflict we may be able to emit the last
|
||||
+ * thread switch earlier in some cases, however, in non-fragment shaders
|
||||
+ * this won't happen because the last instructions are always VPM stores
|
||||
+ * with a small immediate, which conflicts with other signals,
|
||||
+ * preventing us from ever moving the thrsw earlier.
|
||||
+ */
|
||||
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT)
|
||||
+ flag_program_end_nodes(c);
|
||||
+
|
||||
/* Set the register classes for all our temporaries in the graph */
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
ra_set_node_class(c->g, temp_to_node(c, i),
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,105 @@
|
||||
From 645fe451bcecbe3345a144222306d06fb39f6b9f Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Tue, 2 May 2023 10:17:47 +0200
|
||||
Subject: [PATCH 070/139] broadcom/compiler: don't assign registers to unused
|
||||
nodes/temps
|
||||
|
||||
In programs with a lot of unused temps, if we don't do this, we may
|
||||
end up recycling previously used rfs more often, which can be
|
||||
detrimental to instruction pairing.
|
||||
|
||||
total instructions in shared programs: 11464335 -> 11444136 (-0.18%)
|
||||
instructions in affected programs: 8976743 -> 8956544 (-0.23%)
|
||||
helped: 33196
|
||||
HURT: 33778
|
||||
Inconclusive result
|
||||
|
||||
total max-temps in shared programs: 2230150 -> 2229445 (-0.03%)
|
||||
max-temps in affected programs: 86413 -> 85708 (-0.82%)
|
||||
helped: 2217
|
||||
HURT: 1523
|
||||
Max-temps are helped.
|
||||
|
||||
total sfu-stalls in shared programs: 18077 -> 17104 (-5.38%)
|
||||
sfu-stalls in affected programs: 8669 -> 7696 (-11.22%)
|
||||
helped: 2657
|
||||
HURT: 2182
|
||||
Sfu-stalls are helped.
|
||||
|
||||
total inst-and-stalls in shared programs: 11482412 -> 11461240 (-0.18%)
|
||||
inst-and-stalls in affected programs: 8995697 -> 8974525 (-0.24%)
|
||||
helped: 33319
|
||||
HURT: 33708
|
||||
Inconclusive result
|
||||
|
||||
total nops in shared programs: 298140 -> 296185 (-0.66%)
|
||||
nops in affected programs: 52805 -> 50850 (-3.70%)
|
||||
helped: 3797
|
||||
HURT: 2662
|
||||
Inconclusive result
|
||||
---
|
||||
src/broadcom/compiler/v3d_compiler.h | 1 +
|
||||
src/broadcom/compiler/vir_register_allocate.c | 14 ++++++++++++++
|
||||
2 files changed, 15 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
|
||||
index 2642d23b629..f1a807e38fd 100644
|
||||
--- a/src/broadcom/compiler/v3d_compiler.h
|
||||
+++ b/src/broadcom/compiler/v3d_compiler.h
|
||||
@@ -614,6 +614,7 @@ struct v3d_ra_node_info {
|
||||
uint32_t priority;
|
||||
uint8_t class_bits;
|
||||
bool is_program_end;
|
||||
+ bool unused;
|
||||
|
||||
/* V3D 7.x */
|
||||
bool is_ldunif_dst;
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 495644bb557..0ab0474424f 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -386,6 +386,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
|
||||
c->nodes.info[node].priority = 0;
|
||||
c->nodes.info[node].is_ldunif_dst = false;
|
||||
c->nodes.info[node].is_program_end = false;
|
||||
+ c->nodes.info[node].unused = false;
|
||||
}
|
||||
|
||||
/* The spill offset for this thread takes a bit of setup, so do it once at
|
||||
@@ -918,6 +919,12 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
BITSET_WORD *regs,
|
||||
unsigned int *out)
|
||||
{
|
||||
+ /* If this node is for an unused temp, ignore. */
|
||||
+ if (v3d_ra->nodes->info[node].unused) {
|
||||
+ *out = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
/* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
|
||||
* so we can avoid turning them into ldunifrf (which uses the
|
||||
* cond field to encode the dst and would prevent merge with
|
||||
@@ -1331,6 +1338,7 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
for (uint32_t i = 0; i < num_ra_nodes; i++) {
|
||||
c->nodes.info[i].is_ldunif_dst = false;
|
||||
c->nodes.info[i].is_program_end = false;
|
||||
+ c->nodes.info[i].unused = false;
|
||||
c->nodes.info[i].priority = 0;
|
||||
c->nodes.info[i].class_bits = 0;
|
||||
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
|
||||
@@ -1396,6 +1404,12 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||
|
||||
/* Add register interferences based on liveness data */
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
+ /* And while we are here, let's also flag nodes for
|
||||
+ * unused temps.
|
||||
+ */
|
||||
+ if (c->temp_start[i] > c->temp_end[i])
|
||||
+ c->nodes.info[temp_to_node(c, i)].unused = true;
|
||||
+
|
||||
for (uint32_t j = i + 1; j < c->num_temps; j++) {
|
||||
if (interferes(c->temp_start[i], c->temp_end[i],
|
||||
c->temp_start[j], c->temp_end[j])) {
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,83 @@
|
||||
From 851704169d59e28c5429b06d05e5ef952be893a2 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Mon, 15 May 2023 10:02:10 +0200
|
||||
Subject: [PATCH 071/139] broadcom/compiler: only assign rf0 as last resort in
|
||||
V3D 7.x
|
||||
|
||||
So we can use it for ldunif(a) and avoid generating ldunif(a)rf which
|
||||
can't be paired with conditional instructions.
|
||||
|
||||
shader-db (pi5):
|
||||
|
||||
total instructions in shared programs: 11357802 -> 11338883 (-0.17%)
|
||||
instructions in affected programs: 7117889 -> 7098970 (-0.27%)
|
||||
helped: 24264
|
||||
HURT: 17574
|
||||
Instructions are helped.
|
||||
|
||||
total uniforms in shared programs: 3857808 -> 3857815 (<.01%)
|
||||
uniforms in affected programs: 92 -> 99 (7.61%)
|
||||
helped: 0
|
||||
HURT: 1
|
||||
|
||||
total max-temps in shared programs: 2230904 -> 2230199 (-0.03%)
|
||||
max-temps in affected programs: 52309 -> 51604 (-1.35%)
|
||||
helped: 1219
|
||||
HURT: 725
|
||||
Max-temps are helped.
|
||||
|
||||
total sfu-stalls in shared programs: 15021 -> 15236 (1.43%)
|
||||
sfu-stalls in affected programs: 6848 -> 7063 (3.14%)
|
||||
helped: 1866
|
||||
HURT: 1704
|
||||
Inconclusive result
|
||||
|
||||
total inst-and-stalls in shared programs: 11372823 -> 11354119 (-0.16%)
|
||||
inst-and-stalls in affected programs: 7149177 -> 7130473 (-0.26%)
|
||||
helped: 24315
|
||||
HURT: 17561
|
||||
Inst-and-stalls are helped.
|
||||
|
||||
total nops in shared programs: 273624 -> 273711 (0.03%)
|
||||
nops in affected programs: 31562 -> 31649 (0.28%)
|
||||
helped: 1619
|
||||
HURT: 1854
|
||||
Inconclusive result (value mean confidence interval includes 0).
|
||||
---
|
||||
src/broadcom/compiler/vir_register_allocate.c | 13 +++++++++++++
|
||||
1 file changed, 13 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
|
||||
index 0ab0474424f..8eac2b75bd7 100644
|
||||
--- a/src/broadcom/compiler/vir_register_allocate.c
|
||||
+++ b/src/broadcom/compiler/vir_register_allocate.c
|
||||
@@ -950,6 +950,11 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
|
||||
for (int i = 0; i < PHYS_COUNT; i++) {
|
||||
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
|
||||
+
|
||||
+ /* Try to keep rf0 available for ldunif in 7.x (see above). */
|
||||
+ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
|
||||
+ continue;
|
||||
+
|
||||
int phys = v3d_ra->phys_index + phys_off;
|
||||
|
||||
if (BITSET_TEST(regs, phys)) {
|
||||
@@ -959,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
|
||||
}
|
||||
}
|
||||
|
||||
+ /* If we couldn't allocate, do try to assign rf0 if it is available. */
|
||||
+ if (v3d_ra->devinfo->ver >= 71 &&
|
||||
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
|
||||
+ v3d_ra->next_phys = 1;
|
||||
+ *out = v3d_ra->phys_index;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
return false;
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,30 @@
|
||||
From 0d3fd30d67ffc0195b0783e30ab6afbbe403310a Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 28 Apr 2021 14:31:38 +0200
|
||||
Subject: [PATCH 072/139] v3dv: recover non-conformant warning for not fully
|
||||
supported hw
|
||||
|
||||
---
|
||||
src/broadcom/vulkan/v3dv_device.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
|
||||
index d5de3517670..d29ffad3531 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_device.c
|
||||
+++ b/src/broadcom/vulkan/v3dv_device.c
|
||||
@@ -1212,6 +1212,12 @@ create_physical_device(struct v3dv_instance *instance,
|
||||
|
||||
list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
|
||||
|
||||
+ if (device->devinfo.ver != 42) {
|
||||
+ fprintf(stderr, "WARNING: v3dv support for hw version %i is neither "
|
||||
+ "a complete nor a conformant Vulkan implementation. Testing "
|
||||
+ "use only.\n", device->devinfo.ver);
|
||||
+ }
|
||||
+
|
||||
return VK_SUCCESS;
|
||||
|
||||
fail:
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,504 @@
|
||||
From 52b5ac62b367ae89574c8031fdcf7c1dae05c942 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 29 Jun 2021 11:59:53 +0200
|
||||
Subject: [PATCH 073/139] v3dv/meson: add v71 hw generation
|
||||
|
||||
Starting point for v71 version inclusion.
|
||||
|
||||
This just adds it as one of the versions to be compiled (on meson),
|
||||
updates the v3dX/v3dv_X macros, and update the code enough to get it
|
||||
compiling when building using the two versions. For any packet not
|
||||
available on v71 we just provide a generic asserted placeholder of
|
||||
generation not supported.
|
||||
|
||||
Any real v71 support will be implemented on following commits.
|
||||
---
|
||||
src/broadcom/vulkan/meson.build | 6 +-
|
||||
src/broadcom/vulkan/v3dv_private.h | 7 +++
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 75 +++++++++++++++++++++++--
|
||||
src/broadcom/vulkan/v3dvx_image.c | 16 +++++-
|
||||
src/broadcom/vulkan/v3dvx_meta_common.c | 32 +++++++++++
|
||||
src/broadcom/vulkan/v3dvx_pipeline.c | 5 ++
|
||||
src/broadcom/vulkan/v3dvx_queue.c | 11 ++++
|
||||
7 files changed, 142 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
|
||||
index ad032d832ad..3da7364686f 100644
|
||||
--- a/src/broadcom/vulkan/meson.build
|
||||
+++ b/src/broadcom/vulkan/meson.build
|
||||
@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
|
||||
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
|
||||
'--beta', with_vulkan_beta.to_string(),
|
||||
'--device-prefix', 'ver42',
|
||||
+ '--device-prefix', 'ver71',
|
||||
],
|
||||
depend_files : vk_entrypoints_gen_depend_files,
|
||||
)
|
||||
@@ -67,10 +68,7 @@ files_per_version = files(
|
||||
'v3dvx_queue.c',
|
||||
)
|
||||
|
||||
-# The vulkan driver only supports version >= 42, which is the version present in
|
||||
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
|
||||
-# driver.
|
||||
-v3d_versions = ['42']
|
||||
+v3d_versions = ['42', '71']
|
||||
|
||||
v3dv_flags = []
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
|
||||
index c6707211529..6bdf338c67b 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dv_private.h
|
||||
@@ -2608,6 +2608,9 @@ u64_compare(const void *key1, const void *key2)
|
||||
case 42: \
|
||||
v3d_X_thing = &v3d42_##thing; \
|
||||
break; \
|
||||
+ case 71: \
|
||||
+ v3d_X_thing = &v3d71_##thing; \
|
||||
+ break; \
|
||||
default: \
|
||||
unreachable("Unsupported hardware generation"); \
|
||||
} \
|
||||
@@ -2626,6 +2629,10 @@ u64_compare(const void *key1, const void *key2)
|
||||
# define v3dX(x) v3d42_##x
|
||||
# include "v3dvx_private.h"
|
||||
# undef v3dX
|
||||
+
|
||||
+# define v3dX(x) v3d71_##x
|
||||
+# include "v3dvx_private.h"
|
||||
+# undef v3dX
|
||||
#endif
|
||||
|
||||
#ifdef ANDROID
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index f182b790d36..b958e634c82 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
|
||||
};
|
||||
config.width_in_pixels = tiling->width;
|
||||
config.height_in_pixels = tiling->height;
|
||||
+#if V3D_VERSION == 42
|
||||
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
|
||||
config.multisample_mode_4x = tiling->msaa;
|
||||
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
|
||||
cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
|
||||
@@ -82,10 +87,15 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
|
||||
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
|
||||
config.width_in_pixels = tiling->width;
|
||||
config.height_in_pixels = tiling->height;
|
||||
+#if V3D_VERSION == 42
|
||||
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
|
||||
config.multisample_mode_4x = tiling->msaa;
|
||||
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
|
||||
/* There's definitely nothing in the VCD cache we want. */
|
||||
@@ -649,10 +659,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
* bit and instead we have to emit a single clear of all tile buffers.
|
||||
*/
|
||||
if (use_global_zs_clear || use_global_rt_clear) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
|
||||
clear.clear_z_stencil_buffer = use_global_zs_clear;
|
||||
clear.clear_all_render_targets = use_global_rt_clear;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -824,7 +839,12 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
config.number_of_render_targets = MAX2(subpass->color_count, 1);
|
||||
config.multisample_mode_4x = tiling->msaa;
|
||||
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
|
||||
+#if V3D_VERSION == 42
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
|
||||
const struct v3dv_image_view *iview =
|
||||
@@ -920,7 +940,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
const struct v3d_resource_slice *slice =
|
||||
&image->planes[plane].slices[iview->vk.base_mip_level];
|
||||
|
||||
- const uint32_t *clear_color =
|
||||
+ UNUSED const uint32_t *clear_color =
|
||||
&state->attachments[attachment_idx].clear_value.color[0];
|
||||
|
||||
uint32_t clear_pad = 0;
|
||||
@@ -937,13 +957,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
|
||||
clear.clear_color_low_32_bits = clear_color[0];
|
||||
clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
|
||||
clear.render_target_number = i;
|
||||
};
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
|
||||
clear.clear_color_mid_low_32_bits =
|
||||
((clear_color[1] >> 24) | (clear_color[2] << 8));
|
||||
@@ -951,17 +977,28 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
|
||||
clear.render_target_number = i;
|
||||
};
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
+
|
||||
}
|
||||
|
||||
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
|
||||
clear.uif_padded_height_in_uif_blocks = clear_pad;
|
||||
clear.clear_color_high_16_bits = clear_color[3] >> 16;
|
||||
clear.render_target_number = i;
|
||||
};
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
|
||||
v3dX(cmd_buffer_render_pass_setup_render_target)
|
||||
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
|
||||
@@ -976,6 +1013,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
|
||||
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
/* Ends rendering mode config. */
|
||||
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
|
||||
@@ -1036,10 +1077,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
if (cmd_buffer->state.tile_aligned_render_area &&
|
||||
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
|
||||
clear.clear_z_stencil_buffer = !job->early_zs_clear;
|
||||
clear.clear_all_render_targets = true;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
cl_emit(rcl, END_OF_TILE_MARKER, end);
|
||||
}
|
||||
@@ -1065,7 +1111,9 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
* now, would need to change if we allow multiple viewports
|
||||
*/
|
||||
float *vptranslate = dynamic->viewport.translate[0];
|
||||
+#if V3D_VERSION == 42
|
||||
float *vpscale = dynamic->viewport.scale[0];
|
||||
+#endif
|
||||
|
||||
struct v3dv_job *job = cmd_buffer->state.job;
|
||||
assert(job);
|
||||
@@ -1078,10 +1126,15 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
|
||||
v3dv_return_if_oom(cmd_buffer, NULL);
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
|
||||
clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
|
||||
clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
float translate_z, scale_z;
|
||||
v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
|
||||
@@ -1591,16 +1644,20 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
|
||||
assert(pipeline);
|
||||
|
||||
- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
|
||||
-
|
||||
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
|
||||
v3dv_return_if_oom(cmd_buffer, NULL);
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
+ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
|
||||
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
|
||||
config.early_z_enable = enable_ez;
|
||||
config.early_z_updates_enable = config.early_z_enable &&
|
||||
pipeline->z_updates_enable;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
|
||||
void
|
||||
@@ -2031,10 +2088,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
pipeline->vpm_cfg.Gv);
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
struct v3dv_bo *default_attribute_values =
|
||||
pipeline->default_attribute_values != NULL ?
|
||||
pipeline->default_attribute_values :
|
||||
pipeline->device->default_attribute_float;
|
||||
+#endif
|
||||
|
||||
cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
|
||||
pipeline->shader_state_record, shader) {
|
||||
@@ -2060,8 +2119,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
|
||||
shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
shader.address_of_default_attribute_values =
|
||||
v3dv_cl_address(default_attribute_values, 0);
|
||||
+#endif
|
||||
|
||||
shader.any_shader_reads_hardware_written_primitive_id =
|
||||
(pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
|
||||
@@ -2399,11 +2460,17 @@ v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buf
|
||||
|
||||
assert(iview->plane_count == 1);
|
||||
*rt_bpp = iview->planes[0].internal_bpp;
|
||||
- *rt_type = iview->planes[0].internal_type;
|
||||
if (vk_format_is_int(iview->vk.view_format))
|
||||
+#if V3D_VERSION == 42
|
||||
+ *rt_type = iview->planes[0].internal_type;
|
||||
+ if (vk_format_is_int(iview->vk.format))
|
||||
*rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
|
||||
else if (vk_format_is_srgb(iview->vk.view_format))
|
||||
*rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
|
||||
else
|
||||
*rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
|
||||
index 80a3e5bfde8..dac6ff2741f 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_image.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_image.c
|
||||
@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
|
||||
tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
|
||||
tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
|
||||
|
||||
- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
|
||||
-
|
||||
tex.texture_type = image_view->format->planes[plane].tex_type;
|
||||
|
||||
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
|
||||
@@ -110,7 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
|
||||
|
||||
tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
+ tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
|
||||
+#endif
|
||||
+
|
||||
+#if V3D_VERSION == 42
|
||||
tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
/* At this point we don't have the job. That's the reason the first
|
||||
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
|
||||
@@ -166,7 +173,12 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
|
||||
|
||||
assert(buffer_view->format->plane_count == 1);
|
||||
tex.texture_type = buffer_view->format->planes[0].tex_type;
|
||||
+#if V3D_VERSION == 42
|
||||
tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
/* At this point we don't have the job. That's the reason the first
|
||||
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
index 04147b82cbd..2db07ea7427 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
@@ -58,7 +58,12 @@ emit_rcl_prologue(struct v3dv_job *job,
|
||||
config.number_of_render_targets = 1;
|
||||
config.multisample_mode_4x = tiling->msaa;
|
||||
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
|
||||
+#if V3D_VERSION == 42
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
config.internal_depth_type = fb->internal_depth_type;
|
||||
}
|
||||
|
||||
@@ -88,14 +93,20 @@ emit_rcl_prologue(struct v3dv_job *job,
|
||||
}
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
const uint32_t *color = &clear_info->clear_value->color[0];
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
|
||||
clear.clear_color_low_32_bits = color[0];
|
||||
clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
|
||||
clear.render_target_number = 0;
|
||||
};
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
|
||||
clear.clear_color_mid_low_32_bits =
|
||||
((color[1] >> 24) | (color[2] << 8));
|
||||
@@ -103,22 +114,37 @@ emit_rcl_prologue(struct v3dv_job *job,
|
||||
((color[2] >> 24) | ((color[3] & 0xffff) << 8));
|
||||
clear.render_target_number = 0;
|
||||
};
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
+
|
||||
}
|
||||
|
||||
if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
|
||||
clear.uif_padded_height_in_uif_blocks = clear_pad;
|
||||
clear.clear_color_high_16_bits = color[3] >> 16;
|
||||
clear.render_target_number = 0;
|
||||
};
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
|
||||
rt.render_target_0_internal_bpp = tiling->internal_bpp;
|
||||
rt.render_target_0_internal_type = fb->internal_type;
|
||||
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
|
||||
clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
|
||||
@@ -179,10 +205,16 @@ emit_frame_setup(struct v3dv_job *job,
|
||||
*/
|
||||
if (clear_value &&
|
||||
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
|
||||
clear.clear_z_stencil_buffer = true;
|
||||
clear.clear_all_render_targets = true;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
+
|
||||
}
|
||||
cl_emit(rcl, END_OF_TILE_MARKER, end);
|
||||
}
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
index 5d32d414ed8..922698b08a2 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
@@ -447,10 +447,15 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
|
||||
/* FIXME: Use combined input/output size flag in the common case (also
|
||||
* on v3d, see v3dx_draw).
|
||||
*/
|
||||
+#if V3D_VERSION == 42
|
||||
shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
|
||||
prog_data_vs_bin->separate_segments;
|
||||
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
|
||||
prog_data_vs->separate_segments;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
|
||||
shader.coordinate_shader_input_vpm_segment_size =
|
||||
prog_data_vs_bin->separate_segments ?
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
|
||||
index efe63de425c..1a26d04aef7 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_queue.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_queue.c
|
||||
@@ -42,14 +42,25 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
|
||||
config.image_height_pixels = 1;
|
||||
config.number_of_render_targets = 1;
|
||||
config.multisample_mode_4x = false;
|
||||
+#if V3D_VERSION == 42
|
||||
config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("HW generation 71 not supported yet.");
|
||||
+#endif
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
|
||||
rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
|
||||
rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
|
||||
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
|
||||
}
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ unreachable("Hardware generation 71 not supported yet.");
|
||||
+#endif
|
||||
+
|
||||
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
|
||||
clear.z_clear_value = 1.0f;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,29 @@
|
||||
From 7aa016bca8bb1bf449ea79505692353c0bd174b8 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 10 Nov 2021 10:06:50 +0100
|
||||
Subject: [PATCH 074/139] v3dv: expose V3D revision number in device name
|
||||
|
||||
---
|
||||
src/broadcom/vulkan/v3dv_device.c | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
|
||||
index d29ffad3531..3034b561480 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_device.c
|
||||
+++ b/src/broadcom/vulkan/v3dv_device.c
|
||||
@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
|
||||
device->next_program_id = 0;
|
||||
|
||||
ASSERTED int len =
|
||||
- asprintf(&device->name, "V3D %d.%d",
|
||||
- device->devinfo.ver / 10, device->devinfo.ver % 10);
|
||||
+ asprintf(&device->name, "V3D %d.%d.%d",
|
||||
+ device->devinfo.ver / 10,
|
||||
+ device->devinfo.ver % 10,
|
||||
+ device->devinfo.rev);
|
||||
assert(len != -1);
|
||||
|
||||
v3dv_physical_device_init_disk_cache(device);
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,54 @@
|
||||
From fb9e95b7e1d5987fd25e914635c4e09d81ea9561 Mon Sep 17 00:00:00 2001
|
||||
From: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Date: Wed, 10 Nov 2021 07:54:35 +0100
|
||||
Subject: [PATCH 075/139] v3dv/device: handle new rpi5 device (bcm2712)
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This includes both master and primary devices.
|
||||
|
||||
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
|
||||
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
|
||||
---
|
||||
src/broadcom/vulkan/v3dv_device.c | 10 +++++++---
|
||||
1 file changed, 7 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
|
||||
index 3034b561480..c8719d33f15 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_device.c
|
||||
+++ b/src/broadcom/vulkan/v3dv_device.c
|
||||
@@ -1287,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance)
|
||||
if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
|
||||
char **compat = devices[i]->deviceinfo.platform->compatible;
|
||||
while (*compat) {
|
||||
- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
|
||||
+ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
|
||||
+ strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
|
||||
v3d_idx = i;
|
||||
break;
|
||||
}
|
||||
@@ -1296,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance)
|
||||
} else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
|
||||
char **compat = devices[i]->deviceinfo.platform->compatible;
|
||||
while (*compat) {
|
||||
- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
|
||||
- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
|
||||
+ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
|
||||
+ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
|
||||
+ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
|
||||
vc4_idx = i;
|
||||
break;
|
||||
}
|
||||
@@ -1334,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
|
||||
switch (dev->devinfo.ver) {
|
||||
case 42:
|
||||
return 0xBE485FD3; /* Broadcom deviceID for 2711 */
|
||||
+ case 71:
|
||||
+ return 0x55701C33; /* Broadcom deviceID for 2712 */
|
||||
default:
|
||||
unreachable("Unsupported V3D version");
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,32 @@
|
||||
From c4f957af4fb0e10abf0a7ffad4f7a468633b7d99 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 20 Jul 2021 14:00:44 +0200
|
||||
Subject: [PATCH 076/139] v3dv/cmd_buffer: emit TILE_BINNING_MODE_CFG for v71
|
||||
|
||||
---
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++-
|
||||
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index b958e634c82..17b2f46850d 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -94,7 +94,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
#endif
|
||||
#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
|
||||
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
|
||||
+ /* FIXME: ideally we would like next assert on the packet header (as is
|
||||
+ * general, so also applies to GL). We would need to expand
|
||||
+ * gen_pack_header for that.
|
||||
+ */
|
||||
+ assert(config.log2_tile_width == config.log2_tile_height ||
|
||||
+ config.log2_tile_width == config.log2_tile_height + 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,53 @@
|
||||
From 1934ac07df73cb685f6550b8b0f5b4f2ead11396 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 20 Jul 2021 14:33:00 +0200
|
||||
Subject: [PATCH 077/139] v3dv: emit TILE_RENDERING_MODE_CFG_COMMON for v71
|
||||
|
||||
---
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++-
|
||||
src/broadcom/vulkan/v3dvx_meta_common.c | 9 ++++++++-
|
||||
2 files changed, 16 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index 17b2f46850d..7837b460051 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -850,7 +850,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
#endif
|
||||
#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
|
||||
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
|
||||
+ /* FIXME: ideallly we would like next assert on the packet header (as is
|
||||
+ * general, so also applies to GL). We would need to expand
|
||||
+ * gen_pack_header for that.
|
||||
+ */
|
||||
+ assert(config.log2_tile_width == config.log2_tile_height ||
|
||||
+ config.log2_tile_width == config.log2_tile_height + 1);
|
||||
#endif
|
||||
|
||||
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
index 2db07ea7427..e4084d851fc 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
@@ -62,7 +62,14 @@ emit_rcl_prologue(struct v3dv_job *job,
|
||||
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
|
||||
#endif
|
||||
#if V3D_VERSION >= 71
|
||||
- unreachable("Hardware generation 71 not supported yet.");
|
||||
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
|
||||
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
|
||||
+ /* FIXME: ideallly we would like next assert on the packet header (as is
|
||||
+ * general, so also applies to GL). We would need to expand
|
||||
+ * gen_pack_header for that.
|
||||
+ */
|
||||
+ assert(config.log2_tile_width == config.log2_tile_height ||
|
||||
+ config.log2_tile_width == config.log2_tile_height + 1);
|
||||
#endif
|
||||
config.internal_depth_type = fb->internal_depth_type;
|
||||
}
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,315 @@
|
||||
From f0f9eea3cad83ed8824c6a7686150327407a5286 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Thu, 22 Jul 2021 14:26:13 +0200
|
||||
Subject: [PATCH 078/139] v3dv/cmd_buffer: emit
|
||||
TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1 for v71
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
|
||||
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
|
||||
---
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 186 +++++++++++++++++-------
|
||||
src/broadcom/vulkan/v3dvx_meta_common.c | 12 +-
|
||||
src/broadcom/vulkan/v3dvx_private.h | 11 +-
|
||||
3 files changed, 147 insertions(+), 62 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index 7837b460051..c6307890da5 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -800,6 +800,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
|
||||
}
|
||||
}
|
||||
|
||||
+/* Note that for v71, render target cfg packets has just one field that
|
||||
+ * combined the internal type and clamp mode. For simplicity we keep just one
|
||||
+ * helper.
|
||||
+ *
|
||||
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
|
||||
+ *
|
||||
+ * FIXME: for v71 we are not returning all the possible combinations for
|
||||
+ * render target internal type and clamp. For example for int types we are
|
||||
+ * always using clamp int, and for 16f we are using clamp none or pos (that
|
||||
+ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary
|
||||
+ * right now we are just porting what we were doing on 4.2
|
||||
+ */
|
||||
+uint32_t
|
||||
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
|
||||
+ VkFormat vk_format)
|
||||
+{
|
||||
+#if V3D_VERSION == 42
|
||||
+ if (vk_format_is_int(vk_format))
|
||||
+ return V3D_RENDER_TARGET_CLAMP_INT;
|
||||
+ else if (vk_format_is_srgb(vk_format))
|
||||
+ return V3D_RENDER_TARGET_CLAMP_NORM;
|
||||
+ else
|
||||
+ return V3D_RENDER_TARGET_CLAMP_NONE;
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ switch (rt_type) {
|
||||
+ case V3D_INTERNAL_TYPE_8I:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
|
||||
+ case V3D_INTERNAL_TYPE_8UI:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
|
||||
+ case V3D_INTERNAL_TYPE_8:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
|
||||
+ case V3D_INTERNAL_TYPE_16I:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
|
||||
+ case V3D_INTERNAL_TYPE_16UI:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
|
||||
+ case V3D_INTERNAL_TYPE_16F:
|
||||
+ return vk_format_is_srgb(vk_format) ?
|
||||
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
|
||||
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
|
||||
+ case V3D_INTERNAL_TYPE_32I:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
|
||||
+ case V3D_INTERNAL_TYPE_32UI:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
|
||||
+ case V3D_INTERNAL_TYPE_32F:
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
|
||||
+ default:
|
||||
+ unreachable("Unknown internal render target type");
|
||||
+ }
|
||||
+
|
||||
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
+ int rt,
|
||||
+ uint32_t *rt_bpp,
|
||||
+#if V3D_VERSION == 42
|
||||
+ uint32_t *rt_type,
|
||||
+ uint32_t *rt_clamp)
|
||||
+#else
|
||||
+ uint32_t *rt_type_clamp)
|
||||
+#endif
|
||||
+{
|
||||
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
|
||||
+
|
||||
+ assert(state->subpass_idx < state->pass->subpass_count);
|
||||
+ const struct v3dv_subpass *subpass =
|
||||
+ &state->pass->subpasses[state->subpass_idx];
|
||||
+
|
||||
+ if (rt >= subpass->color_count)
|
||||
+ return;
|
||||
+
|
||||
+ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
|
||||
+ const uint32_t attachment_idx = attachment->attachment;
|
||||
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
|
||||
+ return;
|
||||
+
|
||||
+ assert(attachment_idx < state->framebuffer->attachment_count &&
|
||||
+ attachment_idx < state->attachment_alloc_count);
|
||||
+ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
|
||||
+ assert(vk_format_is_color(iview->vk.format));
|
||||
+
|
||||
+ assert(iview->plane_count == 1);
|
||||
+ *rt_bpp = iview->planes[0].internal_bpp;
|
||||
+#if V3D_VERSION == 42
|
||||
+ *rt_type = iview->planes[0].internal_type;
|
||||
+ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
|
||||
+ iview->vk.format);
|
||||
+#endif
|
||||
+#if V3D_VERSION >= 71
|
||||
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
|
||||
+ iview->vk.format);
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
void
|
||||
v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
@@ -939,10 +1036,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
*/
|
||||
job->early_zs_clear = do_early_zs_clear;
|
||||
|
||||
+#if V3D_VERSION >= 71
|
||||
+ uint32_t base_addr = 0;
|
||||
+#endif
|
||||
for (uint32_t i = 0; i < subpass->color_count; i++) {
|
||||
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
|
||||
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
|
||||
+ if (attachment_idx == VK_ATTACHMENT_UNUSED) {
|
||||
+#if V3D_VERSION >= 71
|
||||
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
|
||||
+ rt.render_target_number = i;
|
||||
+ rt.stride = 1; /* Unused */
|
||||
+ }
|
||||
+#endif
|
||||
continue;
|
||||
+ }
|
||||
|
||||
struct v3dv_image_view *iview =
|
||||
state->attachments[attachment_idx].image_view;
|
||||
@@ -978,9 +1085,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
clear.render_target_number = i;
|
||||
};
|
||||
#endif
|
||||
-#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
-#endif
|
||||
|
||||
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
|
||||
#if V3D_VERSION == 42
|
||||
@@ -1010,27 +1114,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
unreachable("HW generation 71 not supported yet.");
|
||||
#endif
|
||||
}
|
||||
+
|
||||
+#if V3D_VERSION >= 71
|
||||
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
|
||||
+ rt.clear_color_low_bits = clear_color[0];
|
||||
+ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
|
||||
+ &rt.internal_type_and_clamping);
|
||||
+ rt.stride =
|
||||
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
|
||||
+ v3d_internal_bpp_words(rt.internal_bpp));
|
||||
+ rt.base_address = base_addr;
|
||||
+ rt.render_target_number = i;
|
||||
+
|
||||
+ /* base_addr in multiples of 512 bits. We divide by 8 because stride
|
||||
+ * is in 128-bit units, but it is packing 2 rows worth of data, so we
|
||||
+ * need to divide it by 2 so it is only 1 row, and then again by 4 so
|
||||
+ * it is in 512-bit units.
|
||||
+ */
|
||||
+ base_addr += (tiling->tile_height * rt.stride) / 8;
|
||||
+ }
|
||||
+#endif
|
||||
}
|
||||
|
||||
#if V3D_VERSION == 42
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
|
||||
- v3dX(cmd_buffer_render_pass_setup_render_target)
|
||||
+ cmd_buffer_render_pass_setup_render_target
|
||||
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
|
||||
&rt.render_target_0_internal_type, &rt.render_target_0_clamp);
|
||||
- v3dX(cmd_buffer_render_pass_setup_render_target)
|
||||
+ cmd_buffer_render_pass_setup_render_target
|
||||
(cmd_buffer, 1, &rt.render_target_1_internal_bpp,
|
||||
&rt.render_target_1_internal_type, &rt.render_target_1_clamp);
|
||||
- v3dX(cmd_buffer_render_pass_setup_render_target)
|
||||
+ cmd_buffer_render_pass_setup_render_target
|
||||
(cmd_buffer, 2, &rt.render_target_2_internal_bpp,
|
||||
&rt.render_target_2_internal_type, &rt.render_target_2_clamp);
|
||||
- v3dX(cmd_buffer_render_pass_setup_render_target)
|
||||
+ cmd_buffer_render_pass_setup_render_target
|
||||
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
|
||||
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
|
||||
}
|
||||
#endif
|
||||
-#if V3D_VERSION >= 71
|
||||
- unreachable("Hardware generation 71 not supported yet.");
|
||||
-#endif
|
||||
|
||||
/* Ends rendering mode config. */
|
||||
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
|
||||
@@ -2445,46 +2566,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
buffer->mem_offset + offset);
|
||||
}
|
||||
}
|
||||
-
|
||||
-void
|
||||
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
- int rt,
|
||||
- uint32_t *rt_bpp,
|
||||
- uint32_t *rt_type,
|
||||
- uint32_t *rt_clamp)
|
||||
-{
|
||||
- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
|
||||
-
|
||||
- assert(state->subpass_idx < state->pass->subpass_count);
|
||||
- const struct v3dv_subpass *subpass =
|
||||
- &state->pass->subpasses[state->subpass_idx];
|
||||
-
|
||||
- if (rt >= subpass->color_count)
|
||||
- return;
|
||||
-
|
||||
- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
|
||||
- const uint32_t attachment_idx = attachment->attachment;
|
||||
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
|
||||
- return;
|
||||
-
|
||||
- assert(attachment_idx < state->framebuffer->attachment_count &&
|
||||
- attachment_idx < state->attachment_alloc_count);
|
||||
- struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
|
||||
- assert(vk_format_is_color(iview->vk.format));
|
||||
-
|
||||
- assert(iview->plane_count == 1);
|
||||
- *rt_bpp = iview->planes[0].internal_bpp;
|
||||
- if (vk_format_is_int(iview->vk.view_format))
|
||||
-#if V3D_VERSION == 42
|
||||
- *rt_type = iview->planes[0].internal_type;
|
||||
- if (vk_format_is_int(iview->vk.format))
|
||||
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
|
||||
- else if (vk_format_is_srgb(iview->vk.view_format))
|
||||
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
|
||||
- else
|
||||
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
|
||||
-#endif
|
||||
-#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
-#endif
|
||||
-}
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
index e4084d851fc..c6391bc6d83 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
|
||||
@@ -26,6 +26,7 @@
|
||||
|
||||
#include "broadcom/common/v3d_macros.h"
|
||||
#include "broadcom/common/v3d_tfu.h"
|
||||
+#include "broadcom/common/v3d_util.h"
|
||||
#include "broadcom/cle/v3dx_pack.h"
|
||||
#include "broadcom/compiler/v3d_compiler.h"
|
||||
|
||||
@@ -150,7 +151,16 @@ emit_rcl_prologue(struct v3dv_job *job,
|
||||
}
|
||||
#endif
|
||||
#if V3D_VERSION >= 71
|
||||
- unreachable("Hardware generation 71 not supported yet.");
|
||||
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
|
||||
+ rt.internal_bpp = tiling->internal_bpp;
|
||||
+ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
|
||||
+ fb->vk_format);
|
||||
+ rt.stride =
|
||||
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
|
||||
+ v3d_internal_bpp_words(rt.internal_bpp));
|
||||
+ rt.base_address = 0;
|
||||
+ rt.render_target_number = 0;
|
||||
+ }
|
||||
#endif
|
||||
|
||||
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
|
||||
index ad8ddfa5731..a4157d11c7c 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dvx_private.h
|
||||
@@ -125,13 +125,6 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
|
||||
uint32_t internal_size,
|
||||
uint32_t *hw_color);
|
||||
|
||||
-void
|
||||
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
- int rt,
|
||||
- uint32_t *rt_bpp,
|
||||
- uint32_t *rt_type,
|
||||
- uint32_t *rt_clamp);
|
||||
-
|
||||
/* Used at v3dv_device */
|
||||
|
||||
void
|
||||
@@ -325,3 +318,7 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
|
||||
uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
|
||||
|
||||
uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
|
||||
+
|
||||
+uint32_t
|
||||
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
|
||||
+ VkFormat vk_format);
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,25 @@
|
||||
From 7c89d8026fd550282d54933f37ffc2773869326f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Mon, 26 Jul 2021 15:08:11 +0200
|
||||
Subject: [PATCH 079/139] v3dvx/cmd_buffer: emit CLEAR_RENDER_TARGETS for v71
|
||||
|
||||
---
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index c6307890da5..ae1c21ae00b 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -1219,7 +1219,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
#endif
|
||||
#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
|
||||
#endif
|
||||
}
|
||||
cl_emit(rcl, END_OF_TILE_MARKER, end);
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,38 @@
|
||||
From 2eb29b57fde2acda76e12953b3a1050f3056b39d Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Sun, 19 Sep 2021 23:37:32 +0200
|
||||
Subject: [PATCH 080/139] v3dv/cmd_buffer: emit CLIPPER_XY_SCALING for v71
|
||||
|
||||
---
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index ae1c21ae00b..2e525a11619 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -1246,9 +1246,7 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
* now, would need to change if we allow multiple viewports
|
||||
*/
|
||||
float *vptranslate = dynamic->viewport.translate[0];
|
||||
-#if V3D_VERSION == 42
|
||||
float *vpscale = dynamic->viewport.scale[0];
|
||||
-#endif
|
||||
|
||||
struct v3dv_job *job = cmd_buffer->state.job;
|
||||
assert(job);
|
||||
@@ -1268,7 +1266,10 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
#endif
|
||||
#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
|
||||
+ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
|
||||
+ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
|
||||
+ }
|
||||
#endif
|
||||
|
||||
float translate_z, scale_z;
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,97 @@
|
||||
From 611bf6a7445837c7e20416ff9f11a6dad9c543d7 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 14 Sep 2021 10:08:19 +0200
|
||||
Subject: [PATCH 081/139] v3dv/uniforms: update VIEWPORT_X/Y_SCALE uniforms for
|
||||
v71
|
||||
|
||||
As the packet CLIPPER_XY scaling, this needs to be computed on 1/64ths
|
||||
of pixel, instead of 1/256ths of pixels.
|
||||
|
||||
As this is the usual values that we get from macros, we add manually a
|
||||
v42 and v71 macro, and define a new helper (V3DV_X) to get the value
|
||||
for the current hw version.
|
||||
---
|
||||
src/broadcom/vulkan/v3dv_private.h | 17 +++++++++++++++++
|
||||
src/broadcom/vulkan/v3dv_uniforms.c | 7 ++++---
|
||||
src/broadcom/vulkan/v3dvx_private.h | 9 +++++++++
|
||||
3 files changed, 30 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
|
||||
index 6bdf338c67b..cd6811b19c2 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dv_private.h
|
||||
@@ -2617,6 +2617,23 @@ u64_compare(const void *key1, const void *key2)
|
||||
v3d_X_thing; \
|
||||
})
|
||||
|
||||
+/* Helper to get hw-specific macro values */
|
||||
+#define V3DV_X(device, thing) ({ \
|
||||
+ __typeof(V3D42_##thing) V3D_X_THING; \
|
||||
+ switch (device->devinfo.ver) { \
|
||||
+ case 42: \
|
||||
+ V3D_X_THING = V3D42_##thing; \
|
||||
+ break; \
|
||||
+ case 71: \
|
||||
+ V3D_X_THING = V3D71_##thing; \
|
||||
+ break; \
|
||||
+ default: \
|
||||
+ unreachable("Unsupported hardware generation"); \
|
||||
+ } \
|
||||
+ V3D_X_THING; \
|
||||
+})
|
||||
+
|
||||
+
|
||||
|
||||
/* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
|
||||
* define v3dX for each version supported, because when we compile code that
|
||||
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
|
||||
index 72fa9a1b39c..0e681cc4ee2 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_uniforms.c
|
||||
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
|
||||
@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
|
||||
|
||||
struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
|
||||
-
|
||||
+ float clipper_xy_granularity =
|
||||
+ V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
|
||||
for (int i = 0; i < uinfo->count; i++) {
|
||||
uint32_t data = uinfo->data[i];
|
||||
|
||||
@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
|
||||
break;
|
||||
|
||||
case QUNIFORM_VIEWPORT_X_SCALE:
|
||||
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
|
||||
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
|
||||
break;
|
||||
|
||||
case QUNIFORM_VIEWPORT_Y_SCALE:
|
||||
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
|
||||
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
|
||||
break;
|
||||
|
||||
case QUNIFORM_VIEWPORT_Z_OFFSET: {
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
|
||||
index a4157d11c7c..ff9ba75cf93 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dvx_private.h
|
||||
@@ -319,6 +319,15 @@ uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
|
||||
|
||||
uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
|
||||
|
||||
+/* General utils */
|
||||
+
|
||||
+uint32_t
|
||||
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
|
||||
+ VkFormat vk_format);
|
||||
+
|
||||
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
|
||||
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
|
||||
+
|
||||
uint32_t
|
||||
v3dX(clamp_for_format_and_type)(uint32_t rt_type,
|
||||
VkFormat vk_format);
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,40 @@
|
||||
From 3819efaf2bb6fd8bd9cd45d54fb7254377b2296a Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Tue, 27 Jul 2021 14:02:30 +0200
|
||||
Subject: [PATCH 082/139] v3dv/cmd_buffer: just don't fill up early-z fields
|
||||
for CFG_BITS for v71
|
||||
|
||||
For v71 early_z_enable/early_z_updates_enable is configured with
|
||||
packet 121.
|
||||
---
|
||||
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 +++------
|
||||
1 file changed, 3 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
index 2e525a11619..fe9f7e43596 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
|
||||
@@ -1783,17 +1783,14 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
|
||||
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
|
||||
v3dv_return_if_oom(cmd_buffer, NULL);
|
||||
|
||||
-#if V3D_VERSION == 42
|
||||
- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
|
||||
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
|
||||
+#if V3D_VERSION == 42
|
||||
+ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
|
||||
config.early_z_enable = enable_ez;
|
||||
config.early_z_updates_enable = config.early_z_enable &&
|
||||
pipeline->z_updates_enable;
|
||||
- }
|
||||
-#endif
|
||||
-#if V3D_VERSION >= 71
|
||||
- unreachable("HW generation 71 not supported yet.");
|
||||
#endif
|
||||
+ }
|
||||
}
|
||||
|
||||
void
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,219 @@
|
||||
From e3b1a578f45ea830d790970115b6de978d56edb8 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 28 Jul 2021 12:01:38 +0200
|
||||
Subject: [PATCH 083/139] v3dv: default vertex attribute values are gen
|
||||
dependant
|
||||
|
||||
Content, structure and size would depend on the generation. Even if it
|
||||
is needed at all.
|
||||
|
||||
So let's move it to the v3dvx files.
|
||||
---
|
||||
src/broadcom/vulkan/v3dv_device.c | 2 +-
|
||||
src/broadcom/vulkan/v3dv_pipeline.c | 61 ++-------------------------
|
||||
src/broadcom/vulkan/v3dv_private.h | 4 --
|
||||
src/broadcom/vulkan/v3dvx_pipeline.c | 63 ++++++++++++++++++++++++++++
|
||||
src/broadcom/vulkan/v3dvx_private.h | 8 ++++
|
||||
5 files changed, 75 insertions(+), 63 deletions(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
|
||||
index c8719d33f15..01e2dd7ac2d 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_device.c
|
||||
+++ b/src/broadcom/vulkan/v3dv_device.c
|
||||
@@ -2043,7 +2043,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||
v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
|
||||
device->instance->default_pipeline_cache_enabled);
|
||||
device->default_attribute_float =
|
||||
- v3dv_pipeline_create_default_attribute_values(device, NULL);
|
||||
+ v3dv_X(device, create_default_attribute_values)(device, NULL);
|
||||
|
||||
device->device_address_mem_ctx = ralloc_context(NULL);
|
||||
util_dynarray_init(&device->device_address_bo_list,
|
||||
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
|
||||
index 22f01bdf64b..d012ff8f948 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_pipeline.c
|
||||
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
|
||||
@@ -2802,62 +2802,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
-static bool
|
||||
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
|
||||
-{
|
||||
- for (uint8_t i = 0; i < pipeline->va_count; i++) {
|
||||
- if (vk_format_is_int(pipeline->va[i].vk_format))
|
||||
- return true;
|
||||
- }
|
||||
- return false;
|
||||
-}
|
||||
-
|
||||
-/* @pipeline can be NULL. We assume in that case that all the attributes have
|
||||
- * a float format (we only create an all-float BO once and we reuse it with
|
||||
- * all float pipelines), otherwise we look at the actual type of each
|
||||
- * attribute used with the specific pipeline passed in.
|
||||
- */
|
||||
-struct v3dv_bo *
|
||||
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
|
||||
- struct v3dv_pipeline *pipeline)
|
||||
-{
|
||||
- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
|
||||
- struct v3dv_bo *bo;
|
||||
-
|
||||
- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
|
||||
-
|
||||
- if (!bo) {
|
||||
- fprintf(stderr, "failed to allocate memory for the default "
|
||||
- "attribute values\n");
|
||||
- return NULL;
|
||||
- }
|
||||
-
|
||||
- bool ok = v3dv_bo_map(device, bo, size);
|
||||
- if (!ok) {
|
||||
- fprintf(stderr, "failed to map default attribute values buffer\n");
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- uint32_t *attrs = bo->map;
|
||||
- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
|
||||
- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
|
||||
- attrs[i * 4 + 0] = 0;
|
||||
- attrs[i * 4 + 1] = 0;
|
||||
- attrs[i * 4 + 2] = 0;
|
||||
- VkFormat attr_format =
|
||||
- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
|
||||
- if (i < va_count && vk_format_is_int(attr_format)) {
|
||||
- attrs[i * 4 + 3] = 1;
|
||||
- } else {
|
||||
- attrs[i * 4 + 3] = fui(1.0);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- v3dv_bo_unmap(device, bo);
|
||||
-
|
||||
- return bo;
|
||||
-}
|
||||
-
|
||||
static void
|
||||
pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
|
||||
const VkPipelineMultisampleStateCreateInfo *ms_info)
|
||||
@@ -2992,9 +2936,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
|
||||
|
||||
v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
|
||||
|
||||
- if (pipeline_has_integer_vertex_attrib(pipeline)) {
|
||||
+ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
|
||||
pipeline->default_attribute_values =
|
||||
- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
|
||||
+ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
|
||||
+
|
||||
if (!pipeline->default_attribute_values)
|
||||
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||
} else {
|
||||
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
|
||||
index cd6811b19c2..a9fab24d19e 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dv_private.h
|
||||
@@ -2500,10 +2500,6 @@ void
|
||||
v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
|
||||
struct v3dv_pipeline_cache *cache);
|
||||
|
||||
-struct v3dv_bo *
|
||||
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
|
||||
- struct v3dv_pipeline *pipeline);
|
||||
-
|
||||
VkResult
|
||||
v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
|
||||
nir_shader *nir,
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
index 922698b08a2..e235220cb14 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
@@ -664,3 +664,66 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
}
|
||||
+
|
||||
+static bool
|
||||
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
|
||||
+{
|
||||
+ for (uint8_t i = 0; i < pipeline->va_count; i++) {
|
||||
+ if (vk_format_is_int(pipeline->va[i].vk_format))
|
||||
+ return true;
|
||||
+ }
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+bool
|
||||
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
|
||||
+{
|
||||
+ return pipeline_has_integer_vertex_attrib(pipeline);
|
||||
+}
|
||||
+
|
||||
+/* @pipeline can be NULL. In that case we assume the most common case. For
|
||||
+ * example, for v42 we assume in that case that all the attributes have a
|
||||
+ * float format (we only create an all-float BO once and we reuse it with all
|
||||
+ * float pipelines), otherwise we look at the actual type of each attribute
|
||||
+ * used with the specific pipeline passed in.
|
||||
+ */
|
||||
+struct v3dv_bo *
|
||||
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
|
||||
+ struct v3dv_pipeline *pipeline)
|
||||
+{
|
||||
+ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
|
||||
+ struct v3dv_bo *bo;
|
||||
+
|
||||
+ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
|
||||
+
|
||||
+ if (!bo) {
|
||||
+ fprintf(stderr, "failed to allocate memory for the default "
|
||||
+ "attribute values\n");
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ bool ok = v3dv_bo_map(device, bo, size);
|
||||
+ if (!ok) {
|
||||
+ fprintf(stderr, "failed to map default attribute values buffer\n");
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ uint32_t *attrs = bo->map;
|
||||
+ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
|
||||
+ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
|
||||
+ attrs[i * 4 + 0] = 0;
|
||||
+ attrs[i * 4 + 1] = 0;
|
||||
+ attrs[i * 4 + 2] = 0;
|
||||
+ VkFormat attr_format =
|
||||
+ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
|
||||
+ if (i < va_count && vk_format_is_int(attr_format)) {
|
||||
+ attrs[i * 4 + 3] = 1;
|
||||
+ } else {
|
||||
+ attrs[i * 4 + 3] = fui(1.0);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ v3dv_bo_unmap(device, bo);
|
||||
+
|
||||
+ return bo;
|
||||
+}
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
|
||||
index ff9ba75cf93..036ce11b455 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dvx_private.h
|
||||
@@ -306,6 +306,14 @@ void
|
||||
v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
|
||||
const VkPipelineVertexInputStateCreateInfo *vi_info,
|
||||
const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
|
||||
+
|
||||
+bool
|
||||
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
|
||||
+
|
||||
+struct v3dv_bo *
|
||||
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
|
||||
+ struct v3dv_pipeline *pipeline);
|
||||
+
|
||||
/* Used at v3dv_queue */
|
||||
void
|
||||
v3dX(job_emit_noop)(struct v3dv_job *job);
|
||||
--
|
||||
2.39.2
|
||||
|
@ -0,0 +1,87 @@
|
||||
From 8464dc8869f3d2eccfecac7b4358cc0ffe05f081 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
|
||||
Date: Wed, 28 Jul 2021 12:05:26 +0200
|
||||
Subject: [PATCH 084/139] v3dv/pipeline: default vertex attributes values are
|
||||
not needed for v71
|
||||
|
||||
There are not part of the shader state record.
|
||||
---
|
||||
src/broadcom/vulkan/v3dv_private.h | 10 +++++++++-
|
||||
src/broadcom/vulkan/v3dvx_pipeline.c | 10 ++++++++++
|
||||
2 files changed, 19 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
|
||||
index a9fab24d19e..300a1ec8ae1 100644
|
||||
--- a/src/broadcom/vulkan/v3dv_private.h
|
||||
+++ b/src/broadcom/vulkan/v3dv_private.h
|
||||
@@ -581,6 +581,10 @@ struct v3dv_device {
|
||||
* being float being float, allowing us to reuse the same BO for all
|
||||
* pipelines matching this requirement. Pipelines that need integer
|
||||
* attributes will create their own BO.
|
||||
+ *
|
||||
+ * Note that since v71 the default attribute values are not needed, so this
|
||||
+ * can be NULL.
|
||||
+ *
|
||||
*/
|
||||
struct v3dv_bo *default_attribute_float;
|
||||
|
||||
@@ -2289,11 +2293,15 @@ struct v3dv_pipeline {
|
||||
unsigned char sha1[20];
|
||||
|
||||
/* In general we can reuse v3dv_device->default_attribute_float, so note
|
||||
- * that the following can be NULL.
|
||||
+ * that the following can be NULL. In 7.x this is not used, so it will be
|
||||
+ * NULL.
|
||||
*
|
||||
* FIXME: the content of this BO will be small, so it could be improved to
|
||||
* be uploaded to a common BO. But as in most cases it will be NULL, it is
|
||||
* not a priority.
|
||||
+ *
|
||||
+ * Note that since v71 the default attribute values are not needed, so this
|
||||
+ * can be NULL.
|
||||
*/
|
||||
struct v3dv_bo *default_attribute_values;
|
||||
|
||||
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
index e235220cb14..4dc6d70efe1 100644
|
||||
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
|
||||
@@ -665,6 +665,7 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
+#if V3D_VERSION == 42
|
||||
static bool
|
||||
pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
|
||||
{
|
||||
@@ -674,11 +675,16 @@ pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
+#endif
|
||||
|
||||
bool
|
||||
v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
|
||||
{
|
||||
+#if V3D_VERSION == 42
|
||||
return pipeline_has_integer_vertex_attrib(pipeline);
|
||||
+#endif
|
||||
+
|
||||
+ return false;
|
||||
}
|
||||
|
||||
/* @pipeline can be NULL. In that case we assume the most common case. For
|
||||
@@ -691,6 +697,10 @@ struct v3dv_bo *
|
||||
v3dX(create_default_attribute_values)(struct v3dv_device *device,
|
||||
struct v3dv_pipeline *pipeline)
|
||||
{
|
||||
+#if V3D_VERSION >= 71
|
||||
+ return NULL;
|
||||
+#endif
|
||||
+
|
||||
uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
|
||||
struct v3dv_bo *bo;
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user