Merge pull request #8256 from heitbaum/mesa2330

Mesa update to 23.3.0
This commit is contained in:
Christian Hewitt 2023-11-30 08:16:48 +04:00 committed by GitHub
commit 80f59f03b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
143 changed files with 127 additions and 17779 deletions

View File

@ -3,8 +3,8 @@
# Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
PKG_NAME="mesa"
PKG_VERSION="23.2.1"
PKG_SHA256="64de0616fc2d801f929ab1ac2a4f16b3e2783c4309a724c8a259b20df8bbc1cc"
PKG_VERSION="23.3.0"
PKG_SHA256="50f729dd60ed6335b989095baad81ef5edf7cfdd4b4b48b9b955917cb07d69c5"
PKG_LICENSE="OSS"
PKG_SITE="http://www.mesa3d.org/"
PKG_URL="https://mesa.freedesktop.org/archive/mesa-${PKG_VERSION}.tar.xz"
@ -13,6 +13,10 @@ PKG_LONGDESC="Mesa is a 3-D graphics library with an API."
get_graphicdrivers
if [ "${DEVICE}" = "Dragonboard" ]; then
PKG_DEPENDS_TARGET+=" libarchive libxml2 lua54"
fi
PKG_MESON_OPTS_TARGET="-Dgallium-drivers=${GALLIUM_DRIVERS// /,} \
-Dgallium-extra-hud=false \
-Dgallium-omx=disabled \

View File

@ -1,332 +0,0 @@
From f62aa2640f92796ff5216da0a5d3c8f46a2855b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Mon, 26 Apr 2021 00:02:21 +0200
Subject: [PATCH 001/142] broadcom(cle,clif,common,simulator): add 7.1 version
on the list of versions to build
This adds 7.1 to the list of available V3D_VERSION, and first changes
on the simulator needed to get it working.
Note that we needed to touch all those 4 codebases because it is
needed if we want to use V3D_DEBUG=clif with the simulator, that it is
the easier way to see which packets a vulkan program is using.
About the simulator, this commit only handle the rename of some
registers. Any additional changes needed to get a proper support for
v71 will be handled them on following commits.
---
src/broadcom/cle/meson.build | 3 +-
src/broadcom/cle/v3dx_pack.h | 2 +
src/broadcom/clif/clif_private.h | 2 +
src/broadcom/common/v3d_device_info.c | 1 +
src/broadcom/common/v3d_macros.h | 3 +
src/broadcom/meson.build | 2 +-
src/broadcom/simulator/v3d_simulator.c | 81 +++++++++++++++++++------
src/broadcom/simulator/v3d_simulator.h | 5 ++
src/broadcom/simulator/v3dx_simulator.c | 31 ++++++++--
9 files changed, 106 insertions(+), 24 deletions(-)
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index 31a0d5bfa94..8ac32b313e4 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -23,7 +23,8 @@ v3d_versions = [
[21, 21],
[33, 33],
[41, 33],
- [42, 33]
+ [42, 33],
+ [71, 33]
]
v3d_xml_files = []
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
index 5762e5aaa70..e5a1eb26698 100644
--- a/src/broadcom/cle/v3dx_pack.h
+++ b/src/broadcom/cle/v3dx_pack.h
@@ -37,6 +37,8 @@
# include "cle/v3d_packet_v41_pack.h"
#elif (V3D_VERSION == 42)
# include "cle/v3d_packet_v42_pack.h"
+#elif (V3D_VERSION == 71)
+# include "cle/v3d_packet_v71_pack.h"
#else
# error "Need to add a pack header include for this v3d version"
#endif
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index 6ace62b0310..cda407a00bf 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+ const uint8_t *cl, uint32_t *size, bool reloc_mode);
static inline void
out(struct clif_dump *clif, const char *fmt, ...)
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 272190eb2e5..7e0862f1f02 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -66,6 +66,7 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
case 33:
case 41:
case 42:
+ case 71:
break;
default:
fprintf(stderr,
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
index fe89398208a..b4291fb5350 100644
--- a/src/broadcom/common/v3d_macros.h
+++ b/src/broadcom/common/v3d_macros.h
@@ -41,6 +41,9 @@
#elif (V3D_VERSION == 42)
# define V3DX(x) V3D42_##x
# define v3dX(x) v3d42_##x
+#elif (V3D_VERSION == 71)
+# define V3DX(x) V3D71_##x
+# define v3dX(x) v3d71_##x
#else
# error "Need to add prefixing macros for this v3d version"
#endif
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 2c10e46b188..73cb7aa0575 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
subdir('cle')
-v3d_versions = ['33', '41', '42']
+v3d_versions = ['33', '41', '42', '71']
v3d_libs = []
if with_gallium_v3d or with_broadcom_vk
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
index eea5d3f050e..5cceb1a82cc 100644
--- a/src/broadcom/simulator/v3d_simulator.c
+++ b/src/broadcom/simulator/v3d_simulator.c
@@ -490,10 +490,20 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
- if (sim_state.ver >= 41)
- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
- else
- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ switch(sim_state.ver) {
+ case 33:
+ v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ break;
+ case 41:
+ case 42:
+ v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ break;
+ case 71:
+ v3d71_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ break;
+ default:
+ unreachable("Unsupported V3D version\n");
+ }
util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
sim_bo) {
@@ -635,10 +645,17 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
static int
v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
{
- if (sim_state.ver >= 41)
- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
- else
+ switch(sim_state.ver) {
+ case 33:
return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
+ case 41:
+ case 42:
+ return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
+ case 71:
+ return v3d71_simulator_get_param_ioctl(sim_state.v3d, args);
+ default:
+ unreachable("Unsupported V3D version\n");
+ }
}
static int
@@ -652,10 +669,20 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
- else
+ switch(sim_state.ver) {
+ case 33:
ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+ break;
+ case 41:
+ case 42:
+ ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+ break;
+ case 71:
+ ret = v3d71_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+ break;
+ default:
+ unreachable("Unsupported V3D version\n");
+ }
v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
@@ -682,11 +709,19 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
v3d_simulator_perfmon_switch(fd, args->perfmon_id);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
- file->gmp->ofs);
- else
- ret = -1;
+ switch(sim_state.ver) {
+ case 41:
+ case 42:
+ ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
+ file->gmp->ofs);
+ break;
+ case 71:
+ ret = v3d71_simulator_submit_csd_ioctl(sim_state.v3d, args,
+ file->gmp->ofs);
+ break;
+ default:
+ ret = -1;
+ }
for (int i = 0; i < args->bo_handle_count; i++)
v3d_simulator_copy_out_handle(file, bo_handles[i]);
@@ -880,10 +915,20 @@ v3d_simulator_init_global()
util_dynarray_init(&sim_state.bin_oom, NULL);
- if (sim_state.ver >= 41)
- v3d41_simulator_init_regs(sim_state.v3d);
- else
+ switch(sim_state.ver) {
+ case 33:
v3d33_simulator_init_regs(sim_state.v3d);
+ break;
+ case 41:
+ case 42:
+ v3d41_simulator_init_regs(sim_state.v3d);
+ break;
+ case 71:
+ v3d71_simulator_init_regs(sim_state.v3d);
+ break;
+ default:
+ unreachable("Not supported V3D version\n");
+ }
}
struct v3d_simulator_file *
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
index ddb079c1455..1472c313a03 100644
--- a/src/broadcom/simulator/v3d_simulator.h
+++ b/src/broadcom/simulator/v3d_simulator.h
@@ -52,6 +52,11 @@ uint32_t v3d_simulator_get_mem_free(void);
# define v3dX(x) v3d41_##x
# include "v3dx_simulator.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dx_simulator.h"
+# undef v3dX
+
#endif
#endif
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index c9322f0397b..723796b16c9 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -46,11 +46,15 @@
#define HW_REGISTER_RO(x) (x)
#define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
+#if V3D_VERSION == 71
+#include "libs/core/v3d/registers/7.1.5.1/v3d.h"
+#else
+#if V3D_VERSION == 41 || V3D_VERSION == 42
#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
#else
#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
#endif
+#endif
#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
@@ -310,16 +314,17 @@ v3d_isr_core(struct v3d_hw *v3d,
return;
}
+#if V3D_VERSION <= 42
if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
fprintf(stderr, "GMP violation at 0x%08x\n",
V3D_READ(V3D_GMP_VIO_ADDR));
- abort();
} else {
fprintf(stderr,
"Unexpected ISR with core status 0x%08x\n",
core_status);
}
abort();
+#endif
}
static void
@@ -396,6 +401,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
}
handle_mmu_interruptions(v3d, hub_status);
+
+#if V3D_VERSION == 71
+ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
+ fprintf(stderr, "GMP violation at 0x%08x\n",
+ V3D_READ(V3D_GMP_VIO_ADDR));
+ } else {
+ fprintf(stderr,
+ "Unexpected ISR with status 0x%08x\n",
+ hub_status);
+ }
+ abort();
+#endif
}
static void
@@ -436,8 +453,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
* for tracing. Perhaps we should evaluate to do the same here and add
* some debug options.
*/
- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
+ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
+#if V3D_VERSION <= 42
+ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
+#endif
+
V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
@@ -447,6 +467,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */
V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+#if V3D_VERSION == 71
+ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
+#endif
V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
--
2.39.2

View File

@ -1,7 +1,7 @@
From 3322c102282cf726ae575b122358060abd5b24db Mon Sep 17 00:00:00 2001
From 54cc206be2d48916862d7e264e886f58b27dd653 Mon Sep 17 00:00:00 2001
From: Dave Stevenson <dave.stevenson@raspberrypi.com>
Date: Thu, 5 Oct 2023 19:32:10 +0100
Subject: [PATCH 142/142] gallium: Add kmsro drivers for RP1 DSI, DPI, and VEC
Subject: [PATCH 1/3] gallium: Add kmsro drivers for RP1 DSI, DPI, and VEC
devices
Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
@ -11,7 +11,7 @@ Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
2 files changed, 6 insertions(+)
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index fbec1da957b..59daf3b6fb6 100644
index 66619bba0db..443923772e8 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -68,6 +68,9 @@ libgallium_dri = shared_library(
@ -22,10 +22,10 @@ index fbec1da957b..59daf3b6fb6 100644
+ 'drm-rp1-dsi_dri.so',
+ 'drm-rp1-vec_dri.so',
'exynos_dri.so',
'hdlcd_dri.so',
'hx8357d_dri.so',
'ili9225_dri.so',
diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
index d506869cbb4..ecb25edd03b 100644
index 9d3069eb004..79f60a7224a 100644
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -98,6 +98,9 @@ DEFINE_LOADER_DRM_ENTRYPOINT(tegra);
@ -36,8 +36,8 @@ index d506869cbb4..ecb25edd03b 100644
+DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_dsi)
+DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_vec)
DEFINE_LOADER_DRM_ENTRYPOINT(exynos)
DEFINE_LOADER_DRM_ENTRYPOINT(hdlcd)
DEFINE_LOADER_DRM_ENTRYPOINT(hx8357d)
DEFINE_LOADER_DRM_ENTRYPOINT(ili9225)
--
2.39.2

View File

@ -1,30 +0,0 @@
From 9e85edd1b347b0e779b393f463f42044a720bcff Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 28 Sep 2021 13:16:49 +0200
Subject: [PATCH 002/142] broadcom/simulator: reset CFG7 for compute dispatch
in v71
This register is new in 7.x, it doesn't seem that we need to
do anything specific for now, but let's make sure it is reset
every time.
---
src/broadcom/simulator/v3dx_simulator.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index 723796b16c9..f23b0538de3 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -227,6 +227,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
+#if V3D_VERSION >= 71
+ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
+#endif
/* CFG0 kicks off the job */
V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
--
2.39.2

View File

@ -1,8 +1,8 @@
From 4f33de7771621e15aae3e3c60c09fd5a2f29bdac Mon Sep 17 00:00:00 2001
From 80050d6960a688d061eac9798c6f5f1b0eb3e960 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 30 Nov 2021 02:39:20 +0100
Subject: [PATCH 066/142] nir: add new opcodes to map new v71
packing/conversion instructions
Subject: [PATCH 2/3] nir: add new opcodes to map new v71 packing/conversion
instructions
Since v71, broadcom hw include specific packing/conversion
instructions, so this commit adds opcodes to be able to make use of
@ -28,17 +28,14 @@ integer.
Interestingly broadcom also defines a similar one that packs the
higher halfword. Not used yet.
FIXME: vftounorm10lo/hi constant expression implementation is somewhat
convoluted. It is likely that it could be implemented in a more easy
way. But it works (passing the tests added with CTS issue #3372,
created with this change in mind).
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
src/compiler/nir/nir_constant_expressions.py | 106 +++++++++++++++++++
src/compiler/nir/nir_opcodes.py | 44 ++++++++
2 files changed, 150 insertions(+)
src/compiler/nir/nir_constant_expressions.py | 94 ++++++++++++++++++++
src/compiler/nir/nir_opcodes.py | 52 +++++++++++
2 files changed, 146 insertions(+)
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index e6383b67737..46395d79a89 100644
index e6383b67737..0d0797526a9 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -62,6 +62,8 @@ template = """\
@ -50,7 +47,7 @@ index e6383b67737..46395d79a89 100644
#include "nir_constant_expressions.h"
/**
@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u)
@@ -277,6 +279,98 @@ unpack_half_1x16(uint16_t u)
return _mesa_half_to_float(u);
}
@ -61,24 +58,22 @@ index e6383b67737..46395d79a89 100644
+static uint32_t v11fpack_v3d(const uint32_t src0,
+ const uint32_t src1)
+{
+ float rgb[3];
+
+ rgb[0] = unpack_half_1x16((src0 & 0xffff));
+ rgb[1] = unpack_half_1x16((src0 >> 16));
+ rgb[2] = unpack_half_1x16((src1 & 0xffff));
+ float rgb[3] = {
+ unpack_half_1x16((src0 & 0xffff)),
+ unpack_half_1x16((src0 >> 16)),
+ unpack_half_1x16((src1 & 0xffff)),
+ };
+
+ return float3_to_r11g11b10f(rgb);
+}
+
+/**
+ * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
+ * as it receives a uint16_t val instead of a float
+ * as they receives a uint16_t val instead of a float
+ */
+static uint8_t _mesa_half_to_snorm8(uint16_t val)
+static inline uint8_t _mesa_half_to_snorm8(uint16_t val)
+{
+ float x = _mesa_half_to_float(val);
+
+ return pack_snorm_1x8(x);
+ return pack_snorm_1x8(_mesa_half_to_float(val));
+}
+
+static uint16_t _mesa_float_to_snorm16(uint32_t val)
@ -95,51 +90,42 @@ index e6383b67737..46395d79a89 100644
+ return pack_unorm_1x16(aux.f);
+}
+
+/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too
+ * verbose. It is likely that there would be a simpler way to implement
+ * it.
+ */
+static uint32_t float_pack16_v3d(uint32_t f32)
+static inline uint32_t float_pack16_v3d(uint32_t f32)
+{
+ float f = uif(f32);
+ return _mesa_float_to_half(f);
+ return _mesa_float_to_half(uif(f32));
+}
+
+static uint32_t float_unpack16_v3d(uint32_t f16)
+static inline uint32_t float_unpack16_v3d(uint32_t f16)
+{
+ float f = _mesa_half_to_float(f16);
+ return fui(f);
+ return fui(_mesa_half_to_float(f16));
+}
+
+static uint32_t vfpack_v3d(uint32_t a, uint32_t b)
+static inline uint32_t vfpack_v3d(uint32_t a, uint32_t b)
+{
+ return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
+}
+
+static uint32_t vfsat_v3d(uint32_t a)
+static inline uint32_t vfsat_v3d(uint32_t a)
+{
+ return vfpack_v3d(
+ fui(SATURATE(_mesa_half_to_float(a & 0xffff))),
+ fui(SATURATE(_mesa_half_to_float(a >> 16))));
+ const uint32_t low = fui(SATURATE(_mesa_half_to_float(a & 0xffff)));
+ const uint32_t high = fui(SATURATE(_mesa_half_to_float(a >> 16)));
+
+ return vfpack_v3d(low, high);
+}
+
+static uint32_t fmul_v3d(uint32_t a, uint32_t b)
+static inline uint32_t fmul_v3d(uint32_t a, uint32_t b)
+{
+ float f = uif(a);
+ float g = uif(b);
+
+ float x = f * g;
+
+ return fui(x);
+ return fui(uif(a) * uif(b));
+}
+
+#define L(x) float_unpack16_v3d((x) & 0xffff)
+#define H(x) float_unpack16_v3d((x) >> 16)
+#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b)))
+
+static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
+{
+ return V(fmul_v3d, a, b);
+ const uint32_t low = fmul_v3d(float_unpack16_v3d(a & 0xffff),
+ float_unpack16_v3d(b & 0xffff));
+ const uint32_t high = fmul_v3d(float_unpack16_v3d(a >> 16),
+ float_unpack16_v3d(b >> 16));
+
+ return vfpack_v3d(low, high);
+}
+
+/* Convert 2x16-bit floating point to 2x10-bit unorm */
@ -156,34 +142,41 @@ index e6383b67737..46395d79a89 100644
+{
+ return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
+}
+
+
/* Some typed vector structures to make things like src0.y work */
typedef int8_t int1_t;
typedef uint8_t uint1_t;
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index e4d87aa6126..63aa7cfa315 100644
index 0f81328f441..b70d9567cd6 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) {
@@ -1413,6 +1413,58 @@ for (int i = 0; i < 32; i += 8) {
}
""")
+# v3d-specific opcodes
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into
+# r11g11b10 bits, rounding to nearest even
+# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
+# r11g11b10 bits, rounding to nearest even, so
+# dst[10:0] = float16_to_float11 (src0[15:0])
+# dst[21:11] = float16_to_float11 (src0[31:16])
+# dst[31:22] = float16_to_float10 (src1[15:0])
+binop_convert("v11fpack_v3d", tuint32, tuint32, "",
+ "v11fpack_v3d(src0, src1)")
+
+# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
+# difference with pack_32_2x16_split is that the sources are 32bit too. So it
+# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit
+# pack.
+# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
+# integer.
+binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
+ "(src0.x & 0xffff) | (src1.x << 16)")
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2
+# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
+# r10g10b10a2:
+# dst[9:0] = src0[9:0]
+# dst[19:10] = src0[25:16]
+# dst[29:20] = src1[9:0]
+# dst[31:30] = src1[17:16]
+binop_convert("v10pack_v3d", tuint32, tuint32, "",
+ "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
+

View File

@ -1,712 +0,0 @@
From 6f744bc4bec98f9769486d427e8e2d4e314ae056 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 29 Jun 2021 12:03:24 +0200
Subject: [PATCH 003/142] broadcom/cle: update the packet definitions for new
generation v71
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Using as reference the spec for 7.1.5. This include totally new
packets, and redefine some that already existed on v42.
Full list:
* Add Depth Bounds Test Limits
* Redefine Tile Binning Mode Cfg
* Redefine Cfg Bits. There are some changes on the fields:
* Line Rasterization is now 1 bit size
* Depth Bounds Enable (that takes one of the bits of Line Rasterization)
* Early-Z/Early-Z updates enable bits (16-17) figure now as reserved.
* New Z-Clipping mode field
* Redefine Tile Rendering Mode Cfg (Common). Changes with respect to v42:
* New log2 tile height/width fields starting at bit 52/55
* Due those two news, end pad is smaller
* sub-id has now a size of 3. Bit 4 is reserved.
* Number of render targets: this field max value is now 7 (not
reflected on the xml).
* Maximum BPP is removed on v71 (now bits 40-41 are reserved)
* Depth Buffer disable: on bit 44
* Update Store Tile Buffer General
* Adding Cfg Render Target Part1/2/3 packets: they replace v4X "Tile
Rendering Mode Cfg (Color)" (real name "Rendering Configuration
(Render Targets Config)"), "Tile Rendering Mode Cfg (Clear Colors
Part1)", "Tile Rendering Mode Cfg (Clear Colors Part2)", and "Tile
Rendering Mode Cfg (Clear Colors Part3)". On those old versions,
the first packet is used to configure 4 render targets. Now that 8
are supported, invididual per-render-target are used.
* Update ZS clear values packet.
* Add new v71 output formats
* Define Clear Render Targets (Replaces Clear Tile Buffers from v42)
* Redefine GL Shader State Record. Changes copared with v42:
* Fields removed:
* "Coordinate shader has separate input and output VPM blocks"
(reserved bit now)
* "Vertex shader has separate input and output VPM blocks"
(reserved bit now)
* "Address of table of default attribute Values." (we needed to
change the start position for all the following fields)
* New field:
* "Never defer FEP depth writes to fragment shader auto Z writes
on scoreboard conflict"
* Redefine clipper xy scaling: Now it uses 1/64ths of pixels, instead
of 1/256ths
* Update texture shader state.
* Notice we don't use an address type for these fields in the XML
description. This is because the addresses are 64-bit aligned
(even though the PRM doesn't say it) which means the 6 LSB bits
are implicitly 0, but the fields are encoded before the 6th bit
of their starting byte, so we can't use the usual trick we do
with address types where the first 6 bits in the byte are
implicitly overwritten by other fields and we have to encode this
manually as a uint field. This would mean that if we had an
actual BO we would also need to add it manually to the job's
list, but since we don't have one, we don't have to do anything
about it.
* Add new RB_Swap field for texture shader state
* Document Cb/Cr addresses as uint fields in texture shader state
* Fixup Blend Config description: we now support 8 RTs.
* TMU config parameter 2 has new fields
* Add new clipper Z without guardband packet in v71
* Add enums for the Z clip modes accepted in v71
* Fix texture state array stride packing for V3D 7.1.5
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
broadcom/cle: rb_swap
---
src/broadcom/cle/v3d_packet_v33.xml | 386 ++++++++++++++++++++++++++--
1 file changed, 368 insertions(+), 18 deletions(-)
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index a0242b5f1c2..624353ca2bf 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="33" max_ver="42">
+<vcxml gen="3.3" min_ver="33" max_ver="71">
<enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
<value name="NEVER" value="0"/>
@@ -167,13 +167,36 @@
<value name="depth_16" value="2"/>
</enum>
- <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+ <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
<value name="none" value="0"/> <!-- no clamping -->
<value name="norm" value="1"/> <!-- [0,1] for f16 -->
<value name="pos" value="2"/> <!-- [0, for f16 -->
<value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
</enum>
+ <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
+ <value name="8i" value="0"/> <!-- no clamping -->
+ <value name="16i" value="1"/> <!-- no clamping -->
+ <value name="32i" value="2"/> <!-- no clamping -->
+ <value name="8ui" value="4"/> <!-- no clamping -->
+ <value name="16ui" value="5"/> <!-- no clamping -->
+ <value name="32ui" value="6"/> <!-- no clamping -->
+ <value name="8" value="8"/> <!-- no clamping -->
+ <value name="16f" value="9"/> <!-- no clamping -->
+ <value name="32f" value="10"/> <!-- no clamping -->
+ <value name="8i_clamped" value="16"/> <!-- clamp to integer RT's range -->
+ <value name="16i_clamped" value="17"/> <!-- clamp to integer RT's range -->
+ <value name="32i_clamped" value="18"/> <!-- clamp to integer RT's range -->
+ <value name="8ui_clamped" value="20"/> <!-- clamp to integer RT's range -->
+ <value name="16ui_clamped" value="21"/> <!-- clamp to integer RT's range -->
+ <value name="32ui_clamped" value="22"/> <!-- clamp to integer RT's range -->
+ <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
+ <value name="16f_clamp_pos" value="25"/> <!-- [0, for f16 -->
+ <value name="16f_clamp_pq" value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
+ <value name="16f_clamp_hlg" value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
+ <value name="invalid" value="32"/>
+ </enum>
+
<!---
CL cache flush commands are not fully documented and subject to a
number of hardware issues that make them unreliable. Specifically:
@@ -263,13 +286,27 @@
<value name="r8ui" value="36"/>
<value name="srgbx8" value="37" max_ver="33"/>
<value name="rgbx8" value="38" max_ver="33"/>
- <value name="bstc" value="39" min_ver="41"/>
+ <value name="bstc8" value="39" min_ver="41"/>
<value name="d32f" value="40" min_ver="41"/>
<value name="d24" value="41" min_ver="41"/>
<value name="d16" value="42" min_ver="41"/>
<value name="d24s8" value="43" min_ver="41"/>
<value name="s8" value="44" min_ver="41"/>
<value name="rgba5551" value="45" min_ver="41"/>
+ <value name="bstc8_srgb" value="46" min_ver="71"/>
+ <value name="bstc10" value="47" min_ver="71"/>
+ <value name="bstc10_srgb" value="48" min_ver="71"/>
+ <value name="bstc10_pq" value="49" min_ver="71"/>
+ <value name="rgba10x6" value="50" min_ver="71"/>
+ <value name="bstc10_hlg" value="55" min_ver="71"/>
+ <value name="rgba10x6_hlg" value="56" min_ver="71"/>
+ <value name="rgb10_a2_hlg" value="57" min_ver="71"/>
+ <value name="bstc10_pq_bt1886" value="58" min_ver="71"/>
+ <value name="rgba10x6_pq_bt1886" value="59" min_ver="71"/>
+ <value name="rgb10_a2_pq_bt1886" value="60" min_ver="71"/>
+ <value name="bstc10_hlg_bt1886" value="61" min_ver="71"/>
+ <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
+ <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
</enum>
<enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
@@ -314,6 +351,12 @@
<value name="perp end caps" value="1"/>
</enum>
+ <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
+ <value name="NONE" value="0"/>
+ <value name="MIN_ONE_TO_ONE" value="1"/>
+ <value name="ZERO_TO_ONE" value="2"/>
+ </enum>
+
<packet code="0" name="Halt"/>
<packet code="1" name="NOP"/>
<packet code="4" name="Flush"/>
@@ -381,11 +424,13 @@
<field name="Last Tile of Frame" size="1" start="0" type="bool"/>
</packet>
- <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+ <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
<field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
<field name="Clear all Render Targets" size="1" start="0" type="bool"/>
</packet>
+ <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
+
<packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
<field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
<field name="Enable Z load" size="1" start="7" type="bool"/>
@@ -443,6 +488,10 @@
<value name="Render target 1" value="1"/>
<value name="Render target 2" value="2"/>
<value name="Render target 3" value="3"/>
+ <value name="Render target 4" value="4" min_ver="71"/>
+ <value name="Render target 5" value="5" min_ver="71"/>
+ <value name="Render target 6" value="6" min_ver="71"/>
+ <value name="Render target 7" value="7" min_ver="71"/>
<value name="None" value="8"/>
<value name="Z" value="9"/>
<value name="Stencil" value="10"/>
@@ -789,7 +838,7 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
- <packet code="84" name="Blend Cfg" min_ver="41">
+ <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
<field name="Render Target Mask" size="4" start="24" type="uint"/>
<field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
<field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
@@ -799,6 +848,16 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
+ <packet code="84" name="Blend Cfg" min_ver="71">
+ <field name="Render Target Mask" size="8" start="24" type="uint"/>
+ <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+ <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+ <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
+ <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
+ <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
+ <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+ </packet>
+
<packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
<field name="Alpha (F16)" size="16" start="48" type="uint"/>
<field name="Blue (F16)" size="16" start="32" type="uint"/>
@@ -828,7 +887,12 @@
<field name="address" size="32" start="0" type="address"/>
</packet>
- <packet code="96" name="Cfg Bits">
+ <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
+ <field name="Lower Test Limit" size="32" start="0" type="float"/>
+ <field name="Upper Test Limit" size="32" start="32" type="float"/>
+ </packet>
+
+ <packet code="96" name="Cfg Bits" max_ver="42">
<field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
<field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
<field name="Blend enable" size="1" start="19" type="bool"/>
@@ -846,6 +910,25 @@
<field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
</packet>
+ <packet code="96" name="Cfg Bits" min_ver="71">
+ <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
+ <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+ <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+ <field name="Blend enable" size="1" start="19" type="bool"/>
+ <field name="Stencil enable" size="1" start="18" type="bool"/>
+ <field name="Z updates enable" size="1" start="15" type="bool"/>
+ <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
+ <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
+ <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
+ <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
+ <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
+ <field name="Line Rasterization" size="1" start="4" type="uint"/>
+ <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+ <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+ <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+ <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+ </packet>
+
<packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
<packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
@@ -907,16 +990,26 @@
<field name="Minimum Zw" size="32" start="0" type="float"/>
</packet>
- <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
<field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
<field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
+ <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
+ <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
+ </packet>
+
<packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
<field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
<field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
+ <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+ <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+ </packet>
+
<packet name="Number of Layers" code="119" min_ver="41">
<field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
</packet>
@@ -947,7 +1040,7 @@
<field name="sub-id" size="1" start="0" type="uint" default="0"/>
</packet>
- <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">
<field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
<field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
@@ -971,6 +1064,35 @@
</field>
</packet>
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
+ <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+ <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
+
+ <field name="Log2 Tile Height" size="3" start="11" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="8" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
+
+ <field name="tile allocation block size" size="2" start="4" type="uint">
+ <value name="tile allocation block size 64b" value="0"/>
+ <value name="tile allocation block size 128b" value="1"/>
+ <value name="tile allocation block size 256b" value="2"/>
+ </field>
+ <field name="tile allocation initial block size" size="2" start="2" type="uint">
+ <value name="tile allocation initial block size 64b" value="0"/>
+ <value name="tile allocation initial block size 128b" value="1"/>
+ <value name="tile allocation initial block size 256b" value="2"/>
+ </field>
+ </packet>
+
<packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
<field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
<field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
@@ -1002,7 +1124,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
<field name="Pad" size="12" start="52" type="uint"/>
<field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
@@ -1018,7 +1140,11 @@
<field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
<field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
- <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+ <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
+ <value name="Render target maximum 32bpp" value="0"/>
+ <value name="Render target maximum 64bpp" value="1"/>
+ <value name="Render target maximum 128bpp" value="2"/>
+ </field>
<field name="Image Height (pixels)" size="16" start="24" type="uint"/>
<field name="Image Width (pixels)" size="16" start="8" type="uint"/>
@@ -1027,6 +1153,43 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
+ <field name="Pad" size="6" start="58" type="uint"/>
+
+ <field name="Log2 Tile Height" size="3" start="55" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="52" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
+
+ <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+ <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
+
+ <field name="Early-Z disable" size="1" start="46" type="bool"/>
+
+ <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
+ <value name="Early-Z direction LT/LE" value="0"/>
+ <value name="Early-Z direction GT/GE" value="1"/>
+ </field>
+
+ <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
+ <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+ <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+
+ <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+ <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+ <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
+
+ <field name="sub-id" size="3" start="0" type="uint" default="0"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
<field name="Address" size="32" start="32" type="address"/>
@@ -1048,7 +1211,8 @@
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+ <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
+ <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">
<field name="Pad" size="28" start="36" type="uint"/>
@@ -1099,7 +1263,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
<field name="unused" size="16" start="48" type="uint"/>
<field name="Z Clear Value" size="32" start="16" type="float"/>
@@ -1108,6 +1272,15 @@
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
+ <field name="unused" size="16" start="48" type="uint"/>
+
+ <field name="Z Clear Value" size="32" start="16" type="float"/>
+
+ <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
+ <field name="sub-id" size="4" start="0" type="uint" default="1"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
@@ -1117,7 +1290,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -1126,6 +1299,19 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
+
+ <field name="Clear Color low bits" size="32" start="32" type="uint"/>
+ <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
+ <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
+
+ <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
+ <!-- In multiples of 512 bits -->
+ <field name="Base Address" size="11" start="7" type="uint"/>
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="2"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
@@ -1135,7 +1321,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -1144,6 +1330,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
+ <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="3"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
@@ -1155,7 +1348,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="6"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
<!-- image height is for Y flipping -->
@@ -1166,6 +1359,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
+ <field name="Clear Color top bits" size="56" start="8" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="4"/>
+ </packet>
+
<packet code="124" shortname="tile_coords" name="Tile Coordinates">
<field name="tile row number" size="12" start="12" type="uint"/>
<field name="tile column number" size="12" start="0" type="uint"/>
@@ -1240,7 +1440,7 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
- <struct name="GL Shader State Record" min_ver="41">
+ <struct name="GL Shader State Record" min_ver="41" max_ver="42">
<field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
<field name="Enable clipping" size="1" start="1" type="bool"/>
@@ -1299,6 +1499,63 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
+ <struct name="GL Shader State Record" min_ver="71">
+ <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+ <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+ <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+ <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+ <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+ <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+ <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+ <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+ <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+ <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+
+ <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+ <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+ <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+ <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+ <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+ <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+ <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+ <field name="No prim pack" size="1" start="19" type="bool"/>
+ <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
+
+ <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+ <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+ <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+ <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+ <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+ <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+ <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+ <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+ <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+ <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
+ <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
+ <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
+ <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
+ <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+ <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
+ <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
+ <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
+ <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
+ <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
+
+ <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
+ <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
+ <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
+ <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
+ <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
+ </struct>
+
<struct name="Geometry Shader State Record" min_ver="41">
<field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
<field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
@@ -1543,7 +1800,7 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
- <struct name="TMU Config Parameter 2" min_ver="42">
+ <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
<field name="Pad" size="7" start="25" type="uint"/>
<field name="LOD Query" size="1" start="24" type="bool"/>
<field name="Op" size="4" start="20" type="TMU Op"/>
@@ -1558,6 +1815,23 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
+ <struct name="TMU Config Parameter 2" min_ver="71">
+ <field name="Pad" size="5" start="27" type="uint"/>
+ <field name="Write conversion" size="1" start="26" type="bool"/>
+ <field name="DIM query" size="1" start="25" type="bool"/>
+ <field name="LOD Query" size="1" start="24" type="bool"/>
+ <field name="Op" size="4" start="20" type="TMU Op"/>
+ <field name="Offset R" size="4" start="16" type="int"/>
+ <field name="Offset T" size="4" start="12" type="int"/>
+ <field name="Offset S" size="4" start="8" type="int"/>
+ <field name="Gather Mode" size="1" start="7" type="bool"/>
+ <field name="Gather Component" size="2" start="5" type="uint"/>
+ <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+ <field name="Sample Number" size="2" start="2" type="uint"/>
+ <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+ <field name="Offset Format 8" size="1" start="0" type="bool"/>
+ </struct>
+
<struct name="Texture Shader State" max_ver="33">
<field name="UIF XOR disable" size="1" start="255" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
@@ -1611,7 +1885,7 @@
<field name="Filter" size="4" start="0" type="TMU Filter"/>
</struct>
- <struct name="Texture Shader State" min_ver="41">
+ <struct name="Texture Shader State" min_ver="41" max_ver="42">
<field name="Pad" size="56" start="136" type="uint"/>
<field name="UIF XOR disable" size="1" start="135" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
@@ -1652,6 +1926,82 @@
<field name="Flip texture X Axis" size="1" start="0" type="bool"/>
</struct>
+ <struct name="Texture Shader State" min_ver="71">
+ <field name="Pad" size="2" start="190" type="uint"/>
+ <!-- When we use an address type, there is an implicit requirement
+ that the address is a 32-bit that is encoded starting at a 32-bit
+ aligned bit offset into the packet. If the address field has less than
+ 32 bits, it is assumed that the address is aligned. For example, a
+ 26-bit address field is expected to be 64-byte aligned (6 lsb bits
+ are 0) and that this will be encoded into a packet starting at bit
+ offset 6 into a 32-bit dword (since bits 0..5 of the address are
+ implicitly 0 and don't need to be explicitly encoded).
+
+ Unfortunately, the CB address below doesn't match this requirement:
+ it starts at bit 138, which is 10 bits into a 32-bit dword, but it
+ represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
+ encode it as an address type. To fix this we encode these addresses
+ as uint types which has two implications:
+ 1. the driver is responsible for manually addinng the buffer objects
+ for these addresses to the job BO list.
+ 2. the driver needs to pass an actual 26-bit address value by manually
+ shifting the 6 lsb bits (that are implicitly 0).
+ -->
+ <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
+ <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
+ <field name="Chroma offset y" size="1" start="137" type="uint"/>
+ <field name="Chroma offset x" size="1" start="136" type="uint"/>
+
+ <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+ <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+ <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+ <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+ <field name="Base Level" size="4" start="124" type="uint"/>
+ <field name="Max Level" size="4" start="120" type="uint"/>
+
+ <field name="Swizzle A" size="3" start="117" type="uint">
+ <value name="Swizzle Zero" value="0"/>
+ <value name="Swizzle One" value="1"/>
+ <value name="Swizzle Red" value="2"/>
+ <value name="Swizzle Green" value="3"/>
+ <value name="Swizzle Blue" value="4"/>
+ <value name="Swizzle Alpha" value="5"/>
+ </field>
+
+ <field name="Swizzle B" size="3" start="114" type="uint"/>
+ <field name="Swizzle G" size="3" start="111" type="uint"/>
+ <field name="Swizzle R" size="3" start="108" type="uint"/>
+ <field name="Extended" size="1" start="107" type="bool"/>
+
+ <field name="Texture type" size="7" start="100" type="uint"/>
+ <field name="Image Depth" size="14" start="86" type="uint"/>
+ <field name="Image Height" size="14" start="72" type="uint"/>
+ <field name="Image Width" size="14" start="58" type="uint"/>
+
+ <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
+ at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
+ Array Stride starting at 33, which is backwards incompatible,
+ We use the definition from 7.1.5.
+ -->
+ <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
+ <field name="R/B swap" size="1" start="32" type="bool"/>
+
+ <field name="Texture base pointer" size="32" start="0" type="address"/>
+
+ <field name="Reverse" size="1" start="5" type="bool"/>
+ <field name="Transfer func" size="3" start="2" type="uint">
+ <value name="Transfer Func None" value="0"/>
+ <value name="Transfer Func sRGB" value="1"/>
+ <value name="Transfer Func PQ" value="2"/>
+ <value name="Transfer Func HLG" value="3"/>
+ <value name="Transfer Func PQ BT1886" value="4"/>
+ <value name="Transfer Func HLG BT1886" value="5"/>
+ </field>
+ <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+ <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+ </struct>
+
<struct name="Sampler State" min_ver="41">
<field name="Border color word 3" size="32" start="160" type="uint"/>
<field name="Border color word 2" size="32" start="128" type="uint"/>
--
2.39.2

View File

@ -1,8 +1,8 @@
From 381c29e3ff5237c89380cc53eb2271d1985f4e34 Mon Sep 17 00:00:00 2001
From 7e151fd3a213848c8022c9f48e10f2aec76c3e4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 2 Dec 2021 13:26:43 +0100
Subject: [PATCH 067/142] broadcom/compiler: update image store lowering to use
v71 new packing/conversion instructions
Subject: [PATCH 3/3] broadcom/compiler: update image store lowering to use v71
new packing/conversion instructions
Vulkan shaderdb stats with pattern dEQP-VK.image.*.with_format.*.*:
total instructions in shared programs: 35993 -> 33245 (-7.63%)
@ -31,18 +31,20 @@ Vulkan shaderdb stats with pattern dEQP-VK.image.*.with_format.*.*:
FWIW, that one HURT on the instructions count is for just one
instruction.
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
src/broadcom/compiler/nir_to_vir.c | 39 +++
src/broadcom/compiler/nir_to_vir.c | 40 +++
src/broadcom/compiler/v3d_compiler.h | 16 +-
.../compiler/v3d_nir_lower_image_load_store.c | 246 +++++++++++++++++-
.../compiler/v3d_nir_lower_image_load_store.c | 239 +++++++++++++++++-
src/broadcom/compiler/vir.c | 2 +-
4 files changed, 294 insertions(+), 9 deletions(-)
4 files changed, 288 insertions(+), 9 deletions(-)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 90fe1d1e7f0..a8cf02dd386 100644
index 220c864a056..4329d4c85f6 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1689,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
@@ -1688,6 +1688,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_VFPACK(c, src[0], src[1]);
break;
@ -65,10 +67,10 @@ index 90fe1d1e7f0..a8cf02dd386 100644
case nir_op_unpack_half_2x16_split_x:
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1719,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
@@ -1698,6 +1714,30 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
break;
}
+ case nir_op_vftounorm8_v3d:
+ result = vir_VFTOUNORM8(c, src[0]);
+ break;
@ -92,14 +94,15 @@ index 90fe1d1e7f0..a8cf02dd386 100644
+ case nir_op_ftosnorm16_v3d:
+ result = vir_FTOSNORM16(c, src[0]);
+ break;
+
default:
fprintf(stderr, "unknown NIR ALU inst: ");
nir_print_instr(&instr->instr, stderr);
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 36adf8830b5..425ab0cdf9d 100644
index 095b33c03b8..5714e85d2b8 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -1186,7 +1186,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
@@ -1180,7 +1180,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
bool v3d_nir_lower_scratch(nir_shader *s);
bool v3d_nir_lower_txf_ms(nir_shader *s);
@ -108,7 +111,7 @@ index 36adf8830b5..425ab0cdf9d 100644
bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
@@ -1427,6 +1427,20 @@ VIR_SFU(LOG)
@@ -1421,6 +1421,20 @@ VIR_SFU(LOG)
VIR_SFU(SIN)
VIR_SFU(RSQRT2)
@ -130,7 +133,7 @@ index 36adf8830b5..425ab0cdf9d 100644
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
struct qreg dest, struct qreg src)
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2900a29817f..bbb55be4a14 100644
index 5f8363377cb..ec43f834897 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,6 +40,10 @@
@ -151,9 +154,9 @@ index 2900a29817f..bbb55be4a14 100644
+ *
+ * This is the generic helper, using all common nir operations.
*/
static nir_ssa_def *
pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
static nir_def *
pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
@@ -91,8 +97,180 @@ pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
}
@ -161,46 +164,42 @@ index 2900a29817f..bbb55be4a14 100644
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static nir_ssa_def *
+nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2)
+static inline nir_def *
+nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
+{
+ return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_11f11f10f(nir_builder *b, nir_ssa_def *color)
+static inline nir_def *
+pack_11f11f10f(nir_builder *b, nir_def *color)
+{
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ /* FIXME: we noted that we could just use p2 again as the second
+ * element to pack, and CTS tests still works. Just using undef as is
+ * slightly more correct
+ */
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+ nir_def *undef = nir_undef(b, 1, color->bit_size);
+ nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+ return nir_v11fpack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color)
+static inline nir_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
+{
+ nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+ nir_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+ nir_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+
+ return nir_v10pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color)
+static inline nir_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
+{
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ p1 = nir_vftounorm10lo_v3d(b, p1);
+
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = nir_vftounorm10hi_v3d(b, p2);
+
@ -213,8 +212,8 @@ index 2900a29817f..bbb55be4a14 100644
+ TO_UNORM
+};
+
+static inline nir_ssa_def *
+pack_8bit(nir_builder *b, nir_ssa_def *color,
+static inline nir_def *
+pack_8bit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
@ -223,8 +222,8 @@ index 2900a29817f..bbb55be4a14 100644
+ * conversion. But we support also that case, and let the caller
+ * decide which method to use.
+ */
+ nir_ssa_def *p1;
+ nir_ssa_def *p2;
+ nir_def *p1;
+ nir_def *p2;
+
+ if (conversion == NONE) {
+ p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
@ -246,10 +245,9 @@ index 2900a29817f..bbb55be4a14 100644
+ nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2);
+ }
+ } else {
+ /* As mentioned on the comment before, using an undef here
+ * would be more correct. But for this case we are getting
+ * worse values, and in fact even some worse instruction count
+ * with some CTS tests, so we just reuse the first packing
+ /* Using an undef here would be more correct. But for this
+ * case we are getting worse shader-db values with some CTS
+ * tests, so we just reuse the first packing.
+ */
+ p2 = p1;
+ }
@ -257,13 +255,13 @@ index 2900a29817f..bbb55be4a14 100644
+ return nir_v8pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_16bit(nir_builder *b, nir_ssa_def *color,
+static inline nir_def *
+pack_16bit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ nir_ssa_def *results[2];
+ nir_ssa_def *channels[4];
+ nir_def *results[2];
+ nir_def *channels[4];
+
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) if we are not doing any conversion. But we support
@ -299,8 +297,8 @@ index 2900a29817f..bbb55be4a14 100644
+ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_ssa_def *
+pack_xbit(nir_builder *b, nir_ssa_def *color,
+static inline nir_def *
+pack_xbit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ const struct util_format_channel_description *r_chan)
+{
@ -340,7 +338,7 @@ index 2900a29817f..bbb55be4a14 100644
{
enum pipe_format format = nir_intrinsic_format(instr);
assert(format != PIPE_FORMAT_NONE);
@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
@@ -118,9 +296,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
*/
formatted = color;
} else {
@ -350,7 +348,7 @@ index 2900a29817f..bbb55be4a14 100644
const unsigned *bits;
switch (r_chan->size) {
@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
@@ -170,6 +345,50 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
return true;
}
@ -366,10 +364,9 @@ index 2900a29817f..bbb55be4a14 100644
+ unsigned num_components = util_format_get_nr_components(format);
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_ssa_def *color = nir_channels(b,
+ nir_ssa_for_src(b, instr->src[3], 4),
+ (1 << num_components) - 1);
+ nir_ssa_def *formatted = NULL;
+ nir_def *color =
+ nir_trim_vector(b, instr->src[3].ssa, num_components);
+ nir_def *formatted = NULL;
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ formatted = nir_format_pack_r9g9b9e5(b, color);
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
@ -393,8 +390,7 @@ index 2900a29817f..bbb55be4a14 100644
+ formatted = pack_xbit(b, color, num_components, r_chan);
+ }
+
+ nir_instr_rewrite_src(&instr->instr, &instr->src[3],
+ nir_src_for_ssa(formatted));
+ nir_src_rewrite(&instr->src[3], formatted);
+ instr->num_components = formatted->num_components;
+
+ return true;
@ -403,10 +399,10 @@ index 2900a29817f..bbb55be4a14 100644
static bool
v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
{
@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
nir_intrinsic_instr *intr =
nir_instr_as_intrinsic(instr);
@@ -207,11 +426,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
nir_intrinsic_instr *intr,
void *_state)
{
+ struct v3d_compile *c = (struct v3d_compile *) _state;
+
switch (intr->intrinsic) {
@ -422,23 +418,24 @@ index 2900a29817f..bbb55be4a14 100644
default:
return false;
}
@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
@@ -220,10 +445,10 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
}
bool
-v3d_nir_lower_image_load_store(nir_shader *s)
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
{
return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb,
return nir_shader_intrinsics_pass(s,
v3d_nir_lower_image_load_store_cb,
nir_metadata_block_index |
- nir_metadata_dominance, NULL);
+ nir_metadata_dominance, c);
}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index aea113f050e..7612eed7130 100644
index 8c536b8fbcc..acb13a6cbf9 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -1576,7 +1576,7 @@ v3d_attempt_compile(struct v3d_compile *c)
@@ -1599,7 +1599,7 @@ v3d_attempt_compile(struct v3d_compile *c)
NIR_PASS(_, c->s, v3d_nir_lower_io, c);
NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);

View File

@ -1,65 +0,0 @@
From 569cbe4229df737ce5915c4be2cad534707fb4f7 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 9 Nov 2021 08:50:51 +0100
Subject: [PATCH 004/142] broadcom/common: retrieve V3D revision number
The subrev field from the hub ident3 register is bumped with every
hardware revision doing backwards incompatible changes so we want to
keep track of this.
Instead of modifying the 'ver' field info to acommodate subrev info,
which would require a lot of changes, simply add a new 'rev' field in
devinfo that we can use when we need to make changes based on the
revision number of a hardware release.
---
src/broadcom/common/v3d_device_info.c | 14 +++++++++++++-
src/broadcom/common/v3d_device_info.h | 3 +++
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 7e0862f1f02..7512fe3a06b 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
struct drm_v3d_get_param ident1 = {
.param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
};
+ struct drm_v3d_get_param hub_ident3 = {
+ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
+ };
int ret;
ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
@@ -76,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
return false;
}
- return true;
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ devinfo->rev = (hub_ident3.value >> 8) & 0xff;
+
+ return true;
}
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 97abd9b8d9f..32cb65cf81f 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -34,6 +34,9 @@ struct v3d_device_info {
/** Simple V3D version: major * 10 + minor */
uint8_t ver;
+ /** V3D revision number */
+ uint8_t rev;
+
/** Size of the VPM, in bytes. */
int vpm_size;
--
2.39.2

View File

@ -1,91 +0,0 @@
From c260843c882d25bd31e308566b45d4517fda0fa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 17 Nov 2021 14:40:47 +0100
Subject: [PATCH 005/142] broadcom/common: add some common v71 helpers
---
src/broadcom/common/v3d_util.c | 27 +++++++++++++++++++++++++++
src/broadcom/common/v3d_util.h | 27 +++++++++++++++++++++++++++
2 files changed, 54 insertions(+)
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
index 57872a923d3..26f5c6b336f 100644
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@@ -170,3 +170,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
unreachable("Unsupported primitive type");
}
}
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp)
+{
+ switch (internal_bpp) {
+ case 0 /* V3D_INTERNAL_BPP_32 */:
+ return 1;
+ case 1 /* V3D_INTERNAL_BPP_64 */:
+ return 2;
+ case 2 /* V3D_INTERNAL_BPP_128 */:
+ return 4;
+ default:
+ unreachable("Unsupported internal BPP");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp)
+{
+ /* stride in multiples of 128 bits, and covers 2 rows. This is the
+ * reason we divide by 2 instead of 4, as we divide number of 32-bit
+ * words per row by 2.
+ */
+
+ return (tile_width * bpp) / 2;
+}
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
index eb802b77f67..864fc949ffa 100644
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@@ -24,6 +24,7 @@
#ifndef V3D_UTIL_H
#define V3D_UTIL_H
+#include "util/macros.h"
#include "common/v3d_device_info.h"
#include "pipe/p_defines.h"
@@ -46,4 +47,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
uint32_t
v3d_hw_prim_type(enum mesa_prim prim_type);
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp);
+
+/* Some configuration packets want the size on log2, but starting at 0 for
+ * size 8.
+ */
+static inline uint8_t
+log2_tile_size(uint32_t size)
+{
+ switch(size) {
+ case 8:
+ return 0;
+ case 16:
+ return 1;
+ case 32:
+ return 2;
+ case 64:
+ return 3;
+ default:
+ unreachable("Unsupported tile width/height");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp);
#endif
--
2.39.2

View File

@ -1,53 +0,0 @@
From a5211a4d71acc53183d2a90eb1694d8cce6eb44f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 5 Aug 2021 01:03:11 +0200
Subject: [PATCH 006/142] broadcom/qpu: add comments on waddr not used on V3D
7.x
---
src/broadcom/qpu/qpu_instr.h | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 2e133472698..45a0cad9760 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -88,11 +88,11 @@ enum v3d_qpu_uf {
};
enum v3d_qpu_waddr {
- V3D_QPU_WADDR_R0 = 0,
- V3D_QPU_WADDR_R1 = 1,
- V3D_QPU_WADDR_R2 = 2,
- V3D_QPU_WADDR_R3 = 3,
- V3D_QPU_WADDR_R4 = 4,
+ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_R5 = 5,
V3D_QPU_WADDR_NOP = 6,
V3D_QPU_WADDR_TLB = 7,
@@ -108,12 +108,12 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_SYNC = 16,
V3D_QPU_WADDR_SYNCU = 17,
V3D_QPU_WADDR_SYNCB = 18,
- V3D_QPU_WADDR_RECIP = 19,
- V3D_QPU_WADDR_RSQRT = 20,
- V3D_QPU_WADDR_EXP = 21,
- V3D_QPU_WADDR_LOG = 22,
- V3D_QPU_WADDR_SIN = 23,
- V3D_QPU_WADDR_RSQRT2 = 24,
+ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_TMUC = 32,
V3D_QPU_WADDR_TMUS = 33,
V3D_QPU_WADDR_TMUT = 34,
--
2.39.2

View File

@ -1,60 +0,0 @@
From 0ccf3043e4a584e5592bb7fad737d5d98ed23db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 5 Aug 2021 01:00:47 +0200
Subject: [PATCH 007/142] broadcom/qpu: set V3D 7.x names for some waddr
aliasing
V3D 7.x got rid of the accumulator, but still uses the values for
WADDR_R5 and WADDR_R5REP, so let's return a proper name and add some
aliases.
---
src/broadcom/qpu/qpu_instr.c | 8 ++++++++
src/broadcom/qpu/qpu_instr.h | 6 ++++--
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 60dabf74e8e..7759fb0efdf 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
return "tmu";
+ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
+ */
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
+ return "quad";
+
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
+ return "rep";
+
static const char *waddr_magic[] = {
[V3D_QPU_WADDR_R0] = "r0",
[V3D_QPU_WADDR_R1] = "r1",
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 45a0cad9760..19bf721dbe1 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -93,7 +93,8 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
- V3D_QPU_WADDR_R5 = 5,
+ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */
+ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */
V3D_QPU_WADDR_NOP = 6,
V3D_QPU_WADDR_TLB = 7,
V3D_QPU_WADDR_TLBU = 8,
@@ -129,7 +130,8 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_TMUHSCM = 44,
V3D_QPU_WADDR_TMUHSF = 45,
V3D_QPU_WADDR_TMUHSLOD = 46,
- V3D_QPU_WADDR_R5REP = 55,
+ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
+ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */
};
struct v3d_qpu_flags {
--
2.39.2

View File

@ -1,241 +0,0 @@
From 18de3cc85cf8bbe294e044f7a12abe14e554de0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Sun, 19 Sep 2021 03:20:18 +0200
Subject: [PATCH 008/142] broadcom/compiler: rename small_imm to small_imm_b
Current small_imm is associated with the "B" read address.
We do this change in advance for v71 support, where we will have 4
different small_imm (a/b/c/d), so we start with a renaming.
---
src/broadcom/compiler/qpu_schedule.c | 22 +++++++++----------
.../compiler/vir_opt_small_immediates.c | 4 ++--
src/broadcom/compiler/vir_to_qpu.c | 2 +-
src/broadcom/qpu/qpu_disasm.c | 2 +-
src/broadcom/qpu/qpu_instr.h | 2 +-
src/broadcom/qpu/qpu_pack.c | 22 +++++++++----------
6 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 3b32b48f86f..a10fa03ed10 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -160,7 +160,7 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- if (!n->inst->qpu.sig.small_imm) {
+ if (!n->inst->qpu.sig.small_imm_b) {
add_read_dep(state,
state->last_rf[n->inst->qpu.raddr_b], n);
}
@@ -615,7 +615,7 @@ qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
return true;
if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm && (inst->raddr_b == waddr))
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
return true;
return false;
@@ -790,11 +790,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
uint64_t raddrs_used = 0;
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
raddrs_used |= (1ll << a->raddr_a);
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
raddrs_used |= (1ll << a->raddr_b);
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
raddrs_used |= (1ll << b->raddr_a);
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
raddrs_used |= (1ll << b->raddr_b);
return raddrs_used;
@@ -816,16 +816,16 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
if (naddrs > 2)
return false;
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
if (naddrs > 1)
return false;
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
if (add_instr->raddr_b != mul_instr->raddr_b)
return false;
- result->sig.small_imm = true;
- result->raddr_b = add_instr->sig.small_imm ?
+ result->sig.small_imm_b = true;
+ result->raddr_b = add_instr->sig.small_imm_b ?
add_instr->raddr_b : mul_instr->raddr_b;
}
@@ -836,7 +836,7 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
raddrs_used &= ~(1ll << raddr_a);
result->raddr_a = raddr_a;
- if (!result->sig.small_imm) {
+ if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
if (add_instr->alu.add.a == V3D_QPU_MUX_B)
@@ -1025,7 +1025,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm |= b->sig.small_imm;
+ merge.sig.small_imm_b |= b->sig.small_imm_b;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1614,7 +1614,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
return false;
if (inst->raddr_b < 3 &&
- !inst->sig.small_imm &&
+ !inst->sig.small_imm_b &&
v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
return false;
}
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..df0d6c36c9b 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -80,7 +80,7 @@ vir_opt_small_immediates(struct v3d_compile *c)
*/
struct v3d_qpu_sig new_sig = inst->qpu.sig;
uint32_t sig_packed;
- new_sig.small_imm = true;
+ new_sig.small_imm_b = true;
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
continue;
@@ -89,7 +89,7 @@ vir_opt_small_immediates(struct v3d_compile *c)
vir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->qpu.sig.small_imm = true;
+ inst->qpu.sig.small_imm_b = true;
inst->qpu.raddr_b = packed;
inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 45e6bfa1470..15c2e3674c2 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -94,7 +94,7 @@ static void
set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
{
if (src.smimm) {
- assert(instr->sig.small_imm);
+ assert(instr->sig.small_imm_b);
*mux = V3D_QPU_MUX_B;
return;
}
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index 28fb2357b97..6aca3c28e78 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -62,7 +62,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
if (mux == V3D_QPU_MUX_A) {
append(disasm, "rf%d", instr->raddr_a);
} else if (mux == V3D_QPU_MUX_B) {
- if (instr->sig.small_imm) {
+ if (instr->sig.small_imm_b) {
uint32_t val;
ASSERTED bool ok =
v3d_qpu_small_imm_unpack(disasm->devinfo,
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 19bf721dbe1..9cd831863b4 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,7 +50,7 @@ struct v3d_qpu_sig {
bool ldvpm:1;
bool ldtlb:1;
bool ldtlbu:1;
- bool small_imm:1;
+ bool small_imm_b:1;
bool ucb:1;
bool rotate:1;
bool wrtmuc:1;
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index a875683c6f8..beac591d3c1 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -112,7 +112,7 @@
#define LDTMU .ldtmu = true
#define LDVARY .ldvary = true
#define LDVPM .ldvpm = true
-#define SMIMM .small_imm = true
+#define SMIMM_B .small_imm_b = true
#define LDTLB .ldtlb = true
#define LDTLBU .ldtlbu = true
#define UCB .ucb = true
@@ -135,8 +135,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDVARY, LDTMU, },
[13] = { THRSW, LDVARY, LDTMU, },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
/* 18-21 reserved */
@@ -148,8 +148,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[27] = { THRSW, LDVPM, LDUNIF },
[28] = { LDVPM, LDTMU, },
[29] = { THRSW, LDVPM, LDTMU, },
- [30] = { SMIMM, LDVPM, },
- [31] = { SMIMM, },
+ [30] = { SMIMM_B, LDVPM, },
+ [31] = { SMIMM_B, },
};
static const struct v3d_qpu_sig v40_sig_map[] = {
@@ -167,8 +167,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[10] = { LDVARY, LDUNIF },
[11] = { THRSW, LDVARY, LDUNIF },
/* 12-13 reserved */
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -178,7 +178,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[22] = { UCB, },
[23] = { ROT, },
/* 24-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
};
static const struct v3d_qpu_sig v41_sig_map[] = {
@@ -197,8 +197,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDUNIFRF },
[13] = { THRSW, LDUNIFRF },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -210,7 +210,7 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[24] = { LDUNIFA},
[25] = { LDUNIFARF },
/* 26-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
};
bool
--
2.39.2

View File

@ -1,53 +0,0 @@
From 0e87405fe73694c173b7ce14c3d60611f241922c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 5 Aug 2021 00:50:12 +0200
Subject: [PATCH 009/142] broadcom/compiler: add small_imm a/c/d on v3d_qpu_sig
small_imm_a, small_imm_c and small_imm_d added on top of the already
existing small_imm_b, as V3D 7.1 defines 4 small immediates, tied to
the 4 raddr. Note that this is only the definition, and just a inst
validation rule to check that are not used before v71. Any real use is
still pending.
---
src/broadcom/compiler/qpu_validate.c | 5 +++++
src/broadcom/qpu/qpu_instr.h | 5 ++++-
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 2cc7a0eb0ae..12788692432 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -115,6 +115,11 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return;
+ if (devinfo->ver < 71) {
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d)
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ }
+
/* LDVARY writes r5 two instructions later and LDUNIF writes
* r5 one instruction later, which is illegal to have
* together.
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 9cd831863b4..13b3f37d43f 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
bool ldvpm:1;
bool ldtlb:1;
bool ldtlbu:1;
- bool small_imm_b:1;
bool ucb:1;
bool rotate:1;
bool wrtmuc:1;
+ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
+ bool small_imm_b:1; /* raddr_b (add b) */
+ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
+ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
};
enum v3d_qpu_cond {
--
2.39.2

View File

@ -1,106 +0,0 @@
From eca19c911d9af3b0ab3b563ea65dc455e3d27987 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 4 Aug 2021 01:11:16 +0200
Subject: [PATCH 010/142] broadcom/qpu: add v71 signal map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Compared with v41, the differences are:
* 14, 15, 29 and 30 are now about immediate a, b, c, d respectively
* 23 is now reserved. On v42 this was for rotate signals, that are
gone on v71.
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
---
src/broadcom/qpu/qpu_pack.c | 47 ++++++++++++++++++++++++++++++++++---
1 file changed, 44 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index beac591d3c1..2820d9d4c56 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -112,12 +112,15 @@
#define LDTMU .ldtmu = true
#define LDVARY .ldvary = true
#define LDVPM .ldvpm = true
-#define SMIMM_B .small_imm_b = true
#define LDTLB .ldtlb = true
#define LDTLBU .ldtlbu = true
#define UCB .ucb = true
#define ROT .rotate = true
#define WRTMUC .wrtmuc = true
+#define SMIMM_A .small_imm_a = true
+#define SMIMM_B .small_imm_b = true
+#define SMIMM_C .small_imm_c = true
+#define SMIMM_D .small_imm_d = true
static const struct v3d_qpu_sig v33_sig_map[] = {
/* MISC R3 R4 R5 */
@@ -213,6 +216,40 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[31] = { SMIMM_B, LDTMU, },
};
+
+static const struct v3d_qpu_sig v71_sig_map[] = {
+ /* MISC phys RF0 */
+ [0] = { },
+ [1] = { THRSW, },
+ [2] = { LDUNIF },
+ [3] = { THRSW, LDUNIF },
+ [4] = { LDTMU, },
+ [5] = { THRSW, LDTMU, },
+ [6] = { LDTMU, LDUNIF },
+ [7] = { THRSW, LDTMU, LDUNIF },
+ [8] = { LDVARY, },
+ [9] = { THRSW, LDVARY, },
+ [10] = { LDVARY, LDUNIF },
+ [11] = { THRSW, LDVARY, LDUNIF },
+ [12] = { LDUNIFRF },
+ [13] = { THRSW, LDUNIFRF },
+ [14] = { SMIMM_A, },
+ [15] = { SMIMM_B, },
+ [16] = { LDTLB, },
+ [17] = { LDTLBU, },
+ [18] = { WRTMUC },
+ [19] = { THRSW, WRTMUC },
+ [20] = { LDVARY, WRTMUC },
+ [21] = { THRSW, LDVARY, WRTMUC },
+ [22] = { UCB, },
+ /* 23 reserved */
+ [24] = { LDUNIFA},
+ [25] = { LDUNIFARF },
+ /* 26-29 reserved */
+ [30] = { SMIMM_C, },
+ [31] = { SMIMM_D, },
+};
+
bool
v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
uint32_t packed_sig,
@@ -221,7 +258,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
if (packed_sig >= ARRAY_SIZE(v33_sig_map))
return false;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ *sig = v71_sig_map[packed_sig];
+ else if (devinfo->ver >= 41)
*sig = v41_sig_map[packed_sig];
else if (devinfo->ver == 40)
*sig = v40_sig_map[packed_sig];
@@ -240,7 +279,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
{
static const struct v3d_qpu_sig *map;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ map = v71_sig_map;
+ else if (devinfo->ver >= 41)
map = v41_sig_map;
else if (devinfo->ver == 40)
map = v40_sig_map;
--
2.39.2

View File

@ -1,778 +0,0 @@
From d10e67a396d713ec81fb133f3516e09fe1e067b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 6 Aug 2021 01:22:31 +0200
Subject: [PATCH 011/142] broadcom/qpu: define v3d_qpu_input, use on
v3d_qpu_alu_instr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
At this point it just tidy up a little the alu_instr structure.
But also serves to prepare the structure for new changes, as 7.x uses
raddr instead of mux, and it is just easier to add the raddr to the
new input structure.
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
---
src/broadcom/compiler/qpu_schedule.c | 65 +++++++--------
src/broadcom/compiler/vir.c | 16 ++--
src/broadcom/compiler/vir_dump.c | 8 +-
.../compiler/vir_opt_copy_propagate.c | 12 +--
.../compiler/vir_opt_redundant_flags.c | 8 +-
src/broadcom/compiler/vir_to_qpu.c | 30 +++----
src/broadcom/qpu/qpu_disasm.c | 16 ++--
src/broadcom/qpu/qpu_instr.c | 8 +-
src/broadcom/qpu/qpu_instr.h | 13 +--
src/broadcom/qpu/qpu_pack.c | 82 +++++++++----------
src/broadcom/qpu/tests/qpu_disasm.c | 8 +-
11 files changed, 134 insertions(+), 132 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index a10fa03ed10..455fa3867be 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -306,14 +306,14 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a);
+ process_mux_deps(state, n, inst->alu.add.a.mux);
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b);
+ process_mux_deps(state, n, inst->alu.add.b.mux);
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a);
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b);
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
@@ -537,22 +537,22 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
if (inst->alu.add.op != V3D_QPU_A_NOP) {
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
+ mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) {
return true;
}
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
+ mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) {
return true;
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
+ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) {
return true;
}
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
+ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) {
return true;
}
}
@@ -839,20 +839,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
- result->alu.add.a = V3D_QPU_MUX_A;
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_A;
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
raddr_a == mul_instr->raddr_b) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
- result->alu.mul.a = V3D_QPU_MUX_A;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_A;
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
}
}
}
@@ -863,20 +863,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
result->raddr_b = raddr_b;
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
raddr_b == add_instr->raddr_a) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
- result->alu.add.a = V3D_QPU_MUX_B;
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_B;
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
raddr_b == mul_instr->raddr_a) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
- result->alu.mul.a = V3D_QPU_MUX_B;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_B;
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
}
}
@@ -927,11 +927,12 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->flags.auf = V3D_QPU_UF_NONE;
inst->alu.mul.output_pack = inst->alu.add.output_pack;
- inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
- inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
+
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
- inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
}
static bool
@@ -2064,12 +2065,12 @@ alu_reads_register(struct v3d_qpu_instr *inst,
if (add) {
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a;
- mux_b = inst->alu.add.b;
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a;
- mux_b = inst->alu.mul.b;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
}
for (int i = 0; i < num_src; i++) {
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 660b11b0577..007cb0a941b 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
return false;
}
- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
@@ -209,15 +209,15 @@ vir_set_unpack(struct qinst *inst, int src,
if (vir_is_add(inst)) {
if (src == 0)
- inst->qpu.alu.add.a_unpack = unpack;
+ inst->qpu.alu.add.a.unpack = unpack;
else
- inst->qpu.alu.add.b_unpack = unpack;
+ inst->qpu.alu.add.b.unpack = unpack;
} else {
assert(vir_is_mul(inst));
if (src == 0)
- inst->qpu.alu.mul.a_unpack = unpack;
+ inst->qpu.alu.mul.a.unpack = unpack;
else
- inst->qpu.alu.mul.b_unpack = unpack;
+ inst->qpu.alu.mul.b.unpack = unpack;
}
}
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..ab5d4043039 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
- unpack[0] = instr->alu.add.a_unpack;
- unpack[1] = instr->alu.add.b_unpack;
+ unpack[0] = instr->alu.add.a.unpack;
+ unpack[1] = instr->alu.add.b.unpack;
} else {
fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
- unpack[0] = instr->alu.mul.a_unpack;
- unpack[1] = instr->alu.mul.b_unpack;
+ unpack[0] = instr->alu.mul.a.unpack;
+ unpack[1] = instr->alu.mul.b.unpack;
}
for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index da121c2a5bd..c4aa7255a17 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -104,14 +104,14 @@ vir_has_unpack(struct qinst *inst, int chan)
if (vir_is_add(inst)) {
if (chan == 0)
- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
} else {
if (chan == 0)
- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
}
}
@@ -161,7 +161,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
continue;
/* these ops can't represent abs. */
- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_VFPACK:
case V3D_QPU_A_FROUND:
@@ -189,7 +189,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
inst->src[i] = mov->src[0];
if (vir_has_unpack(mov, 0)) {
- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
vir_set_unpack(inst, i, unpack);
}
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index c7896d57f2b..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
a->qpu.flags.mpf != b->qpu.flags.mpf ||
a->qpu.alu.add.op != b->qpu.alu.add.op ||
a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
return false;
}
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 15c2e3674c2..c8b6e0a91a0 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -106,20 +106,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
return;
}
- if (instr->alu.add.a != V3D_QPU_MUX_A &&
- instr->alu.add.b != V3D_QPU_MUX_A &&
- instr->alu.mul.a != V3D_QPU_MUX_A &&
- instr->alu.mul.b != V3D_QPU_MUX_A) {
+ if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
instr->raddr_a = src.index;
*mux = V3D_QPU_MUX_A;
} else {
if (instr->raddr_a == src.index) {
*mux = V3D_QPU_MUX_A;
} else {
- assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
- instr->alu.add.b == V3D_QPU_MUX_B &&
- instr->alu.mul.a == V3D_QPU_MUX_B &&
- instr->alu.mul.b == V3D_QPU_MUX_B) ||
+ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
src.index == instr->raddr_b);
instr->raddr_b = src.index;
@@ -147,14 +147,14 @@ is_no_op_mov(struct qinst *qinst)
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
return false;
- if (qinst->qpu.alu.mul.a !=
+ if (qinst->qpu.alu.mul.a.mux !=
V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
return false;
}
} else {
int raddr;
- switch (qinst->qpu.alu.mul.a) {
+ switch (qinst->qpu.alu.mul.a.mux) {
case V3D_QPU_MUX_A:
raddr = qinst->qpu.raddr_a;
break;
@@ -171,7 +171,7 @@ is_no_op_mov(struct qinst *qinst)
/* No packing or flags updates, or we need to execute the
* instruction.
*/
- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -302,11 +302,11 @@ v3d_generate_code_block(struct v3d_compile *c,
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.a, src[0]);
+ &qinst->qpu.alu.add.a.mux, src[0]);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.b, src[1]);
+ &qinst->qpu.alu.add.b.mux, src[1]);
}
qinst->qpu.alu.add.waddr = dst.index;
@@ -314,11 +314,11 @@ v3d_generate_code_block(struct v3d_compile *c,
} else {
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.a, src[0]);
+ &qinst->qpu.alu.mul.a.mux, src[0]);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.b, src[1]);
+ &qinst->qpu.alu.mul.b.mux, src[1]);
}
qinst->qpu.alu.mul.waddr = dst.index;
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index 6aca3c28e78..588a665f770 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -121,16 +121,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.b.unpack));
}
}
@@ -164,16 +164,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
+ v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
}
}
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 7759fb0efdf..7ece8b5e570 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -926,10 +926,10 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
- (add_nsrc > 1 && inst->alu.add.b == mux) ||
- (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
- (mul_nsrc > 1 && inst->alu.mul.b == mux));
+ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
+ (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
+ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
+ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
}
bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 13b3f37d43f..53a51bfb3e1 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -294,25 +294,26 @@ enum v3d_qpu_mux {
V3D_QPU_MUX_B,
};
+struct v3d_qpu_input {
+ enum v3d_qpu_mux mux;
+ enum v3d_qpu_input_unpack unpack;
+};
+
struct v3d_qpu_alu_instr {
struct {
enum v3d_qpu_add_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} add;
struct {
enum v3d_qpu_mul_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} mul;
};
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 2820d9d4c56..6e975793fc0 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -853,12 +853,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.add.b_unpack)) {
+ &instr->alu.add.b.unpack)) {
return false;
}
break;
@@ -872,7 +872,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = mux_b & 0x3;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -884,7 +884,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -892,23 +892,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.add.a = mux_a;
- instr->alu.add.b = mux_b;
+ instr->alu.add.a.mux = mux_a;
+ instr->alu.add.b.mux = mux_b;
instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
instr->alu.add.magic_write = false;
@@ -956,12 +956,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.mul.b_unpack)) {
+ &instr->alu.mul.b.unpack)) {
return false;
}
@@ -972,7 +972,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
((mux_b >> 2) & 1));
if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
@@ -982,23 +982,23 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.mul.a = mux_a;
- instr->alu.mul.b = mux_b;
+ instr->alu.mul.a.mux = mux_a;
+ instr->alu.mul.b.mux = mux_b;
instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
@@ -1030,8 +1030,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
uint32_t waddr = instr->alu.add.waddr;
- uint32_t mux_a = instr->alu.add.a;
- uint32_t mux_b = instr->alu.add.b;
+ uint32_t mux_a = instr->alu.add.a.mux;
+ uint32_t mux_b = instr->alu.add.b.mux;
int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
const struct opcode_desc *desc =
lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
@@ -1102,12 +1102,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
opcode |= output_pack << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
@@ -1141,17 +1141,17 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
uint32_t a_unpack;
uint32_t b_unpack;
- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
@@ -1176,7 +1176,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
mux_b |= packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1194,7 +1194,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
return false;
uint32_t packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1207,11 +1207,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1221,8 +1221,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
default:
if (instr->alu.add.op != V3D_QPU_A_NOP &&
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
return false;
}
break;
@@ -1242,8 +1242,8 @@ static bool
v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
- uint32_t mux_a = instr->alu.mul.a;
- uint32_t mux_b = instr->alu.mul.b;
+ uint32_t mux_a = instr->alu.mul.a.mux;
+ uint32_t mux_b = instr->alu.mul.b.mux;
int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
const struct opcode_desc *desc =
@@ -1277,13 +1277,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
*/
opcode += packed << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
opcode |= packed << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
&packed)) {
return false;
}
@@ -1301,7 +1301,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
opcode |= (packed >> 1) & 1;
mux_b = (packed & 1) << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
@@ -1315,16 +1315,16 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
return false;
- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
opcode = 8;
else
opcode |= (packed + 4) & 7;
- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
return false;
break;
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index 2f8e19c73fe..be7b78d5ef0 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -160,10 +160,10 @@ main(int argc, char **argv)
/* Swap the operands to be sure that we test
* how the QPUs distinguish between these ops.
*/
- swap_mux(&instr.alu.add.a,
- &instr.alu.add.b);
- swap_pack(&instr.alu.add.a_unpack,
- &instr.alu.add.b_unpack);
+ swap_mux(&instr.alu.add.a.mux,
+ &instr.alu.add.b.mux);
+ swap_pack(&instr.alu.add.a.unpack,
+ &instr.alu.add.b.unpack);
break;
default:
break;
--
2.39.2

View File

@ -1,45 +0,0 @@
From 52ea09792ff8a438ccdecac47b8415657be90098 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 6 Aug 2021 01:33:32 +0200
Subject: [PATCH 012/142] broadcom/qpu: add raddr on v3d_qpu_input
On V3D 7.x mux are not used, and raddr_a/b/c/d are used instead
This is not perfect, as for v71, the raddr_a/b defined at qpu_instr
became superfluous. But the alternative would be to define two
different structs, or even having them defined based on version
ifdefs, so this is a reasonable compromise.
---
src/broadcom/qpu/qpu_instr.h | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 53a51bfb3e1..9e56e2d6a99 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -295,7 +295,10 @@ enum v3d_qpu_mux {
};
struct v3d_qpu_input {
- enum v3d_qpu_mux mux;
+ union {
+ enum v3d_qpu_mux mux; /* V3D 4.x */
+ uint8_t raddr; /* V3D 7.x */
+ };
enum v3d_qpu_input_unpack unpack;
};
@@ -385,8 +388,8 @@ struct v3d_qpu_instr {
struct v3d_qpu_sig sig;
uint8_t sig_addr;
bool sig_magic; /* If the signal writes to a magic address */
- uint8_t raddr_a;
- uint8_t raddr_b;
+ uint8_t raddr_a; /* V3D 4.x */
+ uint8_t raddr_b; /* V3D 4.x*/
struct v3d_qpu_flags flags;
union {
--
2.39.2

View File

@ -1,37 +0,0 @@
From 3e5ad0881c2789619cdf65f40a44d5481e28e800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 12 Aug 2021 02:24:02 +0200
Subject: [PATCH 013/142] broadcom/qpu: defining shift/mask for raddr_c/d
On V3D 7.x it replaces mul_a/b and add_a/b
---
src/broadcom/qpu/qpu_pack.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 6e975793fc0..4f106909729 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -84,6 +84,9 @@
#define V3D_QPU_MUL_A_SHIFT 18
#define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18)
+#define V3D_QPU_RADDR_C_SHIFT 18
+#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18)
+
#define V3D_QPU_ADD_B_SHIFT 15
#define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15)
@@ -98,6 +101,9 @@
#define V3D_QPU_BRANCH_BDI_SHIFT 12
#define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12)
+#define V3D_QPU_RADDR_D_SHIFT 12
+#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12)
+
#define V3D_QPU_RADDR_A_SHIFT 6
#define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6)
--
2.39.2

View File

@ -1,46 +0,0 @@
From 81febf14fe05ad26e992275b911e8bc1e1416ebc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 17 Sep 2021 01:04:31 +0200
Subject: [PATCH 014/142] broadcom/commmon: add has_accumulators field on
v3d_device_info
Even if we can just check for the version on the code, checking for
this field makes several places more readable. So for example, on the
register allocate code we doesn't assign an accumulator because we
don't have accumulators on that hw, instead of because hw version is a
given one.
---
src/broadcom/common/v3d_device_info.c | 2 ++
src/broadcom/common/v3d_device_info.h | 3 +++
2 files changed, 5 insertions(+)
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 7512fe3a06b..7bc2b662cfc 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -65,6 +65,8 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
int qups = (ident1.value >> 8) & 0xf;
devinfo->qpu_count = nslc * qups;
+ devinfo->has_accumulators = devinfo->ver < 71;
+
switch (devinfo->ver) {
case 33:
case 41:
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 32cb65cf81f..8dfc7858727 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -42,6 +42,9 @@ struct v3d_device_info {
/* NSLC * QUPS from the core's IDENT registers. */
int qpu_count;
+
+ /* If the hw has accumulator registers */
+ bool has_accumulators;
};
typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
--
2.39.2

View File

@ -1,52 +0,0 @@
From 7d42eca87b6e144697810405308d99d200dca62a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 15 Sep 2021 10:56:43 +0200
Subject: [PATCH 015/142] broadcom/qpu: add qpu_writes_rf0_implicitly helper
On v71 rf0 replaces r5 as the register that gets updated implicitly
with uniform loads, and gets the C coefficient with ldvary. This
helper return if rf0 gets implicitly updated.
---
src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++
src/broadcom/qpu/qpu_instr.h | 2 ++
2 files changed, 14 insertions(+)
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 7ece8b5e570..8de99c611d5 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -920,6 +920,18 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
return false;
}
+bool
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
+{
+ if (devinfo->ver >= 71 &&
+ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
+ return true;
+ }
+
+ return false;
+}
+
bool
v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
{
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 9e56e2d6a99..a25be8e0ee6 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -473,6 +473,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
--
2.39.2

View File

@ -1,261 +0,0 @@
From ebba9019461083687f6afd23ff0d4646c1a667cb Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Sun, 29 Jan 2023 00:27:11 +0100
Subject: [PATCH 017/142] broadcom/compiler: update node/temp translation for
v71
As the offset applied needs to take into account if we have
accumulators or not.
---
src/broadcom/compiler/vir_register_allocate.c | 68 +++++++++----------
1 file changed, 34 insertions(+), 34 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index b22f915d1df..aa9473d124b 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -39,30 +39,31 @@
CLASS_BITS_R5)
static inline uint32_t
-temp_to_node(uint32_t temp)
+temp_to_node(struct v3d_compile *c, uint32_t temp)
{
- return temp + ACC_COUNT;
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0);
}
static inline uint32_t
-node_to_temp(uint32_t node)
+node_to_temp(struct v3d_compile *c, uint32_t node)
{
- assert(node >= ACC_COUNT);
- return node - ACC_COUNT;
+ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+ (!c->devinfo->has_accumulators && node >= 0));
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0);
}
static inline uint8_t
-get_temp_class_bits(struct v3d_ra_node_info *nodes,
+get_temp_class_bits(struct v3d_compile *c,
uint32_t temp)
{
- return nodes->info[temp_to_node(temp)].class_bits;
+ return c->nodes.info[temp_to_node(c, temp)].class_bits;
}
static inline void
-set_temp_class_bits(struct v3d_ra_node_info *nodes,
+set_temp_class_bits(struct v3d_compile *c,
uint32_t temp, uint8_t class_bits)
{
- nodes->info[temp_to_node(temp)].class_bits = class_bits;
+ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
}
static struct ra_class *
@@ -84,7 +85,7 @@ static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
{
assert(temp < c->num_temps && temp < c->nodes.alloc_count);
- return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
+ return choose_reg_class(c, get_temp_class_bits(c, temp));
}
static inline bool
@@ -313,7 +314,7 @@ v3d_choose_spill_node(struct v3d_compile *c)
for (unsigned i = 0; i < c->num_temps; i++) {
if (BITSET_TEST(c->spillable, i)) {
- ra_set_node_spill_cost(c->g, temp_to_node(i),
+ ra_set_node_spill_cost(c->g, temp_to_node(c, i),
spill_costs[i]);
}
}
@@ -482,7 +483,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
c->temp_start[i] < ip && c->temp_end[i] >= ip :
c->temp_start[i] <= ip && c->temp_end[i] > ip;
if (thrsw_cross) {
- ra_set_node_class(c->g, temp_to_node(i),
+ ra_set_node_class(c->g, temp_to_node(c, i),
choose_reg_class(c, CLASS_BITS_PHYS));
}
}
@@ -509,8 +510,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
* same register class bits as the original.
*/
if (inst == position) {
- uint8_t class_bits = get_temp_class_bits(&c->nodes,
- inst->dst.index);
+ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
inst->dst = vir_get_temp(c);
add_node(c, inst->dst.index, class_bits);
} else {
@@ -574,7 +574,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
reconstruct_op = orig_def->qpu.alu.add.op;
}
- uint32_t spill_node = temp_to_node(spill_temp);
+ uint32_t spill_node = temp_to_node(c, spill_temp);
/* We must disable the ldunif optimization if we are spilling uniforms */
bool had_disable_ldunif_opt = c->disable_ldunif_opt;
@@ -739,12 +739,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* update node priorities based one new liveness data.
*/
uint32_t sb_temp =c->spill_base.index;
- uint32_t sb_node = temp_to_node(sb_temp);
+ uint32_t sb_node = temp_to_node(c, sb_temp);
for (uint32_t i = 0; i < c->num_temps; i++) {
if (c->temp_end[i] == -1)
continue;
- uint32_t node_i = temp_to_node(i);
+ uint32_t node_i = temp_to_node(c, i);
c->nodes.info[node_i].priority =
c->temp_end[i] - c->temp_start[i];
@@ -752,7 +752,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
j < c->num_temps; j++) {
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
- uint32_t node_j = temp_to_node(j);
+ uint32_t node_j = temp_to_node(c, j);
ra_add_node_interference(c->g, node_i, node_j);
}
}
@@ -958,7 +958,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
- temp_to_node(i),
+ temp_to_node(c, i),
acc_nodes[3]);
}
}
@@ -968,7 +968,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
- temp_to_node(i),
+ temp_to_node(c, i),
acc_nodes[4]);
}
}
@@ -987,7 +987,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* decides whether the LDVPM is in or out)
*/
assert(inst->dst.file == QFILE_TEMP);
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_PHYS);
break;
}
@@ -1002,7 +1002,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* phys regfile.
*/
assert(inst->dst.file == QFILE_TEMP);
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_PHYS);
break;
}
@@ -1024,7 +1024,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
*/
assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
assert(inst->dst.file == QFILE_TEMP);
- uint32_t node = temp_to_node(inst->dst.index);
+ uint32_t node = temp_to_node(c, inst->dst.index);
ra_set_node_reg(c->g, node,
PHYS_INDEX + inst->src[0].index);
break;
@@ -1043,9 +1043,9 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
*/
if (!inst->qpu.sig.ldunif) {
uint8_t class_bits =
- get_temp_class_bits(&c->nodes, inst->dst.index) &
+ get_temp_class_bits(c, inst->dst.index) &
~CLASS_BITS_R5;
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
class_bits);
} else {
@@ -1054,7 +1054,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* loads interfere with each other.
*/
if (c->devinfo->ver < 40) {
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_R5);
}
}
@@ -1064,7 +1064,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
if (inst->qpu.sig.thrsw) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
- set_temp_class_bits(&c->nodes, i,
+ set_temp_class_bits(c, i,
CLASS_BITS_PHYS);
}
}
@@ -1125,7 +1125,7 @@ v3d_register_allocate(struct v3d_compile *c)
c->nodes.info[i].priority = 0;
c->nodes.info[i].class_bits = 0;
} else {
- uint32_t t = node_to_temp(i);
+ uint32_t t = node_to_temp(c, i);
c->nodes.info[i].priority =
c->temp_end[t] - c->temp_start[t];
c->nodes.info[i].class_bits = CLASS_BITS_ANY;
@@ -1143,7 +1143,7 @@ v3d_register_allocate(struct v3d_compile *c)
/* Set the register classes for all our temporaries in the graph */
for (uint32_t i = 0; i < c->num_temps; i++) {
- ra_set_node_class(c->g, temp_to_node(i),
+ ra_set_node_class(c->g, temp_to_node(c, i),
choose_reg_class_for_temp(c, i));
}
@@ -1153,8 +1153,8 @@ v3d_register_allocate(struct v3d_compile *c)
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
ra_add_node_interference(c->g,
- temp_to_node(i),
- temp_to_node(j));
+ temp_to_node(c, i),
+ temp_to_node(c, j));
}
}
}
@@ -1171,7 +1171,7 @@ v3d_register_allocate(struct v3d_compile *c)
if (c->spill_size <
V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
int node = v3d_choose_spill_node(c);
- uint32_t temp = node_to_temp(node);
+ uint32_t temp = node_to_temp(c, node);
if (node != -1) {
v3d_spill_reg(c, acc_nodes, temp);
continue;
@@ -1186,7 +1186,7 @@ v3d_register_allocate(struct v3d_compile *c)
if (node == -1)
goto spill_fail;
- uint32_t temp = node_to_temp(node);
+ uint32_t temp = node_to_temp(c, node);
enum temp_spill_type spill_type =
get_spill_type_for_temp(c, temp);
if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
@@ -1201,7 +1201,7 @@ v3d_register_allocate(struct v3d_compile *c)
/* Allocation was successful, build the 'temp -> reg' map */
temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
for (uint32_t i = 0; i < c->num_temps; i++) {
- int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
+ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
if (ra_reg < PHYS_INDEX) {
temp_registers[i].magic = true;
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
--
2.39.2

View File

@ -1,144 +0,0 @@
From 9b2dfe0286212aba3687a06023cc5b4ce9944ee0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Mon, 23 Aug 2021 02:18:43 +0200
Subject: [PATCH 018/142] broadcom/compiler: phys index depends on hw version
For 7.1 there are not accumulators. So we replace the macro with a
function call.
---
src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++-----
1 file changed, 29 insertions(+), 10 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index aa9473d124b..a358b616e13 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -28,9 +28,19 @@
#define ACC_INDEX 0
#define ACC_COUNT 6
-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT 64
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return ACC_INDEX + ACC_COUNT;
+ else
+ return 0;
+}
+
+/* ACC as accumulator */
#define CLASS_BITS_PHYS (1 << 0)
#define CLASS_BITS_ACC (1 << 1)
#define CLASS_BITS_R5 (1 << 4)
@@ -771,9 +781,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
}
struct v3d_ra_select_callback_data {
+ uint32_t phys_index;
uint32_t next_acc;
uint32_t next_phys;
struct v3d_ra_node_info *nodes;
+ const struct v3d_device_info *devinfo;
};
/* Choosing accumulators improves chances of merging QPU instructions
@@ -794,7 +806,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
static const int available_rf_threshold = 5;
int available_rf = 0 ;
for (int i = 0; i < PHYS_COUNT; i++) {
- if (BITSET_TEST(regs, PHYS_INDEX + i))
+ if (BITSET_TEST(regs, v3d_ra->phys_index + i))
available_rf++;
if (available_rf >= available_rf_threshold)
break;
@@ -854,7 +866,7 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
{
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
- int phys = PHYS_INDEX + phys_off;
+ int phys = v3d_ra->phys_index + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
@@ -896,8 +908,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
* register file can be divided up for fragment shader threading.
*/
int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+ uint8_t phys_index = get_phys_index(compiler->devinfo);
- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
false);
if (!compiler->regs)
return false;
@@ -912,8 +925,8 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- for (int i = PHYS_INDEX;
- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
+ for (int i = phys_index;
+ i < phys_index + (PHYS_COUNT >> threads); i++) {
ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_phys[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
@@ -1026,7 +1039,8 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
assert(inst->dst.file == QFILE_TEMP);
uint32_t node = temp_to_node(c, inst->dst.index);
ra_set_node_reg(c->g, node,
- PHYS_INDEX + inst->src[0].index);
+ get_phys_index(c->devinfo) +
+ inst->src[0].index);
break;
}
}
@@ -1086,13 +1100,17 @@ v3d_register_allocate(struct v3d_compile *c)
c->num_temps + ACC_COUNT),
};
+ uint32_t phys_index = get_phys_index(c->devinfo);
+
struct v3d_ra_select_callback_data callback_data = {
+ .phys_index = phys_index,
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
* RF0-2.
*/
.next_phys = 3,
.nodes = &c->nodes,
+ .devinfo = c->devinfo,
};
vir_calculate_live_intervals(c);
@@ -1139,6 +1157,7 @@ v3d_register_allocate(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
inst->ip = ip++;
update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
+
}
/* Set the register classes for all our temporaries in the graph */
@@ -1202,13 +1221,13 @@ v3d_register_allocate(struct v3d_compile *c)
temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
for (uint32_t i = 0; i < c->num_temps; i++) {
int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
- if (ra_reg < PHYS_INDEX) {
+ if (ra_reg < phys_index) {
temp_registers[i].magic = true;
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
ra_reg - ACC_INDEX);
} else {
temp_registers[i].magic = false;
- temp_registers[i].index = ra_reg - PHYS_INDEX;
+ temp_registers[i].index = ra_reg - phys_index;
}
}
--
2.39.2

View File

@ -1,40 +0,0 @@
From da0a3deadf86a46c8323267d3f6a49e442835608 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 17 Sep 2021 01:07:06 +0200
Subject: [PATCH 019/142] broadcom/compiler: don't favor/select accum registers
for hw not supporting it
Note that what we do is to just return false on the favor/select accum
methods. We could just avoid to call them, but as the select is called
more than once, it is just easier this way.
---
src/broadcom/compiler/vir_register_allocate.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index a358b616e13..1f495180784 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -797,6 +797,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
int priority)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Favor accumulators if we have less that this number of physical
* registers. Accumulators have more restrictions (like being
* invalidated through thrsw), so running out of physical registers
@@ -832,6 +835,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Choose r5 for our ldunifs if possible (nobody else can load to that
* reg, and it keeps the QPU cond field free from being occupied by
* ldunifrf).
--
2.39.2

View File

@ -1,105 +0,0 @@
From 6c04d7c917da6b38f8b2b4306ab03ed2ab7e6ce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 9 Sep 2021 00:28:53 +0200
Subject: [PATCH 020/142] broadcom/vir: implement is_no_op_mov for v71
Did some refactoring/splitting.
---
src/broadcom/compiler/vir_to_qpu.c | 66 ++++++++++++++++++++++++------
1 file changed, 53 insertions(+), 13 deletions(-)
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index c8b6e0a91a0..08970d52954 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -129,19 +129,8 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
}
static bool
-is_no_op_mov(struct qinst *qinst)
+v3d33_mov_src_and_dst_equal(struct qinst *qinst)
{
- static const struct v3d_qpu_sig no_sig = {0};
-
- /* Make sure it's just a lone MOV. */
- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
- qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
- return false;
- }
-
- /* Check if it's a MOV from a register to itself. */
enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
if (qinst->qpu.alu.mul.magic_write) {
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
@@ -168,6 +157,57 @@ is_no_op_mov(struct qinst *qinst)
return false;
}
+ return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+ if (qinst->qpu.alu.mul.magic_write)
+ return false;
+
+ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+ int raddr;
+
+ raddr = qinst->qpu.alu.mul.a.raddr;
+ if (raddr != waddr)
+ return false;
+
+ return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return v3d33_mov_src_and_dst_equal(qinst);
+ else
+ return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ static const struct v3d_qpu_sig no_sig = {0};
+
+ /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+ * for V3D 7.x there is also A_MOV, we don't need to check for it as
+ * we always emit using M_MOV. We could use A_MOV later on the
+ * squedule to improve performance
+ */
+ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+ qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+ return false;
+ }
+
+ if (!mov_src_and_dst_equal(qinst, devinfo))
+ return false;
+
/* No packing or flags updates, or we need to execute the
* instruction.
*/
@@ -324,7 +364,7 @@ v3d_generate_code_block(struct v3d_compile *c,
qinst->qpu.alu.mul.waddr = dst.index;
qinst->qpu.alu.mul.magic_write = dst.magic;
- if (is_no_op_mov(qinst)) {
+ if (is_no_op_mov(qinst, c->devinfo)) {
vir_remove_instruction(c, qinst);
continue;
}
--
2.39.2

View File

@ -1,104 +0,0 @@
From 7b5be2d9b178a45c34c22db2744639a6a8a216d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 9 Sep 2021 01:18:54 +0200
Subject: [PATCH 021/142] broadcom/compiler: update vir_to_qpu::set_src for v71
---
src/broadcom/compiler/vir_to_qpu.c | 47 ++++++++++++++++++++++++++----
1 file changed, 42 insertions(+), 5 deletions(-)
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 08970d52954..afc4941fdb1 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -86,12 +86,22 @@ new_qpu_nop_before(struct qinst *inst)
return q;
}
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+ if (src.smimm)
+ unreachable("v3d71_set_src: pending handling small immediates");
+
+ assert(!src.magic);
+ *raddr = src.index;
+}
+
/**
* Allocates the src register (accumulator or register file) into the RADDR
* fields of the instruction.
*/
static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
{
if (src.smimm) {
assert(instr->sig.small_imm_b);
@@ -128,6 +138,24 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
}
}
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux *mux,
+ uint8_t *raddr,
+ struct qpu_reg src,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return v3d33_set_src(instr, mux, src);
+ else
+ return v3d71_set_src(instr, raddr, src);
+}
+
static bool
v3d33_mov_src_and_dst_equal(struct qinst *qinst)
{
@@ -340,13 +368,18 @@ v3d_generate_code_block(struct v3d_compile *c,
qinst->qpu.sig_magic = dst.magic;
} else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.a.mux, src[0]);
+ &qinst->qpu.alu.add.a.mux,
+ &qinst->qpu.alu.add.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.b.mux, src[1]);
+ &qinst->qpu.alu.add.b.mux,
+ &qinst->qpu.alu.add.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.add.waddr = dst.index;
@@ -354,11 +387,15 @@ v3d_generate_code_block(struct v3d_compile *c,
} else {
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.a.mux, src[0]);
+ &qinst->qpu.alu.mul.a.mux,
+ &qinst->qpu.alu.mul.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.b.mux, src[1]);
+ &qinst->qpu.alu.mul.b.mux,
+ &qinst->qpu.alu.mul.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.mul.waddr = dst.index;
--
2.39.2

View File

@ -1,92 +0,0 @@
From fe89703008f2a3d6bfe6e260791f712013be5e48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 9 Sep 2021 23:59:28 +0200
Subject: [PATCH 022/142] broadcom/qpu_schedule: add process_raddr_deps
On v71 we don't have muxes, but more raddr. Adding a equivalent add
deps function.
---
src/broadcom/compiler/qpu_schedule.c | 52 +++++++++++++++++++++++-----
1 file changed, 44 insertions(+), 8 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 455fa3867be..89254643c90 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -155,6 +155,7 @@ static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
enum v3d_qpu_mux mux)
{
+ assert(state->devinfo->ver < 71);
switch (mux) {
case V3D_QPU_MUX_A:
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint8_t raddr, bool is_small_imm)
+{
+ assert(state->devinfo->ver >= 71);
+
+ if (!is_small_imm)
+ add_read_dep(state, state->last_rf[raddr], n);
+}
+
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
@@ -305,15 +317,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a.mux);
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b.mux);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
+ inst->sig.small_imm_a);
+ }
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
+ inst->sig.small_imm_b);
+ }
+ }
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a.mux);
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b.mux);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+ inst->sig.small_imm_c);
+ }
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+ inst->sig.small_imm_d);
+ }
+ }
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
--
2.39.2

View File

@ -1,128 +0,0 @@
From 20ce426df1ab2546332141f4bc4531ada754cdea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 10 Sep 2021 01:20:44 +0200
Subject: [PATCH 023/142] broadcom/qpu: update disasm_raddr for v71
---
src/broadcom/qpu/qpu_disasm.c | 72 ++++++++++++++++++++++++++++++++---
1 file changed, 66 insertions(+), 6 deletions(-)
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index 588a665f770..b613de781dc 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -56,8 +56,9 @@ pad_to(struct disasm_state *disasm, int n)
static void
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
- const struct v3d_qpu_instr *instr, uint8_t mux)
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux mux)
{
if (mux == V3D_QPU_MUX_A) {
append(disasm, "rf%d", instr->raddr_a);
@@ -82,6 +83,65 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
}
}
+enum v3d_qpu_input_class {
+ V3D_QPU_ADD_A,
+ V3D_QPU_ADD_B,
+ V3D_QPU_MUL_A,
+ V3D_QPU_MUL_B
+};
+
+static void
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ uint8_t raddr,
+ enum v3d_qpu_input_class input_class)
+{
+ bool is_small_imm = false;
+ switch(input_class) {
+ case V3D_QPU_ADD_A:
+ is_small_imm = instr->sig.small_imm_a;
+ break;
+ case V3D_QPU_ADD_B:
+ is_small_imm = instr->sig.small_imm_b;
+ break;
+ case V3D_QPU_MUL_A:
+ is_small_imm = instr->sig.small_imm_c;
+ break;
+ case V3D_QPU_MUL_B:
+ is_small_imm = instr->sig.small_imm_d;
+ break;
+ }
+
+ if (is_small_imm) {
+ unreachable("Pending handling small immediates");
+ uint32_t val;
+ ASSERTED bool ok =
+ v3d_qpu_small_imm_unpack(disasm->devinfo,
+ raddr,
+ &val);
+
+ if ((int)val >= -16 && (int)val <= 15)
+ append(disasm, "%d", val);
+ else
+ append(disasm, "0x%08x", val);
+ assert(ok);
+ } else {
+ append(disasm, "rf%d", raddr);
+ }
+}
+
+static void
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ const struct v3d_qpu_input *input,
+ enum v3d_qpu_input_class input_class)
+{
+ if (disasm->devinfo->ver < 71)
+ v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
+ else
+ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
+}
+
static void
v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
{
@@ -121,14 +181,14 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
append(disasm, "%s",
v3d_qpu_unpack_name(instr->alu.add.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
append(disasm, "%s",
v3d_qpu_unpack_name(instr->alu.add.b.unpack));
}
@@ -164,14 +224,14 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
append(disasm, "%s",
v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
append(disasm, "%s",
v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
}
--
2.39.2

View File

@ -1,59 +0,0 @@
From 7263fa24a3c57b1dcd4d870670cda86ae89aa28c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 15 Sep 2021 10:55:49 +0200
Subject: [PATCH 024/142] broadcom/qpu: return false on
qpu_writes_accumulatorXX helpers for v71
As for v71 doesn't have accumulators (devinfo->has_accumulators set to
false), those methods would always return false.
---
src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 8de99c611d5..7ec3c867260 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -854,6 +854,9 @@ bool
v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if(!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
return true;
@@ -864,6 +867,9 @@ bool
v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
@@ -894,6 +900,9 @@ bool
v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
return true;
@@ -904,6 +913,9 @@ bool
v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (v3d_qpu_writes_r5(devinfo, inst))
return true;
if (v3d_qpu_writes_r4(devinfo, inst))
--
2.39.2

View File

@ -1,116 +0,0 @@
From 6a9611c5a22218388bba419174d3343e0cdf773b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 14 Sep 2021 10:42:55 +0200
Subject: [PATCH 025/142] broadcom/compiler: add support for varyings on nir to
vir generation for v71
Needs update as v71 doesn't have accumulators anymore, and ldvary uses
now rf0 to return the value.
---
src/broadcom/compiler/nir_to_vir.c | 34 +++++++++++++++++-------------
1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index ca072971f01..79a22c3bd08 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
static struct qreg
emit_smooth_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg w, struct qreg r5)
+ struct qreg vary, struct qreg w, struct qreg c_reg)
{
- return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
}
static struct qreg
emit_noperspective_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
- return vir_FADD(c, vir_MOV(c, vary), r5);
+ return vir_FADD(c, vir_MOV(c, vary), c_reg);
}
static struct qreg
emit_flat_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
vir_MOV_dest(c, c->undef, vary);
- return vir_MOV(c, r5);
+ return vir_MOV(c, c_reg);
}
static struct qreg
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
int8_t input_idx, uint8_t swizzle, int array_index)
{
- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ struct qreg c_reg; /* C coefficient */
+
+ if (c->devinfo->has_accumulators)
+ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ else
+ c_reg = vir_reg(QFILE_REG, 0);
struct qinst *ldvary = NULL;
struct qreg vary;
@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
vary = vir_emit_def(c, ldvary);
} else {
vir_NOP(c)->qpu.sig.ldvary = true;
- vary = r3;
+ vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
}
/* Store the input value before interpolation so we can implement
@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (input_idx >= 0) {
assert(var);
c->interp[input_idx].vp = vary;
- c->interp[input_idx].C = vir_MOV(c, r5);
+ c->interp[input_idx].C = vir_MOV(c, c_reg);
c->interp[input_idx].mode = var->data.interpolation;
}
@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
*/
if (!var) {
assert(input_idx < 0);
- return emit_smooth_varying(c, vary, c->payload_w, r5);
+ return emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
int i = c->num_inputs++;
@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (var->data.centroid) {
BITSET_SET(c->centroid_flags, i);
result = emit_smooth_varying(c, vary,
- c->payload_w_centroid, r5);
+ c->payload_w_centroid, c_reg);
} else {
- result = emit_smooth_varying(c, vary, c->payload_w, r5);
+ result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
break;
case INTERP_MODE_NOPERSPECTIVE:
BITSET_SET(c->noperspective_flags, i);
- result = emit_noperspective_varying(c, vary, r5);
+ result = emit_noperspective_varying(c, vary, c_reg);
break;
case INTERP_MODE_FLAT:
BITSET_SET(c->flat_shade_flags, i);
- result = emit_flat_varying(c, vary, r5);
+ result = emit_flat_varying(c, vary, c_reg);
break;
default:
--
2.39.2

View File

@ -1,55 +0,0 @@
From 06af15a60f7a9c135893e5f8934b8030c1da95f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 15 Sep 2021 01:14:15 +0200
Subject: [PATCH 026/142] broadcom/compiler: payload_w is loaded on rf3 for v71
And in general rf0 is now used for other needs.
---
src/broadcom/compiler/nir_to_vir.c | 6 +++++-
src/broadcom/compiler/vir_register_allocate.c | 6 +++++-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 79a22c3bd08..1a05b279a2d 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -4325,7 +4325,11 @@ nir_to_vir(struct v3d_compile *c)
{
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ if (c->devinfo->ver < 71)
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ else
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 1f495180784..eca9a6751a6 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -1034,6 +1034,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
if (inst->src[0].file == QFILE_REG) {
switch (inst->src[0].index) {
case 0:
+ /* V3D 7.x doesn't use rf0 for thread payload */
+ if (c->devinfo->ver >= 71)
+ break;
+ else
+ FALLTHROUGH;
case 1:
case 2:
case 3: {
@@ -1163,7 +1168,6 @@ v3d_register_allocate(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
inst->ip = ip++;
update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
-
}
/* Set the register classes for all our temporaries in the graph */
--
2.39.2

View File

@ -1,30 +0,0 @@
From d38d8056903b9a4f96ab56261ac3b3c3be0af4fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 15 Sep 2021 11:12:59 +0200
Subject: [PATCH 027/142] broadcom/qpu_schedule: update write deps for v71
We just need to add a write dep if rf0 is written implicitly.
Note that we don't need to check if we have accumulators when checking
for r3/r4/r5, as v3d_qpu_writes_rX would return false for hw version
that doesn't have accumulators.
---
src/broadcom/compiler/qpu_schedule.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 89254643c90..2fa9031d7b6 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -422,6 +422,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_r[4], n);
if (v3d_qpu_writes_r5(devinfo, inst))
add_write_dep(state, &state->last_r[5], n);
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+ add_write_dep(state, &state->last_rf[0], n);
/* If we add any more dependencies here we should consider whether we
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
--
2.39.2

View File

@ -1,140 +0,0 @@
From 7e2a2be830b1672ab846389a46b5d09bad0f7a98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 16 Sep 2021 00:49:25 +0200
Subject: [PATCH 028/142] broadcom/compiler: update register classes to not
include accumulators on v71
---
src/broadcom/compiler/vir_register_allocate.c | 56 ++++++++++++-------
1 file changed, 36 insertions(+), 20 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index eca9a6751a6..7b3f6c41934 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -44,10 +44,15 @@ get_phys_index(const struct v3d_device_info *devinfo)
#define CLASS_BITS_PHYS (1 << 0)
#define CLASS_BITS_ACC (1 << 1)
#define CLASS_BITS_R5 (1 << 4)
-#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \
- CLASS_BITS_ACC | \
- CLASS_BITS_R5)
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+ else
+ return CLASS_BITS_PHYS;
+}
static inline uint32_t
temp_to_node(struct v3d_compile *c, uint32_t temp)
{
@@ -82,11 +87,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
if (class_bits == CLASS_BITS_PHYS) {
return c->compiler->reg_class_phys[c->thread_index];
} else if (class_bits == (CLASS_BITS_R5)) {
+ assert(c->devinfo->has_accumulators);
return c->compiler->reg_class_r5[c->thread_index];
} else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+ assert(c->devinfo->has_accumulators);
return c->compiler->reg_class_phys_or_acc[c->thread_index];
} else {
- assert(class_bits == CLASS_BITS_ANY);
+ assert(class_bits == get_class_bit_any(c->devinfo));
return c->compiler->reg_class_any[c->thread_index];
}
}
@@ -447,7 +454,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
*/
assert(c->disable_ldunif_opt);
struct qreg offset = vir_uniform_ui(c, spill_offset);
- add_node(c, offset.index, CLASS_BITS_ANY);
+ add_node(c, offset.index, get_class_bit_any(c->devinfo));
/* We always enable per-quad on spills/fills to ensure we spill
* any channels involved with helper invocations.
@@ -645,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* instruction immediately after, so
* we can use any register class for it.
*/
- add_node(c, unif.index, CLASS_BITS_ANY);
+ add_node(c, unif.index,
+ get_class_bit_any(c->devinfo));
} else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
struct qreg temp =
reconstruct_temp(c, reconstruct_op);
@@ -924,31 +932,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
for (int threads = 0; threads < max_thread_index; threads++) {
compiler->reg_class_any[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_r5[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_phys_or_acc[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
+ if (compiler->devinfo->has_accumulators) {
+ compiler->reg_class_r5[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ compiler->reg_class_phys_or_acc[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ }
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
for (int i = phys_index;
i < phys_index + (PHYS_COUNT >> threads); i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ if (compiler->devinfo->has_accumulators)
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_phys[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
- ra_class_add_reg(compiler->reg_class_any[threads], i);
+ if (compiler->devinfo->has_accumulators) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->reg_class_any[threads], i);
+ }
}
/* r5 can only store a single 32-bit value, so not much can
* use it.
*/
- ra_class_add_reg(compiler->reg_class_r5[threads],
- ACC_INDEX + 5);
- ra_class_add_reg(compiler->reg_class_any[threads],
- ACC_INDEX + 5);
+ if (compiler->devinfo->has_accumulators) {
+ ra_class_add_reg(compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
+ }
}
ra_set_finalize(compiler->regs, NULL);
@@ -1086,7 +1101,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
}
/* All accumulators are invalidated across a thread switch. */
- if (inst->qpu.sig.thrsw) {
+ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
set_temp_class_bits(c, i,
@@ -1157,7 +1172,8 @@ v3d_register_allocate(struct v3d_compile *c)
uint32_t t = node_to_temp(c, i);
c->nodes.info[i].priority =
c->temp_end[t] - c->temp_start[t];
- c->nodes.info[i].class_bits = CLASS_BITS_ANY;
+ c->nodes.info[i].class_bits =
+ get_class_bit_any(c->devinfo);
}
}
--
2.39.2

View File

@ -1,109 +0,0 @@
From 0157228c729b8812dc4900fa24db63b7d27aa342 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 23 Sep 2021 11:19:58 +0200
Subject: [PATCH 029/142] broadcom/compiler: implement "reads/writes too soon"
checks for v71
---
src/broadcom/compiler/qpu_schedule.c | 65 ++++++++++++++++++++++------
1 file changed, 51 insertions(+), 14 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 2fa9031d7b6..4db0c2e72da 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -562,7 +562,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
}
static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ switch (raddr) {
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
struct qinst *qinst)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -574,24 +591,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
if (inst->alu.add.op != V3D_QPU_A_NOP) {
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+ return true;
+ }
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
}
@@ -1147,7 +1184,7 @@ retry:
* regfile A or B that was written to by the previous
* instruction."
*/
- if (reads_too_soon_after_write(scoreboard, n->inst))
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
--
2.39.2

View File

@ -1,118 +0,0 @@
From 3fb3333bdf9699157cf0a2bd46ba4c25058bc5c1 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 23 Sep 2021 11:44:59 +0200
Subject: [PATCH 030/142] broadcom/compiler: implement read stall check for v71
---
src/broadcom/compiler/qpu_schedule.c | 32 +++++++++++++++++-----------
src/broadcom/qpu/qpu_instr.c | 12 +++++++++++
src/broadcom/qpu/qpu_instr.h | 2 ++
3 files changed, 34 insertions(+), 12 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 4db0c2e72da..b78abe003e9 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -679,29 +679,37 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
}
static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
uint32_t waddr) {
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
- inst->raddr_a == waddr)
- return true;
+ if (devinfo->ver < 71) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm_b && (inst->raddr_b == waddr))
- return true;
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+ return true;
+ } else {
+ /* FIXME: skip if small immediate */
+ if (v3d71_qpu_reads_raddr(inst, waddr))
+ return true;
+ }
return false;
}
static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
{
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
- qpu_instruction_uses_rf(inst,
+ qpu_instruction_uses_rf(devinfo, inst,
scoreboard->last_stallable_sfu_reg);
}
@@ -1319,7 +1327,7 @@ retry:
int prio = get_instruction_priority(c->devinfo, inst);
- if (mux_read_stalls(scoreboard, inst)) {
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
/* Don't merge an instruction that stalls */
if (prev_inst)
continue;
@@ -2389,7 +2397,7 @@ schedule_instructions(struct v3d_compile *c,
}
}
}
- if (mux_read_stalls(scoreboard, inst))
+ if (read_stalls(c->devinfo, scoreboard, inst))
c->qpu_inst_stalled_count++;
}
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 7ec3c867260..e8bbb2141b0 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -956,6 +956,18 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
(mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
}
+bool
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+
+ return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) ||
+ (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) ||
+ (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) ||
+ (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
+}
+
bool
v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
const struct v3d_qpu_sig *sig)
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index a25be8e0ee6..9f7582ab06d 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -494,4 +494,6 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
#endif
--
2.39.2

View File

@ -1,65 +0,0 @@
From cbe0a7a06a5fb9b3f28acba8c9cac362a6bc5324 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 6 Oct 2021 13:58:00 +0200
Subject: [PATCH 031/142] broadcom/compiler: add a
v3d71_qpu_writes_waddr_explicitly helper
---
src/broadcom/qpu/qpu_instr.c | 28 ++++++++++++++++++++++++++++
src/broadcom/qpu/qpu_instr.h | 3 +++
2 files changed, 31 insertions(+)
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index e8bbb2141b0..feb6b343c1c 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -968,6 +968,34 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
(mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
}
+bool
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
+ !inst->alu.add.magic_write &&
+ inst->alu.add.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
+ !inst->alu.mul.magic_write &&
+ inst->alu.mul.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic && inst->sig_addr == waddr) {
+ return true;
+ }
+
+ return false;
+}
+
bool
v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
const struct v3d_qpu_sig *sig)
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 9f7582ab06d..50a69ce8c3a 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -496,4 +496,7 @@ bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr);
#endif
--
2.39.2

View File

@ -1,67 +0,0 @@
From 92e91a9b22ae61dc9f39880e8fdaa7714789efdb Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 27 Sep 2021 11:49:24 +0200
Subject: [PATCH 032/142] broadcom/compiler: prevent rf2-3 usage in thread end
delay slots for v71
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
---
src/broadcom/compiler/qpu_schedule.c | 37 +++++++++++++++++++++-------
1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index b78abe003e9..839c0c62315 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1691,16 +1691,35 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
return false;
- /* RF0-2 might be overwritten during the delay slots by
- * fragment shader setup.
- */
- if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
- return false;
+ if (c->devinfo->ver <= 42) {
+ /* RF0-2 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+ return false;
- if (inst->raddr_b < 3 &&
- !inst->sig.small_imm_b &&
- v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
- return false;
+ if (inst->raddr_b < 3 &&
+ !inst->sig.small_imm_b &&
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+ return false;
+ }
+ }
+
+ if (c->devinfo->ver >= 71) {
+ /* RF2-3 might be overwritten during the delay slots by
+ * fragment shader setup.
+ *
+ * FIXME: handle small immediate cases
+ */
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
+ v3d71_qpu_reads_raddr(inst, 3)) {
+ return false;
+ }
+
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+ return false;
+ }
}
}
--
2.39.2

View File

@ -1,78 +0,0 @@
From 68a1545eb973e41608534ff05a9e84a86c046453 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 27 Sep 2021 13:26:04 +0200
Subject: [PATCH 033/142] broadcom/qpu: add new ADD opcodes for FMOV/MOV in v71
---
src/broadcom/qpu/qpu_instr.c | 5 +++++
src/broadcom/qpu/qpu_instr.h | 4 ++++
src/broadcom/qpu/qpu_pack.c | 15 +++++++++++++++
3 files changed, 24 insertions(+)
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index feb6b343c1c..195a0dcd232 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -177,6 +177,8 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
[V3D_QPU_A_ITOF] = "itof",
[V3D_QPU_A_CLZ] = "clz",
[V3D_QPU_A_UTOF] = "utof",
+ [V3D_QPU_A_MOV] = "mov",
+ [V3D_QPU_A_FMOV] = "fmov",
};
if (op >= ARRAY_SIZE(op_names))
@@ -458,6 +460,9 @@ static const uint8_t add_op_args[] = {
[V3D_QPU_A_ITOF] = D | A,
[V3D_QPU_A_CLZ] = D | A,
[V3D_QPU_A_UTOF] = D | A,
+
+ [V3D_QPU_A_MOV] = D | A,
+ [V3D_QPU_A_FMOV] = D | A,
};
static const uint8_t mul_op_args[] = {
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 50a69ce8c3a..c86a4119c54 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -227,6 +227,10 @@ enum v3d_qpu_add_op {
V3D_QPU_A_ITOF,
V3D_QPU_A_CLZ,
V3D_QPU_A_UTOF,
+
+ /* V3D 7.x */
+ V3D_QPU_A_FMOV,
+ V3D_QPU_A_MOV,
};
enum v3d_qpu_mul_op {
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 4045275cb9a..0e504e65fbf 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -776,6 +776,21 @@ static const struct opcode_desc add_ops_v71[] = {
{ 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
{ 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+
+ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
+
+ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+
};
static const struct opcode_desc mul_ops_v71[] = {
--
2.39.2

View File

@ -1,46 +0,0 @@
From 8dbbb7e22b694fdc62376d112b3dc6105d556c63 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 4 Oct 2021 13:07:35 +0200
Subject: [PATCH 034/142] broadcom/qpu: fix packing/unpacking of fmov variants
for v71
---
src/broadcom/qpu/qpu_pack.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 0e504e65fbf..0eb820b3f10 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -1405,9 +1405,9 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
break;
case V3D_QPU_M_FMOV:
- instr->alu.mul.output_pack = (raddr_d >> 2) & 1;
+ instr->alu.mul.output_pack = raddr_d & 0x3;
- if (!v3d_qpu_float32_unpack_unpack(raddr_d & 0x3,
+ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
&instr->alu.mul.a.unpack)) {
return false;
}
@@ -2046,14 +2046,13 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
&packed)) {
return false;
}
- opcode |= (packed >> 1) & 1;
- raddr_d = (packed & 1) << 2;
+ raddr_d |= packed;
if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
- raddr_d |= packed;
+ raddr_d |= packed << 2;
break;
}
--
2.39.2

View File

@ -1,107 +0,0 @@
From 63d0059ebef288afb0e2e746dadda8c2238bdfcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 28 Sep 2021 01:17:08 +0200
Subject: [PATCH 035/142] broadcom/qpu: implement switch rules for fmin/fmax
fadd/faddnf for v71
They use the same opcodes, and switch between one and the other based
on raddr.
Note that the rule rule includes also if small_imm_a/b are used. That
is still not in place so that part is hardcode. Would be updated later
when small immediates support for v71 gets implemented.
---
src/broadcom/qpu/qpu_pack.c | 48 +++++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 0eb820b3f10..7a262f18ac3 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -651,7 +651,9 @@ static const struct opcode_desc mul_ops_v33[] = {
* opcodes that changed on v71
*/
static const struct opcode_desc add_ops_v71[] = {
+ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
{ 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
{ 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
{ 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
{ 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
@@ -666,6 +668,10 @@ static const struct opcode_desc add_ops_v71[] = {
{ 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
{ 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
{ 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
+ /* FMIN is instead FMAX depending on the raddr_a/b order. */
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
+ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
{ 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
{ 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
@@ -1162,6 +1168,22 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
instr->alu.add.op = desc->op;
+ /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the
+ * operands.
+ */
+ /* FIXME: for now hardcoded values, until we got the small_imm support
+ * in place
+ */
+ uint32_t small_imm_a = 0;
+ uint32_t small_imm_b = 0;
+ if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+ small_imm_b *256 + (op & 3) * 64 + raddr_b) {
+ if (instr->alu.add.op == V3D_QPU_A_FMIN)
+ instr->alu.add.op = V3D_QPU_A_FMAX;
+ if (instr->alu.add.op == V3D_QPU_A_FADD)
+ instr->alu.add.op = V3D_QPU_A_FADDNF;
+ }
+
/* Some QPU ops require a bit more than just basic opcode and mux a/b
* comparisons to distinguish them.
*/
@@ -1754,6 +1776,11 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
uint32_t output_pack;
uint32_t a_unpack;
uint32_t b_unpack;
+ /* FIXME: for now hardcoded values, until we got the small_imm
+ * support in place
+ */
+ uint32_t small_imm_a = 0;
+ uint32_t small_imm_b = 0;
if (instr->alu.add.op != V3D_QPU_A_FCMP) {
if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
@@ -1773,6 +1800,27 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
return false;
}
+ /* These operations with commutative operands are
+ * distinguished by which order their operands come in.
+ */
+ bool ordering =
+ small_imm_a * 256 + a_unpack * 64 + raddr_a >
+ small_imm_b * 256 + b_unpack * 64 + raddr_b;
+ if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+ ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
+ uint32_t temp;
+
+ temp = a_unpack;
+ a_unpack = b_unpack;
+ b_unpack = temp;
+
+ temp = raddr_a;
+ raddr_a = raddr_b;
+ raddr_b = temp;
+ }
+
opcode |= a_unpack << 2;
opcode |= b_unpack << 0;
--
2.39.2

View File

@ -1,37 +0,0 @@
From c9f6faa3ddc91024b3d9dc67ce2221187daac128 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 29 Sep 2021 11:54:18 +0200
Subject: [PATCH 036/142] broadcom/compiler: make vir_write_rX return false on
platforms without accums
---
src/broadcom/compiler/vir.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 007cb0a941b..d75cd777b6d 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -158,6 +158,9 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
bool
vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
for (int i = 0; i < vir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_VPM:
@@ -180,6 +183,9 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
bool
vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
switch (inst->dst.file) {
case QFILE_MAGIC:
switch (inst->dst.index) {
--
2.39.2

View File

@ -1,77 +0,0 @@
From 3d16229743e26b58735ed049ee982073f6034342 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 29 Sep 2021 12:03:50 +0200
Subject: [PATCH 037/142] broadcom/compiler: rename vir_writes_rX to
vir_writes_rX_implicitly
Since that represents more accurately what they check..
---
src/broadcom/compiler/v3d_compiler.h | 4 ++--
src/broadcom/compiler/vir.c | 6 ++++--
src/broadcom/compiler/vir_register_allocate.c | 4 ++--
3 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index eb4e692464b..7e8f3bfc1a7 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -1149,8 +1149,8 @@ bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
uint8_t vir_channels_written(struct qinst *inst);
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index d75cd777b6d..aea113f050e 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -156,7 +156,8 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
if (!devinfo->has_accumulators)
return false;
@@ -181,7 +182,8 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
if (!devinfo->has_accumulators)
return false;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 7b3f6c41934..f2df35cd458 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -988,7 +988,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* result to a temp), nothing else can be stored in r3/r4 across
* it.
*/
- if (vir_writes_r3(c->devinfo, inst)) {
+ if (vir_writes_r3_implicitly(c->devinfo, inst)) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
@@ -998,7 +998,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
}
}
- if (vir_writes_r4(c->devinfo, inst)) {
+ if (vir_writes_r4_implicitly(c->devinfo, inst)) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
--
2.39.2

View File

@ -1,170 +0,0 @@
From 83fae160491737e8568b8fb5eaa5be4d2c8bf3c8 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 29 Sep 2021 12:10:31 +0200
Subject: [PATCH 038/142] broadcom/compiler: only handle accumulator classes if
present
---
src/broadcom/compiler/vir_register_allocate.c | 77 ++++++++++++-------
1 file changed, 49 insertions(+), 28 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index f2df35cd458..e78ccb7c6aa 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -53,6 +53,17 @@ get_class_bit_any(const struct v3d_device_info *devinfo)
else
return CLASS_BITS_PHYS;
}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+ if (!devinfo->has_accumulators) {
+ assert(class_bits & CLASS_BITS_PHYS);
+ class_bits = CLASS_BITS_PHYS;
+ }
+ return class_bits;
+}
+
static inline uint32_t
temp_to_node(struct v3d_compile *c, uint32_t temp)
{
@@ -413,8 +424,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
*/
if (c->spilling) {
int temp_class = CLASS_BITS_PHYS;
- if (i != c->spill_base.index)
+ if (c->devinfo->has_accumulators &&
+ i != c->spill_base.index) {
temp_class |= CLASS_BITS_ACC;
+ }
add_node(c, i, temp_class);
}
}
@@ -473,14 +486,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
* temp will be used immediately so just like the uniform above we
* can allow accumulators.
*/
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
if (!fill_dst) {
struct qreg dst = vir_TMUWT(c);
assert(dst.file == QFILE_TEMP);
- add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ add_node(c, dst.index, temp_class);
} else {
*fill_dst = vir_LDTMU(c);
assert(fill_dst->file == QFILE_TEMP);
- add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ add_node(c, fill_dst->index, temp_class);
}
/* Temps across the thread switch we injected can't be assigned to
@@ -662,8 +677,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* instruction immediately after so we
* can use ACC.
*/
- add_node(c, temp.index, CLASS_BITS_PHYS |
- CLASS_BITS_ACC);
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+ CLASS_BITS_ACC);
+ add_node(c, temp.index, temp_class);
} else {
/* If we have a postponed spill, we
* don't need a fill as the temp would
@@ -941,6 +958,7 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
+ /* Init physical regs */
for (int i = phys_index;
i < phys_index + (PHYS_COUNT >> threads); i++) {
if (compiler->devinfo->has_accumulators)
@@ -949,16 +967,15 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
+ /* Init accumulator regs */
if (compiler->devinfo->has_accumulators) {
for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
- }
- /* r5 can only store a single 32-bit value, so not much can
- * use it.
- */
- if (compiler->devinfo->has_accumulators) {
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
ra_class_add_reg(compiler->reg_class_r5[threads],
ACC_INDEX + 5);
ra_class_add_reg(compiler->reg_class_any[threads],
@@ -1081,21 +1098,23 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* because ldunif has usually a shorter lifespan, allowing for
* more accumulator reuse and QPU merges.
*/
- if (!inst->qpu.sig.ldunif) {
- uint8_t class_bits =
- get_temp_class_bits(c, inst->dst.index) &
- ~CLASS_BITS_R5;
- set_temp_class_bits(c, inst->dst.index,
- class_bits);
-
- } else {
- /* Until V3D 4.x, we could only load a uniform
- * to r5, so we'll need to spill if uniform
- * loads interfere with each other.
- */
- if (c->devinfo->ver < 40) {
+ if (c->devinfo->has_accumulators) {
+ if (!inst->qpu.sig.ldunif) {
+ uint8_t class_bits =
+ get_temp_class_bits(c, inst->dst.index) &
+ ~CLASS_BITS_R5;
set_temp_class_bits(c, inst->dst.index,
- CLASS_BITS_R5);
+ class_bits);
+
+ } else {
+ /* Until V3D 4.x, we could only load a uniform
+ * to r5, so we'll need to spill if uniform
+ * loads interfere with each other.
+ */
+ if (c->devinfo->ver < 40) {
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_R5);
+ }
}
}
}
@@ -1152,8 +1171,10 @@ v3d_register_allocate(struct v3d_compile *c)
c->thread_index--;
}
- c->g = ra_alloc_interference_graph(c->compiler->regs,
- c->num_temps + ARRAY_SIZE(acc_nodes));
+ unsigned num_ra_nodes = c->num_temps;
+ if (c->devinfo->has_accumulators)
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
+ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
@@ -1162,8 +1183,8 @@ v3d_register_allocate(struct v3d_compile *c)
* live in, but the classes take up a lot of memory to set up, so we
* don't want to make too many.
*/
- for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
- if (i < ACC_COUNT) {
+ for (uint32_t i = 0; i < num_ra_nodes; i++) {
+ if (c->devinfo->has_accumulators && i < ACC_COUNT) {
acc_nodes[i] = i;
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
c->nodes.info[i].priority = 0;
--
2.39.2

View File

@ -1,187 +0,0 @@
From fd77cc3204e7c69927f97ce2a1d55d2a47d77a27 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 29 Sep 2021 12:14:04 +0200
Subject: [PATCH 039/142] broadcom/compiler: don't assign rf0 to temps across
implicit rf0 writes
In platforms that don't have accumulators and have implicit writes to
the register file we need to be careful and avoid assigning a physical
register to a temp that lives across an implicit write to that same
physical register.
For now, we have the case of implicit writes to rf0 from various
signals, but it should be easy to extend this to include additional
registers if needed.
---
src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++----
1 file changed, 57 insertions(+), 12 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index e78ccb7c6aa..e0adc1de7a4 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -29,6 +29,9 @@
#define ACC_INDEX 0
#define ACC_COUNT 6
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
#define PHYS_COUNT 64
static uint8_t
@@ -67,15 +70,17 @@ filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
static inline uint32_t
temp_to_node(struct v3d_compile *c, uint32_t temp)
{
- return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0);
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
}
static inline uint32_t
node_to_temp(struct v3d_compile *c, uint32_t node)
{
assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
- (!c->devinfo->has_accumulators && node >= 0));
- return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0);
+ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
}
static inline uint8_t
@@ -360,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
c->nodes.info = reralloc_array_size(c,
c->nodes.info,
sizeof(c->nodes.info[0]),
- c->nodes.alloc_count + ACC_COUNT);
+ c->nodes.alloc_count +
+ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
}
/* Creates the interference node for a new temp. We use this to keep the node
@@ -372,7 +378,8 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
ensure_nodes(c);
int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
- assert(node == temp + ACC_COUNT);
+ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+ node == temp + IMPLICIT_RF_COUNT);
/* We fill the node priority after we are done inserting spills */
c->nodes.info[node].class_bits = class_bits;
@@ -995,7 +1002,9 @@ tmu_spilling_allowed(struct v3d_compile *c)
}
static void
-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+ int *acc_nodes,
+ int *implicit_rf_nodes,
struct qinst *inst)
{
int32_t ip = inst->ip;
@@ -1025,6 +1034,19 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
}
}
+ /* If any instruction writes to a physical register implicitly
+ * nothing else can write the same register across it.
+ */
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_LDVPMV_IN:
@@ -1116,6 +1138,16 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
CLASS_BITS_R5);
}
}
+ } else {
+ /* If the instruction has an implicit write
+ * we can't allocate its dest to the same
+ * register.
+ */
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, inst->dst.index),
+ implicit_rf_nodes[0]);
+ }
}
}
@@ -1139,10 +1171,18 @@ struct qpu_reg *
v3d_register_allocate(struct v3d_compile *c)
{
int acc_nodes[ACC_COUNT];
+ int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+ unsigned num_ra_nodes = c->num_temps;
+ if (c->devinfo->has_accumulators)
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
+ else
+ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
c->nodes = (struct v3d_ra_node_info) {
.alloc_count = c->num_temps,
.info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
- c->num_temps + ACC_COUNT),
+ num_ra_nodes),
};
uint32_t phys_index = get_phys_index(c->devinfo);
@@ -1171,9 +1211,6 @@ v3d_register_allocate(struct v3d_compile *c)
c->thread_index--;
}
- unsigned num_ra_nodes = c->num_temps;
- if (c->devinfo->has_accumulators)
- num_ra_nodes += ARRAY_SIZE(acc_nodes);
c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
@@ -1181,7 +1218,8 @@ v3d_register_allocate(struct v3d_compile *c)
* interfere with when ops have implied r3/r4 writes or for the thread
* switches. We could represent these as classes for the nodes to
* live in, but the classes take up a lot of memory to set up, so we
- * don't want to make too many.
+ * don't want to make too many. We use the same mechanism on platforms
+ * without accumulators that can have implicit writes to phys regs.
*/
for (uint32_t i = 0; i < num_ra_nodes; i++) {
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
@@ -1189,6 +1227,12 @@ v3d_register_allocate(struct v3d_compile *c)
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
c->nodes.info[i].priority = 0;
c->nodes.info[i].class_bits = 0;
+ } else if (!c->devinfo->has_accumulators &&
+ i < ARRAY_SIZE(implicit_rf_nodes)) {
+ implicit_rf_nodes[i] = i;
+ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+ c->nodes.info[i].priority = 0;
+ c->nodes.info[i].class_bits = 0;
} else {
uint32_t t = node_to_temp(c, i);
c->nodes.info[i].priority =
@@ -1204,7 +1248,8 @@ v3d_register_allocate(struct v3d_compile *c)
int ip = 0;
vir_for_each_inst_inorder(inst, c) {
inst->ip = ip++;
- update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
+ update_graph_and_reg_classes_for_inst(c, acc_nodes,
+ implicit_rf_nodes, inst);
}
/* Set the register classes for all our temporaries in the graph */
--
2.39.2

View File

@ -1,33 +0,0 @@
From 9a08ae9f354a6da6d9d71b87800aca8b3df49e29 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 28 Sep 2021 13:37:28 +0200
Subject: [PATCH 040/142] broadcom/compiler: CS payload registers have changed
in v71
---
src/broadcom/compiler/nir_to_vir.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 1a05b279a2d..220ff6bcd49 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -4362,8 +4362,13 @@ nir_to_vir(struct v3d_compile *c)
V3D_QPU_WADDR_SYNC));
}
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ if (c->devinfo->ver <= 42) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ } else if (c->devinfo->ver >= 71) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ }
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
--
2.39.2

View File

@ -1,46 +0,0 @@
From 5477884196cb54a71f54fa6cad42c6d3326bde88 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 22 Oct 2021 13:39:48 +0200
Subject: [PATCH 041/142] broadcom/compiler: don't schedule rf0 writes right
after ldvary
ldvary writes rf0 implicitly on the next cycle so they would clash.
This case is not handled correctly by our normal dependency tracking,
which doesn't know anything about delayed writes from instructions
and thinks the rf0 write happens on the same cycle ldvary is emitted.
Fixes (v71):
dEQP-VK.glsl.conversions.matrix_to_matrix.mat2x3_to_mat4x2_fragment
---
src/broadcom/compiler/qpu_schedule.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 839c0c62315..870823fd2b1 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -652,6 +652,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
v3d_qpu_writes_r4(devinfo, inst))
return true;
+ if (devinfo->ver <= 42)
+ return false;
+
+ /* Don't schedule anything that writes rf0 right after ldvary, since
+ * that would clash with the ldvary's delayed rf0 write (the exception
+ * is another ldvary, since its implicit rf0 write would also have
+ * one cycle of delay and would not clash).
+ */
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !inst->sig.ldvary))) {
+ return true;
+ }
+
return false;
}
--
2.39.2

View File

@ -1,60 +0,0 @@
From 31623712c2f741d393767641f32d56c35150eda5 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 30 Sep 2021 13:22:48 +0200
Subject: [PATCH 042/142] broadcom/compiler: allow instruction merges in v71
In v3d 4.x there were restrictions based on the number of raddrs used
by the combined instructions, but we don't have these restrictions in
v3d 7.x.
It should be noted that while there are no restrictions on the number
of raddrs addressed, a QPU instruction can only address a single small
immediate, so we should be careful about that when we add support for
small immediates.
---
src/broadcom/compiler/qpu_schedule.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 870823fd2b1..ff544fb3c1c 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -906,8 +906,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *add_instr,
- const struct v3d_qpu_instr *mul_instr)
+ const struct v3d_qpu_instr *mul_instr,
+ const struct v3d_device_info *devinfo)
{
+ assert(devinfo->ver <= 42);
+
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
int naddrs = util_bitcount64(raddrs_used);
@@ -1111,9 +1114,19 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
add_instr = a;
}
- if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
- return false;
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+ * they have restrictions on the number of raddrs that can be adressed
+ * in a single instruction.
+ *
+ * FIXME: for V3D 7.x we can't merge instructions if they address more
+ * than one small immediate. For now, we don't support small immediates,
+ * so it is not a problem.
+ */
+ if (devinfo->ver <= 42) {
+ if (add_instr && mul_instr &&
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
+ }
}
merge.sig.thrsw |= b->sig.thrsw;
--
2.39.2

View File

@ -1,172 +0,0 @@
From 959a0128654c94d84fda53ffc108971d3b3a817a Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 6 Oct 2021 09:27:43 +0200
Subject: [PATCH 043/142] broadcom/qpu: add MOV integer packing/unpacking
variants
These are new in v71 and cover MOV on both the ADD and the MUL alus.
---
src/broadcom/qpu/qpu_instr.h | 9 ++++
src/broadcom/qpu/qpu_pack.c | 98 ++++++++++++++++++++++++++++++++++++
2 files changed, 107 insertions(+)
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index c86a4119c54..4b34d17bd4c 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -285,6 +285,15 @@ enum v3d_qpu_input_unpack {
/** Swap high and low 16 bits */
V3D_QPU_UNPACK_SWAP_16,
+
+ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UL,
+ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UH,
+ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IL,
+ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IH,
};
enum v3d_qpu_mux {
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 7a262f18ac3..4d677894755 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -922,6 +922,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
}
}
+static bool
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
+ enum v3d_qpu_input_unpack *unpacked)
+{
+ switch (packed) {
+ case 0:
+ *unpacked = V3D_QPU_UNPACK_NONE;
+ return true;
+ case 1:
+ *unpacked = V3D_QPU_UNPACK_UL;
+ return true;
+ case 2:
+ *unpacked = V3D_QPU_UNPACK_UH;
+ return true;
+ case 3:
+ *unpacked = V3D_QPU_UNPACK_IL;
+ return true;
+ case 4:
+ *unpacked = V3D_QPU_UNPACK_IH;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+ uint32_t *packed)
+{
+ switch (unpacked) {
+ case V3D_QPU_UNPACK_NONE:
+ *packed = 0;
+ return true;
+ case V3D_QPU_UNPACK_UL:
+ *packed = 1;
+ return true;
+ case V3D_QPU_UNPACK_UH:
+ *packed = 2;
+ return true;
+ case V3D_QPU_UNPACK_IL:
+ *packed = 3;
+ return true;
+ case V3D_QPU_UNPACK_IH:
+ *packed = 4;
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool
v3d_qpu_float16_unpack_unpack(uint32_t packed,
enum v3d_qpu_input_unpack *unpacked)
@@ -1273,6 +1323,15 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
+ case V3D_QPU_A_MOV:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
default:
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
@@ -1449,6 +1508,15 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
break;
+ case V3D_QPU_M_MOV:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+ break;
+
default:
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
@@ -1909,6 +1977,21 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
opcode |= packed;
break;
+ case V3D_QPU_A_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_b |= packed << 2;
+ break;
+ }
+
default:
if (instr->alu.add.op != V3D_QPU_A_NOP &&
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
@@ -2126,6 +2209,21 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
break;
}
+ case V3D_QPU_M_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_d |= packed << 2;
+ break;
+ }
+
default:
break;
}
--
2.39.2

View File

@ -1,47 +0,0 @@
From 2e86dd0c357d7b432ce6794ae22fbfae89ad186b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 6 Oct 2021 12:01:10 +0200
Subject: [PATCH 044/142] broadcom/qpu: fail packing on unhandled mul
pack/unpack
We are doing this for the ADD alu already and it may be helpful to
identify cases where we have QPU code with pack/unpack modifiers on
MUL opcodes that we then are not packing into the actual QPU
instructions.
---
src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 4d677894755..180d7ab08a3 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -2106,6 +2106,12 @@ v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
}
default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
break;
}
@@ -2225,6 +2231,12 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
}
default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
break;
}
--
2.39.2

View File

@ -1,30 +0,0 @@
From ed6bfa29d43b5a89ff070961454f1e82e23b4f45 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 8 Oct 2021 15:10:24 +0200
Subject: [PATCH 045/142] broadcom/compiler: generalize check for shaders using
pixel center W
V3D 4.x has pixel center W in rf0 and V3D 7.x has it in rf3. We already
account for this when we setup the c->payload_w, so use that.
---
src/broadcom/compiler/nir_to_vir.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 220ff6bcd49..90fe1d1e7f0 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -4547,8 +4547,8 @@ vir_check_payload_w(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_REG &&
- inst->src[i].index == 0) {
+ if (inst->src[i].file == c->payload_w.file &&
+ inst->src[i].index == c->payload_w.index) {
c->uses_center_w = true;
return;
}
--
2.39.2

View File

@ -1,34 +0,0 @@
From e1a0fa2c2010ef29b8cec798cd0fc99cf44f3a2d Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 14 Oct 2021 14:16:40 +0200
Subject: [PATCH 046/142] broadcom/compiler: v71 isn't affected by
double-rounding of viewport X,Y coords
---
src/broadcom/compiler/v3d_nir_lower_io.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 3ef0e398228..4cdba3748a1 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
* The correct fix for this as recommended by Broadcom
* is to convert to .8 fixed-point with ffloor().
*/
- pos = nir_f2i32(b, nir_ffloor(b, pos));
- v3d_nir_store_output(b, state->vp_vpm_offset + i,
- offset_reg, pos);
+ if (c->devinfo->ver <= 42)
+ pos = nir_f2i32(b, nir_ffloor(b, pos));
+ else
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
+ offset_reg, pos);
}
}
--
2.39.2

View File

@ -1,31 +0,0 @@
From 697e6cf01b781b244404872f331a778b6d4e67da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 19 Oct 2021 11:16:43 +0200
Subject: [PATCH 047/142] broadcom/compiler: update one TMUWT restriction for
v71
TMUWT not allowed in the final instruction restriction doesn't apply
for v71.
---
src/broadcom/compiler/qpu_schedule.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index ff544fb3c1c..25f79aa6f46 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1700,8 +1700,10 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
/* GFXH-1625: TMUWT not allowed in the final instruction. */
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+ if (c->devinfo->ver <= 42 && slot == 2 &&
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
return false;
+ }
/* No writing physical registers at the end. */
bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
--
2.39.2

View File

@ -1,37 +0,0 @@
From 26fea727a9f34b75a3fe3f6a806accaddcc317f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 19 Oct 2021 11:51:32 +0200
Subject: [PATCH 048/142] broadcom/compiler: update ldunif/ldvary comment for
v71
For v42 and below ldunif/ldvary write both on r5, but with a different
delay, so we need to take that into account when scheduling both.
For v71 the register used is rf0, but the behaviour is the same. So
the scheduling code can be the same, but the comment needs update.
---
src/broadcom/compiler/qpu_schedule.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 25f79aa6f46..e8197661f89 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1234,10 +1234,11 @@ retry:
if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* ldunif and ldvary both write r5, but ldunif does so a tick
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ /* ldunif and ldvary both write the same register (r5 for v42
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
+ * If the ldvary's register wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
- * r5 in the same tick.
+ * the register in the same tick.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
--
2.39.2

View File

@ -1,52 +0,0 @@
From 70456e27b039174f767010f96d9b649e5e42d84f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 19 Oct 2021 23:52:30 +0200
Subject: [PATCH 049/142] broadcom/compiler: update payload registers handling
when computing live intervals
As for v71 the payload registers are not the same. Specifically now
rf3 is used as payload register, so this is needed to avoid rf3 being
selected as a instruction dst by the register allocator, overwriting
the payload value that could be still used.
---
src/broadcom/compiler/vir_live_variables.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 575b0481dc8..87a7e2b5b81 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
flags_inst = NULL;
}
- /* Payload registers: r0/1/2 contain W, centroid W,
- * and Z at program start. Register allocation will
- * force their nodes to R0/1/2.
+ /* Payload registers: for fragment shaders, W,
+ * centroid W, and Z will be initialized at r0/1/2
+ * until v42, or r1/r2/r3 from v71.
+ *
+ * For compute shaders, payload would be r0/r2 until
+ * v42, r3/r2 from v71
+ *
+ * Register allocation will force their nodes to those
+ * registers.
*/
if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
+ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+ if (inst->src[0].index >= min_payload_r ||
+ inst->src[0].index <= max_payload_r) {
c->temp_start[inst->dst.index] = 0;
- break;
}
}
--
2.39.2

View File

@ -1,235 +0,0 @@
From f9a76b3a1e316e5ed6387819b87eaaf60f989a2b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 26 Oct 2021 11:43:02 +0200
Subject: [PATCH 050/142] broadcom/compiler: update peripheral access
restrictions for v71
In V3D 4.x only a couple of simultaneous accesses where allowed, but
V3D 7.x is a bit more flexible, so rather than trying to check for all
the allowed combinations it is easier to check if we are one of the
disallows.
Shader-db (pi5):
total instructions in shared programs: 11338883 -> 11307386 (-0.28%)
instructions in affected programs: 2727201 -> 2695704 (-1.15%)
helped: 12555
HURT: 289
Instructions are helped.
total max-temps in shared programs: 2230199 -> 2229260 (-0.04%)
max-temps in affected programs: 20508 -> 19569 (-4.58%)
helped: 608
HURT: 4
Max-temps are helped.
total sfu-stalls in shared programs: 15236 -> 15293 (0.37%)
sfu-stalls in affected programs: 148 -> 205 (38.51%)
helped: 38
HURT: 64
Inconclusive result (%-change mean confidence interval includes 0).
total inst-and-stalls in shared programs: 11354119 -> 11322679 (-0.28%)
inst-and-stalls in affected programs: 2732262 -> 2700822 (-1.15%)
helped: 12550
HURT: 304
Inst-and-stalls are helped.
total nops in shared programs: 273711 -> 274095 (0.14%)
nops in affected programs: 9626 -> 10010 (3.99%)
helped: 186
HURT: 397
Nops are HURT.
---
src/broadcom/compiler/qpu_schedule.c | 88 +++++++++++++++++++++-------
src/broadcom/compiler/qpu_validate.c | 2 +-
src/broadcom/qpu/qpu_instr.c | 16 +++--
src/broadcom/qpu/qpu_instr.h | 2 +
4 files changed, 82 insertions(+), 26 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index e8197661f89..adb501e85ce 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -790,7 +790,8 @@ enum {
V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
V3D_PERIPHERAL_TSY = (1 << 8),
- V3D_PERIPHERAL_TLB = (1 << 9),
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
};
static uint32_t
@@ -815,8 +816,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
if (v3d_qpu_uses_sfu(inst))
result |= V3D_PERIPHERAL_SFU;
- if (v3d_qpu_uses_tlb(inst))
- result |= V3D_PERIPHERAL_TLB;
+ if (v3d_qpu_reads_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_READ;
+ if (v3d_qpu_writes_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_WRITE;
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
@@ -847,32 +850,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
if (devinfo->ver < 41)
return false;
- /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
- * tmuc).
+ /* V3D 4.x can't do more than one peripheral access except in a
+ * few cases:
*/
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
- b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ if (devinfo->ver <= 42) {
+ /* WRTMUC signal with TMU register write (other than tmuc). */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ }
+
+ /* TMU read with VPM read/write. */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+
+ return false;
}
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
- b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ /* V3D 7.x can't have more than one of these restricted peripherals */
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+ V3D_PERIPHERAL_TSY |
+ V3D_PERIPHERAL_TLB_READ |
+ V3D_PERIPHERAL_SFU |
+ V3D_PERIPHERAL_VPM_READ |
+ V3D_PERIPHERAL_VPM_WRITE;
+
+ const uint32_t a_restricted = a_peripherals & restricted;
+ const uint32_t b_restricted = b_peripherals & restricted;
+ if (a_restricted && b_restricted) {
+ /* WRTMUC signal with TMU register write (other than tmuc) is
+ * allowed though.
+ */
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+ return false;
+ }
}
- /* V3D 4.1+ allows TMU read with VPM read/write. */
- if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
- (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
- b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
- return true;
+ /* Only one TMU read per instruction */
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+ return false;
}
- if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
- (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
- a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
- return true;
+
+ /* Only one TLB access per instruction */
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ)) &&
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ))) {
+ return false;
}
- return false;
+ return true;
}
/* Compute a bitmask of which rf registers are used between
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 12788692432..fde6695d59b 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -227,7 +227,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
vpm_writes +
tlb_writes +
tsy_writes +
- inst->sig.ldtmu +
+ (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
inst->sig.ldtlb +
inst->sig.ldvpm +
inst->sig.ldtlbu > 1) {
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 195a0dcd232..f54ce7210fb 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -649,12 +649,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op)
}
bool
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
{
- if (inst->sig.ldtlb ||
- inst->sig.ldtlbu)
- return true;
+ return inst->sig.ldtlb || inst->sig.ldtlbu;
+}
+bool
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
+{
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
@@ -672,6 +674,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
return false;
}
+bool
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+{
+ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
+}
+
bool
v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
{
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 4b34d17bd4c..dece45c5c54 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -472,6 +472,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
--
2.39.2

View File

@ -1,61 +0,0 @@
From 3520cceb87fb2f9765ba7dbe2771fbd0cadca78d Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 26 Oct 2021 08:37:54 +0200
Subject: [PATCH 051/142] broadcom/qpu: add packing for fmov on ADD alu
---
src/broadcom/qpu/qpu_pack.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 180d7ab08a3..ed5a8bc667d 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -1332,6 +1332,20 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
}
break;
+ case V3D_QPU_A_FMOV:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ /* Mul alu FMOV has one additional variant */
+ int32_t unpack = (raddr_b >> 2) & 0x7;
+ if (unpack == 7)
+ return false;
+
+ if (!v3d_qpu_float32_unpack_unpack(unpack,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
default:
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
@@ -1992,6 +2006,23 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
break;
}
+ case V3D_QPU_A_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b = packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed << 2;
+ break;
+ }
+
default:
if (instr->alu.add.op != V3D_QPU_A_NOP &&
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
--
2.39.2

View File

@ -1,155 +0,0 @@
From 7c7ab15b3c9def4bc3bb5be492228a933c325f8a Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 6 Oct 2021 13:58:27 +0200
Subject: [PATCH 052/142] broadcom/compiler: handle rf0 flops storage
restriction in v71
---
src/broadcom/compiler/qpu_schedule.c | 81 +++++++++++++++++++++++++++-
1 file changed, 79 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index adb501e85ce..7048d9257b6 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -538,6 +538,10 @@ struct choose_scoreboard {
int ldvary_count;
int pending_ldtmu_count;
bool first_ldtmu_after_thrsw;
+
+ /* V3D 7.x */
+ int last_implicit_rf0_write_tick;
+ bool has_rf0_flops_conflict;
};
static bool
@@ -1499,6 +1503,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
}
}
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ scoreboard->has_rf0_flops_conflict = true;
+ }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return;
+
+ /* Thread switch restrictions:
+ *
+ * At the point of a thread switch or thread end (when the actual
+ * thread switch or thread end happens, not when the signalling
+ * instruction is processed):
+ *
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
+ * ldvary instruction in which another signal also wrote to the
+ * register file, and the final instruction of the thread section
+ * contained a signal which wrote to the register file, then the
+ * value of rf0 is undefined at the start of the new section
+ *
+ * Here we use the scoreboard to track if our last rf0 implicit write
+ * happens at the same time that another signal writes the register
+ * file (has_rf0_flops_conflict). We will use that information when
+ * scheduling thrsw instructions to avoid putting anything in their
+ * last delay slot which has a signal that writes to the register file.
+ */
+
+ /* Reset tracking if we have an explicit rf0 write or we are starting
+ * a new thread section.
+ */
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+ scoreboard->last_implicit_rf0_write_tick = -10;
+ scoreboard->has_rf0_flops_conflict = false;
+ }
+
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+ scoreboard->tick + 1 : scoreboard->tick;
+ }
+
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
@@ -1542,6 +1602,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
update_scoreboard_tmu_tracking(scoreboard, qinst);
}
@@ -1812,6 +1874,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
*/
static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
uint32_t slot)
{
@@ -1842,6 +1905,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
return false;
+ /* See comment when we set has_rf0_flops_conflict for details */
+ if (c->devinfo->ver >= 71 &&
+ slot == 2 &&
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+ !qinst->qpu.sig_magic) {
+ if (scoreboard->has_rf0_flops_conflict)
+ return false;
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+ return false;
+ }
+
return true;
}
@@ -1874,7 +1948,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* also apply to instructions scheduled after the thrsw that we want
* to place in its delay slots.
*/
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
return false;
/* TLB access is disallowed until scoreboard wait is executed, which
@@ -1947,8 +2021,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
bool is_thrend)
{
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+ qinst, slot)) {
return false;
+ }
if (is_thrend &&
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -2718,6 +2794,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
scoreboard.first_ldtmu_after_thrsw = true;
+ scoreboard.last_implicit_rf0_write_tick = - 10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
--
2.39.2

View File

@ -1,189 +0,0 @@
From 0c6910721eb50b38b3388c2d2344b6ecfe0fee58 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 27 Oct 2021 11:35:12 +0200
Subject: [PATCH 053/142] broadcom/compiler: enable ldvary pipelining on v71
---
src/broadcom/compiler/qpu_schedule.c | 121 ++++++++++++++++++---------
1 file changed, 80 insertions(+), 41 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 7048d9257b6..334ffdc6d58 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -2312,46 +2312,72 @@ emit_branch(struct v3d_compile *c,
}
static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
uint32_t num_src;
- enum v3d_qpu_mux mux_a, mux_b;
-
- if (add) {
+ if (add)
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a.mux;
- mux_b = inst->alu.add.b.mux;
- } else {
+ else
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a.mux;
- mux_b = inst->alu.mul.b.mux;
- }
- for (int i = 0; i < num_src; i++) {
- if (magic) {
- if (i == 0 && mux_a == index)
- return true;
- if (i == 1 && mux_b == index)
- return true;
+ if (devinfo->ver <= 42) {
+ enum v3d_qpu_mux mux_a, mux_b;
+ if (add) {
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (magic) {
+ if (i == 0 && mux_a == index)
+ return true;
+ if (i == 1 && mux_b == index)
+ return true;
+ } else {
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
}
}
+
+ return false;
+ }
+
+ assert(devinfo->ver >= 71);
+ assert(!magic);
+
+ uint32_t raddr_a, raddr_b;
+ if (add) {
+ raddr_a = inst->alu.add.a.raddr;
+ raddr_b = inst->alu.add.b.raddr;
+ } else {
+ raddr_a = inst->alu.mul.a.raddr;
+ raddr_b = inst->alu.mul.b.raddr;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (i == 0 && raddr_a == index)
+ return true;
+ if (i == 1 && raddr_b == index)
+ return true;
}
return false;
@@ -2386,6 +2412,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
struct qblock *block,
struct v3d_qpu_instr *inst)
{
+ const struct v3d_device_info *devinfo = c->devinfo;
+
/* We only call this if we have successfully merged an ldvary into a
* previous instruction.
*/
@@ -2398,9 +2426,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
* the ldvary destination, if it does, then moving the ldvary before
* it would overwrite it.
*/
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
return false;
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
return false;
/* The implicit ldvary destination may not be written to by a signal
@@ -2436,13 +2464,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
}
/* The previous instruction cannot have a conflicting signal */
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
return false;
uint32_t sig;
struct v3d_qpu_sig new_sig = prev->qpu.sig;
new_sig.ldvary = true;
- if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
return false;
/* The previous instruction cannot use flags since ldvary uses the
@@ -2471,14 +2499,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
inst->sig_magic = false;
inst->sig_addr = 0;
- /* By moving ldvary to the previous instruction we make it update
- * r5 in the current one, so nothing else in it should write r5.
- * This should've been prevented by our dependency tracking, which
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+ if (devinfo->ver >= 71) {
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+ }
+
+ /* By moving ldvary to the previous instruction we make it update r5
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
+ * should write this register.
+ *
+ * This should've been prevented by our depedency tracking, which
* would not allow ldvary to be paired up with an instruction that
- * writes r5 (since our dependency tracking doesn't know that the
- * ldvary write r5 happens in the next instruction).
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
+ * ldvary write to r5/rf0 happens in the next instruction).
*/
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
+ assert(devinfo->ver <= 42 ||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
return true;
}
--
2.39.2

View File

@ -1,144 +0,0 @@
From 0670d642bb91fc68ce73f2d9fb88c482295a446d Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 28 Oct 2021 14:13:29 +0200
Subject: [PATCH 054/142] broadcom/compiler: try to use ldunif(a) instead of
ldunif(a)rf in v71
The rf variants need to encode the destination in the cond bits, which
prevents these to be merged with any other instruction that need them.
In 4.x, ldunif(a) write to r5 which is a special register that only
ldunif(a) and ldvary can write so we have a special register class for
it and only allow it for them. Then when we need to choose a register
for a node, if this register is available we always use it.
In 7.x these instructions write to rf0, which can be used by any
instruction, so instead of restricting rf0, we track the temps that
are used as ldunif(a) destinations and use that information to favor
rf0 for them.
---
src/broadcom/compiler/v3d_compiler.h | 3 ++
src/broadcom/compiler/vir_register_allocate.c | 34 ++++++++++++++++---
src/broadcom/compiler/vir_to_qpu.c | 11 ++++--
3 files changed, 41 insertions(+), 7 deletions(-)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 7e8f3bfc1a7..36adf8830b5 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -613,6 +613,9 @@ struct v3d_ra_node_info {
struct {
uint32_t priority;
uint8_t class_bits;
+
+ /* V3D 7.x */
+ bool is_ldunif_dst;
} *info;
uint32_t alloc_count;
};
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index e0adc1de7a4..1be091f8518 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -384,6 +384,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
/* We fill the node priority after we are done inserting spills */
c->nodes.info[node].class_bits = class_bits;
c->nodes.info[node].priority = 0;
+ c->nodes.info[node].is_ldunif_dst = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -899,9 +900,22 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ unsigned int node,
BITSET_WORD *regs,
unsigned int *out)
{
+ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+ * so we can avoid turning them into ldunifrf (which uses the
+ * cond field to encode the dst and would prevent merge with
+ * instructions that use cond flags).
+ */
+ if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ assert(v3d_ra->devinfo->ver >= 71);
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
int phys = v3d_ra->phys_index + phys_off;
@@ -927,7 +941,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
return reg;
}
- if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+ if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
return reg;
/* If we ran out of physical registers try to assign an accumulator
@@ -1139,15 +1153,24 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
}
}
} else {
- /* If the instruction has an implicit write
- * we can't allocate its dest to the same
- * register.
+ /* Make sure we don't allocate the ldvary's
+ * destination to rf0, since it would clash
+ * with its implicit write to that register.
*/
- if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ if (inst->qpu.sig.ldvary) {
ra_add_node_interference(c->g,
temp_to_node(c, inst->dst.index),
implicit_rf_nodes[0]);
}
+ /* Flag dst temps from ldunif(a) instructions
+ * so we can try to assign rf0 to them and avoid
+ * converting these to ldunif(a)rf.
+ */
+ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+ const uint32_t dst_n =
+ temp_to_node(c, inst->dst.index);
+ c->nodes.info[dst_n].is_ldunif_dst = true;
+ }
}
}
@@ -1222,6 +1245,7 @@ v3d_register_allocate(struct v3d_compile *c)
* without accumulators that can have implicit writes to phys regs.
*/
for (uint32_t i = 0; i < num_ra_nodes; i++) {
+ c->nodes.info[i].is_ldunif_dst = false;
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
acc_nodes[i] = i;
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index afc4941fdb1..cbbb495592b 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -345,8 +345,15 @@ v3d_generate_code_block(struct v3d_compile *c,
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
- if (!dst.magic ||
- dst.index != V3D_QPU_WADDR_R5) {
+ bool use_rf;
+ if (c->devinfo->has_accumulators) {
+ use_rf = !dst.magic ||
+ dst.index != V3D_QPU_WADDR_R5;
+ } else {
+ use_rf = dst.magic || dst.index != 0;
+ }
+
+ if (use_rf) {
assert(c->devinfo->ver >= 40);
if (qinst->qpu.sig.ldunif) {
--
2.39.2

View File

@ -1,82 +0,0 @@
From cbed3b97394da09c9ae644c79e098e3ba8b5c3e8 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 29 Oct 2021 13:00:56 +0200
Subject: [PATCH 055/142] broadcom/compiler: don't assign rf0 to temps that
conflict with ldvary
ldvary writes to rf0 implicitly, so we don't want to allocate rf0 to
any temps that are live across ldvary's rf0 live ranges.
---
src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++++++-
1 file changed, 38 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 1be091f8518..6f7b1ca0589 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -1019,6 +1019,7 @@ static void
update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
int *acc_nodes,
int *implicit_rf_nodes,
+ int last_ldvary_ip,
struct qinst *inst)
{
int32_t ip = inst->ip;
@@ -1125,6 +1126,25 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
}
}
+ /* Don't allocate rf0 to temps that cross ranges where we have
+ * live implicit rf0 writes from ldvary. We can identify these
+ * by tracking the last ldvary instruction and explicit reads
+ * of rf0.
+ */
+ if (c->devinfo->ver >= 71 &&
+ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+ (vir_get_nsrc(inst) > 1 &&
+ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > last_ldvary_ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
if (inst->dst.file == QFILE_TEMP) {
/* Only a ldunif gets to write to R5, which only has a
* single 32-bit channel of storage.
@@ -1270,10 +1290,27 @@ v3d_register_allocate(struct v3d_compile *c)
* interferences.
*/
int ip = 0;
+ int last_ldvary_ip = -1;
vir_for_each_inst_inorder(inst, c) {
inst->ip = ip++;
+
+ /* ldunif(a) always write to a temporary, so we have
+ * liveness info available to decide if rf0 is
+ * available for them, however, ldvary is different:
+ * it always writes to rf0 directly so we don't have
+ * liveness information for its implicit rf0 write.
+ *
+ * That means the allocator may assign rf0 to a temp
+ * that is defined while an implicit rf0 write from
+ * ldvary is still live. We fix that by manually
+ * tracking rf0 live ranges from ldvary instructions.
+ */
+ if (inst->qpu.sig.ldvary)
+ last_ldvary_ip = ip;
+
update_graph_and_reg_classes_for_inst(c, acc_nodes,
- implicit_rf_nodes, inst);
+ implicit_rf_nodes,
+ last_ldvary_ip, inst);
}
/* Set the register classes for all our temporaries in the graph */
--
2.39.2

View File

@ -1,139 +0,0 @@
From cbaa469c09974c1574b16f559173694904fe1bb0 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 25 Oct 2021 09:38:57 +0200
Subject: [PATCH 056/142] broadcom/compiler: convert mul to add when needed to
allow merge
V3D 7.x added 'mov' opcodes to the ADD alu, so now it is possible to
move these to the ADD alu to facilitate merging them with other MUL
instructions.
---
src/broadcom/compiler/qpu_schedule.c | 102 ++++++++++++++++++++++++---
1 file changed, 94 insertions(+), 8 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 334ffdc6d58..caa84254998 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1086,6 +1086,57 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
}
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ case V3D_QPU_M_FMOV:
+ return devinfo->ver >= 71;
+ default:
+ return false;
+ }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ return V3D_QPU_A_MOV;
+ case V3D_QPU_M_FMOV:
+ return V3D_QPU_A_FMOV;
+ default:
+ unreachable("unexpected mov opcode");
+ }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+ inst->alu.mul.op = V3D_QPU_M_NOP;
+
+ inst->flags.ac = inst->flags.mc;
+ inst->flags.apf = inst->flags.mpf;
+ inst->flags.auf = inst->flags.muf;
+ inst->flags.mc = V3D_QPU_COND_NONE;
+ inst->flags.mpf = V3D_QPU_PF_NONE;
+ inst->flags.muf = V3D_QPU_UF_NONE;
+
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+}
+
static bool
qpu_merge_inst(const struct v3d_device_info *devinfo,
struct v3d_qpu_instr *result,
@@ -1151,17 +1202,52 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
}
}
+ struct v3d_qpu_instr add_inst;
if (b->alu.mul.op != V3D_QPU_M_NOP) {
- if (a->alu.mul.op != V3D_QPU_M_NOP)
- return false;
- merge.alu.mul = b->alu.mul;
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = a;
+ }
+ /* If a's mul op is used but its add op is not, then see if we
+ * can convert either a's mul op or b's mul op to an add op
+ * so we can merge.
+ */
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+ add_inst = *b;
+ qpu_convert_mul_to_add(&add_inst);
- merge.flags.mc = b->flags.mc;
- merge.flags.mpf = b->flags.mpf;
- merge.flags.muf = b->flags.muf;
+ merge.alu.add = add_inst.alu.add;
- mul_instr = b;
- add_instr = a;
+ merge.flags.ac = b->flags.mc;
+ merge.flags.apf = b->flags.mpf;
+ merge.flags.auf = b->flags.muf;
+
+ mul_instr = a;
+ add_instr = &add_inst;
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+ add_inst = *a;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge = add_inst;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = &add_inst;
+ } else {
+ return false;
+ }
}
/* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
--
2.39.2

View File

@ -1,418 +0,0 @@
From b59b3725fb16f4ab1ac0db86a5452a4ed6176074 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 3 Nov 2021 10:34:19 +0100
Subject: [PATCH 057/142] broadcom/compiler: implement small immediates for v71
---
src/broadcom/compiler/qpu_schedule.c | 90 +++++++++++++------
src/broadcom/compiler/qpu_validate.c | 20 ++++-
.../compiler/vir_opt_small_immediates.c | 26 +++++-
src/broadcom/compiler/vir_to_qpu.c | 11 ++-
src/broadcom/qpu/qpu_disasm.c | 1 -
src/broadcom/qpu/qpu_instr.c | 8 +-
src/broadcom/qpu/qpu_instr.h | 2 +-
src/broadcom/qpu/qpu_pack.c | 36 ++++----
8 files changed, 139 insertions(+), 55 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index caa84254998..bd1c920848a 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -714,7 +714,6 @@ qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
!inst->sig.small_imm_b && (inst->raddr_b == waddr))
return true;
} else {
- /* FIXME: skip if small immediate */
if (v3d71_qpu_reads_raddr(inst, waddr))
return true;
}
@@ -948,10 +947,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
return raddrs_used;
}
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
*/
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
@@ -959,6 +959,27 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *mul_instr,
const struct v3d_device_info *devinfo)
{
+ if (devinfo->ver >= 71) {
+ assert(add_instr->sig.small_imm_a +
+ add_instr->sig.small_imm_b <= 1);
+ assert(add_instr->sig.small_imm_c +
+ add_instr->sig.small_imm_d == 0);
+ assert(mul_instr->sig.small_imm_a +
+ mul_instr->sig.small_imm_b == 0);
+ assert(mul_instr->sig.small_imm_c +
+ mul_instr->sig.small_imm_d <= 1);
+
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+ return (result->sig.small_imm_a +
+ result->sig.small_imm_b +
+ result->sig.small_imm_c +
+ result->sig.small_imm_d) <= 1;
+ }
+
assert(devinfo->ver <= 42);
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
@@ -1060,7 +1081,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
}
static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -1084,6 +1106,18 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ if (devinfo->ver >= 71) {
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+ if (inst->sig.small_imm_a) {
+ inst->sig.small_imm_c = true;
+ inst->sig.small_imm_a = false;
+ } else if (inst->sig.small_imm_b) {
+ inst->sig.small_imm_d = true;
+ inst->sig.small_imm_b = false;
+ }
+ }
}
static bool
@@ -1135,6 +1169,16 @@ qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+ if (inst->sig.small_imm_c) {
+ inst->sig.small_imm_a = true;
+ inst->sig.small_imm_c = false;
+ } else if (inst->sig.small_imm_d) {
+ inst->sig.small_imm_b = true;
+ inst->sig.small_imm_d = false;
+ }
}
static bool
@@ -1173,20 +1217,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge.alu.mul = mul_inst.alu.mul;
- merge.flags.mc = b->flags.ac;
- merge.flags.mpf = b->flags.apf;
- merge.flags.muf = b->flags.auf;
+ merge.flags.mc = mul_inst.flags.mc;
+ merge.flags.mpf = mul_inst.flags.mpf;
+ merge.flags.muf = mul_inst.flags.muf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
@@ -1225,9 +1269,9 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.alu.add = add_inst.alu.add;
- merge.flags.ac = b->flags.mc;
- merge.flags.apf = b->flags.mpf;
- merge.flags.auf = b->flags.muf;
+ merge.flags.ac = add_inst.flags.ac;
+ merge.flags.apf = add_inst.flags.apf;
+ merge.flags.auf = add_inst.flags.auf;
mul_instr = a;
add_instr = &add_inst;
@@ -1252,17 +1296,12 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
/* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
* they have restrictions on the number of raddrs that can be adressed
- * in a single instruction.
- *
- * FIXME: for V3D 7.x we can't merge instructions if they address more
- * than one small immediate. For now, we don't support small immediates,
- * so it is not a problem.
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
+ * but we are still limited to a single small immediate per instruction.
*/
- if (devinfo->ver <= 42) {
- if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
- return false;
- }
+ if (add_instr && mul_instr &&
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
}
merge.sig.thrsw |= b->sig.thrsw;
@@ -1273,7 +1312,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm_b |= b->sig.small_imm_b;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1933,8 +1971,6 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (c->devinfo->ver >= 71) {
/* RF2-3 might be overwritten during the delay slots by
* fragment shader setup.
- *
- * FIXME: handle small immediate cases
*/
if (v3d71_qpu_reads_raddr(inst, 2) ||
v3d71_qpu_reads_raddr(inst, 3)) {
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index fde6695d59b..41070484286 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -116,8 +116,24 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
return;
if (devinfo->ver < 71) {
- if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d)
- fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+ inst->sig.small_imm_d) {
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ }
+ } else {
+ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+ !vir_is_add(qinst)) {
+ fail_instr(state, "small imm a/b used but no ADD inst");
+ }
+ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+ !vir_is_mul(qinst)) {
+ fail_instr(state, "small imm c/d used but no MUL inst");
+ }
+ if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+ fail_instr(state, "only one small immediate can be "
+ "enabled per instruction");
+ }
}
/* LDVARY writes r5 two instructions later and LDUNIF writes
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index df0d6c36c9b..ed5bc011964 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
/* The small immediate value sits in the raddr B field, so we
* can't have 2 small immediates in one instruction (unless
* they're the same value, but that should be optimized away
- * elsewhere).
+ * elsewhere). Since 7.x we can encode small immediates in
+ * any raddr field, but each instruction can still only use
+ * one.
*/
bool uses_small_imm = false;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
*/
struct v3d_qpu_sig new_sig = inst->qpu.sig;
uint32_t sig_packed;
- new_sig.small_imm_b = true;
+ if (c->devinfo->ver <= 42) {
+ new_sig.small_imm_b = true;
+ } else {
+ if (vir_is_add(inst)) {
+ if (i == 0)
+ new_sig.small_imm_a = true;
+ else
+ new_sig.small_imm_b = true;
+ } else {
+ if (i == 0)
+ new_sig.small_imm_c = true;
+ else
+ new_sig.small_imm_d = true;
+ }
+ }
+
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
continue;
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
vir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->qpu.sig.small_imm_b = true;
+ inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+ inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+ inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+ inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
inst->qpu.raddr_b = packed;
inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index cbbb495592b..4ed184cbbcb 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -89,8 +89,15 @@ new_qpu_nop_before(struct qinst *inst)
static void
v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
{
- if (src.smimm)
- unreachable("v3d71_set_src: pending handling small immediates");
+ /* If we have a small immediate move it from inst->raddr_b to the
+ * corresponding raddr.
+ */
+ if (src.smimm) {
+ assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+ instr->sig.small_imm_c || instr->sig.small_imm_d);
+ *raddr = instr->raddr_b;
+ return;
+ }
assert(!src.magic);
*raddr = src.index;
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index b613de781dc..c1590a760de 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -113,7 +113,6 @@ v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
}
if (is_small_imm) {
- unreachable("Pending handling small immediates");
uint32_t val;
ASSERTED bool ok =
v3d_qpu_small_imm_unpack(disasm->devinfo,
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index f54ce7210fb..c30f4bbbccf 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -975,10 +975,10 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) ||
- (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) ||
- (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) ||
- (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
+ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
+ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
+ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
+ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
}
bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index dece45c5c54..d408fb426fa 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -402,7 +402,7 @@ struct v3d_qpu_instr {
uint8_t sig_addr;
bool sig_magic; /* If the signal writes to a magic address */
uint8_t raddr_a; /* V3D 4.x */
- uint8_t raddr_b; /* V3D 4.x*/
+ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
struct v3d_qpu_flags flags;
union {
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index ed5a8bc667d..7984712d527 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -1218,16 +1218,11 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
instr->alu.add.op = desc->op;
- /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the
+ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
* operands.
*/
- /* FIXME: for now hardcoded values, until we got the small_imm support
- * in place
- */
- uint32_t small_imm_a = 0;
- uint32_t small_imm_b = 0;
- if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
- small_imm_b *256 + (op & 3) * 64 + raddr_b) {
+ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
if (instr->alu.add.op == V3D_QPU_A_FMIN)
instr->alu.add.op = V3D_QPU_A_FMAX;
if (instr->alu.add.op == V3D_QPU_A_FADD)
@@ -1858,11 +1853,6 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
uint32_t output_pack;
uint32_t a_unpack;
uint32_t b_unpack;
- /* FIXME: for now hardcoded values, until we got the small_imm
- * support in place
- */
- uint32_t small_imm_a = 0;
- uint32_t small_imm_b = 0;
if (instr->alu.add.op != V3D_QPU_A_FCMP) {
if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
@@ -1886,8 +1876,8 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
* distinguished by which order their operands come in.
*/
bool ordering =
- small_imm_a * 256 + a_unpack * 64 + raddr_a >
- small_imm_b * 256 + b_unpack * 64 + raddr_b;
+ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
((instr->alu.add.op == V3D_QPU_A_FMAX ||
@@ -1901,6 +1891,22 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
temp = raddr_a;
raddr_a = raddr_b;
raddr_b = temp;
+
+ /* If we are swapping raddr_a/b we also need to swap
+ * small_imm_a/b.
+ */
+ if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
+ assert(instr->sig.small_imm_a !=
+ instr->sig.small_imm_b);
+ struct v3d_qpu_sig new_sig = instr->sig;
+ new_sig.small_imm_a = !instr->sig.small_imm_a;
+ new_sig.small_imm_b = !instr->sig.small_imm_b;
+ uint32_t sig;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+ return false;
+ *packed_instr &= ~V3D_QPU_SIG_MASK;
+ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+ }
}
opcode |= a_unpack << 2;
--
2.39.2

View File

@ -1,61 +0,0 @@
From 3af87d2672da7c928ecf8a0a1cd1bef8a6729364 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 22 Nov 2021 12:56:03 +0100
Subject: [PATCH 058/142] broadcom/compiler: update thread end restrictions for
v7.x
In 4.x it is not allowed to write to the register file in the last
3 instructions, but in 7.x we only have this restriction in the
thread end instruction itself, and only if the write comes from
the ALU ports.
---
src/broadcom/compiler/qpu_schedule.c | 31 ++++++++++++++++++++--------
1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index bd1c920848a..cba16c77d67 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1938,17 +1938,30 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
return false;
}
- /* No writing physical registers at the end. */
- bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
- bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
- if ((!add_is_nop && !inst->alu.add.magic_write) ||
- (!mul_is_nop && !inst->alu.mul.magic_write)) {
- return false;
+ if (c->devinfo->ver <= 42) {
+ /* No writing physical registers at the end. */
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
+ return false;
+ }
+
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ return false;
+ }
}
- if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
- !inst->sig_magic) {
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* The thread end instruction must not write to the
+ * register file via the add/mul ALUs.
+ */
+ if (slot == 0 &&
+ (!inst->alu.add.magic_write ||
+ !inst->alu.mul.magic_write)) {
+ return false;
+ }
}
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
--
2.39.2

View File

@ -1,112 +0,0 @@
From 7cfd5b808bb2f1cb17f57435cb5d411c4ac3aa6c Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 23 Nov 2021 10:04:49 +0100
Subject: [PATCH 059/142] broadcom/compiler: update ldvary thread switch delay
slot restriction for v7.x
In V3D 7.x we don't have accumulators which would not survive a thread
switch, so the only restriction is that ldvary can't be placed in the second
delay slot of a thread switch.
shader-db results for UnrealEngine4 shaders:
total instructions in shared programs: 446458 -> 446401 (-0.01%)
instructions in affected programs: 13492 -> 13435 (-0.42%)
helped: 58
HURT: 3
Instructions are helped.
total nops in shared programs: 19571 -> 19541 (-0.15%)
nops in affected programs: 161 -> 131 (-18.63%)
helped: 30
HURT: 0
Nops are helped.
---
src/broadcom/compiler/qpu_schedule.c | 33 +++++++++++++++++++++-------
src/broadcom/compiler/qpu_validate.c | 10 +++++++--
2 files changed, 33 insertions(+), 10 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index cba16c77d67..32f651851cf 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1491,11 +1491,20 @@ retry:
* ldvary now if the follow-up fixup would place
* it in the delay slots of a thrsw, which is not
* allowed and would prevent the fixup from being
- * successful.
+ * successful. In V3D 7.x we can allow this to happen
+ * as long as it is not the last delay slot.
*/
- if (inst->sig.ldvary &&
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
- continue;
+ if (inst->sig.ldvary) {
+ if (c->devinfo->ver <= 42 &&
+ scoreboard->last_thrsw_tick + 2 >=
+ scoreboard->tick - 1) {
+ continue;
+ }
+ if (c->devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 ==
+ scoreboard->tick - 1) {
+ continue;
+ }
}
/* We can emit a new tmu lookup with a previous ldtmu
@@ -2020,8 +2029,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
return false;
- if (slot > 0 && qinst->qpu.sig.ldvary)
- return false;
+ if (qinst->qpu.sig.ldvary) {
+ if (c->devinfo->ver <= 42 && slot > 0)
+ return false;
+ if (c->devinfo->ver >= 71 && slot == 2)
+ return false;
+ }
/* unifa and the following 3 instructions can't overlap a
* thread switch/end. The docs further clarify that this means
@@ -2618,9 +2631,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
/* We can't put an ldvary in the delay slots of a thrsw. We should've
* prevented this when pairing up the ldvary with another instruction
- * and flagging it for a fixup.
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
+ * second delay slot.
*/
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+ assert((devinfo->ver <= 42 &&
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+ (devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
/* Move the ldvary to the previous instruction and remove it from the
* current one.
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 41070484286..4f09aa8aef4 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -215,8 +215,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
"SFU write started during THRSW delay slots ");
}
- if (inst->sig.ldvary)
- fail_instr(state, "LDVARY during THRSW delay slots");
+ if (inst->sig.ldvary) {
+ if (devinfo->ver <= 42)
+ fail_instr(state, "LDVARY during THRSW delay slots");
+ if (devinfo->ver >= 71 &&
+ state->ip - state->last_thrsw_ip == 2) {
+ fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+ }
+ }
}
(void)qpu_magic_waddr_matches; /* XXX */
--
2.39.2

View File

@ -1,30 +0,0 @@
From ca4063d627cd31c589a8e8688f2876dd8211d1bc Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 25 Nov 2021 08:31:02 +0100
Subject: [PATCH 060/142] broadcom/compiler: lift restriction for branch +
msfign after setmsf for v7.x
---
src/broadcom/compiler/qpu_schedule.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 32f651851cf..476eae691ab 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -2373,10 +2373,11 @@ emit_branch(struct v3d_compile *c,
assert(scoreboard->last_branch_tick + 3 < branch_tick);
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
* setmsf.
*/
bool is_safe_msf_branch =
+ c->devinfo->ver >= 71 ||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
--
2.39.2

View File

@ -1,38 +0,0 @@
From 167510aa43bbcf06e57a64495cee40e8cdaf5f8b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 26 Nov 2021 10:37:05 +0100
Subject: [PATCH 061/142] broadcom/compiler: start allocating from RF 4 in V7.x
In V3D 4.x we start at RF3 so that we allocate RF0-2 only if there
aren't any other RFs available. This is useful with small shaders
to ensure that our TLB writes don't use these registers because
these are the last instructions we emit in fragment shaders and
the last instructions in a program can't write to these registers,
so if we do, we need to emit NOPs.
In V3D 7.x the registers affected by this restriction are RF2-3,
so we choose to start at RF4.
---
src/broadcom/compiler/vir_register_allocate.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 6f7b1ca0589..440b093a636 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -1234,9 +1234,10 @@ v3d_register_allocate(struct v3d_compile *c)
.phys_index = phys_index,
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
- * RF0-2.
+ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+ * using RF2-3.
*/
- .next_phys = 3,
+ .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
.nodes = &c->nodes,
.devinfo = c->devinfo,
};
--
2.39.2

View File

@ -1,71 +0,0 @@
From d47ea903b96e43b07bdef21f8026da818e30fcd1 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 25 Nov 2021 13:00:34 +0100
Subject: [PATCH 062/142] broadcom/compiler: validate restrictions after TLB Z
write
---
src/broadcom/compiler/qpu_validate.c | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 4f09aa8aef4..1082fb7d50a 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
int last_sfu_write;
int last_branch_ip;
int last_thrsw_ip;
+ int first_tlb_z_write;
/* Set when we've found the last-THRSW signal, or if we were started
* in single-segment mode.
@@ -110,11 +111,37 @@ static void
qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
{
const struct v3d_device_info *devinfo = state->c->devinfo;
+
+ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+ state->first_tlb_z_write = state->ip;
+
const struct v3d_qpu_instr *inst = &qinst->qpu;
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+ fail_instr(state, "Implicit branch MSF read after TLB Z write");
+ }
+
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return;
+ if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write) {
+ fail_instr(state, "SETMSF after TLB Z write");
+ }
+
+ if (state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->alu.add.op == V3D_QPU_A_MSF) {
+ fail_instr(state, "MSF read after TLB Z write");
+ }
+
if (devinfo->ver < 71) {
if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
inst->sig.small_imm_d) {
@@ -348,6 +375,7 @@ qpu_validate(struct v3d_compile *c)
.last_sfu_write = -10,
.last_thrsw_ip = -10,
.last_branch_ip = -10,
+ .first_tlb_z_write = INT_MAX,
.ip = 0,
.last_thrsw_found = !c->last_thrsw,
--
2.39.2

View File

@ -1,26 +0,0 @@
From 6cdf01fad49489b5fc66d231b527de5245d5de32 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 29 Nov 2021 13:23:11 +0100
Subject: [PATCH 063/142] broadcom/compiler: lift restriction on vpmwt in last
instruction for V3D 7.x
---
src/broadcom/compiler/qpu_schedule.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 476eae691ab..77fb6a794e6 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1934,7 +1934,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (slot > 0 && qinst->uniform != ~0)
return false;
- if (v3d_qpu_waits_vpm(inst))
+ if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
return false;
if (inst->sig.ldvary)
--
2.39.2

View File

@ -1,134 +0,0 @@
From acc54637f0787ba4dc887130c25c628ccdaf4e38 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 9 Nov 2021 11:34:59 +0100
Subject: [PATCH 064/142] broadcom/compiler: fix up copy propagation for v71
Update rules for unsafe copy propagations to match v7.x.
---
.../compiler/vir_opt_copy_propagate.c | 83 +++++++++++++------
1 file changed, 56 insertions(+), 27 deletions(-)
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index c4aa7255a17..1260838ca05 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
#include "v3d_compiler.h"
static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
{
if (!inst)
return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
return false;
}
- switch (inst->src[0].file) {
- case QFILE_MAGIC:
- /* No copy propagating from R3/R4/R5 -- the MOVs from those
- * are there to register allocate values produced into R3/4/5
- * to other regs (though hopefully r3/4/5).
- */
- switch (inst->src[0].index) {
- case V3D_QPU_WADDR_R3:
- case V3D_QPU_WADDR_R4:
- case V3D_QPU_WADDR_R5:
- return false;
+ if (devinfo->ver <= 42) {
+ switch (inst->src[0].file) {
+ case QFILE_MAGIC:
+ /* No copy propagating from R3/R4/R5 -- the MOVs from
+ * those are there to register allocate values produced
+ * into R3/4/5 to other regs (though hopefully r3/4/5).
+ */
+ switch (inst->src[0].index) {
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ return false;
+ default:
+ break;
+ }
+ break;
+
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* MOVs from rf0/1/2 are only to track the live
+ * intervals for W/centroid W/Z.
+ */
+ return false;
+ }
+ break;
+
default:
break;
}
- break;
-
- case QFILE_REG:
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- /* MOVs from rf0/1/2 are only to track the live
+ } else {
+ assert(devinfo->ver >= 71);
+ switch (inst->src[0].file) {
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ /* MOVs from rf1/2/3 are only to track the live
* intervals for W/centroid W/Z.
+ *
+ * Note: rf0 can be implicitly written by ldvary
+ * (no temp involved), so it is not an SSA value and
+ * could clash with writes to other temps that are
+ * also allocated to rf0. In theory, that would mean
+ * that we can't copy propagate from it, but we handle
+ * this at register allocation time, preventing temps
+ * from being allocated to rf0 while the rf0 value from
+ * ldvary is still live.
*/
- return false;
- }
- break;
+ case 1:
+ case 2:
+ case 3:
+ return false;
+ }
+ break;
- default:
- break;
+ default:
+ break;
+ }
}
return true;
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
*/
struct qinst *mov = movs[inst->src[i].index];
if (!mov) {
- if (!is_copy_mov(c->defs[inst->src[i].index]))
+ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
continue;
mov = c->defs[inst->src[i].index];
@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)
apply_kills(c, movs, inst);
- if (is_copy_mov(inst))
+ if (is_copy_mov(c->devinfo, inst))
movs[inst->dst.index] = inst;
}
}
--
2.39.2

View File

@ -1,150 +0,0 @@
From c340f7f1eb4a1e5c0fafe1ea2f801f2ebaf82d8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 26 Nov 2021 01:24:12 +0100
Subject: [PATCH 065/142] broadcom/qpu: new packing/conversion v71 instructions
This commits adds the qpu definitions for several new v71
instructions.
Packing:
* vpack does a 2x32 to 2x16 bit integer pack
* v8pack: Pack 2 x 2x16 bit integers into 4x8 bits
* v10pack packs parts of 2 2x16 bit integer into r10g10b10a2.
* v11fpack packs parts of 2 2x16 bit float into r11g11b10 rounding
to nearest
Conversion to unorm/snorm:
* vftounorm8/vftosnorm8: converts from 2x16-bit floating point
to 2x8 bit unorm/snorm.
* ftounorm16/ftosnorm16: converts floating point to 16-bit
unorm/snorm
* vftounorm10lo: Convert 2x16-bit floating point to 2x10-bit unorm
* vftounorm10hi: Convert 2x16-bit floating point to one 2-bit and one 10-bit unorm
---
src/broadcom/qpu/qpu_instr.c | 20 ++++++++++++++++++++
src/broadcom/qpu/qpu_instr.h | 12 ++++++++++++
src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++
3 files changed, 44 insertions(+)
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index c30f4bbbccf..44f20618a5a 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -179,6 +179,10 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
[V3D_QPU_A_UTOF] = "utof",
[V3D_QPU_A_MOV] = "mov",
[V3D_QPU_A_FMOV] = "fmov",
+ [V3D_QPU_A_VPACK] = "vpack",
+ [V3D_QPU_A_V8PACK] = "v8pack",
+ [V3D_QPU_A_V10PACK] = "v10pack",
+ [V3D_QPU_A_V11FPACK] = "v11fpack",
};
if (op >= ARRAY_SIZE(op_names))
@@ -201,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
[V3D_QPU_M_MOV] = "mov",
[V3D_QPU_M_NOP] = "nop",
[V3D_QPU_M_FMUL] = "fmul",
+ [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
+ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
+ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
+ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
+ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
+ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
};
if (op >= ARRAY_SIZE(op_names))
@@ -463,6 +473,10 @@ static const uint8_t add_op_args[] = {
[V3D_QPU_A_MOV] = D | A,
[V3D_QPU_A_FMOV] = D | A,
+ [V3D_QPU_A_VPACK] = D | A | B,
+ [V3D_QPU_A_V8PACK] = D | A | B,
+ [V3D_QPU_A_V10PACK] = D | A | B,
+ [V3D_QPU_A_V11FPACK] = D | A | B,
};
static const uint8_t mul_op_args[] = {
@@ -476,6 +490,12 @@ static const uint8_t mul_op_args[] = {
[V3D_QPU_M_NOP] = 0,
[V3D_QPU_M_MOV] = D | A,
[V3D_QPU_M_FMUL] = D | A | B,
+ [V3D_QPU_M_FTOUNORM16] = D | A,
+ [V3D_QPU_M_FTOSNORM16] = D | A,
+ [V3D_QPU_M_VFTOUNORM8] = D | A,
+ [V3D_QPU_M_VFTOSNORM8] = D | A,
+ [V3D_QPU_M_VFTOUNORM10LO] = D | A,
+ [V3D_QPU_M_VFTOUNORM10HI] = D | A,
};
bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index d408fb426fa..56eee9f9cac 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -231,6 +231,10 @@ enum v3d_qpu_add_op {
/* V3D 7.x */
V3D_QPU_A_FMOV,
V3D_QPU_A_MOV,
+ V3D_QPU_A_VPACK,
+ V3D_QPU_A_V8PACK,
+ V3D_QPU_A_V10PACK,
+ V3D_QPU_A_V11FPACK,
};
enum v3d_qpu_mul_op {
@@ -244,6 +248,14 @@ enum v3d_qpu_mul_op {
V3D_QPU_M_MOV,
V3D_QPU_M_NOP,
V3D_QPU_M_FMUL,
+
+ /* V3D 7.x */
+ V3D_QPU_M_FTOUNORM16,
+ V3D_QPU_M_FTOSNORM16,
+ V3D_QPU_M_VFTOUNORM8,
+ V3D_QPU_M_VFTOSNORM8,
+ V3D_QPU_M_VFTOUNORM10LO,
+ V3D_QPU_M_VFTOUNORM10HI,
};
enum v3d_qpu_output_pack {
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 7984712d527..6cd75adac6d 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -783,6 +783,9 @@ static const struct opcode_desc add_ops_v71[] = {
{ 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
{ 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
+ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
+
{ 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
{ 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
{ 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
@@ -797,6 +800,8 @@ static const struct opcode_desc add_ops_v71[] = {
{ 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
{ 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
+ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
};
static const struct opcode_desc mul_ops_v71[] = {
@@ -822,6 +827,13 @@ static const struct opcode_desc mul_ops_v71[] = {
{ 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
{ 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
+
{ 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
{ 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
--
2.39.2

View File

@ -1,68 +0,0 @@
From f6082e941a3454c8735df2ff2713ae49b3daa74f Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 18 Apr 2023 08:50:13 +0200
Subject: [PATCH 068/142] broadcom/compiler: don't allocate spill base to rf0
in V3D 7.x
Otherwise it can be stomped by instructions doing implicit rf0 writes.
---
src/broadcom/compiler/vir_register_allocate.c | 21 +++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 440b093a636..121c9b2794f 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -582,7 +582,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
}
static void
-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+ int spill_temp)
{
c->spill_start_num_temps = c->num_temps;
c->spilling = true;
@@ -594,8 +595,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
spill_offset = c->spill_size;
c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
- if (spill_offset == 0)
+ if (spill_offset == 0) {
v3d_setup_spill_base(c);
+
+ /* Don't allocate our spill base to rf0 to avoid
+ * conflicts with instructions doing implicit writes
+ * to that register.
+ */
+ if (!c->devinfo->has_accumulators) {
+ ra_add_node_interference(
+ c->g,
+ temp_to_node(c, c->spill_base.index),
+ implicit_rf_nodes[0]);
+ }
+ }
}
struct qinst *last_thrsw = c->last_thrsw;
@@ -1346,7 +1359,7 @@ v3d_register_allocate(struct v3d_compile *c)
int node = v3d_choose_spill_node(c);
uint32_t temp = node_to_temp(c, node);
if (node != -1) {
- v3d_spill_reg(c, acc_nodes, temp);
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
continue;
}
}
@@ -1363,7 +1376,7 @@ v3d_register_allocate(struct v3d_compile *c)
enum temp_spill_type spill_type =
get_spill_type_for_temp(c, temp);
if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
- v3d_spill_reg(c, acc_nodes, temp);
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
if (c->spills + c->fills > c->max_tmu_spills)
goto spill_fail;
} else {
--
2.39.2

View File

@ -1,186 +0,0 @@
From 0e9577fbb18a026390f653ca22f5a98a69a5fe59 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 2 May 2023 10:12:37 +0200
Subject: [PATCH 069/142] broadcom/compiler: improve allocation for final
program instructions
The last 3 instructions can't use specific registers so flag all the
nodes for temps used in the last program instructions and try to
avoid assigning any of these. This may help us avoid injecting nops
for the last thread switch instruction.
Because regisster allocation needs to happen before QPU scheduling
and instruction merging we can't tell exactly what the last 3
instructions will be, so we do this for a few more instructions than
just 3.
We only do this for fragment shaders because other shader stages
always end with VPM store instructions that take an small immediate
and therefore will never allow us to merge the final thread switch
earlier, so limiting allocation for these shaders will never improve
anything and might instead be detrimental.
total instructions in shared programs: 11471389 -> 11464335 (-0.06%)
instructions in affected programs: 582908 -> 575854 (-1.21%)
helped: 4669
HURT: 578
Instructions are helped.
total max-temps in shared programs: 2230497 -> 2230150 (-0.02%)
max-temps in affected programs: 5662 -> 5315 (-6.13%)
helped: 344
HURT: 44
Max-temps are helped.
total sfu-stalls in shared programs: 18068 -> 18077 (0.05%)
sfu-stalls in affected programs: 264 -> 273 (3.41%)
helped: 37
HURT: 48
Inconclusive result (value mean confidence interval includes 0).
total inst-and-stalls in shared programs: 11489457 -> 11482412 (-0.06%)
inst-and-stalls in affected programs: 585180 -> 578135 (-1.20%)
helped: 4659
HURT: 588
Inst-and-stalls are helped.
total nops in shared programs: 301738 -> 298140 (-1.19%)
nops in affected programs: 14680 -> 11082 (-24.51%)
helped: 3252
HURT: 108
Nops are helped.
---
src/broadcom/compiler/v3d_compiler.h | 1 +
src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++++--
2 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 425ab0cdf9d..2642d23b629 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -613,6 +613,7 @@ struct v3d_ra_node_info {
struct {
uint32_t priority;
uint8_t class_bits;
+ bool is_program_end;
/* V3D 7.x */
bool is_ldunif_dst;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 121c9b2794f..495644bb557 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -385,6 +385,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
c->nodes.info[node].class_bits = class_bits;
c->nodes.info[node].priority = 0;
c->nodes.info[node].is_ldunif_dst = false;
+ c->nodes.info[node].is_program_end = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -929,6 +930,17 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
return true;
}
+ /* The last 3 instructions in a shader can't use some specific registers
+ * (usually early rf registers, depends on v3d version) so try to
+ * avoid allocating these to registers used by the last instructions
+ * in the shader.
+ */
+ const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
+ if (v3d_ra->nodes->info[node].is_program_end &&
+ v3d_ra->next_phys < safe_rf_start) {
+ v3d_ra->next_phys = safe_rf_start;
+ }
+
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
int phys = v3d_ra->phys_index + phys_off;
@@ -1218,6 +1230,44 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
}
}
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+ /* Only look for registers used in this many instructions */
+ uint32_t last_set_count = 6;
+
+ struct qblock *last_block = vir_exit_block(c);
+ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+ if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
+ continue;
+
+ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+ }
+ }
+
+ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->dst.index);
+ c->nodes.info[node].is_program_end = true;
+ }
+
+ if (--last_set_count == 0)
+ break;
+ }
+}
+
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
*
@@ -1280,17 +1330,16 @@ v3d_register_allocate(struct v3d_compile *c)
*/
for (uint32_t i = 0; i < num_ra_nodes; i++) {
c->nodes.info[i].is_ldunif_dst = false;
+ c->nodes.info[i].is_program_end = false;
+ c->nodes.info[i].priority = 0;
+ c->nodes.info[i].class_bits = 0;
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
acc_nodes[i] = i;
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
- c->nodes.info[i].priority = 0;
- c->nodes.info[i].class_bits = 0;
} else if (!c->devinfo->has_accumulators &&
i < ARRAY_SIZE(implicit_rf_nodes)) {
implicit_rf_nodes[i] = i;
ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
- c->nodes.info[i].priority = 0;
- c->nodes.info[i].class_bits = 0;
} else {
uint32_t t = node_to_temp(c, i);
c->nodes.info[i].priority =
@@ -1327,6 +1376,18 @@ v3d_register_allocate(struct v3d_compile *c)
last_ldvary_ip, inst);
}
+ /* Flag the nodes that are used in the last instructions of the program
+ * (there are some registers that cannot be used in the last 3
+ * instructions). We only do this for fragment shaders, because the idea
+ * is that by avoiding this conflict we may be able to emit the last
+ * thread switch earlier in some cases, however, in non-fragment shaders
+ * this won't happen because the last instructions are always VPM stores
+ * with a small immediate, which conflicts with other signals,
+ * preventing us from ever moving the thrsw earlier.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+ flag_program_end_nodes(c);
+
/* Set the register classes for all our temporaries in the graph */
for (uint32_t i = 0; i < c->num_temps; i++) {
ra_set_node_class(c->g, temp_to_node(c, i),
--
2.39.2

View File

@ -1,105 +0,0 @@
From 645fe451bcecbe3345a144222306d06fb39f6b9f Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 2 May 2023 10:17:47 +0200
Subject: [PATCH 070/142] broadcom/compiler: don't assign registers to unused
nodes/temps
In programs with a lot of unused temps, if we don't do this, we may
end up recycling previously used rfs more often, which can be
detrimental to instruction pairing.
total instructions in shared programs: 11464335 -> 11444136 (-0.18%)
instructions in affected programs: 8976743 -> 8956544 (-0.23%)
helped: 33196
HURT: 33778
Inconclusive result
total max-temps in shared programs: 2230150 -> 2229445 (-0.03%)
max-temps in affected programs: 86413 -> 85708 (-0.82%)
helped: 2217
HURT: 1523
Max-temps are helped.
total sfu-stalls in shared programs: 18077 -> 17104 (-5.38%)
sfu-stalls in affected programs: 8669 -> 7696 (-11.22%)
helped: 2657
HURT: 2182
Sfu-stalls are helped.
total inst-and-stalls in shared programs: 11482412 -> 11461240 (-0.18%)
inst-and-stalls in affected programs: 8995697 -> 8974525 (-0.24%)
helped: 33319
HURT: 33708
Inconclusive result
total nops in shared programs: 298140 -> 296185 (-0.66%)
nops in affected programs: 52805 -> 50850 (-3.70%)
helped: 3797
HURT: 2662
Inconclusive result
---
src/broadcom/compiler/v3d_compiler.h | 1 +
src/broadcom/compiler/vir_register_allocate.c | 14 ++++++++++++++
2 files changed, 15 insertions(+)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 2642d23b629..f1a807e38fd 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -614,6 +614,7 @@ struct v3d_ra_node_info {
uint32_t priority;
uint8_t class_bits;
bool is_program_end;
+ bool unused;
/* V3D 7.x */
bool is_ldunif_dst;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 495644bb557..0ab0474424f 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -386,6 +386,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
c->nodes.info[node].priority = 0;
c->nodes.info[node].is_ldunif_dst = false;
c->nodes.info[node].is_program_end = false;
+ c->nodes.info[node].unused = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -918,6 +919,12 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
+ /* If this node is for an unused temp, ignore. */
+ if (v3d_ra->nodes->info[node].unused) {
+ *out = 0;
+ return true;
+ }
+
/* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
* so we can avoid turning them into ldunifrf (which uses the
* cond field to encode the dst and would prevent merge with
@@ -1331,6 +1338,7 @@ v3d_register_allocate(struct v3d_compile *c)
for (uint32_t i = 0; i < num_ra_nodes; i++) {
c->nodes.info[i].is_ldunif_dst = false;
c->nodes.info[i].is_program_end = false;
+ c->nodes.info[i].unused = false;
c->nodes.info[i].priority = 0;
c->nodes.info[i].class_bits = 0;
if (c->devinfo->has_accumulators && i < ACC_COUNT) {
@@ -1396,6 +1404,12 @@ v3d_register_allocate(struct v3d_compile *c)
/* Add register interferences based on liveness data */
for (uint32_t i = 0; i < c->num_temps; i++) {
+ /* And while we are here, let's also flag nodes for
+ * unused temps.
+ */
+ if (c->temp_start[i] > c->temp_end[i])
+ c->nodes.info[temp_to_node(c, i)].unused = true;
+
for (uint32_t j = i + 1; j < c->num_temps; j++) {
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
--
2.39.2

View File

@ -1,83 +0,0 @@
From 851704169d59e28c5429b06d05e5ef952be893a2 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 15 May 2023 10:02:10 +0200
Subject: [PATCH 071/142] broadcom/compiler: only assign rf0 as last resort in
V3D 7.x
So we can use it for ldunif(a) and avoid generating ldunif(a)rf which
can't be paired with conditional instructions.
shader-db (pi5):
total instructions in shared programs: 11357802 -> 11338883 (-0.17%)
instructions in affected programs: 7117889 -> 7098970 (-0.27%)
helped: 24264
HURT: 17574
Instructions are helped.
total uniforms in shared programs: 3857808 -> 3857815 (<.01%)
uniforms in affected programs: 92 -> 99 (7.61%)
helped: 0
HURT: 1
total max-temps in shared programs: 2230904 -> 2230199 (-0.03%)
max-temps in affected programs: 52309 -> 51604 (-1.35%)
helped: 1219
HURT: 725
Max-temps are helped.
total sfu-stalls in shared programs: 15021 -> 15236 (1.43%)
sfu-stalls in affected programs: 6848 -> 7063 (3.14%)
helped: 1866
HURT: 1704
Inconclusive result
total inst-and-stalls in shared programs: 11372823 -> 11354119 (-0.16%)
inst-and-stalls in affected programs: 7149177 -> 7130473 (-0.26%)
helped: 24315
HURT: 17561
Inst-and-stalls are helped.
total nops in shared programs: 273624 -> 273711 (0.03%)
nops in affected programs: 31562 -> 31649 (0.28%)
helped: 1619
HURT: 1854
Inconclusive result (value mean confidence interval includes 0).
---
src/broadcom/compiler/vir_register_allocate.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 0ab0474424f..8eac2b75bd7 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -950,6 +950,11 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+
+ /* Try to keep rf0 available for ldunif in 7.x (see above). */
+ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+ continue;
+
int phys = v3d_ra->phys_index + phys_off;
if (BITSET_TEST(regs, phys)) {
@@ -959,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
}
}
+ /* If we couldn't allocate, do try to assign rf0 if it is available. */
+ if (v3d_ra->devinfo->ver >= 71 &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ v3d_ra->next_phys = 1;
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
return false;
}
--
2.39.2

View File

@ -1,30 +0,0 @@
From 0d3fd30d67ffc0195b0783e30ab6afbbe403310a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 28 Apr 2021 14:31:38 +0200
Subject: [PATCH 072/142] v3dv: recover non-conformant warning for not fully
supported hw
---
src/broadcom/vulkan/v3dv_device.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index d5de3517670..d29ffad3531 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1212,6 +1212,12 @@ create_physical_device(struct v3dv_instance *instance,
list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
+ if (device->devinfo.ver != 42) {
+ fprintf(stderr, "WARNING: v3dv support for hw version %i is neither "
+ "a complete nor a conformant Vulkan implementation. Testing "
+ "use only.\n", device->devinfo.ver);
+ }
+
return VK_SUCCESS;
fail:
--
2.39.2

View File

@ -1,504 +0,0 @@
From 52b5ac62b367ae89574c8031fdcf7c1dae05c942 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 29 Jun 2021 11:59:53 +0200
Subject: [PATCH 073/142] v3dv/meson: add v71 hw generation
Starting point for v71 version inclusion.
This just adds it as one of the versions to be compiled (on meson),
updates the v3dX/v3dv_X macros, and update the code enough to get it
compiling when building using the two versions. For any packet not
available on v71 we just provide a generic asserted placeholder of
generation not supported.
Any real v71 support will be implemented on following commits.
---
src/broadcom/vulkan/meson.build | 6 +-
src/broadcom/vulkan/v3dv_private.h | 7 +++
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 75 +++++++++++++++++++++++--
src/broadcom/vulkan/v3dvx_image.c | 16 +++++-
src/broadcom/vulkan/v3dvx_meta_common.c | 32 +++++++++++
src/broadcom/vulkan/v3dvx_pipeline.c | 5 ++
src/broadcom/vulkan/v3dvx_queue.c | 11 ++++
7 files changed, 142 insertions(+), 10 deletions(-)
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
index ad032d832ad..3da7364686f 100644
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
'--beta', with_vulkan_beta.to_string(),
'--device-prefix', 'ver42',
+ '--device-prefix', 'ver71',
],
depend_files : vk_entrypoints_gen_depend_files,
)
@@ -67,10 +68,7 @@ files_per_version = files(
'v3dvx_queue.c',
)
-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']
v3dv_flags = []
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index c6707211529..6bdf338c67b 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -2608,6 +2608,9 @@ u64_compare(const void *key1, const void *key2)
case 42: \
v3d_X_thing = &v3d42_##thing; \
break; \
+ case 71: \
+ v3d_X_thing = &v3d71_##thing; \
+ break; \
default: \
unreachable("Unsupported hardware generation"); \
} \
@@ -2626,6 +2629,10 @@ u64_compare(const void *key1, const void *key2)
# define v3dX(x) v3d42_##x
# include "v3dvx_private.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dvx_private.h"
+# undef v3dX
#endif
#ifdef ANDROID
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index f182b790d36..b958e634c82 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
};
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
@@ -82,10 +87,15 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
}
/* There's definitely nothing in the VCD cache we want. */
@@ -649,10 +659,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
* bit and instead we have to emit a single clear of all tile buffers.
*/
if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = use_global_zs_clear;
clear.clear_all_render_targets = use_global_rt_clear;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
}
}
@@ -824,7 +839,12 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
config.number_of_render_targets = MAX2(subpass->color_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
const struct v3dv_image_view *iview =
@@ -920,7 +940,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const struct v3d_resource_slice *slice =
&image->planes[plane].slices[iview->vk.base_mip_level];
- const uint32_t *clear_color =
+ UNUSED const uint32_t *clear_color =
&state->attachments[attachment_idx].clear_value.color[0];
uint32_t clear_pad = 0;
@@ -937,13 +957,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = clear_color[0];
clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
clear.render_target_number = i;
};
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
clear.clear_color_mid_low_32_bits =
((clear_color[1] >> 24) | (clear_color[2] << 8));
@@ -951,17 +977,28 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
clear.render_target_number = i;
};
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
+
}
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
clear.uif_padded_height_in_uif_blocks = clear_pad;
clear.clear_color_high_16_bits = clear_color[3] >> 16;
clear.render_target_number = i;
};
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
}
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
v3dX(cmd_buffer_render_pass_setup_render_target)
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
@@ -976,6 +1013,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
/* Ends rendering mode config. */
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -1036,10 +1077,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
if (cmd_buffer->state.tile_aligned_render_area &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -1065,7 +1111,9 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
* now, would need to change if we allow multiple viewports
*/
float *vptranslate = dynamic->viewport.translate[0];
+#if V3D_VERSION == 42
float *vpscale = dynamic->viewport.scale[0];
+#endif
struct v3dv_job *job = cmd_buffer->state.job;
assert(job);
@@ -1078,10 +1126,15 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
v3dv_return_if_oom(cmd_buffer, NULL);
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
float translate_z, scale_z;
v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
@@ -1591,16 +1644,20 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
-
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
v3dv_return_if_oom(cmd_buffer, NULL);
+#if V3D_VERSION == 42
+ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
config.early_z_enable = enable_ez;
config.early_z_updates_enable = config.early_z_enable &&
pipeline->z_updates_enable;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
}
void
@@ -2031,10 +2088,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
pipeline->vpm_cfg.Gv);
}
+#if V3D_VERSION == 42
struct v3dv_bo *default_attribute_values =
pipeline->default_attribute_values != NULL ?
pipeline->default_attribute_values :
pipeline->device->default_attribute_float;
+#endif
cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
pipeline->shader_state_record, shader) {
@@ -2060,8 +2119,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+#if V3D_VERSION == 42
shader.address_of_default_attribute_values =
v3dv_cl_address(default_attribute_values, 0);
+#endif
shader.any_shader_reads_hardware_written_primitive_id =
(pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -2399,11 +2460,17 @@ v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buf
assert(iview->plane_count == 1);
*rt_bpp = iview->planes[0].internal_bpp;
- *rt_type = iview->planes[0].internal_type;
if (vk_format_is_int(iview->vk.view_format))
+#if V3D_VERSION == 42
+ *rt_type = iview->planes[0].internal_type;
+ if (vk_format_is_int(iview->vk.format))
*rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
else if (vk_format_is_srgb(iview->vk.view_format))
*rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
else
*rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
}
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index 80a3e5bfde8..dac6ff2741f 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
-
tex.texture_type = image_view->format->planes[plane].tex_type;
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
@@ -110,7 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+#if V3D_VERSION == 42
+ tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
+#endif
+
+#if V3D_VERSION == 42
tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -166,7 +173,12 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
assert(buffer_view->format->plane_count == 1);
tex.texture_type = buffer_view->format->planes[0].tex_type;
+#if V3D_VERSION == 42
tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 04147b82cbd..2db07ea7427 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -58,7 +58,12 @@ emit_rcl_prologue(struct v3dv_job *job,
config.number_of_render_targets = 1;
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
config.internal_depth_type = fb->internal_depth_type;
}
@@ -88,14 +93,20 @@ emit_rcl_prologue(struct v3dv_job *job,
}
}
+#if V3D_VERSION == 42
const uint32_t *color = &clear_info->clear_value->color[0];
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = color[0];
clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
clear.render_target_number = 0;
};
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
clear.clear_color_mid_low_32_bits =
((color[1] >> 24) | (color[2] << 8));
@@ -103,22 +114,37 @@ emit_rcl_prologue(struct v3dv_job *job,
((color[2] >> 24) | ((color[3] & 0xffff) << 8));
clear.render_target_number = 0;
};
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
+
}
if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
clear.uif_padded_height_in_uif_blocks = clear_pad;
clear.clear_color_high_16_bits = color[3] >> 16;
clear.render_target_number = 0;
};
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
}
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = tiling->internal_bpp;
rt.render_target_0_internal_type = fb->internal_type;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -179,10 +205,16 @@ emit_frame_setup(struct v3dv_job *job,
*/
if (clear_value &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = true;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
+
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 5d32d414ed8..922698b08a2 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -447,10 +447,15 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
/* FIXME: Use combined input/output size flag in the common case (also
* on v3d, see v3dx_draw).
*/
+#if V3D_VERSION == 42
shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs_bin->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs->separate_segments;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
shader.coordinate_shader_input_vpm_segment_size =
prog_data_vs_bin->separate_segments ?
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index efe63de425c..1a26d04aef7 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -42,14 +42,25 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
config.image_height_pixels = 1;
config.number_of_render_targets = 1;
config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+#if V3D_VERSION >= 71
+ unreachable("Hardware generation 71 not supported yet.");
+#endif
+
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = 1.0f;
--
2.39.2

View File

@ -1,29 +0,0 @@
From 7aa016bca8bb1bf449ea79505692353c0bd174b8 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 10 Nov 2021 10:06:50 +0100
Subject: [PATCH 074/142] v3dv: expose V3D revision number in device name
---
src/broadcom/vulkan/v3dv_device.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index d29ffad3531..3034b561480 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
device->next_program_id = 0;
ASSERTED int len =
- asprintf(&device->name, "V3D %d.%d",
- device->devinfo.ver / 10, device->devinfo.ver % 10);
+ asprintf(&device->name, "V3D %d.%d.%d",
+ device->devinfo.ver / 10,
+ device->devinfo.ver % 10,
+ device->devinfo.rev);
assert(len != -1);
v3dv_physical_device_init_disk_cache(device);
--
2.39.2

View File

@ -1,54 +0,0 @@
From fb9e95b7e1d5987fd25e914635c4e09d81ea9561 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 10 Nov 2021 07:54:35 +0100
Subject: [PATCH 075/142] v3dv/device: handle new rpi5 device (bcm2712)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This includes both master and primary devices.
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
---
src/broadcom/vulkan/v3dv_device.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 3034b561480..c8719d33f15 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1287,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance)
if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+ strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
v3d_idx = i;
break;
}
@@ -1296,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance)
} else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
vc4_idx = i;
break;
}
@@ -1334,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
switch (dev->devinfo.ver) {
case 42:
return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+ case 71:
+ return 0x55701C33; /* Broadcom deviceID for 2712 */
default:
unreachable("Unsupported V3D version");
}
--
2.39.2

View File

@ -1,32 +0,0 @@
From c4f957af4fb0e10abf0a7ffad4f7a468633b7d99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 20 Jul 2021 14:00:44 +0200
Subject: [PATCH 076/142] v3dv/cmd_buffer: emit TILE_BINNING_MODE_CFG for v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index b958e634c82..17b2f46850d 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -94,7 +94,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
#endif
#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideally we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
#endif
}
--
2.39.2

View File

@ -1,53 +0,0 @@
From 1934ac07df73cb685f6550b8b0f5b4f2ead11396 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 20 Jul 2021 14:33:00 +0200
Subject: [PATCH 077/142] v3dv: emit TILE_RENDERING_MODE_CFG_COMMON for v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++-
src/broadcom/vulkan/v3dvx_meta_common.c | 9 ++++++++-
2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 17b2f46850d..7837b460051 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -850,7 +850,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
#endif
#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
#endif
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 2db07ea7427..e4084d851fc 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -62,7 +62,14 @@ emit_rcl_prologue(struct v3dv_job *job,
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
#endif
config.internal_depth_type = fb->internal_depth_type;
}
--
2.39.2

View File

@ -1,315 +0,0 @@
From f0f9eea3cad83ed8824c6a7686150327407a5286 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 22 Jul 2021 14:26:13 +0200
Subject: [PATCH 078/142] v3dv/cmd_buffer: emit
TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1 for v71
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 186 +++++++++++++++++-------
src/broadcom/vulkan/v3dvx_meta_common.c | 12 +-
src/broadcom/vulkan/v3dvx_private.h | 11 +-
3 files changed, 147 insertions(+), 62 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 7837b460051..c6307890da5 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -800,6 +800,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
}
}
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary
+ * right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+ if (vk_format_is_int(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_INT;
+ else if (vk_format_is_srgb(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_NORM;
+ else
+ return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+ switch (rt_type) {
+ case V3D_INTERNAL_TYPE_8I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+ case V3D_INTERNAL_TYPE_8UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_8:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ case V3D_INTERNAL_TYPE_16I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+ case V3D_INTERNAL_TYPE_16UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_16F:
+ return vk_format_is_srgb(vk_format) ?
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+ case V3D_INTERNAL_TYPE_32I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+ case V3D_INTERNAL_TYPE_32UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_32F:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+ default:
+ unreachable("Unknown internal render target type");
+ }
+
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+ int rt,
+ uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+ uint32_t *rt_type,
+ uint32_t *rt_clamp)
+#else
+ uint32_t *rt_type_clamp)
+#endif
+{
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ assert(state->subpass_idx < state->pass->subpass_count);
+ const struct v3dv_subpass *subpass =
+ &state->pass->subpasses[state->subpass_idx];
+
+ if (rt >= subpass->color_count)
+ return;
+
+ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+ const uint32_t attachment_idx = attachment->attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ return;
+
+ assert(attachment_idx < state->framebuffer->attachment_count &&
+ attachment_idx < state->attachment_alloc_count);
+ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+ assert(vk_format_is_color(iview->vk.format));
+
+ assert(iview->plane_count == 1);
+ *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+ *rt_type = iview->planes[0].internal_type;
+ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+}
+
void
v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -939,10 +1036,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
*/
job->early_zs_clear = do_early_zs_clear;
+#if V3D_VERSION >= 71
+ uint32_t base_addr = 0;
+#endif
for (uint32_t i = 0; i < subpass->color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.render_target_number = i;
+ rt.stride = 1; /* Unused */
+ }
+#endif
continue;
+ }
struct v3dv_image_view *iview =
state->attachments[attachment_idx].image_view;
@@ -978,9 +1085,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
clear.render_target_number = i;
};
#endif
-#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
-#endif
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
#if V3D_VERSION == 42
@@ -1010,27 +1114,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
unreachable("HW generation 71 not supported yet.");
#endif
}
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.clear_color_low_bits = clear_color[0];
+ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+ &rt.internal_type_and_clamping);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = base_addr;
+ rt.render_target_number = i;
+
+ /* base_addr in multiples of 512 bits. We divide by 8 because stride
+ * is in 128-bit units, but it is packing 2 rows worth of data, so we
+ * need to divide it by 2 so it is only 1 row, and then again by 4 so
+ * it is in 512-bit units.
+ */
+ base_addr += (tiling->tile_height * rt.stride) / 8;
+ }
+#endif
}
#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
&rt.render_target_0_internal_type, &rt.render_target_0_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 1, &rt.render_target_1_internal_bpp,
&rt.render_target_1_internal_type, &rt.render_target_1_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 2, &rt.render_target_2_internal_bpp,
&rt.render_target_2_internal_type, &rt.render_target_2_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
}
#endif
-#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
-#endif
/* Ends rendering mode config. */
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -2445,46 +2566,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
buffer->mem_offset + offset);
}
}
-
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp)
-{
- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
- assert(state->subpass_idx < state->pass->subpass_count);
- const struct v3dv_subpass *subpass =
- &state->pass->subpasses[state->subpass_idx];
-
- if (rt >= subpass->color_count)
- return;
-
- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
- const uint32_t attachment_idx = attachment->attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
- return;
-
- assert(attachment_idx < state->framebuffer->attachment_count &&
- attachment_idx < state->attachment_alloc_count);
- struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
- assert(vk_format_is_color(iview->vk.format));
-
- assert(iview->plane_count == 1);
- *rt_bpp = iview->planes[0].internal_bpp;
- if (vk_format_is_int(iview->vk.view_format))
-#if V3D_VERSION == 42
- *rt_type = iview->planes[0].internal_type;
- if (vk_format_is_int(iview->vk.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
- else if (vk_format_is_srgb(iview->vk.view_format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
- else
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-#endif
-#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
-#endif
-}
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index e4084d851fc..c6391bc6d83 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -26,6 +26,7 @@
#include "broadcom/common/v3d_macros.h"
#include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
@@ -150,7 +151,16 @@ emit_rcl_prologue(struct v3dv_job *job,
}
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.internal_bpp = tiling->internal_bpp;
+ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+ fb->vk_format);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = 0;
+ rt.render_target_number = 0;
+ }
#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ad8ddfa5731..a4157d11c7c 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -125,13 +125,6 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
uint32_t internal_size,
uint32_t *hw_color);
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp);
-
/* Used at v3dv_device */
void
@@ -325,3 +318,7 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
--
2.39.2

View File

@ -1,25 +0,0 @@
From 7c89d8026fd550282d54933f37ffc2773869326f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Mon, 26 Jul 2021 15:08:11 +0200
Subject: [PATCH 079/142] v3dvx/cmd_buffer: emit CLEAR_RENDER_TARGETS for v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index c6307890da5..ae1c21ae00b 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1219,7 +1219,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
#endif
#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
--
2.39.2

View File

@ -1,38 +0,0 @@
From 2eb29b57fde2acda76e12953b3a1050f3056b39d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Sun, 19 Sep 2021 23:37:32 +0200
Subject: [PATCH 080/142] v3dv/cmd_buffer: emit CLIPPER_XY_SCALING for v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index ae1c21ae00b..2e525a11619 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1246,9 +1246,7 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
* now, would need to change if we allow multiple viewports
*/
float *vptranslate = dynamic->viewport.translate[0];
-#if V3D_VERSION == 42
float *vpscale = dynamic->viewport.scale[0];
-#endif
struct v3dv_job *job = cmd_buffer->state.job;
assert(job);
@@ -1268,7 +1266,10 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
}
#endif
#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+ }
#endif
float translate_z, scale_z;
--
2.39.2

View File

@ -1,97 +0,0 @@
From 611bf6a7445837c7e20416ff9f11a6dad9c543d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 14 Sep 2021 10:08:19 +0200
Subject: [PATCH 081/142] v3dv/uniforms: update VIEWPORT_X/Y_SCALE uniforms for
v71
As the packet CLIPPER_XY scaling, this needs to be computed on 1/64ths
of pixel, instead of 1/256ths of pixels.
As this is the usual values that we get from macros, we add manually a
v42 and v71 macro, and define a new helper (V3DV_X) to get the value
for the current hw version.
---
src/broadcom/vulkan/v3dv_private.h | 17 +++++++++++++++++
src/broadcom/vulkan/v3dv_uniforms.c | 7 ++++---
src/broadcom/vulkan/v3dvx_private.h | 9 +++++++++
3 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 6bdf338c67b..cd6811b19c2 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -2617,6 +2617,23 @@ u64_compare(const void *key1, const void *key2)
v3d_X_thing; \
})
+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({ \
+ __typeof(V3D42_##thing) V3D_X_THING; \
+ switch (device->devinfo.ver) { \
+ case 42: \
+ V3D_X_THING = V3D42_##thing; \
+ break; \
+ case 71: \
+ V3D_X_THING = V3D71_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ V3D_X_THING; \
+})
+
+
/* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
* define v3dX for each version supported, because when we compile code that
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 72fa9a1b39c..0e681cc4ee2 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
+ float clipper_xy_granularity =
+ V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
for (int i = 0; i < uinfo->count; i++) {
uint32_t data = uinfo->data[i];
@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
break;
case QUNIFORM_VIEWPORT_X_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Y_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Z_OFFSET: {
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index a4157d11c7c..ff9ba75cf93 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -319,6 +319,15 @@ uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
uint32_t
v3dX(clamp_for_format_and_type)(uint32_t rt_type,
VkFormat vk_format);
--
2.39.2

View File

@ -1,40 +0,0 @@
From 3819efaf2bb6fd8bd9cd45d54fb7254377b2296a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 27 Jul 2021 14:02:30 +0200
Subject: [PATCH 082/142] v3dv/cmd_buffer: just don't fill up early-z fields
for CFG_BITS for v71
For v71 early_z_enable/early_z_updates_enable is configured with
packet 121.
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 2e525a11619..fe9f7e43596 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1783,17 +1783,14 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
v3dv_return_if_oom(cmd_buffer, NULL);
-#if V3D_VERSION == 42
- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+#if V3D_VERSION == 42
+ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
config.early_z_enable = enable_ez;
config.early_z_updates_enable = config.early_z_enable &&
pipeline->z_updates_enable;
- }
-#endif
-#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
#endif
+ }
}
void
--
2.39.2

View File

@ -1,219 +0,0 @@
From e3b1a578f45ea830d790970115b6de978d56edb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 28 Jul 2021 12:01:38 +0200
Subject: [PATCH 083/142] v3dv: default vertex attribute values are gen
dependant
Content, structure and size would depend on the generation. Even if it
is needed at all.
So let's move it to the v3dvx files.
---
src/broadcom/vulkan/v3dv_device.c | 2 +-
src/broadcom/vulkan/v3dv_pipeline.c | 61 ++-------------------------
src/broadcom/vulkan/v3dv_private.h | 4 --
src/broadcom/vulkan/v3dvx_pipeline.c | 63 ++++++++++++++++++++++++++++
src/broadcom/vulkan/v3dvx_private.h | 8 ++++
5 files changed, 75 insertions(+), 63 deletions(-)
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index c8719d33f15..01e2dd7ac2d 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -2043,7 +2043,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
device->instance->default_pipeline_cache_enabled);
device->default_attribute_float =
- v3dv_pipeline_create_default_attribute_values(device, NULL);
+ v3dv_X(device, create_default_attribute_values)(device, NULL);
device->device_address_mem_ctx = ralloc_context(NULL);
util_dynarray_init(&device->device_address_bo_list,
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 22f01bdf64b..d012ff8f948 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -2802,62 +2802,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
}
}
-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
- for (uint8_t i = 0; i < pipeline->va_count; i++) {
- if (vk_format_is_int(pipeline->va[i].vk_format))
- return true;
- }
- return false;
-}
-
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline)
-{
- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
- struct v3dv_bo *bo;
-
- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
- if (!bo) {
- fprintf(stderr, "failed to allocate memory for the default "
- "attribute values\n");
- return NULL;
- }
-
- bool ok = v3dv_bo_map(device, bo, size);
- if (!ok) {
- fprintf(stderr, "failed to map default attribute values buffer\n");
- return false;
- }
-
- uint32_t *attrs = bo->map;
- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
- attrs[i * 4 + 0] = 0;
- attrs[i * 4 + 1] = 0;
- attrs[i * 4 + 2] = 0;
- VkFormat attr_format =
- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
- if (i < va_count && vk_format_is_int(attr_format)) {
- attrs[i * 4 + 3] = 1;
- } else {
- attrs[i * 4 + 3] = fui(1.0);
- }
- }
-
- v3dv_bo_unmap(device, bo);
-
- return bo;
-}
-
static void
pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
const VkPipelineMultisampleStateCreateInfo *ms_info)
@@ -2992,9 +2936,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
- if (pipeline_has_integer_vertex_attrib(pipeline)) {
+ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
pipeline->default_attribute_values =
- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
if (!pipeline->default_attribute_values)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
} else {
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index cd6811b19c2..a9fab24d19e 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -2500,10 +2500,6 @@ void
v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
struct v3dv_pipeline_cache *cache);
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline);
-
VkResult
v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
nir_shader *nir,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 922698b08a2..e235220cb14 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -664,3 +664,66 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
}
}
}
+
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+ for (uint8_t i = 0; i < pipeline->va_count; i++) {
+ if (vk_format_is_int(pipeline->va[i].vk_format))
+ return true;
+ }
+ return false;
+}
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+ return pipeline_has_integer_vertex_attrib(pipeline);
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline)
+{
+ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+ struct v3dv_bo *bo;
+
+ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+ if (!bo) {
+ fprintf(stderr, "failed to allocate memory for the default "
+ "attribute values\n");
+ return NULL;
+ }
+
+ bool ok = v3dv_bo_map(device, bo, size);
+ if (!ok) {
+ fprintf(stderr, "failed to map default attribute values buffer\n");
+ return NULL;
+ }
+
+ uint32_t *attrs = bo->map;
+ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+ attrs[i * 4 + 0] = 0;
+ attrs[i * 4 + 1] = 0;
+ attrs[i * 4 + 2] = 0;
+ VkFormat attr_format =
+ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+ if (i < va_count && vk_format_is_int(attr_format)) {
+ attrs[i * 4 + 3] = 1;
+ } else {
+ attrs[i * 4 + 3] = fui(1.0);
+ }
+ }
+
+ v3dv_bo_unmap(device, bo);
+
+ return bo;
+}
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ff9ba75cf93..036ce11b455 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -306,6 +306,14 @@ void
v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
const VkPipelineVertexInputStateCreateInfo *vi_info,
const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline);
+
/* Used at v3dv_queue */
void
v3dX(job_emit_noop)(struct v3dv_job *job);
--
2.39.2

View File

@ -1,87 +0,0 @@
From 8464dc8869f3d2eccfecac7b4358cc0ffe05f081 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 28 Jul 2021 12:05:26 +0200
Subject: [PATCH 084/142] v3dv/pipeline: default vertex attributes values are
not needed for v71
There are not part of the shader state record.
---
src/broadcom/vulkan/v3dv_private.h | 10 +++++++++-
src/broadcom/vulkan/v3dvx_pipeline.c | 10 ++++++++++
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index a9fab24d19e..300a1ec8ae1 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -581,6 +581,10 @@ struct v3dv_device {
* being float being float, allowing us to reuse the same BO for all
* pipelines matching this requirement. Pipelines that need integer
* attributes will create their own BO.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
+ *
*/
struct v3dv_bo *default_attribute_float;
@@ -2289,11 +2293,15 @@ struct v3dv_pipeline {
unsigned char sha1[20];
/* In general we can reuse v3dv_device->default_attribute_float, so note
- * that the following can be NULL.
+ * that the following can be NULL. In 7.x this is not used, so it will be
+ * NULL.
*
* FIXME: the content of this BO will be small, so it could be improved to
* be uploaded to a common BO. But as in most cases it will be NULL, it is
* not a priority.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
*/
struct v3dv_bo *default_attribute_values;
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index e235220cb14..4dc6d70efe1 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -665,6 +665,7 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
}
}
+#if V3D_VERSION == 42
static bool
pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
{
@@ -674,11 +675,16 @@ pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
}
return false;
}
+#endif
bool
v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
{
+#if V3D_VERSION == 42
return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+ return false;
}
/* @pipeline can be NULL. In that case we assume the most common case. For
@@ -691,6 +697,10 @@ struct v3dv_bo *
v3dX(create_default_attribute_values)(struct v3dv_device *device,
struct v3dv_pipeline *pipeline)
{
+#if V3D_VERSION >= 71
+ return NULL;
+#endif
+
uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
struct v3dv_bo *bo;
--
2.39.2

View File

@ -1,39 +0,0 @@
From 339096598660ec34be8087007dd4d66581de1c4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Wed, 28 Jul 2021 13:45:52 +0200
Subject: [PATCH 085/142] v3dv/pipeline: handle GL_SHADER_STATE_RECORD changed
size on v71
It is likely that we would need more changes, as this packet changed,
but this is enough to get basic tests running. Any additional support
will be handled with new commits.
---
src/broadcom/vulkan/v3dvx_pipeline.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 4dc6d70efe1..a640c1d084a 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -360,7 +360,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
static void
pack_shader_state_record(struct v3dv_pipeline *pipeline)
{
- assert(sizeof(pipeline->shader_state_record) ==
+ assert(sizeof(pipeline->shader_state_record) >=
cl_packet_length(GL_SHADER_STATE_RECORD));
struct v3d_fs_prog_data *prog_data_fs =
@@ -453,9 +453,6 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs->separate_segments;
#endif
-#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
-#endif
shader.coordinate_shader_input_vpm_segment_size =
prog_data_vs_bin->separate_segments ?
--
2.39.2

View File

@ -1,89 +0,0 @@
From 5b1342eb1e255d17619b1a7b33eaf7b31f5e50a5 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 22 Sep 2021 12:03:58 +0200
Subject: [PATCH 086/142] v3dv: setup render pass color clears for any format
bpp in v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 33 ++++++++++++++++----------
1 file changed, 20 insertions(+), 13 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index fe9f7e43596..1b39e230580 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1064,7 +1064,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
UNUSED const uint32_t *clear_color =
&state->attachments[attachment_idx].clear_value.color[0];
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
slice->tiling == V3D_TILING_UIF_XOR) {
int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
@@ -1084,10 +1084,8 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
clear.render_target_number = i;
};
-#endif
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
-#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
clear.clear_color_mid_low_32_bits =
((clear_color[1] >> 24) | (clear_color[2] << 8));
@@ -1095,25 +1093,16 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
clear.render_target_number = i;
};
-#endif
-#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
-#endif
-
}
if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
clear.uif_padded_height_in_uif_blocks = clear_pad;
clear.clear_color_high_16_bits = clear_color[3] >> 16;
clear.render_target_number = i;
};
-#endif
-#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
-#endif
}
+#endif
#if V3D_VERSION >= 71
cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
@@ -1133,6 +1122,24 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
*/
base_addr += (tiling->tile_height * rt.stride) / 8;
}
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) clear_color[1]) |
+ (((uint64_t) (clear_color[2] & 0xff)) << 32);
+ rt.render_target_number = i;
+ }
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (clear_color[3])) << 24);
+ rt.render_target_number = i;
+ }
+ }
#endif
}
--
2.39.2

View File

@ -1,126 +0,0 @@
From ff5b5d4405b1d5600d7f1c4355202fd303f56700 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 22 Sep 2021 12:04:21 +0200
Subject: [PATCH 087/142] v3dv: setup TLB clear color for meta operations in
v71
---
src/broadcom/vulkan/v3dvx_meta_common.c | 46 +++++++++++++++----------
1 file changed, 27 insertions(+), 19 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index c6391bc6d83..09ebcfa97c1 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -75,8 +75,9 @@ emit_rcl_prologue(struct v3dv_job *job,
config.internal_depth_type = fb->internal_depth_type;
}
+ const uint32_t *color = NULL;
if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (clear_info->image) {
const struct v3dv_image *image = clear_info->image;
@@ -101,20 +102,16 @@ emit_rcl_prologue(struct v3dv_job *job,
}
}
+ color = &clear_info->clear_value->color[0];
+
#if V3D_VERSION == 42
- const uint32_t *color = &clear_info->clear_value->color[0];
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = color[0];
clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
clear.render_target_number = 0;
};
-#endif
-#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
-#endif
if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
-#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
clear.clear_color_mid_low_32_bits =
((color[1] >> 24) | (color[2] << 8));
@@ -122,25 +119,16 @@ emit_rcl_prologue(struct v3dv_job *job,
((color[2] >> 24) | ((color[3] & 0xffff) << 8));
clear.render_target_number = 0;
};
-#endif
-#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
-#endif
-
}
if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
clear.uif_padded_height_in_uif_blocks = clear_pad;
clear.clear_color_high_16_bits = color[3] >> 16;
clear.render_target_number = 0;
};
-#endif
-#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
-#endif
}
+#endif
}
#if V3D_VERSION == 42
@@ -150,8 +138,11 @@ emit_rcl_prologue(struct v3dv_job *job,
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
#endif
+
#if V3D_VERSION >= 71
cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ if (color)
+ rt.clear_color_low_bits = color[0];
rt.internal_bpp = tiling->internal_bpp;
rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
fb->vk_format);
@@ -161,6 +152,24 @@ emit_rcl_prologue(struct v3dv_job *job,
rt.base_address = 0;
rt.render_target_number = 0;
}
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) color[1]) |
+ (((uint64_t) (color[2] & 0xff)) << 32);
+ rt.render_target_number = 0;
+ }
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (color[3])) << 24);
+ rt.render_target_number = 0;
+ }
+ }
#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
@@ -229,9 +238,8 @@ emit_frame_setup(struct v3dv_job *job,
}
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
#endif
-
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
--
2.39.2

View File

@ -1,49 +0,0 @@
From 1e9d7d69849fa646b331f7661c74ee138badc4bb Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 25 Oct 2021 01:37:12 +0200
Subject: [PATCH 088/142] v3dv: fix up texture shader state for v71
There are some new fields for YCbCr with pointers for the various
planes in multi-planar formats. These need to match the base address
pointer in the texture state, or the hardware will assume this is a
multi-planar texture.
---
src/broadcom/vulkan/v3dvx_image.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index dac6ff2741f..848290c2a47 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -129,6 +129,14 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
iplane);
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
}
@@ -191,5 +199,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
buffer_view->offset;
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
--
2.39.2

View File

@ -1,52 +0,0 @@
From 1f150a3a92741f7654a13626bd5b27b5575f2b76 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 25 Oct 2021 01:38:31 +0200
Subject: [PATCH 089/142] v3dv: handle new texture state transfer functions in
v71
---
src/broadcom/vulkan/v3dvx_image.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index 848290c2a47..437d4588c7e 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -108,15 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+ bool is_srgb = vk_format_is_srgb(image_view->vk.format);
#if V3D_VERSION == 42
tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
#endif
#if V3D_VERSION == 42
- tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
+ tex.srgb = is_srgb;
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
#endif
/* At this point we don't have the job. That's the reason the first
@@ -181,11 +182,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
assert(buffer_view->format->plane_count == 1);
tex.texture_type = buffer_view->format->planes[0].tex_type;
+
+ bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
#if V3D_VERSION == 42
- tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+ tex.srgb = is_srgb;
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
#endif
/* At this point we don't have the job. That's the reason the first
--
2.39.2

View File

@ -1,42 +0,0 @@
From 45de9f019ee92635de9a505db58439f0f4561281 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 28 Sep 2021 08:14:11 +0200
Subject: [PATCH 090/142] v3dv: implement noop job for v71
---
src/broadcom/vulkan/v3dvx_queue.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index 1a26d04aef7..f8cee36e3bf 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -46,7 +46,8 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
#endif
#if V3D_VERSION >= 71
- unreachable("HW generation 71 not supported yet.");
+ config.log2_tile_width = 3; /* Tile size 64 */
+ config.log2_tile_height = 3; /* Tile size 64 */
#endif
}
@@ -58,10 +59,13 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
}
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.internal_bpp = V3D_INTERNAL_BPP_32;
+ rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ rt.stride = 1; /* Unused RT */
+ }
#endif
-
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = 1.0f;
clear.stencil_clear_value = 0;
--
2.39.2

View File

@ -1,117 +0,0 @@
From 3e607bb28056bb52242be6878281efae84026813 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 28 Sep 2021 08:23:48 +0200
Subject: [PATCH 091/142] v3dv: handle render pass global clear for v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 66 ++++++++++++++++----------
1 file changed, 41 insertions(+), 25 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 1b39e230580..48b2e319e51 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -362,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
iview->vk.base_array_layer + layer,
image_plane);
+ /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+ * is broken in earlier V3D versions.
+ */
+ assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
+
cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = buffer;
store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
@@ -484,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
const VkImageAspectFlags aspects =
vk_format_aspects(ds_attachment->desc.format);
+#if V3D_VERSION <= 42
+ /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+ * for depth/stencil.
+ *
+ * There used to be some confusion regarding the Clear Tile Buffers
+ * Z/S bit also being broken, but we confirmed with Broadcom that this
+ * is not the case, it was just that some other hardware bugs (that we
+ * need to work around, such as GFXH-1461) could cause this bit to behave
+ * incorrectly.
+ *
+ * There used to be another issue where the RTs bit in the Clear Tile
+ * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+ * fixed since V3D 4.1.
+ *
+ * So if we have to emit a clear of depth or stencil we don't use
+ * the per-buffer store clear bit, even if we need to store the buffers,
+ * instead we always have to use the Clear Tile Buffers Z/S bit.
+ * If we have configured the job to do early Z/S clearing, then we
+ * don't want to emit any Clear Tile Buffers command at all here.
+ *
+ * Note that GFXH-1689 is not reproduced in the simulator, where
+ * using the clear buffer bit in depth/stencil stores works fine.
+ */
+
/* Only clear once on the first subpass that uses the attachment */
uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
ds_attachment->first_subpass :
@@ -503,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
ds_attachment->desc.stencilLoadOp,
subpass->do_stencil_clear_with_draw);
+ use_global_zs_clear = !state->job->early_zs_clear &&
+ (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+ /* The store command's clear buffer bit cannot be used for Z/S stencil:
+ * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+ * so we don't want to emit redundant clears here.
+ */
+ use_global_zs_clear = false;
+#endif
+
/* Skip the last store if it is not required */
uint32_t ds_last_subpass = !pass->multiview_enabled ?
ds_attachment->last_subpass :
@@ -545,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
needs_stencil_store = subpass->resolve_stencil;
}
- /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
- * for depth/stencil.
- *
- * There used to be some confusion regarding the Clear Tile Buffers
- * Z/S bit also being broken, but we confirmed with Broadcom that this
- * is not the case, it was just that some other hardware bugs (that we
- * need to work around, such as GFXH-1461) could cause this bit to behave
- * incorrectly.
- *
- * There used to be another issue where the RTs bit in the Clear Tile
- * Buffers packet also cleared Z/S, but Broadcom confirmed this is
- * fixed since V3D 4.1.
- *
- * So if we have to emit a clear of depth or stencil we don't use
- * the per-buffer store clear bit, even if we need to store the buffers,
- * instead we always have to use the Clear Tile Buffers Z/S bit.
- * If we have configured the job to do early Z/S clearing, then we
- * don't want to emit any Clear Tile Buffers command at all here.
- *
- * Note that GFXH-1689 is not reproduced in the simulator, where
- * using the clear buffer bit in depth/stencil stores works fine.
- */
- use_global_zs_clear = !state->job->early_zs_clear &&
- (needs_depth_clear || needs_stencil_clear);
if (needs_depth_store || needs_stencil_store) {
const uint32_t zs_buffer =
v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -673,7 +689,7 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
}
#endif
#if V3D_VERSION >= 71
- unreachable("Hardware generation 71 not supported yet.");
+ cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
#endif
}
}
--
2.39.2

View File

@ -1,32 +0,0 @@
From 3794f6f08c559c4e442b57e992d501fb7d515b9b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 28 Sep 2021 08:31:04 +0200
Subject: [PATCH 092/142] v3dv: GFX-1461 does not affect V3D 7.x
---
src/broadcom/vulkan/v3dv_pass.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 20f5014268d..3e82c15df88 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device,
/* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
* the clear might get lost. If a subpass has this then we can't emit
- * the clear using the TLB and we have to do it as a draw call.
+ * the clear using the TLB and we have to do it as a draw call. This
+ * issue is fixed since V3D 4.3.18.
*
* FIXME: separate stencil.
*/
- if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+ if (device->devinfo.ver == 42 &&
+ subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
struct v3dv_render_pass_attachment *att =
&pass->attachments[subpass->ds_attachment.attachment];
if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
--
2.39.2

View File

@ -1,69 +0,0 @@
From 5be7f484210103e40b77fa3135042da4a8406659 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 28 Sep 2021 08:59:08 +0200
Subject: [PATCH 093/142] v3dv: update thread end restrictions validation for
v71
---
src/broadcom/compiler/qpu_validate.c | 37 +++++++++++++++++++++++++---
1 file changed, 34 insertions(+), 3 deletions(-)
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 1082fb7d50a..0466ee5d0b6 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -316,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if ((inst->alu.add.op != V3D_QPU_A_NOP &&
!inst->alu.add.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "ADD RF write at THREND");
+ }
+ if (inst->alu.add.waddr == 2 ||
+ inst->alu.add.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
!inst->alu.mul.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "MUL RF write at THREND");
+ }
+
+ if (inst->alu.mul.waddr == 2 ||
+ inst->alu.mul.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
!inst->sig_magic) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71 &&
+ (inst->sig_addr == 2 ||
+ inst->sig_addr == 3)) {
+ fail_instr(state, "RF2-3 write after THREND");
+ }
}
/* GFXH-1625: No TMUWT in the last instruction */
--
2.39.2

View File

@ -1,68 +0,0 @@
From a751dff57b6d769f5b031054cc65415cc3b44c08 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 29 Sep 2021 08:22:59 +0200
Subject: [PATCH 094/142] v3dv: handle early Z/S clears for v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 30 ++++++++++++++++++++------
1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 48b2e319e51..4580e2a4650 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -998,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* Early-Z/S clearing is independent of Early Z/S testing, so it is
* possible to enable one but not the other so long as their
* respective requirements are met.
+ *
+ * From V3D 4.5.6, Z/S buffers are always cleared automatically
+ * between tiles, but we still want to enable early ZS clears
+ * when Z/S are not loaded or stored.
*/
struct v3dv_render_pass_attachment *ds_attachment =
&pass->attachments[ds_attachment_idx];
@@ -1005,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const VkImageAspectFlags ds_aspects =
vk_format_aspects(ds_attachment->desc.format);
- bool needs_depth_clear =
- check_needs_clear(state,
- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_attachment->first_subpass,
- ds_attachment->desc.loadOp,
- subpass->do_depth_clear_with_draw);
-
bool needs_depth_store =
v3dv_cmd_buffer_check_needs_store(state,
ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
ds_attachment->last_subpass,
ds_attachment->desc.storeOp) ||
subpass->resolve_depth;
+#if V3D_VERSION <= 42
+ bool needs_depth_clear =
+ check_needs_clear(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ subpass->do_depth_clear_with_draw);
do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+ bool needs_depth_load =
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp);
+ do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
if (do_early_zs_clear &&
vk_format_has_stencil(ds_attachment->desc.format)) {
bool needs_stencil_load =
--
2.39.2

View File

@ -1,34 +0,0 @@
From 2add46ebce4760bf8349606201324ee0e6b1f9da Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 29 Sep 2021 09:07:28 +0200
Subject: [PATCH 095/142] v3dv: handle RTs with no color targets in v71
---
src/broadcom/vulkan/v3dvx_cmd_buffer.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index 4580e2a4650..750486a6ccf 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -1175,6 +1175,17 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
#endif
}
+#if V3D_VERSION >= 71
+ /* If we don't have any color RTs, we still need to emit one and flag
+ * it as not used using stride = 1.
+ */
+ if (subpass->color_count == 0) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.stride = 1;
+ }
+ }
+#endif
+
#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
cmd_buffer_render_pass_setup_render_target
--
2.39.2

View File

@ -1,85 +0,0 @@
From 019abbd34d2d904d6bb33f9fa4433cb53ca7899c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 1 Oct 2021 15:18:38 +0200
Subject: [PATCH 096/142] v3dv: no specific separate_segments flag for V3D 7.1
On V3D 7.1 there is not a flag on the Shader State Record to specify
if we are using shared or separate segments. This is done by setting
the vpm input size to 0 (so we need to ensure that the output would be
the max needed for input/output).
We were already doing the latter on the prog_data_vs, so we just need
to use those values, instead of assigning default values.
As we are here, we also add some comments on the compiler part.
---
src/broadcom/compiler/qpu_schedule.c | 4 ++++
src/broadcom/compiler/vir.c | 4 ++++
src/broadcom/vulkan/v3dvx_pipeline.c | 15 +++++++++++++--
3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 77fb6a794e6..4f767296860 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -297,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* If the input and output segments are shared, then all VPM reads to
* a location need to happen before all writes. We handle this by
* serializing all VPM operations for now.
+ *
+ * FIXME: we are assuming that the segments are shared. That is
+ * correct right now as we are only using shared, but technically you
+ * can choose.
*/
bool separate_vpm_segment = false;
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 7612eed7130..dd0aa761c43 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -745,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
+ *
+ * FIXME: initially testing on V3D 7.1 seems to work fine when using
+ * separate segments. So we could try to reevaluate in the future, if
+ * there is any advantage of using separate segments.
*/
prog_data->separate_segments = false;
prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index a640c1d084a..a72ca3c241b 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -452,14 +452,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
prog_data_vs_bin->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs->separate_segments;
-#endif
-
shader.coordinate_shader_input_vpm_segment_size =
prog_data_vs_bin->separate_segments ?
prog_data_vs_bin->vpm_input_size : 1;
shader.vertex_shader_input_vpm_segment_size =
prog_data_vs->separate_segments ?
prog_data_vs->vpm_input_size : 1;
+#endif
+
+ /* On V3D 7.1 there isn't a specific flag to set if we are using
+ * shared/separate segments or not. We just set the value of
+ * vpm_input_size to 0, and set output to the max needed. That should be
+ * already properly set on prog_data_vs_bin
+ */
+#if V3D_VERSION == 71
+ shader.coordinate_shader_input_vpm_segment_size =
+ prog_data_vs_bin->vpm_input_size;
+ shader.vertex_shader_input_vpm_segment_size =
+ prog_data_vs->vpm_input_size;
+#endif
shader.coordinate_shader_output_vpm_segment_size =
prog_data_vs_bin->vpm_output_size;
--
2.39.2

View File

@ -1,39 +0,0 @@
From 4f6b4f91577ec04aab907d59d836d0c17731a9d0 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 7 Oct 2021 12:43:49 +0200
Subject: [PATCH 097/142] v3dv: don't convert floating point border colors in
v71
The TMU does this for us now.
---
src/broadcom/vulkan/v3dvx_device.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
index e235983864c..72daefadb08 100644
--- a/src/broadcom/vulkan/v3dvx_device.c
+++ b/src/broadcom/vulkan/v3dvx_device.c
@@ -118,7 +118,11 @@ static union pipe_color_union encode_border_color(
(1 << (desc->channel[i].size - 1)) - 1);
}
- /* convert from float to expected format */
+#if V3D_VERSION <= 42
+ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+ * for us. In V3D 4.x we need to manually convert floating point color
+ * values to the expected format.
+ */
if (vk_format_is_srgb(bc_info->format) ||
vk_format_is_compressed(bc_info->format)) {
for (int i = 0; i < 4; i++)
@@ -170,6 +174,7 @@ static union pipe_color_union encode_border_color(
}
}
}
+#endif
return border;
}
--
2.39.2

View File

@ -1,60 +0,0 @@
From d8083cb8f104e0f035f5b812e000a500fa52d66f Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 15 Oct 2021 13:06:31 +0200
Subject: [PATCH 098/142] v3dv: handle Z clipping in v71
Fixes the following tests:
dEQP-VK.clipping.clip_volume.*
dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_* (except deltazero)
---
src/broadcom/vulkan/v3dvx_pipeline.c | 33 ++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index a72ca3c241b..7b1133f8173 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -227,6 +227,39 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
pipeline->z_updates_enable = config.z_updates_enable;
+
+#if V3D_VERSION >= 71
+ /* From the Vulkan spec:
+ *
+ * "depthClampEnable controls whether to clamp the fragments depth
+ * values as described in Depth Test. If the pipeline is not created
+ * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+ * then enabling depth clamp will also disable clipping primitives to
+ * the z planes of the frustrum as described in Primitive Clipping.
+ * Otherwise depth clipping is controlled by the state set in
+ * VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+ *
+ * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
+ * supported in the driver yet, so in practice we are always enabling Z
+ * clipping for now.
+ */
+ bool z_clip_enable = false;
+ const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+ ds_info ? vk_find_struct_const(ds_info->pNext,
+ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+ NULL;
+ if (clip_info)
+ z_clip_enable = clip_info->depthClipEnable;
+ else if (!(rs_info && rs_info->depthClampEnable))
+ z_clip_enable = true;
+
+ if (z_clip_enable) {
+ config.z_clipping_mode = pipeline->negative_one_to_one ?
+ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
+ } else {
+ config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+ }
+#endif
};
}
--
2.39.2

Some files were not shown because too many files have changed in this diff Show More