From 7228a420fd3fc1cff6c183a4d162dfac75018d0f Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Fri, 21 Sep 2018 19:22:52 +0100 Subject: [PATCH] ffmpeg: sync with newclock5 --- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 5974 +++++++++-------- 1 file changed, 3049 insertions(+), 2925 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 1d65823f3d..0c51586d54 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -12349,10 +12349,10 @@ index 0000000000..75a1789c25 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S new file mode 100644 -index 0000000000..7ea82b38fe +index 0000000000..6ce3d3ca8d --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S -@@ -0,0 +1,902 @@ +@@ -0,0 +1,872 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -12423,6 +12423,8 @@ index 0000000000..7ea82b38fe +.equ AVAIL_S_UL_N_L_C, 32 - 3 +.equ AVAIL_S_L_N_DL_C, 32 - 4 + ++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr ++ +@ On entry +@ r2 req +@ r3 avail @@ -12442,78 +12444,77 @@ index 0000000000..7ea82b38fe +@ If UR avail then d_ur == a_ur so U-filter good too +@ +@ Data load pointers (only load if req & avail): -+@ r8 DL + stride -+@ r6 L -+@ r7 U -+@ r4 UR ++@ r4 DL + stride ++@ r10 L ++@ r6 U ++@ r5 UR +@ +@ Others: -+@ r2 req (if preserve_req) -+@ r3 req & avail (if preserve_req) -+@ r2 req & avail (if !preserve_req) -+@ r10 L + stride -+@ r5 DL + stride * 2 -+@ r12 stride * 2 ++@ r2 req ++@ r7 req & avail ++@ r3 L + stride ++@ r8 DL + stride * 2 ++@ r9 stride * 2 +@ cs Load U +@ mi Load UR +@ +@ Clobbered: -+@ r9, lr ++@ r12 + -+.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur, preserve_req, I1, I2 ++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur + -+.equ src_l, \sp_offset + 0 -+.equ src_u, \sp_offset + 4 -+.equ src_ur, \sp_offset + 8 -+.equ stride, \sp_offset + 12 -+.equ pw, (1 << \pw_s) @ pel width in bytes -+.equ b_size, (1 << (\pw_s + \log2_s)) @ size in bytes ++.equ src_l\@, \sp_offset + 0 ++.equ src_u\@, \sp_offset + 4 ++.equ src_ur\@, \sp_offset + 8 ++.equ stride\@, \sp_offset + 12 ++.equ pw\@, (1 << \pw_s) @ pel width in bytes ++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes + -+ ldrd r4, r5, [sp, #src_ur] @ and stride -+ ldrd r6, r7, [sp, #src_l] @ and src_u -+ lsls lr, r3, #AVAIL_S_U_N_UL_C -+ mov r8, r4 -+ sub r9, r6, r5 -+ it mi -+ movmi r8, r7 -+ it cs -+ movcs r8, r9 -+ lsls lr, r3, #AVAIL_S_L_N_DL_C -+ ite pl -+ movpl r6, r8 -+ addmi r8, r9, r5, lsl #\log2_s -+ it cs -+ addcs r8, r6, r5, lsl #\log2_s -+ .if !\preserve_req -+ and r2, r2, r3 -+ .endif -+ add r10, r6, r5 -+ lsl r12, r5, #1 -+ lsls lr, r3, #AVAIL_S_U_N_UL_C -+ it cc -+ movcc r9, r6 -+ vld1.\d_type {\d_l}, [r8], r5 -+ add lr, r7, #b_size - pw -+ add r5, r8, r5 -+ itt pl -+ movpl lr, r9 -+ movpl r7, r9 -+ tst r3, #AVAIL_UR -+ vld1.\d_type {\d_ul}, [r9] -+ it eq -+ moveq r4, lr -+ \I1 -+ .if \preserve_req -+ and r3, r2, r3 -+ .else -+ lsls lr, r2, #AVAIL_S_UR_N_U_C -+ .endif -+ vld1.\d_type {\d_u}, [r7] -+ \I2 -+ vld1.\d_type {\d_ur}, [r4] -+ .if \preserve_req -+ lsls lr, r3, #AVAIL_S_UR_N_U_C -+ .endif ++@ r9 stride ++@ r7 = ab_ul, r6 = a_u, r5 = a_ur ++@ r4 = b_dl, r10 = b_l, r8 = b_u ++ ++ ldr r5, [sp, #src_ur\@] ++ lsl r12, r3, #AVAIL_S_U_DL_CPSR ++ ldr r10, [sp, #src_l\@] ++ ldr r9, [sp, #stride\@] ++ ldr r6, [sp, #src_u\@] ++ ++ @ This is quite a slow instruction but it replaces ++ @ a decent number of tests that yield a max of 2 flags/op ++ @ It is annoying we can't branch on Q! ++ @ If L navail (ne) then DL must be navail (pl) ++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur ++ ++ mov r4, r5 ++ sub r7, r10, r9 ++ it vs ++ movvs r4, r6 ++ add r8, r6, #b_size\@ - pw\@ ++ it cs ++ movcs r4, r7 ++ ite ne ++ movne r10, r4 ++ addeq r4, r7, r9, lsl #\log2_s ++ it cc ++ movcc r7, r10 ++ it mi ++ addmi r4, r10, r9, lsl #\log2_s ++ vld1.\d_type {\d_ul}, [r7] ++ itt vc ++ movvc r8, r7 ++ movvc r6, r7 ++ vld1.\d_type {\d_l }, [r4], r9 ++ tst r3, #AVAIL_UR ++ vld1.\d_type {\d_u }, [r6] ++ it eq ++ moveq r5, r8 ++ and r7, r2, r3 ++ add r8, r4, r9 ++ vld1.\d_type {\d_ur}, [r5] ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ add r3, r10, r9 ++ lsl r9, #1 +.endm + + @@ -12536,33 +12537,33 @@ index 0000000000..7ea82b38fe +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_8, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d2[0], d3[], d4[], 0 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] + -+ sub r3, r0, #pw -+ it mi -+ vldrmi s7, [r4] -+ it cs -+ vldrcs s6, [r7] -+ it pl -+ vmovpl.f32 s7, s8 -+ lsls lr, r2, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.8 {d0[0]}, [r6], r12 -+ vld1.8 {d1[0]}, [r10], r12 -+ vld1.8 {d0[1]}, [r6] -+ vld1.8 {d1[1]}, [r10] ++ it cs ++ vldrcs s2, [r6] ++ ite pl ++ vmovpl s3, s4 ++ vldrmi s3, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10] ++ vld1.8 {d0[3]}, [r3] +1: -+ bcc 1f -+ vld1.8 {d1[2]}, [r8], r12 -+ vld1.8 {d0[3]}, [r5] -+ vld1.8 {d1[3]}, [r8] ++ bcc 1f ++ vld1.8 {d0[5]}, [r4], r9 ++ vld1.8 {d0[6]}, [r8] ++ vld1.8 {d0[7]}, [r4] +1: -+ vst1.8 {d2[0]}, [r3] -+ vst1.8 {d3}, [r1] -+ vzip.8 d0, d1 -+ vst1.8 {d0}, [r0] -+ pop {r4-r10, pc} ++ vstr d1, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] ++ vstr d0, [r0] @ Left ++ pop {r4-r10, pc} +endfunc + + @@ -12584,31 +12585,30 @@ index 0000000000..7ea82b38fe +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d2[0], d3[], d4[], 0 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] + -+ sub r3, r0, #pw -+ it mi -+ vldrmi d4, [r4] -+ it cs -+ vldrcs d3, [r7] -+ lsls lr, r2, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.16 {d0[0]}, [r6], r12 -+ vld1.16 {d1[0]}, [r10], r12 -+ vld1.16 {d0[1]}, [r6] -+ vld1.16 {d1[1]}, [r10] ++ it cs ++ vldrcs d2, [r6] ++ it mi ++ vldrmi d3, [r5] ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10] ++ vld1.16 {d0[3]}, [r3] +1: -+ bcc 1f -+ vld1.16 {d1[2]}, [r8], r12 -+ vld1.16 {d0[3]}, [r5] -+ vld1.16 {d1[3]}, [r8] ++ bcc 1f ++ vld1.16 {d1[1]}, [r4], r9 ++ vld1.16 {d1[2]}, [r8] ++ vld1.16 {d1[3]}, [r4] +1: -+ vst1.16 {d2[0]}, [r3] -+ vst1.16 {d3, d4}, [r1] -+ vzip.16 d0, d1 -+ vst1.16 {q0}, [r0] -+ pop {r4-r10, pc} ++ vst1.16 {q1}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] ++ vst1.16 {q0}, [r0] @ Left ++ pop {r4-r10, pc} +endfunc + + @@ -12630,69 +12630,72 @@ index 0000000000..7ea82b38fe +.set log2_s, 3 + +function ff_hevc_rpi_intra_filter_8_neon_8, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d3[7], d4[], d5[], 1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] + -+ it mi -+ vldrmi d5, [r4] -+ sub r0, #pw -+ it cs -+ vldrcs d4, [r7] -+ lsls lr, r3, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.8 {d0[0]}, [r6], r12 -+ vld1.8 {d1[0]}, [r10], r12 -+ vld1.8 {d0[1]}, [r6], r12 -+ vld1.8 {d1[1]}, [r10], r12 -+ vld1.8 {d0[2]}, [r6], r12 -+ vld1.8 {d1[2]}, [r10], r12 -+ vld1.8 {d0[3]}, [r6] -+ vld1.8 {d1[3]}, [r10] ++ it cs ++ vldrcs d4, [r6] ++ it mi ++ vldrmi d5, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10], r9 ++ vld1.8 {d0[3]}, [r3], r9 ++ vld1.8 {d0[4]}, [r10], r9 ++ vld1.8 {d0[5]}, [r3], r9 ++ vld1.8 {d0[6]}, [r10] ++ vld1.8 {d0[7]}, [r3] +1: -+ bcc 1f -+ vld1.8 {d1[4]}, [r8], r12 -+ vld1.8 {d0[5]}, [r5], r12 -+ vld1.8 {d1[5]}, [r8], r12 -+ vld1.8 {d0[6]}, [r5], r12 -+ vld1.8 {d1[6]}, [r8], r12 -+ vld1.8 {d0[7]}, [r5], r12 -+ vld1.8 {d1[7]}, [r8], r12 ++ bcc 1f ++ vld1.8 {d1[1]}, [r4], r9 ++ vld1.8 {d1[2]}, [r8], r9 ++ vld1.8 {d1[3]}, [r4], r9 ++ vld1.8 {d1[4]}, [r8], r9 ++ vld1.8 {d1[5]}, [r4], r9 ++ vld1.8 {d1[6]}, [r8] ++ vld1.8 {d1[7]}, [r4] +1: -+ vext.8 q3, q1, q2, #15 -+ vmov.u8 r4, d5[7] @ Save final pel -+ tst r2, #FILTER_LIGHT -+ vzip.8 d0, d1 -+ beq 1f ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f + + @ Luma light filter -+ vaddl.u8 q8, d7, d5 -+ vext.8 q1, q1, q0, #15 -+ vaddl.u8 q2, d6, d4 -+ vaddl.u8 q3, d3, d1 -+ vaddl.u8 q9, d2, d0 -+ vext.16 q10, q8, q8, #1 -+ vext.16 q11, q3, q3, #1 -+ vadd.u16 q10, q8 -+ vadd.u16 q11, q3 -+ vadd.u16 d2, d4, d18 @ d2[0] = l[0] + 2ul + u[0] -+ vmov.u8 r5, d1[7] @ Save final pel -+ vext.16 q0, q2, q8, #1 -+ vext.16 q3, q9, q3, #1 -+ vadd.u16 q8, q0, q2 -+ vadd.u16 q3, q9 -+ vrshrn.u16 d5, q10, #2 -+ vrshrn.u16 d1, q11, #2 -+ vrshr.u16 d2, #2 -+ vrshrn.u16 d4, q8, #2 -+ vrshrn.u16 d0, q3, #2 -+ vmov.8 d5[7], r4 @ Restore final pel -+ vmov.8 d1[7], r5 @ Restore final pel -+ vdup.8 d3, d2[0] -+1: -+ vst1.8 {d3[7]}, [r0]! -+ vst1.8 {q2}, [r1] -+ vst1.8 {q0}, [r0] -+ pop {r4-r10, pc} ++ vext.8 q8, q15, q2, #15 ++ vext.8 q12, q15, q0, #15 ++ vaddl.u8 q9, d17, d5 ++ vaddl.u8 q8, d16, d4 ++ vaddl.u8 q13, d25, d1 ++ vaddl.u8 q12, d24, d0 ++ vmov.u8 r3, d5[7] @ Save final pel ++ vmov.u8 r2, d1[7] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ vrshrn.u16 d0, q0, #2 ++ vrshrn.u16 d1, q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u8 d5[7], r3 @ Restore final pel ++ vmov.u8 d1[7], r2 @ Restore final pel ++ vdup.u8 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.8 {q2 }, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] @ Up-left ++ vst1.8 {q0 }, [r0] @ Left ++ pop {r4-r10, pc} +endfunc + + @@ -12717,89 +12720,85 @@ index 0000000000..7ea82b38fe +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_8_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d5[3], "d16[],d17[]", "d18[],d19[]", 1, \ -+ "ldr r9, [sp, #ur_size]", \ -+ "sub r0, #pw" ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" + -+ vmov q1, q0 -+ ldrh lr, [r4, #3*2] -+ it mi -+ vldmmi r4, {d18, d19} -+ it cs -+ vldmcs r7, {d16, d17} -+ itt mi -+ cmpmi r9, #p_size -+ vdupmi.16 d19, lr -+ lsls lr, r3, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.16 {d0[0]}, [r6], r12 -+ vld1.16 {d2[0]}, [r10], r12 -+ vld1.16 {d0[1]}, [r6], r12 -+ vld1.16 {d2[1]}, [r10], r12 -+ vld1.16 {d0[2]}, [r6], r12 -+ vld1.16 {d2[2]}, [r10], r12 -+ vld1.16 {d0[3]}, [r6] -+ vld1.16 {d2[3]}, [r10] ++ it cs ++ vldmcs r6, {d4, d5} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #4 ++ vldm r5, {d6, d7} ++ bgt 1f ++ vdup.16 d7, d6[3] +1: -+ ldr lr, [sp, #dl_size] -+ bcc 2f -+ vld1.16 {d3[0]}, [r8], r12 -+ vld1.16 {d1[1]}, [r5], r12 -+ cmp lr, #p_size -+ vld1.16 {d3[1]}, [r8], r12 -+ bcc 10f -+ vld1.16 {d1[2]}, [r5], r12 -+ vld1.16 {d3[2]}, [r8], r12 -+ vld1.16 {d1[3]}, [r5] -+ vld1.16 {d3[3]}, [r8] ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vdup.16 q1, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10] ++ vld1.16 {d1[3]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.16 {d2[1]}, [r4], r9 ++ cmp r12, #p_size ++ vld1.16 {d2[2]}, [r8], r9 ++ vld1.16 {d2[3]}, [r4], r9 ++ blt 2f ++ vld1.16 {d3[0]}, [r8], r9 ++ vld1.16 {d3[1]}, [r4], r9 ++ vld1.16 {d3[2]}, [r8] ++ vld1.16 {d3[3]}, [r4] ++ b 1f +2: -+ vext.16 q3, q8, q9, #7 -+ vext.16 q10, q2, q8, #7 -+ tst r2, #FILTER_LIGHT -+ vzip.16 q0, q1 -+ beq 3f ++ vdup.16 d3, d2[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f + + @ Luma light filter -+ vadd.i16 q3, q9 -+ vext.16 q11, q0, q1, #7 -+ vext.16 q2, q2, q0, #7 -+ vadd.i16 q8, q10 -+ vadd.i16 q10, q11, q1 -+ vadd.i16 q0, q2 -+ vext.16 q11, q3, q3, #1 -+ vadd.i16 d4, d16, d0 @ d4[0] = l[0] + 2ul + u[0] -+ vmov.u16 r4, d19[3] @ Save final pel -+ vext.16 q9, q10, q10, #1 -+ vext.16 q12, q8, q3, #1 -+ vext.16 q13, q0, q10, #1 -+ vadd.i16 q3, q11 -+ vadd.i16 q10, q9 -+ vadd.i16 q8, q12 -+ vadd.i16 q0, q13 -+ vmov.u16 r5, d3[3] @ Save final pel -+ vrshr.u16 d4, d4, #2 -+ vrshr.u16 q9, q3, #2 -+ vrshr.u16 q1, q10, #2 -+ vrshr.u16 q8, #2 -+ vrshr.u16 q0, #2 -+ vmov.16 d19[3], r4 @ Restore final pel -+ vmov.16 d3[3], r5 @ Restore final pel -+ vdup.16 d5, d4[0] -+3: -+ vst1.16 {d5[3]}, [r0]! -+ vst1.16 {q8-q9}, [r1] -+ vst1.16 {q0-q1}, [r0] -+ pop {r4-r10, pc} ++ vext.16 q9, q2, q3, #7 ++ vext.16 q8, q15, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ vadd.u16 q9, q3 ++ vadd.u16 q8, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r3, d7[3] @ Save final pel ++ vmov.u16 r2, d3[3] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r3 @ Restore final pel ++ vmov.u16 d3[3], r2 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] + +10: -+A ldrh r9, [r8, -r12] -+T sub r9, r8, r12 -+T ldrh r9, [r9] -+ orr r9, r9, r9, lsl #16 -+ vmov.32 d1[1], r9 -+ vmov.32 d3[1], r9 -+ b 2b ++ vst1.16 {q2, q3}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vst1.16 {q0, q1}, [r0] @ Left ++ pop {r4-r10, pc} +endfunc + +@ int ff_hevc_rpi_intra_filter_16_neon_16( @@ -12823,163 +12822,152 @@ index 0000000000..7ea82b38fe +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_16_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d4[],d5[]", d17[3], "d18[],d19[]", "d22[],d23[]", 1, \ -+ "ldr r9, [sp, #ur_size]", \ -+ "sub r0, #pw" ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" + -+ vmov q10, q9 -+ ldr lr, [sp, #dl_size] -+ vmov q12, q11 -+ it cs -+ vldmcs r7, {q9-q10} ++ vdup.16 q9, d16[0] ++ vdup.16 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {d16-d19} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #12 + @ Given chroma frame layout, if UR exists then it is always legit to + @ load all of it even if most of it is outside the frame. -+ itt mi -+ vldmmi r4, {q11-q12} -+ cmpmi r9, #p_size -+ bmi 10f -+1: -+ lsls r3, #AVAIL_S_L_N_DL_C -+ bpl 20f -+ vld1.16 {d0[0]}, [r6], r12 -+ vld1.16 {d2[0]}, [r10], r12 -+ vld1.16 {d0[1]}, [r6], r12 -+ vld1.16 {d2[1]}, [r10], r12 -+ vld1.16 {d0[2]}, [r6], r12 -+ vld1.16 {d2[2]}, [r10], r12 -+ vld1.16 {d0[3]}, [r6], r12 -+ vld1.16 {d2[3]}, [r10], r12 -+ vld1.16 {d1[0]}, [r6], r12 -+ vld1.16 {d3[0]}, [r10], r12 -+ vld1.16 {d1[1]}, [r6], r12 -+ vld1.16 {d3[1]}, [r10], r12 -+ vld1.16 {d1[2]}, [r6], r12 -+ vld1.16 {d3[2]}, [r10], r12 -+ vld1.16 {d1[3]}, [r6] -+ vld1.16 {d3[3]}, [r10] -+2: bcc 30f -+ vld1.16 {d6[0]}, [r8], r12 -+ vld1.16 {d4[1]}, [r5], r12 -+ cmp lr, #p_size -+ vld1.16 {d6[1]}, [r8], r12 -+ bcc 40f -+ vld1.16 {d4[2]}, [r5], r12 -+ vld1.16 {d6[2]}, [r8], r12 -+ vld1.16 {d4[3]}, [r5], r12 -+ vld1.16 {d6[3]}, [r8], r12 -+ vld1.16 {d5[0]}, [r5], r12 -+ vld1.16 {d7[0]}, [r8], r12 -+ vld1.16 {d5[1]}, [r5], r12 -+ vld1.16 {d7[1]}, [r8], r12 -+ vld1.16 {d5[2]}, [r5], r12 -+ vld1.16 {d7[2]}, [r8], r12 -+ vld1.16 {d5[3]}, [r5] -+ vld1.16 {d7[3]}, [r8] -+3: -+ vzip.16 q0, q1 -+ tst r2, #FILTER_LIGHT -+ vzip.16 q2, q3 -+ beq 4f ++ vldm r5, {d20-d23} ++ bgt 1f ++ bge 4f ++ cmp r5, #8 ++ bge 3f ++ vdup.16 d21, d20[3] ++3: vdup.16 d22, d21[3] ++4: vdup.16 d23, d22[3] + -+ vext.16 q13, q8, q0, #7 -+ vadd.i16 q13, q0 -+ vext.16 q0, q0, q1, #7 -+ vadd.i16 q0, q1 -+ vext.16 q1, q1, q2, #7 -+ vadd.i16 q1, q2 -+ vext.16 q2, q2, q3, #7 -+ vadd.i16 q2, q3 -+ vext.16 q14, q8, q9, #7 -+ vadd.i16 q14, q9 -+ vext.16 q9, q9, q10, #7 -+ vadd.i16 q9, q10 -+ vext.16 q10, q10, q11, #7 -+ vadd.i16 q10, q11 -+ vext.16 q11, q11, q12, #7 -+ vadd.i16 q11, q12 -+ vadd.i16 d17, d26, d28 @ d17[0] = l[0] + 2ul + u[0] -+ vmov.u16 r4, d7[3] @ Save final pel -+ vext.16 q3, q2, q2, #1 -+ vadd.i16 q3, q2 -+ vext.16 q2, q1, q2, #1 -+ vadd.i16 q2, q1 -+ vext.16 q1, q0, q1, #1 -+ vadd.i16 q1, q0 -+ vext.16 q0, q13, q0, #1 -+ vadd.i16 q0, q13 -+ vext.16 q13, q11, q11, #1 -+ vadd.i16 q13, q11 -+ vext.16 q11, q10, q11, #1 -+ vadd.i16 q11, q10 -+ vext.16 q10, q9, q10, #1 -+ vadd.i16 q10, q9 -+ vext.16 q9, q14, q9, #1 -+ vadd.i16 q9, q14 -+ vrshr.u16 d17, #2 -+ vmov.u16 r5, d25[3] @ Save final pel -+ vrshr.u16 q3, #2 -+ vrshr.u16 q12, q13, #2 -+ vrshr.u16 q0, #2 -+ vrshr.u16 q1, #2 -+ vrshr.u16 q2, #2 -+ vrshr.u16 q9, #2 ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ ldr r12, [sp, #dl_size] ++ vdup.16 q1, d0[0] ++ vdup.16 q2, d0[0] ++ vdup.16 q3, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10], r9 ++ vld1.16 {d1[3]}, [r3], r9 ++ vld1.16 {d2[0]}, [r10], r9 ++ vld1.16 {d2[1]}, [r3], r9 ++ vld1.16 {d2[2]}, [r10], r9 ++ vld1.16 {d2[3]}, [r3], r9 ++ vld1.16 {d3[0]}, [r10], r9 ++ vld1.16 {d3[1]}, [r3], r9 ++ vld1.16 {d3[2]}, [r10] ++ vld1.16 {d3[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d4[1]}, [r4], r9 ++ cmp r12, #4 ++ vld1.16 {d4[2]}, [r8], r9 ++ vld1.16 {d4[3]}, [r4], r9 ++ ble 2f ++ vld1.16 {d5[0]}, [r8], r9 ++ vld1.16 {d5[1]}, [r4], r9 ++ cmp r12, #12 ++ vld1.16 {d5[2]}, [r8], r9 ++ vld1.16 {d5[3]}, [r4], r9 ++ blt 3f ++ vld1.16 {d6[0]}, [r8], r9 ++ vld1.16 {d6[1]}, [r4], r9 ++ vld1.16 {d6[2]}, [r8], r9 ++ vld1.16 {d6[3]}, [r4], r9 ++ ble 4f ++ vld1.16 {d7[0]}, [r8], r9 ++ vld1.16 {d7[1]}, [r4], r9 ++ vld1.16 {d7[2]}, [r8] ++ vld1.16 {d7[3]}, [r4] ++ b 1f ++2: vdup.16 d5, d4[3] ++3: vdup.16 d6, d5[3] ++4: vdup.16 d7, d6[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ vpush {q5} ++ @ Luma light filter ++ @ Left ++ vext.16 q5, q2, q3, #7 ++ vext.16 q14, q1, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ ++ vadd.u16 q5, q3 ++ vadd.u16 q14, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r2, d7[3] @ Save final pel ++ ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q14, #1 ++ vext.16 q2, q14, q5, #1 ++ vext.16 q3, q5, q5, #1 ++ ++ vmov d30, d24 @ d30[0] = l[0] + ul ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ vadd.u16 q2, q14 ++ vadd.u16 q3, q5 ++ ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ ++ @ Up ++ vext.16 q5, q10, q11, #7 ++ vext.16 q14, q9, q10, #7 ++ vext.16 q13, q8, q9, #7 ++ vext.16 q12, q15, q8, #7 ++ ++ vadd.u16 q5, q11 ++ vadd.u16 q14, q10 ++ vadd.u16 q13, q9 ++ vadd.u16 q12, q8 ++ vmov.u16 r3, d23[3] @ Save final pel ++ ++ vext.16 q8, q12, q13, #1 ++ vext.16 q9, q13, q14, #1 ++ vext.16 q10, q14, q5, #1 ++ vext.16 q11, q5, q5, #1 ++ ++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q8, q12 ++ vadd.u16 q9, q13 ++ vadd.u16 q10, q14 ++ vadd.u16 q11, q5 ++ ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 + vrshr.u16 q10, #2 + vrshr.u16 q11, #2 -+ vdup.16 d17, d17[0] -+ vmov.16 d7[3], r4 @ Restore final pel -+ vmov.16 d25[3], r5 @ Restore final pel -+4: -+ vst1.16 {d17[3]}, [r0]! -+ vst1.16 {q9-q10}, [r1]! -+ vst1.16 {q0-q1}, [r0]! -+ vst1.16 {q11-q12}, [r1] -+ vst1.16 {q2-q3}, [r0] -+ pop {r4-r10, pc} + -+10: cmp r9, #8 -+ bhi 12f -+ beq 11f -+ vdup.16 d21, d20[3] -+11: vdup.16 d22, d21[3] -+12: vdup.16 d23, d22[3] -+ b 1b ++ @ Misc ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r2 @ Restore final pel ++ vmov.u16 d23[3], r3 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ vpop {q5} + -+20: vmov q0, q2 -+ vmov q1, q2 -+ b 2b -+ -+30: vmov q3, q2 -+ b 3b -+ -+40: cmp lr, #8 -+ bhi 42f -+ beq 41f -+ vdup.16 d5, d6[1] -+ vdup.16 d7, d6[1] -+ vmov.f32 s9, s10 -+ vmov.f32 s13, s10 -+ b 3b -+41: vld1.16 {d4[2]}, [r5], r12 -+ vld1.16 {d6[2]}, [r8], r12 -+ vld1.16 {d4[3]}, [r5] -+ vld1.16 {d6[3]}, [r8] -+ vdup.16 d5, d6[3] -+ vdup.16 d7, d6[3] -+ b 3b -+42: vld1.16 {d4[2]}, [r5], r12 -+ vld1.16 {d6[2]}, [r8], r12 -+ vld1.16 {d4[3]}, [r5], r12 -+ vld1.16 {d6[3]}, [r8], r12 -+ vld1.16 {d5[0]}, [r5], r12 -+ ldrh lr, [r8, r12] -+ vld1.16 {d7[0]}, [r8], r12 -+ vld1.16 {d5[1]}, [r5] -+ vld1.16 {d7[1]}, [r8] -+ orr lr, lr, lr, lsl #16 -+ vmov s11, lr -+ vmov s15, lr -+ b 3b ++10: ++ vstm r1, {d16-d23} @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vstm r0, { d0-d7 } @ Left ++ pop {r4-r10, pc} +endfunc + +@ int ff_hevc_rpi_intra_filter_4_neon_32( @@ -13000,31 +12988,31 @@ index 0000000000..7ea82b38fe +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d16[0], "d4[],d5[]", "d6[],d7[]", 0, \ -+ "vmov q1, q0" ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" + -+ sub r3, r0, #pw -+ it mi -+ vldmmi r4, {d6, d7} -+ it cs -+ vldmcs r7, {d4, d5} -+ lsls lr, r2, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.32 {d0[0]}, [r6], r12 -+ vld1.32 {d0[1]}, [r10], r12 -+ vld1.32 {d1[0]}, [r6] -+ vld1.32 {d1[1]}, [r10] ++ it cs ++ vldmcs r6, {d4, d5} ++ it mi ++ vldmmi r5, {d6, d7} ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10] ++ vld1.32 {d1[1]}, [r3] +1: -+ bcc 1f -+ vld1.32 {d2[1]}, [r8], r12 -+ vld1.32 {d3[0]}, [r5] -+ vld1.32 {d3[1]}, [r8] ++ bcc 1f ++ vld1.32 {d2[1]}, [r4], r9 ++ vld1.32 {d3[0]}, [r8] ++ vld1.32 {d3[1]}, [r4] +1: -+ vst1.32 {d16[0]}, [r3] -+ vst1.32 {q2, q3}, [r1] -+ vst1.32 {q0, q1}, [r0] -+ pop {r4-r10, pc} ++ vst1.32 {q2, q3 }, [r1] @ Up ++ vst1.32 {d31[1]}, [r12] ++ vst1.32 {q0, q1 }, [r0] @ Left ++ pop {r4-r10, pc} +endfunc + + @@ -13050,57 +13038,54 @@ index 0000000000..7ea82b38fe + +function ff_hevc_rpi_intra_filter_8_neon_32, export=1 + push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]", 0, \ -+ "vmov r3, s0" ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" + -+ vmov q9, q8 -+ ldr r9, [r4, #3*4] -+ vmov q11, q10 -+ ldr lr, [sp, #ur_size] -+ it cs -+ vldmcs r7, {q8, q9} -+ ittt mi -+ vldmmi r4, {q10, q11} -+ cmpmi lr, #p_size -+ vdupmi.32 q11, r9 -+ lsls lr, r2, #AVAIL_S_L_N_DL_C -+ vdup.32 q1, r3 -+ vdup.32 q2, r3 -+ vdup.32 q3, r3 -+ it cs -+ ldrcs r9, [r8, r12] ++ vdup.32 q9, d16[0] ++ vdup.32 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {q8, q9 } ++ ldr r12, [sp, #ur_size] + bpl 1f -+ vld1.32 {d0[0]}, [r6], r12 -+ vld1.32 {d0[1]}, [r10], r12 -+ vld1.32 {d1[0]}, [r6], r12 -+ vld1.32 {d1[1]}, [r10], r12 -+ vld1.32 {d2[0]}, [r6], r12 -+ vld1.32 {d2[1]}, [r10], r12 -+ vld1.32 {d3[0]}, [r6] -+ vld1.32 {d3[1]}, [r10] ++ cmp r12, #p_size ++ vldm r5, {q10, q11} ++ bge 1f ++ vdup.32 q11, d21[1] ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ vdup.32 q2, d0[0] ++ vdup.32 q3, d0[0] ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10], r9 ++ vld1.32 {d1[1]}, [r3], r9 ++ vld1.32 {d2[0]}, [r10], r9 ++ vld1.32 {d2[1]}, [r3], r9 ++ vld1.32 {d3[0]}, [r10] ++ vld1.32 {d3[1]}, [r3] +1: -+ ldr lr, [sp, #dl_size] -+ bcc 2f -+ vld1.32 {d4[1]}, [r8], r12 -+ vld1.32 {d5[0]}, [r5], r12 -+ cmp lr, #p_size -+ vld1.32 {d5[1]}, [r8], r12 + bcc 1f -+ vld1.32 {d6[0]}, [r5], r12 -+ vld1.32 {d6[1]}, [r8], r12 -+ vld1.32 {d7[0]}, [r5] -+ vld1.32 {d7[1]}, [r8] -+1: -+ it cc -+ vdupcc.32 q3, r9 ++ ldr r12, [sp, #dl_size] ++ vld1.32 {d4[1]}, [r4], r9 ++ cmp r12, #p_size ++ vld1.32 {d5[0]}, [r8], r9 ++ vld1.32 {d5[1]}, [r4], r9 ++ blt 2f ++ vld1.32 {d6[0]}, [r8], r9 ++ vld1.32 {d6[1]}, [r4], r9 ++ vld1.32 {d7[0]}, [r8] ++ vld1.32 {d7[1]}, [r4] ++ b 1f +2: -+ vst1.32 {q8-q9}, [r1]! -+ sub r3, r0, #pw -+ vst1.32 {q0-q1}, [r0]! -+ vst1.32 {q10-q11}, [r1] -+ vst1.32 {q2-q3}, [r0] -+ vst1.32 {d31[1]}, [r3] -+ pop {r4-r10, pc} ++ vdup.32 q3, d5[1] ++1: ++ add r12, r0, #-pw ++ vstm r1, { q8-q11} @ Up ++ vst1.32 {d31[1]}, [r12] ++ vstm r0, { q0-q3 } @ Left ++ pop {r4-r10, pc} +endfunc + + @@ -13125,131 +13110,116 @@ index 0000000000..7ea82b38fe +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_16_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1], 1, \ -+ "ldr r9, [sp, #ur_size]", \ -+ "sub r0, #pw" ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] + + @ Once we get this big we have run out of neon regs to store + @ everything at once so do in pieces + -+ @ Up and/or up-right (have) -+ add lr, r1, #(pw << log2_s) -+ bcc 1f -+ vldm r7, {q0-q3} -+ vstm r1, {q0-q3} ++ @ Up (have) ++ it cs ++ vldmcs r6, { q0-q3 } ++ ldr r12, [sp, #ur_size] ++ it mi ++ vldmmi r5, { q8-q11} ++ it cs ++ vstmcs r1, { q0-q3 } ++ bpl 1f ++ cmp r12, #12 ++ add lr, r1, #(pw << log2_s) ++ bgt 2f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 q9, d17[1] ++4: vdup.16 d10, d19[1] ++3: vdup.16 q11, d21[1] ++2: vstm lr, { q8-q11} +1: -+ bpl 3f -+ vldm r4, {q8-q11} -+ cmp r9, #16 -+ blo 10f -+2: vstm lr, {q8-q11} -+3: -+ @ Up-left -+ vst1.32 {d30[1]}, [r0]! + -+ @ Left and/or down-left (have) -+ lsls lr, r3, #AVAIL_S_L_N_DL_C -+ ldr r9, [sp, #dl_size] -+ bpl 4f -+ vld1.32 {d0[0]}, [r6], r12 -+ vld1.32 {d0[1]}, [r10], r12 -+ vld1.32 {d1[0]}, [r6], r12 -+ vld1.32 {d1[1]}, [r10], r12 -+ vld1.32 {d2[0]}, [r6], r12 -+ vld1.32 {d2[1]}, [r10], r12 -+ vld1.32 {d3[0]}, [r6], r12 -+ vld1.32 {d3[1]}, [r10], r12 -+ vld1.32 {d4[0]}, [r6], r12 -+ vld1.32 {d4[1]}, [r10], r12 -+ vld1.32 {d5[0]}, [r6], r12 -+ vld1.32 {d5[1]}, [r10], r12 -+ vld1.32 {d6[0]}, [r6], r12 -+ vld1.32 {d6[1]}, [r10], r12 -+ vld1.32 {d7[0]}, [r6] -+ vld1.32 {d7[1]}, [r10] -+ vstm r0, {q0-q3} -+4: add lr, r0, #(pw << log2_s) -+ bcc 6f -+ vdup.32 d16, d30[0] -+ vld1.32 {d16[1]}, [r8], r12 -+ vld1.32 {d17[0]}, [r5], r12 -+ cmp r9, #16 -+ vld1.32 {d17[1]}, [r8], r12 -+ blo 20f -+ vld1.32 {d18[0]}, [r5], r12 -+ vld1.32 {d18[1]}, [r8], r12 -+ vld1.32 {d19[0]}, [r5], r12 -+ vld1.32 {d19[1]}, [r8], r12 -+ vld1.32 {d20[0]}, [r5], r12 -+ vld1.32 {d20[1]}, [r8], r12 -+ vld1.32 {d21[0]}, [r5], r12 -+ vld1.32 {d21[1]}, [r8], r12 -+ vld1.32 {d22[0]}, [r5], r12 -+ vld1.32 {d22[1]}, [r8], r12 -+ vld1.32 {d23[0]}, [r5] -+ vld1.32 {d23[1]}, [r8] -+5: vstm lr, {q8-q11} -+6: -+ eors r3, r2 @ (req & avail) ^ req = (req & ~avail) -+ bne 7f -+ pop {r4-r10, pc} -+7: -+ @ Up and/or up-right (don't have) -+ vdup.32 q0, d31[0] -+ lsls lr, r3, #AVAIL_S_UR_N_U_C -+ vdup.32 q1, d31[0] -+ add lr, r1, #(pw << log2_s) -+ vdup.32 q8, d31[1] -+ vdup.32 q9, d31[1] -+ it cs -+ vstmcs r1!, {q0-q1} -+ it mi -+ vstmmi lr!, {q8-q9} -+ it cs -+ vstmcs r1, {q0-q1} -+ it mi -+ vstmmi lr, {q8-q9} ++ @ Left (have) ++ add lr, r0, #-pw ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vst1.32 {d30[1]}, [lr] @ UL ++ bpl 1f ++ vld1.32 { d0[0]}, [r10], r9 ++ vld1.32 { d0[1]}, [r3], r9 ++ vld1.32 { d1[0]}, [r10], r9 ++ vld1.32 { d1[1]}, [r3], r9 ++ vld1.32 { d2[0]}, [r10], r9 ++ vld1.32 { d2[1]}, [r3], r9 ++ vld1.32 { d3[0]}, [r10], r9 ++ vld1.32 { d3[1]}, [r3], r9 ++ vld1.32 { d4[0]}, [r10], r9 ++ vld1.32 { d4[1]}, [r3], r9 ++ vld1.32 { d5[0]}, [r10], r9 ++ vld1.32 { d5[1]}, [r3], r9 ++ vld1.32 { d6[0]}, [r10], r9 ++ vld1.32 { d6[1]}, [r3], r9 ++ vld1.32 { d7[0]}, [r10] ++ vld1.32 { d7[1]}, [r3] ++ vstm r0, { q0-q3 } ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vdup.32 d16, d30[0] @ d16[0] = d30[0] ++ add lr, r0, #(pw << log2_s) ++ vld1.32 {d16[1]}, [r4], r9 ++ cmp r12, #4 ++ vld1.32 {d17[0]}, [r8], r9 ++ vld1.32 {d17[1]}, [r4], r9 ++ ble 2f ++ vld1.32 {d18[0]}, [r8], r9 ++ vld1.32 {d18[1]}, [r4], r9 ++ cmp r12, #12 ++ vld1.32 {d19[0]}, [r8], r9 ++ vld1.32 {d19[1]}, [r4], r9 ++ blt 3f ++ vld1.32 {d20[0]}, [r8], r9 ++ vld1.32 {d20[1]}, [r4], r9 ++ vld1.32 {d21[0]}, [r8], r9 ++ vld1.32 {d21[1]}, [r4], r9 ++ ble 4f ++ vld1.32 {d22[0]}, [r8], r9 ++ vld1.32 {d22[1]}, [r4], r9 ++ vld1.32 {d23[0]}, [r8] ++ vld1.32 {d23[1]}, [r4] ++ b 5f ++2: vdup.32 q9, d17[1] ++3: vdup.32 q10, d19[1] ++4: vdup.32 q11, d21[1] ++5: vstm lr, { q8-q11} ++1: ++ eors r7, r2 ++ beq 99f + -+ @ Left and/or down-left (don't have) -+ vdup.32 q0, d30[0] -+ lsls lr, r3, #AVAIL_S_L_N_DL_C -+ vdup.32 q1, d30[0] -+ add lr, r0, #(pw << log2_s) -+ it mi -+ vstmmi r0!, {q0-q1} -+ it cs -+ vstmcs lr!, {q0-q1} -+ it mi -+ vstmmi r0, {q0-q1} -+ it cs -+ vstmcs lr, {q0-q1} -+ pop {r4-r10, pc} ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ vdup.32 q0, d31[0] ++ vdup.32 q1, d31[0] ++ vdup.32 q2, d31[0] ++ vdup.32 q3, d31[0] ++ add lr, r1, #(pw << log2_s) ++ vdup.32 q8, d31[1] ++ vdup.32 q9, d31[1] ++ vdup.32 q10, d31[1] ++ vdup.32 q11, d31[1] ++ it cs ++ vstmcs r1, { q0-q3 } ++ it mi ++ vstmmi lr, { q8-q11} + -+10: cmp r9, #8 -+ bhi 12f -+ beq 11f -+ vdup.32 q9, d17[1] -+11: vdup.32 q10, d19[1] -+12: vdup.32 q11, d21[1] -+ b 2b ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q0, d30[0] ++ vdup.32 q1, d30[0] ++ vdup.32 q2, d30[0] ++ vdup.32 q3, d30[0] ++ add lr, r0, #(pw << log2_s) ++ it mi ++ vstmmi r0, { q0-q3 } ++ it cs ++ vstmcs lr, { q0-q3 } + -+20: cmp r9, #8 -+ blo 21f -+ vld1.32 {d18[0]}, [r5], r12 -+ vld1.32 {d18[1]}, [r8], r12 -+ vld1.32 {d19[0]}, [r5], r12 -+ vld1.32 {d19[1]}, [r8], r12 -+ beq 22f -+ vld1.32 {d20[0]}, [r5], r12 -+ vld1.32 {d20[1]}, [r8], r12 -+ vld1.32 {d21[0]}, [r5] -+ vld1.32 {d21[1]}, [r8] -+ b 23f -+21: vdup.32 q9, d17[1] -+22: vdup.32 q10, d19[1] -+23: vdup.32 q11, d21[1] -+ b 5b ++99: ++ pop {r4-r10, pc} +endfunc + + @@ -19949,10 +19919,10 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..0866a26702 +index 0000000000..891e3a900c --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c -@@ -0,0 +1,1940 @@ +@@ -0,0 +1,1936 @@ +/* + * HEVC Parameter Set decoding + * @@ -20890,7 +20860,7 @@ index 0000000000..0866a26702 +static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id, + const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx) +{ -+ HEVCWindow *ow; ++ HEVCRpiWindow *ow; + int ret = 0; + int log2_diff_max_min_transform_block_size; + int bit_depth_chroma, start, vui_present, sublayer_ordering_info; @@ -20980,7 +20950,6 @@ index 0000000000..0866a26702 + sps->bit_depth, bit_depth_chroma); + return AVERROR_INVALIDDATA; + } -+ sps->bit_depth_chroma = bit_depth_chroma; + + ret = map_pixel_format(sps); + if (ret < 0) @@ -21173,11 +21142,7 @@ index 0000000000..0866a26702 + "extended_precision_processing_flag not yet implemented\n"); + + sps->intra_smoothing_disabled_flag = get_bits1(gb); -+ sps->high_precision_offsets_enabled_flag = get_bits1(gb); -+ if (sps->high_precision_offsets_enabled_flag) -+ av_log(avctx, AV_LOG_WARNING, -+ "high_precision_offsets_enabled_flag not fully implemented\n"); -+ ++ sps->high_precision_offsets_enabled_flag = get_bits1(gb); + sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); + + cabac_bypass_alignment_enabled_flag = get_bits1(gb); @@ -21224,6 +21189,7 @@ index 0000000000..0866a26702 + sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; + + sps->qp_bd_offset = 6 * (sps->bit_depth - 8); ++ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7)); + + if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) || + av_mod_uintp2(sps->height, sps->log2_min_cb_size)) { @@ -21895,10 +21861,10 @@ index 0000000000..0866a26702 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..11d9e26853 +index 0000000000..712464a075 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,444 @@ +@@ -0,0 +1,447 @@ +/* + * HEVC parameter set parsing + * @@ -22013,26 +21979,26 @@ index 0000000000..11d9e26853 + int8_t slice_qp; + + uint8_t luma_log2_weight_denom; -+ int16_t chroma_log2_weight_denom; -+ -+ int16_t luma_weight_l0[16]; -+ int16_t chroma_weight_l0[16][2]; -+ int16_t chroma_weight_l1[16][2]; -+ int16_t luma_weight_l1[16]; ++ uint8_t chroma_log2_weight_denom; + ++ int16_t luma_weight_l0[16]; // -128, +255 + int16_t luma_offset_l0[16]; ++ int16_t chroma_weight_l0[16][2]; + int16_t chroma_offset_l0[16][2]; + ++ int16_t luma_weight_l1[16]; + int16_t luma_offset_l1[16]; ++ int16_t chroma_weight_l1[16][2]; + int16_t chroma_offset_l1[16][2]; ++ +} RpiSliceHeader; + -+typedef struct HEVCWindow { -+ unsigned int left_offset; -+ unsigned int right_offset; -+ unsigned int top_offset; -+ unsigned int bottom_offset; -+} HEVCWindow; ++typedef struct HEVCRpiWindow { ++ uint16_t left_offset; ++ uint16_t right_offset; ++ uint16_t top_offset; ++ uint16_t bottom_offset; ++} HEVCRpiWindow; + +typedef struct VUI { + AVRational sar; @@ -22057,7 +22023,7 @@ index 0000000000..11d9e26853 + int frame_field_info_present_flag; + + int default_display_window_flag; -+ HEVCWindow def_disp_win; ++ HEVCRpiWindow def_disp_win; + + int vui_timing_info_present_flag; + uint32_t vui_num_units_in_tick; @@ -22129,16 +22095,19 @@ index 0000000000..11d9e26853 + +typedef struct HEVCRpiSPS { + unsigned vps_id; -+ int chroma_format_idc; ++ uint8_t chroma_format_idc; + uint8_t separate_colour_plane_flag; + -+ HEVCWindow output_window; ++ HEVCRpiWindow output_window; + -+ HEVCWindow pic_conf_win; ++ HEVCRpiWindow pic_conf_win; + -+ int bit_depth; -+ int bit_depth_chroma; -+ int pixel_shift; ++ uint16_t wp_offset_half_range; // WpOffsetHalfRange ++ ++ uint8_t bit_depth; ++ ++// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth ++ uint8_t pixel_shift; + enum AVPixelFormat pix_fmt; + + unsigned int log2_max_poc_lsb; @@ -23351,10 +23320,10 @@ index 0000000000..d4ac348df9 +#endif /* AVCODEC_RPI_HEVC_SEI_H */ diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c new file mode 100644 -index 0000000000..fe506c8ad0 +index 0000000000..23b49a99ae --- /dev/null +++ b/libavcodec/rpi_hevc_shader.c -@@ -0,0 +1,1570 @@ +@@ -0,0 +1,1537 @@ +#include "rpi_hevc_shader.h" + +#ifdef _MSC_VER @@ -23382,1544 +23351,1511 @@ index 0000000000..fe506c8ad0 +// ::mc_start +/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_c_qn -+/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1 -+/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 -+/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif -+/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 -+/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift -+/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 -+/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 -+/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask -+/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif ++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif +/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif +/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch ++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch +/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num -+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 +/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num +/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift +/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x -+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a -+/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 -+/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif -+/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000118] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif -+/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a -+/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 -+/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 -+/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif ++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD ++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y +// :1 -+/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 -+/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 +// ::mc_filter_c_p -+/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 -+/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a -+/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b -+/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y -+/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d -+/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif -+/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 ++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 -+/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 -+/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next -+/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 -+/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch -+/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 -+/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 -+/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 -+/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 -+/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 -+/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 -+/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b -+/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c_p_l1 -+/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 -+/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a -+/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b -+/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y -+/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d -+/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif -+/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 ++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 -+/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next -+/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 -+/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch -+/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 -+/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 -+/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 -+/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 -+/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 -+/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 -+/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b -+/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax ++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c_b -+/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 -+/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif -+/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif -+/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif -+/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height -+/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next -+/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif -+/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif -+/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif -+/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif -+/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a -+/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif -+/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift -+/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif -+/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif -+/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a -+/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b -+/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif -+/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c -+/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif -+/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 -+/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d -+/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 -+/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 ++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b ++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif ++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y ++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add ++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 ++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val ++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 ++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 ++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d ++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif ++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d +// :1 -+/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 -+/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next -+/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next -+/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y -+/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 -+/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask -+/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5 -+/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 -+/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 -+/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 -+/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 -+/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 -+/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask -+/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 -+/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 -+/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 -+/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a -+/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 -+/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 -+/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 -+/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 -+/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 -+/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 -+/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height -+/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 -+/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b -+/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 ++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next ++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 ++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 ++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 ++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 ++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax ++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 ++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 ++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a ++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b ++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b ++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 ++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 ++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 ++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 ++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 ++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 ++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_sync_q0 -+/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q1 -+/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q2 -+/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q3 -+/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync_q4 -+/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q5 -+/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q6 -+/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q7 -+/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync_q8 -+/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q9 -+/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q10 -+/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q11 -+/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c_qn +// ::mc_exit_y_qn -+/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 -+/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c_q0 +// ::mc_exit_y_q0 -+/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 -+/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 -+/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop +// ::mc_setup_y_q0 -+/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_y_qn -+/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif -+/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 -+/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 -+/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask -+/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif -+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 -+/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 -+/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch -+/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num -+/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 -+/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 -+/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a -+/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif ++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 ++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 ++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 ++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif ++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch ++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 ++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a +// :1 -+/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 -+/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth -+/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000e08] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 +// :per_block_setup_8 -+/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif -+/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif -+/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 -+/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif -+/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif -+/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init -+/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul -+/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 -+/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 -+/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0, r0, v_dma_h_shift -+/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift -+/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif -+/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif -+/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255 -+/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 -+/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 -+/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 -+/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 -+/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif -+/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 -+/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15 -+/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) ++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add ++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val ++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 ++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d ++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c ++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif ++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d ++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c ++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d ++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c ++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 ++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif ++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 +// ::mc_filter_y_pxx -+/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 -+/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 -+/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 ++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 -+/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 -+/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask -+/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 -+/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 -+/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a -+/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height -+/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 -+/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b -+/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height ++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b ++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y_bxx -+/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 -+/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 -+/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 ++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 -+/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 -+/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask -+/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 -+/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 -+/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a -+/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off -+/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 -+/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height -+/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 -+/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b -+/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 ++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 ++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 ++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch ++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 ++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b ++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y_p00 -+/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next -+/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif -+/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif -+/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init -+/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift -+/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif -+/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif -+/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base -+/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 -+/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 ++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif ++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a ++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif ++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif ++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif ++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base +// :1 -+/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height -+/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b -+/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 ++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b ++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y_b00 -+/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 -+/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 -+/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7 -+/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 -+/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 -+/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 -+/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 -+/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1 ++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 ++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 +// :1 -+/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask -+/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 -+/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 -+/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height -+/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b -+/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax ++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 ++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_setup_c10_q0 -+/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_c10_qn -+/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1 -+/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 -+/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif -+/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 -+/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift -+/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 -+/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 -+/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask -+/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif -+/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch -+/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num -+/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 -+/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 -+/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x -+/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a -+/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 -+/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 -+/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif -+/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 -+/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 -+/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 -+/* [0x000017f0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) -+/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 -+/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif -+/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a -+/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 -+/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 -+/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif ++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch ++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num ++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 ++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif ++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif ++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD ++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y +// :1 -+/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 -+/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 +// ::mc_filter_c10_p -+/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a -+/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b -+/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y -+/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d -+/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif -+/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 -+/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 -+/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next -+/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 -+/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch -+/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 -+/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 -+/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 -+/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8 -+/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8 -+/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 -+/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 -+/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b -+/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c10_p_l1 -+/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a -+/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b -+/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y -+/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d -+/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif -+/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 -+/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next -+/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 -+/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch -+/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 -+/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 -+/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 -+/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8 -+/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8 -+/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 -+/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 -+/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b -+/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax ++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c10_b -+/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 -+/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif -+/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif -+/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif -+/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height -+/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif -+/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif -+/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif -+/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif -+/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a -+/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif -+/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift -+/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif -+/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif -+/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a -+/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b -+/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif -+/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c -+/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif -+/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 -+/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d -+/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 -+/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 ++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b ++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif ++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y ++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add ++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 ++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val ++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 ++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 ++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d ++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif ++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d +// :1 -+/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 -+/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next -+/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next -+/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y -+/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 -+/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask -+/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5 -+/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 -+/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8) -+/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 -+/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 -+/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 -+/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 -+/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask -+/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 -+/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 -+/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 -+/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a -+/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 -+/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 -+/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 -+/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 -+/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 -+/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 -+/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height -+/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 -+/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b -+/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 ++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next ++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 ++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 ++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 ++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 ++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax ++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 ++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 ++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a ++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b ++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b ++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 ++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 ++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 ++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 ++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 ++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_sync10_q0 -+/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q1 -+/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q2 -+/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q3 -+/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync10_q4 -+/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q5 -+/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q6 -+/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q7 -+/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync10_q8 -+/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) -+/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q9 -+/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q10 -+/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q11 -+/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c10_q0 +// ::mc_exit_y10_q0 -+/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 -+/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 -+/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c10_qn +// ::mc_exit_y10_qn -+/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 -+/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop +// ::mc_setup_y10_q0 -+/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_y10_qn -+/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif -+/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 -+/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 -+/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask -+/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif -+/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 -+/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift -+/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 -+/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00002390] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch -+/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num -+/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 -+/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 -+/* [0x00002448] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a -+/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif ++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 ++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 ++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 ++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 ++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif ++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch ++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 ++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a +// :1 -+/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 -+/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth -+/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 -+/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 -+/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 -+/* [0x000024d0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) -+/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 -+/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 +// :per_block_setup_10 -+/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif -+/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif -+/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 -+/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif -+/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif -+/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init -+/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul -+/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 -+/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 -+/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0, r0, v_dma_h_shift -+/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift -+/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif -+/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif -+/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255 -+/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 -+/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 -+/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 -+/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 -+/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif -+/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 -+/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15 -+/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) ++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add ++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val ++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 ++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d ++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c ++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif ++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d ++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c ++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d ++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c ++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 ++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif ++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 +// ::mc_filter_y10_pxx -+/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 -+/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 -+/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 ++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 -+/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 -+/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask -+/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 -+/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 -+/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a -+/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height -+/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 -+/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b -+/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height ++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b ++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y10_p00 -+/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next -+/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif -+/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif -+/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init -+/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift -+/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif -+/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif -+/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base -+/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 -+/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 ++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif ++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a ++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif ++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif ++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif ++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base +// :1 -+/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height -+/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b -+/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 ++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b ++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y10_bxx -+/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 -+/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 -+/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 ++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 -+/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 -+/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask -+/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 -+/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 -+/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a -+/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off -+/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 -+/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height -+/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 -+/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b -+/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 ++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 ++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 ++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch ++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 ++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b ++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y10_b00 -+/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 -+/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 -+/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7 -+/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 -+/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 -+/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 -+/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 -+/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1 ++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 ++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 +// :1 -+/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask -+/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 -+/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 -+/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height -+/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 -+/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 -+/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest -+/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b -+/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 -+/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 -+/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax ++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 ++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b ++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_end +}; +#ifdef __HIGHC__ @@ -24927,7 +24863,7 @@ index 0000000000..fe506c8ad0 +#endif diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h new file mode 100644 -index 0000000000..ddb351782d +index 0000000000..79651c9b6c --- /dev/null +++ b/libavcodec/rpi_hevc_shader.h @@ -0,0 +1,63 @@ @@ -24939,67 +24875,73 @@ index 0000000000..ddb351782d +#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0) +#define mc_start (ff_hevc_rpi_shader + 0) +#define mc_setup_c_qn (ff_hevc_rpi_shader + 2) -+#define mc_filter_c_p (ff_hevc_rpi_shader + 142) -+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 272) -+#define mc_filter_c_b (ff_hevc_rpi_shader + 402) -+#define mc_sync_q0 (ff_hevc_rpi_shader + 590) -+#define mc_sync_q1 (ff_hevc_rpi_shader + 608) -+#define mc_sync_q2 (ff_hevc_rpi_shader + 620) -+#define mc_sync_q3 (ff_hevc_rpi_shader + 632) -+#define mc_sync_q4 (ff_hevc_rpi_shader + 644) -+#define mc_sync_q5 (ff_hevc_rpi_shader + 662) -+#define mc_sync_q6 (ff_hevc_rpi_shader + 674) -+#define mc_sync_q7 (ff_hevc_rpi_shader + 686) -+#define mc_sync_q8 (ff_hevc_rpi_shader + 698) -+#define mc_sync_q9 (ff_hevc_rpi_shader + 716) -+#define mc_sync_q10 (ff_hevc_rpi_shader + 728) -+#define mc_sync_q11 (ff_hevc_rpi_shader + 740) -+#define mc_exit_c_qn (ff_hevc_rpi_shader + 752) -+#define mc_exit_y_qn (ff_hevc_rpi_shader + 752) -+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 770) -+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 770) -+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 790) -+#define mc_setup_y_qn (ff_hevc_rpi_shader + 792) -+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1032) -+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1162) -+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1292) -+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1382) -+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1462) -+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1464) -+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1600) -+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1728) -+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1856) -+#define mc_sync10_q0 (ff_hevc_rpi_shader + 2042) -+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2060) -+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2072) -+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2084) -+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2096) -+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2114) -+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2126) -+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2138) -+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2150) -+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2168) -+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2180) -+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2192) -+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2204) -+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2204) -+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2224) -+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2224) -+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2242) -+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2244) -+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2494) -+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2624) -+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2716) -+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2846) -+#define mc_end (ff_hevc_rpi_shader + 2926) ++#define mc_filter_c_p (ff_hevc_rpi_shader + 134) ++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260) ++#define mc_filter_c_b (ff_hevc_rpi_shader + 386) ++#define mc_sync_q0 (ff_hevc_rpi_shader + 580) ++#define mc_sync_q1 (ff_hevc_rpi_shader + 598) ++#define mc_sync_q2 (ff_hevc_rpi_shader + 610) ++#define mc_sync_q3 (ff_hevc_rpi_shader + 622) ++#define mc_sync_q4 (ff_hevc_rpi_shader + 634) ++#define mc_sync_q5 (ff_hevc_rpi_shader + 652) ++#define mc_sync_q6 (ff_hevc_rpi_shader + 664) ++#define mc_sync_q7 (ff_hevc_rpi_shader + 676) ++#define mc_sync_q8 (ff_hevc_rpi_shader + 688) ++#define mc_sync_q9 (ff_hevc_rpi_shader + 706) ++#define mc_sync_q10 (ff_hevc_rpi_shader + 718) ++#define mc_sync_q11 (ff_hevc_rpi_shader + 730) ++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742) ++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742) ++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760) ++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760) ++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780) ++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782) ++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014) ++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140) ++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272) ++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358) ++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432) ++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434) ++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562) ++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684) ++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806) ++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996) ++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014) ++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026) ++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038) ++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050) ++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068) ++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080) ++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092) ++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104) ++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122) ++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134) ++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146) ++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158) ++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158) ++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178) ++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178) ++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196) ++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198) ++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440) ++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566) ++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654) ++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786) ++#define mc_end (ff_hevc_rpi_shader + 2860) + +#endif diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm new file mode 100644 -index 0000000000..f8572cdebe +index 0000000000..77946a0443 --- /dev/null +++ b/libavcodec/rpi_hevc_shader.qasm -@@ -0,0 +1,1741 @@ +@@ -0,0 +1,1821 @@ ++# Inter pred asm ++# ++# Logic here should be good to 14 bits without modification ++# but only 8 & 10 are currently instantiated & tested ++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow ++# in _p00 & _b00 + +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress +# the warning that we are using rotation & ra/rb registers. r0..3 can be @@ -25007,6 +24949,34 @@ index 0000000000..f8572cdebe +# local 4. As it happens this is what is wanted here as we do not want the +# constants from the other half of the calc. + ++# Number limits in P/B calculation ++# ++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier ++# we offset our intermediates s.t. they always end up +ve before the next ++# multiply (may be -ve whilst summing but that doesn't matter). ++# ++# Range calc for up to 14 bits (Y-B pred): ++# ++# denom: [0, 7] ++# bmax = (1 << bits) - 1 ++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1] ++# ++# wt_mul: [-128, 255] ++# wt_off = off * 2 + 1: [-bmax, bmax] ++# ++# pel: [0, bmax] ++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff] ++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e] ++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6] ++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4] ++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2): ++# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000] ++# ++# This all looks good and is mostly bit depth independant - and as we manage ++# to do unsigned multiplies everywhere (now) this should be good for any bit ++# depth up to 14 (we could probably do 16 - but that requires a few tweaks ++# to the shifts we don't currently have logic for) ++ +# PREREAD is the number of requests that we have sitting in the TMU request +# queue. +# @@ -25016,10 +24986,26 @@ index 0000000000..f8572cdebe +# In s/w we are effectively limited to the min vertical read which is >= 4 +# so output FIFO is the limit. +# -+# However in the current world there seems to be no benefit (and a small -+# overhead) in setting this bigger than 2. ++# As the test for read-next is is the main part of the Luma loop (rather than ++# the preload FIFO part) we are limited to min_luma_height - 1 ++# Min_luma_height is 4 so we can only have a preload of 3 ++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick ++# in chroma without abandoning preload pretty much entirely (which would be bad) ++# ++# Timing tests vs preload of 4 suggests this doesn't hurt us much ++# Could have preread 4 for Chroma but when tested it didn't help + -+.set PREREAD, 4 ++.set PREREAD, 3 ++ ++# Offset added (effectively) at the exit of the H FIR filter ++# This is enough to force the result +ve ++# Is good if it is a power of 2 as that allows for >> without loss ++# ++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22 ++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00 ++# Round up to next power of 2 ++ ++.set FIR_OFFSET, 0x4000 + +# Block heights - 8 & 16 are the only numbers we currently support + @@ -25036,6 +25022,15 @@ index 0000000000..f8572cdebe +.set N_QPU_8, 12 +.set N_QPU_16, 12 + ++# Value to add to the weight multiplier to convert it into an unsigned value ++# Should be power of two for convienience ++ ++.set LOG2_MUL_ADD, 14 ++.set MUL_ADD, (1 << LOG2_MUL_ADD) ++ ++# Fixed denom (max that it can be set to) ++.set DENOM, 7 ++ +# register allocation +# + @@ -25043,16 +25038,16 @@ index 0000000000..f8572cdebe +# Used as temp and may be loop filter coeffs (split into .8s) +# or temp in loop. Check usage on an individual basis. + -+# ra4-7 -+# C: L0 H filter out FIFO -+# otherwise -- free -- ++# ra4-11 ++# V FIFO / temp / free + -+# ra8-11 -+# temp in some places - check usage -+# Y: (with rb8-11) horiz out FIFO ++# -- free -- ra12 + -+# ra12-15 -+# -- free -- ++# -- free -- ra13 ++ ++# -- free -- ra14 ++ ++# -- free -- ra15 + +# uniform: width:height +.set ra_width_height, ra16 @@ -25076,11 +25071,11 @@ index 0000000000..f8572cdebe +.set ra_y2_next, ra19.16a + +# Setup: consts - subdivide a single register -+.set ra_kff100100, ra20 ++.set ra_kff800100, ra20 +.set ra_k256, ra20.16a +.set ra_k0, ra20.8a +.set ra_k1, ra20.8b -+.set ra_k16, ra20.8c ++.set ra_k128, ra20.8c +.set ra_k255, ra20.8d + +# Loop: xshifts @@ -25099,20 +25094,38 @@ index 0000000000..f8572cdebe +.set ra_blk_height_pmax, ra23 +.set ra_pmax, ra23.16a +.set ra_blk_height, ra23.8c -+# -- free -- ra23.8d ++# --free -- ra23.8d + +# Loop: src frame base (L0) +.set ra_base, ra24 + -+# Loop: src frame base (L1) -+.set ra_base2, ra25 ++# Misc offsets ++.set ra_fir_off_val_wt_den_p7, ra25 ++.set ra_wt_den_p7, ra25.8a ++# -- free -- ra25.8b ++.set ra_fir_off_val, ra25.16b ++ ++# As it happens these constants are the same ++.if FIR_OFFSET == MUL_ADD ++# Weight multiplier unsigned add ++.set ra_kmul_add, ra_fir_off_val ++.else ++.error "FIR_OFFSET != MUL_ADD: Need new register & init" ++.endif + +# Loop: next src frame base (L0) +.set ra_base_next, ra26 + -+# -- free -- ra27 -+# -- free -- ra28 -+# -- free -- ra29 ++# Loop: height<<23 + width<<16 + vdw_setup_0 ++.set ra_dma0, ra27 ++ ++# Loop: destination address ++.set ra_dest, ra28 ++ ++# Setup: Dup of rb_ef ++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul ++# (top bits are ignored by mul24) ++.set ra_ef, ra29 + +# Use an even numbered register as a link register to avoid corrupting flags +.set ra_link, ra30 @@ -25127,25 +25140,22 @@ index 0000000000..f8572cdebe + +# El Flags +# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n ++# Duped into ra_ef as sometimes that is easier to use +.set rb_ef, rb3 + -+# rb4-7 -+# C-B: L1 H filter out FIFO -+# Y: (with ra2.8x) Y vertical filter coeffs -+ -+# rb8-11 -+# C: Vertical filter coeffs -+# Y: (with ra8-11) horiz out FIFO ++# rb4-11 ++# Loop: V filter FIFO or V filter coeff + +# Loop var: offset to add before shift (round + weighting offsets) +# Exact value varies by loop +.set rb_wt_off, rb12 + -+# Setup: denom + 6 + 9 -+.set rb_wt_den_p15, rb13 ++# -- free -- rb13 + +# -- free -- rb14 -+# -- free -- rb15 ++ ++# Loop: src frame base (L1) ++.set rb_base2, rb15 + +# Line pitch (128 for sand128) +.set rb_pitch, rb16 @@ -25164,23 +25174,22 @@ index 0000000000..f8572cdebe +# offset to the slice +.set rb_xpitch, rb20 + -+# -- free -- rb21 ++# These 3 consts each save 1 instruction in Y loop setup ++# so whilst they are worthwhile they should be the 1st to die if we need ++# another b reg ++.set rb_y_coeffs_2, rb21 # 0x050b0a00 ++.set rb_y_coeffs_3, rb22 # 0x11283a40 ++.set rb_y_coeffs_5, rb23 # 0x0a0b0500 + +# Setup: 0xff (8-bit) / 0xffff (9+ bit) -+.set rb_pmask, rb22 -+ -+# Loop: destination address -+.set rb_dest, rb23 ++.set rb_pmask, rb24 + +# vdw_setup_1(dst_pitch) -+.set rb_dma1_base, rb24 ++.set rb_dma1_base, rb25 + +# Setup: pic width - 1 +# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. -+.set rb_max_x, rb25 -+ -+# Loop: height<<23 + width<<16 + vdw_setup_0 -+.set rb_dma0, rb26 ++.set rb_max_x, rb26 + +# vdw_setup_0 (depends on QPU number) +.set rb_dma0_base, rb27 @@ -25194,9 +25203,8 @@ index 0000000000..f8572cdebe +# Setup: pic_height - 1 +.set rb_max_y, rb30 + -+# -- free -- rb31 -+ -+ ++# Setup: FIR H offset ++.set rb_fir_off_h, rb31 + + +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. @@ -25256,7 +25264,22 @@ index 0000000000..f8572cdebe +::mc_start + +################################################################################ -+# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) ++# mc_setup_c ++# ++# typedef struct qpu_mc_pred_c_s_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint32_t pic_cw; // C Width (== Y width / 2) ++# uint32_t pic_ch; // C Height (== Y Height / 2) ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# int16_t y2; ++# int16_t x2; ++# uint32_t base2; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_s_t; + +.macro m_setup_c, v_bit_depth + @@ -25271,34 +25294,31 @@ index 0000000000..f8572cdebe +.set v_blk_height, C_BLK_HEIGHT_16 +.endif + -+ mov tmurs, 1 # No swap TMUs -+ -+# Load first request location -+ mov ra0, unif # next_x_y ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y + + mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+ shl rb_ef, r0, i_shift30 -+ -+ mov ra_base, unif # Store frame c base ++ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base + +# Read image dimensions -+ sub r0, unif, 1 # pic c width -+ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes -+ sub rb_max_y, unif, 1 # pic c height ++ sub r0, unif, 1 # pic c width ++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes ++ sub rb_max_y, unif, 1 # pic c height + +# load constants -+ mov ra_kff100100, 0xff100100 ++ mov ra_kff800100, 0xff800100 + mov rb_pmask, v_pmask + mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) + +# get source pitch -+ mov rb_xpitch, unif # stride2 -+ mov rb_pitch, unif # stride1 -+ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly -+ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 ++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2 ++ mov rb_pitch, unif # stride1 ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly ++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 + + and r0, 1, elem_num -+ nop ; mul24 r0, r0, 5 ++ nop ; mul24 r0, r0, 5 +.if v_bit_depth <= 8 + add rb_elem_x, r0, elem_num +.else @@ -25310,9 +25330,9 @@ index 0000000000..f8572cdebe +# ra_base ends up with t0s base +# ra_base2 ends up with t1s base + -+ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] -+ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice -+ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y ++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] ++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice ++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y + min r0, r0, rb_max_x + +# Get shift @@ -25322,7 +25342,7 @@ index 0000000000..f8572cdebe +.if v_bit_depth <= 8 + shl ra_xshift_next, r0, 3 +.else -+ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 +.endif + +# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to @@ -25332,30 +25352,23 @@ index 0000000000..f8572cdebe +.endif + sub r1, ra_k0, rb_pitch + and r1, r0, r1 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2 + add ra_base, ra_base, r0 + -+ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator -+ +# Compute part of VPM to use for DMA output +# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? + m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + +# And again for L1, but only worrying about frame2 stuff + -+# Load first request location -+ mov ra0, unif # next_x_y -+ -+ mov ra_base2, unif # [ra0 delay] Store frame c base -+ +# Compute base address for first and second access +# ra_base ends up with t0s base -+# ra_base2 ends up with t1s base ++# rb_base2 ends up with t1s base + + shl r0, ra0.16b, v_x_shift -+ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset -+ max r0, r0, 0 ++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset ++ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2 + min r0, r0, rb_max_x + +# Get shift (already zero if 9+ bit so ignore) @@ -25369,37 +25382,35 @@ index 0000000000..f8572cdebe + and r0, r0, -4 +.endif + sub r1, ra_k0, rb_pitch -+ and r1, r0, r1 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r2, ra_y2 -+ add ra_base2, ra_base2, r0 ++ and r1, r0, r1 ; mov r3, PREREAD ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r2, ra_y2 ++ add rb_base2, rb_base2, r0 ; mov r0, ra_y + +# Do preloads -+# r0 = ra_y, r2 = ra_y2 -+ mov r3, PREREAD ; mov r0, ra_y ++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD + +:1 + sub.setf r3, r3, 1 + max r1, r0, 0 + min r1, r1, rb_max_y -+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 ; mov ra_y, r0 ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 + + max r1, r2, 0 + brr.anynz -, r:1b + min r1, r1, rb_max_y -+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, rb_base2, r1 ; mov ra_y2, r2 +# >>> .anynz 1b + -+ mov ra_link, unif # link -+# touch registers to keep simulator happy -+ # ra/b4..7: B0 -> B stash registers -+ mov ra4, 0 ; mov rb4, 0 ++ mov ra_link, unif # link ++# touch registers to keep simulator happy (and fills in delay slots) ++ mov ra4, 0 ; mov rb4, 0 + bra -, ra_link -+ mov ra5, 0 ; mov rb5, 0 -+ mov ra6, 0 ; mov rb6, 0 -+ mov ra7, 0 ; mov rb7, 0 ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 +# >>> ra_link +.endm + @@ -25409,11 +25420,22 @@ index 0000000000..f8572cdebe + m_setup_c 8 + +################################################################################ -+ -+# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) -+ -+# At this point we have already issued two pairs of texture requests for the current block -+# ra_x, ra_x16_base point to the current coordinates for this block ++# ++# mc_filter_c_p ++# ++# typedef struct qpu_mc_pred_c_p_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint16_t h; ++# uint16_t w; ++# uint32_t coeffs_x; ++# uint32_t coeffs_y; ++# uint32_t wo_u; ++# uint32_t wo_v; ++# uint32_t dst_addr_c; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_p_t; + +.macro m_filter_c_p, v_tmu, v_bit_depth + @@ -25447,55 +25469,58 @@ index 0000000000..f8572cdebe +.set vra_y_next, ra_y2_next +.set vrx_base_next, rb_base2_next +.set vra_y, ra_y2 -+.set vra_base, ra_base2 ++.set vra_base, rb_base2 +.set vr_txs, t1s +.endif + ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ +# per-channel shifts were calculated on the *previous* invocation +# get base addresses and per-channel shifts for *next* invocation -+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base + -+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 -+ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height -+ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs -+ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 ++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs ++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a + +.if v_bit_depth <= 8 + shl vrx_xshift_next, r0, 3 + and r0, r0, -4 +.endif -+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs + add vrx_base_next, r3, r0 ; mov r1, ra_height + +# set up VPM write -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight -+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight ++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight + -+# ; unpack filter coefficients ++# Misc final setup... + -+ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a -+ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2) -+ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight ++ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr ++ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register ++ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight ++ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4) ++ mov rb11, ra3.8d ; mov ra_link, unif # ; Link + -+ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y ++# r5 = -4 (loop counter) ++# ra_wt_mul_l0 = weight L0 + 128 (now unsigned) ++# rb_wt_off = (offset * 2 + 1) << (wt_den + 5) ++# rb31 = FIR value offset + -+ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d -+ -+ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link -+ sub ra3, rb_wt_den_p15, ra_k1 -+ -+# r5 = 0 (loop counter) -+# ra9 = alias for rb_max_y -+# ra_wt_mul_l0 = weight L0 -+# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19] -+# rb_wt_off = (offset * 2 + 1) << (ra3 - 1) ++# FIFO: rb4, ra5, rb6, ra7 ++# Coeffs in ra3.8a, ra3.8b, rb10, rb11 + +# We want (r0r1) +# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... @@ -25507,117 +25532,92 @@ index 0000000000..f8572cdebe +# then submit two more texture requests + +.if v_tmu == 0 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment -+ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next +.else -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment -+ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay] +.endif + -+ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+ min r3, r3, ra9 ; mov.ifnc r0, r2 ++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++ min r3, r3, rb_max_y ; mov.ifnc r0, r2 + -+ mov ra4, ra5 ; mul24 r2, r3, rb_pitch -+ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++.if v_tmu == 0 ++ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes ++.else ++ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes ++.endif + +# apply horizontal filter +# The filter coeffs for the two halves of this are the same (unlike in the +# Y case) so it doesn't matter which ra0 we get them from +# Also as the two halves are locked together we don't need to separate the 1st -+# r0 mul or the last r1 mul as they are vaild for all QPUs ++# r0 mul or the last r1 mul as they are valid for all QPUs + -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 + -+# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift) -+# Have to dup block as we need to move the brr - code is more common than it -+# looks at first glance -+.if v_bit_depth <= 8 ++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift) ++# We would like to save the r5->r4 shift but we need a delay slot ++# for both r7 & r6 which we can't find anything to put in if we have ++# already multiplied r4 & r5! + brr.anyn -, r:1b -+ add r2, r2, r3 ; mov ra5, ra6 -+ mov ra6, ra7 ; mul24 r1, ra7, rb10 -+ sub ra7, r2, r0 ; mul24 r0, ra4, rb8 -+.else -+ add r2, r2, r3 ; mov ra5, ra6 -+ brr.anyn -, r:1b -+ mov ra6, ra7 ; mul24 r1, ra7, rb10 -+ sub r2, r2, r0 ; mul24 r0, ra4, rb8 -+ asr ra7, r2, v_bit_depth - 8 -+.endif ++ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post ++ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post ++ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 +# >>> .anyn 1b + -+ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay] -+ add r1, r1, r0 ; mul24 r0, ra7, rb11 ++ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay] ++ sub r1, r1, r0 ; mul24 r0, ra7, rb11 + sub r1, r1, r0 -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+ asr r1, r1, 14 -+ nop ; mul24 r1, r1, ra_wt_mul_l0 -+ shl r1, r1, 8 ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop ++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop + brr.anyn -, r:1b -+ asr r1, r1, ra3 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++ asr r1, r1, i_wt_den_p6 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop +# >>> .anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++# recalc ra_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 + brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + -+# At 10 bits -+# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits) -+# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230 -+# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits) -+# (P) -+# * weight (255) = 5987400 = 0x5b5c48 (23 bits) -+# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits) -+# ... should be OK -+# -+# (B) -+# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits) -+# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits) -+# So signed overflow if we sign extend here :-( -+# -+# In practice this doesn't happen (we need a maximal offset and a very unlucky -+# filter). -+# -+# This could be fixed by offsetting the filters s.t. they are unsigned until -+# weight mul and then removing the offset with the weighting offset (I think -+# this should work) or splitting the rounding & offsetting -+ +::mc_filter_c_p + m_filter_c_p 0, 8 + @@ -25625,200 +25625,217 @@ index 0000000000..f8572cdebe + m_filter_c_p 1, 8 + +################################################################################ -+ ++# +# mc_filter_c_b -+ -+# At this point we have already issued two pairs of texture requests for the current block -+# ra_x, ra_x16_base point to the current coordinates for this block ++# ++# typedef struct qpu_mc_pred_c_b_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint16_t h; ++# uint16_t w; ++# uint32_t coeffs_x1; ++# uint32_t coeffs_y1; ++# int16_t weight_u1; ++# int16_t weight_v1; ++# int16_t y2; ++# int16_t x2; ++# uint32_t base2; ++# uint32_t coeffs_x2; ++# uint32_t coeffs_y2; ++# uint32_t wo_u2; ++# uint32_t wo_v2; ++# uint32_t dst_addr_c; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_b_t; + +.macro m_filter_c_b, v_bit_depth + +.if v_bit_depth <= 8 +.set v_x_shift, 1 +.set v_v_shift, 8 -+# Shifts to get width & height in the right place in rb_dma0 ++# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 2 +.set v_v_shift, i_shift16 -+# Shifts to get width & height in the right place in rb_dma0 ++# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif +.set v_x_mul, (1 << v_x_shift) + ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base + -+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 -+ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height -+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 ++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs + +.if v_bit_depth <= 8 + shl ra_xshift_next, r0, 3 +.endif + -+ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs -+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B + +# set up VPM write + -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight -+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight ++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight + -+ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 -+ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base -+ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs ++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 ++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base ++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register ++ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x + +# L1 - uniform layout could possibly be optimized + -+ shl r0, ra3.16b, v_x_shift # r0=x*2 -+ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs -+ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight -+ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs -+ min r0, r0, rb_max_x ; mov rb9, ra3.8b ++ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<>> .anyn 1b + -+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay] -+ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+ sub r2, r1, r0 ; mul24 r0, ra4, rb8 -+ sub r1, r3, r0 ; mul24 r0, ra5, rb9 -+ add r1, r1, r0 ; mul24 r0, ra7, rb11 -+ sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0 ++ sub.setf -, r5, rb_lcount ; mov r0, ra4 ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ add r1, r1, r0 ; mul24 r0, ra7, rb7 + -+ asr r2, r2, 14 ; mul24 r1, r1, ra_k256 -+ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1 ++ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1 ++ sub r2, r2, r0 + -+ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9) -+ add r1, r1, r2 ; mov r3, ra_blk_height -+ -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++ shr r1, r1, 6 ++ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop + + brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++ asr r1, r1, ra_wt_den_p7 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop +# >>> .anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++# recalc ra_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 + brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + @@ -25992,25 +26009,29 @@ index 0000000000..f8572cdebe + + + # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y -+ mov ra9, unif # ref_y_base -+ mov ra1, unif # x2_y2 -+ mov ra11, unif # ref_y2_base ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ mov ra9, unif # ref_y_base ++ mov ra1, unif # x2_y2 ++ + +# load constants + mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+ shl rb_ef, r0, i_shift30 ++ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base + -+ -+ mov ra_kff100100, 0xff100100 ++ mov ra_kff800100, 0xff800100 + mov rb_pmask, v_pmask + mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++ mov rb_y_coeffs_2, 0x050b0a00 ++ mov rb_y_coeffs_3, 0x11283a40 ++ mov rb_y_coeffs_5, 0x0a0b0500 + +# Compute part of VPM to use + +# Read image dimensions -+ mov ra3, unif # width_height -+ mov rb_xpitch, unif # stride2 ++ mov ra3, unif # width_height ++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2 +.if v_x_shift == 0 + sub rb_max_x, ra3.16b, 1 +.else @@ -26018,77 +26039,74 @@ index 0000000000..f8572cdebe + shl rb_max_x, r0, v_x_shift +.endif + sub rb_max_y, ra3.16a, 1 -+ mov rb_pitch, unif # stride1 ++ mov r3, elem_num ; mov rb_pitch, unif # stride1 + +# get destination pitch -+ mov r1, vdw_setup_1(0) ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] + or rb_dma1_base, r1, rb_pitch + +# Compute base address for first and second access -+ mov r3, elem_num -+ add r0, ra0.16b, r3 # Load x + elem_num ++ add r0, ra0.16b, r3 # Load x + elem_num +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + max r0, r0, 0 + min r0, r0, rb_max_x -+ shl ra_xshift_next, r0, 3 # Compute shifts ++ shl ra_xshift_next, r0, 3 # Compute shifts + +# X is byte offset - we can only load words - mask + -+ and r0, r0, -4 ; v8subs r2, r2, r2 ++ and r0, r0, -4 ; v8subs r2, r2, r2 + sub r2, r2, rb_pitch + and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets + add ra_base, ra9, r0 + + # r3 still contains elem_num -+ add r0, ra1.16b, r3 # Load x ++ add r0, ra1.16b, r3 # Load x +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + max r0, r0, 0 + min r0, r0, rb_max_x -+ shl rb_xshift2_next, r0, 3 # Compute shifts ++ shl rb_xshift2_next, r0, 3 # Compute shifts + + # r2 still contains mask + and r0, r0, -4 + and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add ra_base2, ra11, r0 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add rb_base2, ra11, r0 + +# Do preloads -+ nop ; mov r0, ra0.16a # ; r0 = y -+ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ nop ; mov r0, ra0.16a # ; r0 = y ++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 + +:1 + sub.setf r3, r3, 1 + max r1, r0, 0 + min r1, r1, rb_max_y -+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 ; mov ra_y, r0 ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 + + max r1, r2, 0 + brr.anynz -, r:1b + min r1, r1, rb_max_y -+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, rb_base2, r1 ; mov ra_y2, r2 +# >>> .anynz 1b + -+ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom -+ + m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + -+ mov ra_link, unif # Next fn ++ mov ra_link, unif # Next fn + +# touch vertical context to keep simulator happy -+ mov ra8, 0 ; mov rb8, 0 ++ mov ra8, 0 ; mov rb8, 0 # [ra_link delay] + bra -, ra_link -+ mov ra9, 0 ; mov rb9, 0 -+ mov ra10, 0 ; mov rb10, 0 -+ mov ra11, 0 ; mov rb11, 0 ++ mov ra9, 0 ; mov rb9, 0 ++ mov ra10, 0 ; mov rb10, 0 ++ mov ra11, 0 ; mov rb11, 0 +# >>> ra_link +.endm + @@ -26102,8 +26120,6 @@ index 0000000000..f8572cdebe +# Start of per-block setup code +# P and B blocks share the same setup code to save on Icache space + -+# luma_setup_delay3 done in delay slots of branch that got us here -+ +# get base addresses and per-channel shifts for *next* invocation +# per-channel shifts were calculated on the *previous* invocation + @@ -26129,9 +26145,9 @@ index 0000000000..f8572cdebe +.elif v_bit_depth == 10 + brr ra_link, r:per_block_setup_10 +.endif -+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? -+ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 -+ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +.endm + +.macro m_per_block_setup, v_bit_depth @@ -26139,13 +26155,13 @@ index 0000000000..f8572cdebe +.if v_bit_depth <= 8 +.set v_x_shift, 0 +.set v_x_mul, 1 -+# Shifts to get width & height in the right place in rb_dma0 ++# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 1 +.set v_x_mul, 2 -+# Shifts to get width & height in the right place in rb_dma0 ++# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif @@ -26153,95 +26169,86 @@ index 0000000000..f8572cdebe +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif -+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next + min r0, r0, rb_max_x + -+ shl ra_xshift_next, r0, 3 # Compute shifts ++ shl ra_xshift_next, r0, 3 # Compute shifts + and r0, r0, -4 -+ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base -+ and r1, r0, r2 ; mov ra_y_next, ra0.16a -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y -+ add ra_base_next, ra_base_next, r0 # [ra1 delay] ++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y ++ add ra_base_next, ra_base_next, r0 # [ra1 delay] + -+ add r0, ra1.16b, r3 # Load x2 ++ add r0, ra1.16b, r3 # Load x2 +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif -+ max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base -+ shl rb_xshift2_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height -+ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes ++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height ++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes + add rb_base2_next, rb_base2_next, r0 + +# get width,height of block (unif load above), r1 = width * pel_size -+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) -+ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height -+ add rb_lcount, r0, 7 -+ shl r0, r0, v_dma_h_shift -+ add r0, r0, r1 # Combine width and height of destination area -+ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++ add rb_lcount, r0, (7-8) ++ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val ++ add r0, r0, r1 # Combine width and height of destination area ++ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val ++ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets + +# get filter coefficients and discard unused B frame values -+ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight -+ shl ra8, r0, 3 ; mov r3, ra_k255 ++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 ; mov rb5, ra_k255 + -+# Pack the 1st 4 filter coefs for H & V tightly +# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) + -+ mov r1,0x00010100 # -ve [ra8 delay] -+ ror ra2.8a, r1, ra8.8d -+ ror ra0.8a, r1, ra8.8c ++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val ++# but I can't see a way of doing that that is cheap enough to be worth it + -+ mov r1, 0x01040400 ++# Picked out in a slightly random order to space out uniform loads ++ ++ # 1 ++ mov r1, 0x01040400 # [ra8 delay] + ror ra2.8b, r1, ra8.8d + ror ra0.8b, r1, ra8.8c -+ -+ mov r1,0x050b0a00 # -ve -+ ror ra2.8c, r1, ra8.8d -+ ror ra0.8c, r1, ra8.8c -+ -+ mov r1,0x11283a40 -+ ror ra2.8d, r1, ra8.8d -+ ror ra0.8d, r1, ra8.8c -+ -+# In the 2nd vertical half we use b registers due to using a-side fifo regs -+ -+ mov r1,0x3a281100 -+ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 -+ -+ mov r1,0x0a0b0500 # -ve -+ ror r0, r1, ra8.8d -+ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 -+ ++ # 2 ++ ror ra2.8c, rb_y_coeffs_2, ra8.8d ++ ror ra0.8c, rb_y_coeffs_2, ra8.8c ++ # 0 ++ mov r1,0x00010100 # -ve [ra8 delay] ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset ++ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++ # 7 ++ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000 ++ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address ++ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++ # 3 ++ ror ra2.8d, rb_y_coeffs_3, ra8.8d ++ ror ra0.8d, rb_y_coeffs_3, ra8.8c ++ # 5 ++ ror ra3.8b, rb_y_coeffs_5, ra8.8d ++ ror ra1.8b, rb_y_coeffs_5, ra8.8c ++ # 6 + mov r1,0x04040100 -+ ror r0, r1, ra8.8d -+ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 -+ -+ mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address -+ -+ mov r1,0x01010000 # -ve -+ ror r0, r1, ra8.8d ++ ror ra3.8c, r1, ra8.8d ++ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val + + bra -, ra_link -+ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 -+ -+ shl r0, ra_wt_off_l0, rb_wt_den_p15 # Offset calc -+ # For B l1 & L0 offsets should be identical so it doesn't matter which we use -+ asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val ++ # 4 ++ mov r1,0x3a281100 ++ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val ++ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 +# >>> branch ra_link + -+# r5 = 0 -+# ra_wt_mul_l1 = weight L1 -+# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) -+# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) -+# rb_wt_den_p15 = weight denom + 6 + 9 -+# rb_wt_mul_l0 = weight L0 ++# r5 = -8 ++# r2 = fir_off_val ++# r3 = 128 +.endm + +:per_block_setup_8 @@ -26250,117 +26257,120 @@ index 0000000000..f8572cdebe + + +################################################################################ -+# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+# In a P block, y2_x2 should be y_x+8 -+# At this point we have already issued two pairs of texture requests for the current block ++# ++# mc_filter_y_pxx ++# ++# Setup (& therefore uniform struct) shared with _bxx ++# Struct in m_luma_setup ++# ++# We can have 2 separate P reqs here as long as they mate to generate a ++# rectangular output block (i.e. h0 = h1, w0 = 8) ++# ++# At this point we have already issued PREREAD pairs of texture requests for the current block + +.macro m_filter_y_pxx, v_bit_depth ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ + m_luma_setup v_bit_depth + -+ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++ shl r1, ra_wt_off_l0, i_wt_den_p5 ++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul ++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 + -+# r5 = 0 (loop count) -+ -+:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+# N.B. Whilst y == y2 as far as this loop is concerned we will start -+# the grab for the next block before we finish with this block and that -+# might be B where y != y2 so we must do full processing on both y and y2 ++# This loop is identical to the B loop from here ---> ++:1 ++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef + -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ max r2, ra_y, 0 ; mov r1, 0 ++ min r2, r2, rb_max_y ; mov r3, ra_k1 ++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++ add t0s, ra_base, r2 ; mov rb5, rb6 ++ shr r0, r4, ra_xshift ; mov rb6, rb7 + -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+ -+ max r2, ra_y2, 0 -+ min r2, r2, rb_max_y ; mov ra7, ra8 -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte -+ -+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes ++ shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++ add t1s, rb_base2, r2 ; mov ra8, ra9 + +# apply horizontal filter -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r5, 8 ; mov ra9, ra10 -+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a + brr.anyn -, r:1b -+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+ mov ra10, ra11 ; mov rb10, rb11 -+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+ # >>> .anyn 1b ++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++ # >>> .anyn 1b (r5 + r5) + + # apply vertical filter and write to VPM -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 -+# At this point r1 is a 22-bit signed quantity: 8 (original sample), -+# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign) -+# The top 8 bits have rubbish in them as mul24 is unsigned -+# The low 6 bits need discard before weighting -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish -+ asr r1, r1, 14 -+ nop ; mul24 r1, r1, ra_wt_mul_l0 -+ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop ++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb8 ++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++ add r1, r1, r0 ; mul24 r0, ra11, rb11 ++# <--- to here ++ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height ++ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++ ++ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) + -+ shl r1, r1, 8 ; v8subs r0, ra_height, r3 + brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+ -+# >>> branch.anyn yloop ++ asr r1, r1, i_wt_den_p6 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> branch.anyn 1b (r5 - rb_lcount) + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++# recalc ra_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 + brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + @@ -26371,106 +26381,118 @@ index 0000000000..f8572cdebe +################################################################################ + +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+# In a P block, only the first half of coefficients contain used information. -+# At this point we have already issued two pairs of texture requests for the current block -+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time? -+# Or possibly by taking advantage of symmetry? ++# ++# Setup (& therefore uniform struct) shared with _pxx ++# Struct in m_luma_setup ++# ++# l0 calc in els 0-7, L1 in 8-15 ++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh) ++# ++# At this point we have already issued PREREAD pairs of texture requests for the current block + +.macro m_filter_y_bxx, v_bit_depth ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ + m_luma_setup v_bit_depth + ++ shl r1, ra_wt_off_l0, i_wt_den_p6 ++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++ ++# This loop is identical to the P loop from here ---> +:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef + -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ max r2, ra_y, 0 ; mov r1, 0 ++ min r2, r2, rb_max_y ; mov r3, ra_k1 ++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++ add t0s, ra_base, r2 ; mov rb5, rb6 ++ shr r0, r4, ra_xshift ; mov rb6, rb7 + -+ max r2, ra_y2, 0 -+ min r2, r2, rb_max_y ; mov ra7, ra8 -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte -+ -+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes ++ shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++ add t1s, rb_base2, r2 ; mov ra8, ra9 + +# apply horizontal filter -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r5, 8 ; mov ra9, ra10 -+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a + brr.anyn -, r:1b -+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+ mov ra10, ra11 ; mov rb10, rb11 -+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+ # >>> .anyn 1b ++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++ # >>> .anyn 1b (r5 + r5) + + # apply vertical filter and write to VPM -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 ; mov r2, rb_wt_off -+# As with P-pred r1 is a 22-bit signed quantity in 32-bits -+# Top 8 bits are bad - low 6 bits should be discarded -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 + -+ asr r1, r1, 14 -+ nop ; mul24 r0, r1, ra_wt_mul_l0 -+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb8 ++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++ add r1, r1, r0 ; mul24 r0, ra11, rb11 ++# <--- to here ++ sub r1, r1, ra4 ++ sub r1, r1, r0 ; mov r2, rb_wt_off ++ ++ asr r1, r1, 6 ++ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++ add r1, r1, r2 ; mov r0, r1 << 8 ++ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height + -+ add r1, r1, r0 ; mov r3, ra_blk_height -+ shl r1, r1, 8 ; v8subs r0, ra_height, r3 + brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> branch.anyn 1b ++ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) ++# >>> branch.anyn 1b (r5 - rb_lcount) + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++# r3 = block_height + -+# If looping again then we consumed 16 height last loop ++# If looping again then we consumed block_height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++# recalc ra_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 -+# >>> .anyz ra_link ++# >>> .anyz ra_link (ra_height - remaining height) + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 + brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + @@ -26493,94 +26515,90 @@ index 0000000000..f8572cdebe +.if v_bit_depth <= 8 +.set v_x_shift, 0 +.set v_x_mul, 1 -+# Shifts to get width & height in the right place in rb_dma0 ++# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 1 +.set v_x_mul, 2 -+# Shifts to get width & height in the right place in rb_dma0 ++# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif + -+ mov ra0, unif ; mov r3, elem_num # y_x -+ mov ra_xshift, ra_xshift_next # [ra0 delay] -+ add r0, ra0.16b, r3 ++ mov ra0, unif ; mov r0, elem_num # y_x ++ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + -+ max r0, r0, 0 -+ min r0, r0, rb_max_x ++ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height ++ min r0, r0, rb_max_x ; mov ra_width_height, unif + -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; v8subs r2, r2, r2 -+ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base -+ and r1, r0, r2 ; mov ra_y_next, ra0.16a -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr + add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write + +# get width,height of block (unif load above) +# Compute vdw_setup1(dst_pitch-width) + shl r1, ra_width, v_x_shift + sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset -+ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr -+ add rb_dma0, r0, rb_dma0_base -+ -+ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 -+ # For B l1 & L0 offsets should be identical so it doesn't matter which we use -+ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link ++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++ add r0, r0, r1 # Combine width and height of destination area ++ shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link ++ add ra_dma0, r0, rb_dma0_base + +:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask + -+ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 + + brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++ asr r1, r1, DENOM + 8 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +# >>> branch.anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++# recalc ra_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 + brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + @@ -26595,65 +26613,63 @@ index 0000000000..f8572cdebe + m_luma_setup v_bit_depth + +# Fix up vals that were expecting a filter (somewhat icky) -+ mov r0, 7 -+ sub rb_i_tmu, rb_i_tmu, r0 -+ sub rb_lcount, rb_lcount, r0 -+ mov r0, 8 ; mov r1, ra_wt_off_mul_l0 -+ shl rb_wt_off, rb_wt_off, r0 -+ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++ mov r2, 1 ++ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want ++ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero ++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 + +:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next + + max r2, ra_y2, 0 + min r2, r2, rb_max_y -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte -+ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 + -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+ add r1, r0, r1 -+ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++ ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 + + brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +# >>> branch.anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++# recalc ra_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 + brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + @@ -26743,7 +26759,7 @@ index 0000000000..f8572cdebe +# Do not add code here because mc_end must appear after all other code. diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h new file mode 100644 -index 0000000000..9f8983da52 +index 0000000000..2f06987bb9 --- /dev/null +++ b/libavcodec/rpi_hevc_shader_cmd.h @@ -0,0 +1,128 @@ @@ -26787,8 +26803,8 @@ index 0000000000..9f8983da52 + uint16_t w; + uint32_t coeffs_x1; + uint32_t coeffs_y1; -+ uint32_t weight_u1; -+ uint32_t weight_v1; ++ int16_t weight_u1; ++ int16_t weight_v1; + qpu_mc_src_t next_src2; + uint32_t coeffs_x2; + uint32_t coeffs_y2; @@ -26804,7 +26820,6 @@ index 0000000000..9f8983da52 + uint32_t pic_ch; // C Height (== Y Height / 2) + uint32_t stride2; + uint32_t stride1; -+ uint32_t wdenom; + qpu_mc_src_t next_src2; + uint32_t next_fn; +} qpu_mc_pred_c_s_t; @@ -26846,7 +26861,6 @@ index 0000000000..9f8983da52 + uint16_t pic_w; + uint32_t stride2; + uint32_t stride1; -+ uint32_t wdenom; + uint32_t next_fn; +} qpu_mc_pred_y_s_t; + @@ -26871,16 +26885,18 @@ index 0000000000..9f8983da52 +#define QPU_MC_PRED_N_Y10 12 +#define QPU_MC_PRED_N_C10 12 + ++#define QPU_MC_DENOM 7 ++ +#pragma pack(pop) + +#endif + diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c new file mode 100644 -index 0000000000..0c80cf4de0 +index 0000000000..577850a6b4 --- /dev/null +++ b/libavcodec/rpi_hevc_shader_template.c -@@ -0,0 +1,62 @@ +@@ -0,0 +1,61 @@ +#include "hevc.h" +#include "rpi_hevcdec.h" +#include "libavutil/rpi_sand_fns.h" @@ -26896,7 +26912,6 @@ index 0000000000..0c80cf4de0 + uint32_t height; + uint32_t stride2; + uint32_t stride1; -+ uint32_t wdenom; +} shader_track_t; + +static int wtoidx(const unsigned int w) @@ -26973,10 +26988,10 @@ index 0000000000..304d73ea4a + diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h new file mode 100644 -index 0000000000..b9e7c07fe3 +index 0000000000..59b00d537b --- /dev/null +++ b/libavcodec/rpi_hevc_shader_template_fn.h -@@ -0,0 +1,477 @@ +@@ -0,0 +1,475 @@ +#define STRCAT(x,y) x##y + +#if PW == 1 @@ -27214,7 +27229,6 @@ index 0000000000..b9e7c07fe3 + st->width = c->pic_w * PW; + st->stride1 = c->stride1; + st->stride2 = c->stride2; -+ st->wdenom = c->wdenom; + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); @@ -27227,7 +27241,6 @@ index 0000000000..b9e7c07fe3 + st->width = c->pic_cw * PW; + st->stride1 = c->stride1; + st->stride2 = c->stride2; -+ st->wdenom = c->wdenom; + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); @@ -27255,11 +27268,11 @@ index 0000000000..b9e7c07fe3 + // wo[offset] = offset*2+1 + s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); ++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); + if (w2 > 0) { + s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( + (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); ++ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); + } + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; @@ -27287,7 +27300,7 @@ index 0000000000..b9e7c07fe3 + + s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, -+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), + 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; @@ -27306,7 +27319,7 @@ index 0000000000..b9e7c07fe3 + // wo[offset] = offset*2+1 + s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); + + st->last_l0 = &c->next_src1; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); @@ -27335,7 +27348,7 @@ index 0000000000..b9e7c07fe3 + + s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, -+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), + 0, woff_b(s, c->wo2), 0, 0, c->w); + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; @@ -27355,10 +27368,10 @@ index 0000000000..b9e7c07fe3 + + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); + + FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); + @@ -27379,10 +27392,10 @@ index 0000000000..b9e7c07fe3 + + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); + + FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); + @@ -27417,11 +27430,11 @@ index 0000000000..b9e7c07fe3 + + s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( + patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, -+ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2), ++ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2), + 0, woff_b(s, c->wo_u2), mx2, my2, c->w); + s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( + patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, -+ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2), ++ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2), + 0, woff_b(s, c->wo_v2), mx2, my2, c->w); + + FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); @@ -28106,10 +28119,10 @@ index 0000000000..1128a2c054 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..25ae294ff4 +index 0000000000..39a63c77de --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,6013 @@ +@@ -0,0 +1,6016 @@ +/* + * HEVC video Decoder + * @@ -29209,136 +29222,130 @@ index 0000000000..25ae294ff4 +static void default_pred_weight_table(HEVCRpiContext * const s) +{ + unsigned int i; ++ const unsigned int wt = 1 << QPU_MC_DENOM; + s->sh.luma_log2_weight_denom = 0; + s->sh.chroma_log2_weight_denom = 0; + for (i = 0; i < s->sh.nb_refs[L0]; i++) { -+ s->sh.luma_weight_l0[i] = 1; ++ s->sh.luma_weight_l0[i] = wt; + s->sh.luma_offset_l0[i] = 0; -+ s->sh.chroma_weight_l0[i][0] = 1; ++ s->sh.chroma_weight_l0[i][0] = wt; ++ s->sh.chroma_weight_l0[i][1] = wt; + s->sh.chroma_offset_l0[i][0] = 0; -+ s->sh.chroma_weight_l0[i][1] = 1; + s->sh.chroma_offset_l0[i][1] = 0; + } + for (i = 0; i < s->sh.nb_refs[L1]; i++) { -+ s->sh.luma_weight_l1[i] = 1; ++ s->sh.luma_weight_l1[i] = wt; + s->sh.luma_offset_l1[i] = 0; -+ s->sh.chroma_weight_l1[i][0] = 1; ++ s->sh.chroma_weight_l1[i][0] = wt; ++ s->sh.chroma_weight_l1[i][1] = wt; + s->sh.chroma_offset_l1[i][0] = 0; -+ s->sh.chroma_weight_l1[i][1] = 1; + s->sh.chroma_offset_l1[i][1] = 0; + } +} + -+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) ++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb, ++ const unsigned int refs, ++ int16_t * luma_weight, int16_t * luma_offset, ++ int16_t * chroma_weight, int16_t * chroma_offset) +{ -+ int i = 0; -+ int j = 0; -+ uint8_t luma_weight_l0_flag[16]; -+ uint8_t chroma_weight_l0_flag[16]; -+ uint8_t luma_weight_l1_flag[16]; -+ uint8_t chroma_weight_l1_flag[16]; -+ unsigned int luma_log2_weight_denom; ++ unsigned int luma_flags; ++ unsigned int chroma_flags; ++ unsigned int i; ++ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8); ++ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range; ++ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM; ++ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM; ++ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom); ++ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom); + -+ luma_log2_weight_denom = get_ue_golomb_long(gb); -+ if (luma_log2_weight_denom > 7) { -+ av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom); -+ return AVERROR_INVALIDDATA; -+ } -+ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; -+ if (ctx_cfmt(s) != 0) { -+ const unsigned int chroma_log2_weight_denom = luma_log2_weight_denom + get_se_golomb(gb); -+ if (chroma_log2_weight_denom > 7) ++ if (refs == 0) ++ return 0; ++ ++ luma_flags = get_bits(gb, refs); ++ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs); ++ i = 1 << (refs - 1); ++ ++ do ++ { ++ if ((luma_flags & i) != 0) + { -+ av_log(s->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is invalid\n", chroma_log2_weight_denom); -+ return AVERROR_INVALIDDATA; ++ const int delta_weight = get_se_golomb(gb); ++ const int offset = get_se_golomb(gb); ++ if (delta_weight < -128 || delta_weight > 127 || ++ offset < -wp_offset_half_range || offset >= wp_offset_half_range) ++ { ++ return AVERROR_INVALIDDATA; ++ } ++ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift); ++ *luma_offset++ = offset << wp_offset_bd_shift; ++ } ++ else ++ { ++ *luma_weight++ = luma_weight_base; ++ *luma_offset++ = 0; + } -+ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; -+ } + -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) { -+ luma_weight_l0_flag[i] = get_bits1(gb); -+ if (!luma_weight_l0_flag[i]) { -+ s->sh.luma_weight_l0[i] = 1 << s->sh.luma_log2_weight_denom; -+ s->sh.luma_offset_l0[i] = 0; -+ } -+ } -+ if (ctx_cfmt(s) != 0) { -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) -+ chroma_weight_l0_flag[i] = get_bits1(gb); -+ } else { -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) -+ chroma_weight_l0_flag[i] = 0; -+ } -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) { -+ if (luma_weight_l0_flag[i]) { -+ int delta_luma_weight_l0 = get_se_golomb(gb); -+ s->sh.luma_weight_l0[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l0; -+ s->sh.luma_offset_l0[i] = get_se_golomb(gb); -+ } -+ if (chroma_weight_l0_flag[i]) { -+ for (j = 0; j < 2; j++) { -+ int delta_chroma_weight_l0 = get_se_golomb(gb); -+ int delta_chroma_offset_l0 = get_se_golomb(gb); ++ if ((chroma_flags & i) != 0) ++ { ++ unsigned int j; ++ for (j = 0; j != 2; ++j) ++ { ++ const int delta_weight = get_se_golomb(gb); ++ const int delta_offset = get_se_golomb(gb); + -+ if ( (int8_t)delta_chroma_weight_l0 != delta_chroma_weight_l0 -+ || delta_chroma_offset_l0 < -(1<<17) || delta_chroma_offset_l0 > (1<<17)) { ++ if (delta_weight < -128 || delta_weight > 127 || ++ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range) ++ { + return AVERROR_INVALIDDATA; + } + -+ s->sh.chroma_weight_l0[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l0; -+ s->sh.chroma_offset_l0[i][j] = av_clip((delta_chroma_offset_l0 - ((128 * s->sh.chroma_weight_l0[i][j]) -+ >> s->sh.chroma_log2_weight_denom) + 128), -128, 127); -+ } -+ } else { -+ s->sh.chroma_weight_l0[i][0] = 1 << s->sh.chroma_log2_weight_denom; -+ s->sh.chroma_offset_l0[i][0] = 0; -+ s->sh.chroma_weight_l0[i][1] = 1 << s->sh.chroma_log2_weight_denom; -+ s->sh.chroma_offset_l0[i][1] = 0; -+ } -+ } -+ if (s->sh.slice_type == HEVC_SLICE_B) { -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) { -+ luma_weight_l1_flag[i] = get_bits1(gb); -+ if (!luma_weight_l1_flag[i]) { -+ s->sh.luma_weight_l1[i] = 1 << s->sh.luma_log2_weight_denom; -+ s->sh.luma_offset_l1[i] = 0; ++ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift); ++ *chroma_offset++ = av_clip( ++ wp_offset_half_range + delta_offset - ++ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom), ++ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift; + } + } -+ if (ctx_cfmt(s) != 0) { -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) -+ chroma_weight_l1_flag[i] = get_bits1(gb); -+ } else { -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) -+ chroma_weight_l1_flag[i] = 0; ++ else ++ { ++ *chroma_weight++ = chroma_weight_base; ++ *chroma_weight++ = chroma_weight_base; ++ *chroma_offset++ = 0; ++ *chroma_offset++ = 0; + } -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) { -+ if (luma_weight_l1_flag[i]) { -+ int delta_luma_weight_l1 = get_se_golomb(gb); -+ s->sh.luma_weight_l1[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l1; -+ s->sh.luma_offset_l1[i] = get_se_golomb(gb); -+ } -+ if (chroma_weight_l1_flag[i]) { -+ for (j = 0; j < 2; j++) { -+ int delta_chroma_weight_l1 = get_se_golomb(gb); -+ int delta_chroma_offset_l1 = get_se_golomb(gb); ++ } while ((i >>= 1) != 0); + -+ if ( (int8_t)delta_chroma_weight_l1 != delta_chroma_weight_l1 -+ || delta_chroma_offset_l1 < -(1<<17) || delta_chroma_offset_l1 > (1<<17)) { -+ return AVERROR_INVALIDDATA; -+ } ++ return 0; ++} + -+ s->sh.chroma_weight_l1[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l1; -+ s->sh.chroma_offset_l1[i][j] = av_clip((delta_chroma_offset_l1 - ((128 * s->sh.chroma_weight_l1[i][j]) -+ >> s->sh.chroma_log2_weight_denom) + 128), -128, 127); -+ } -+ } else { -+ s->sh.chroma_weight_l1[i][0] = 1 << s->sh.chroma_log2_weight_denom; -+ s->sh.chroma_offset_l1[i][0] = 0; -+ s->sh.chroma_weight_l1[i][1] = 1 << s->sh.chroma_log2_weight_denom; -+ s->sh.chroma_offset_l1[i][1] = 0; -+ } -+ } ++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) ++{ ++ int err; ++ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb); ++ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb); ++ ++ if (luma_log2_weight_denom > 7 || ++ chroma_log2_weight_denom > 7) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n", ++ luma_log2_weight_denom, chroma_log2_weight_denom); ++ return AVERROR_INVALIDDATA; + } ++ ++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; ++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; ++ ++ if ((err = get_weights(s, gb, s->sh.nb_refs[L0], ++ s->sh.luma_weight_l0, s->sh.luma_offset_l0, ++ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 || ++ (err = get_weights(s, gb, s->sh.nb_refs[L1], ++ s->sh.luma_weight_l1, s->sh.luma_offset_l1, ++ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n"); ++ return err; ++ } ++ + return 0; +} + @@ -29404,7 +29411,7 @@ index 0000000000..25ae294ff4 + const HEVCRpiSPS *sps) +{ + const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data; -+ const HEVCWindow *ow = &sps->output_window; ++ const HEVCRpiWindow *ow = &sps->output_window; + unsigned int num = 0, den = 0; + + avctx->pix_fmt = sps->pix_fmt; @@ -29805,15 +29812,15 @@ index 0000000000..25ae294ff4 + } + + if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) || -+ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) { -+ int ret = pred_weight_table(s, gb); -+ if (ret < 0) ++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) ++ { ++ if ((ret = pred_weight_table(s, gb)) != 0) + return ret; + } + else + { -+ // Give us unit weights -+ default_pred_weight_table(s); ++ // Give us unit weights ++ default_pred_weight_table(s); + } + + sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); @@ -30633,12 +30640,17 @@ index 0000000000..25ae294ff4 +#define get_mc_address_u(f) get_vc_address_u(f) +#endif + -+static inline int offset_depth_adj(const HEVCRpiContext *const s, const int wt) ++static inline uint32_t pack_wo_p(const int off, const int mul) +{ -+ return s->ps.sps->high_precision_offsets_enabled_flag ? wt : -+ wt << (s->ps.sps->bit_depth - 8); ++ return PACK2(off * 2 + 1, mul); +} + ++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul) ++{ ++ return PACK2(off0 + off1 + 1, mul); ++} ++ ++ +static void +rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, + const int x0, const int y0, @@ -30655,7 +30667,7 @@ index 0000000000..25ae294ff4 + const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; + const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); + qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; -+ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul); ++ const uint32_t wo = pack_wo_p(weight_offset, weight_mul); + HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); + @@ -30823,10 +30835,8 @@ index 0000000000..25ae294ff4 + const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; + const unsigned int ref_idx0 = mv_field->ref_idx[0]; + const unsigned int ref_idx1 = mv_field->ref_idx[1]; -+ const uint32_t wt_offset = -+ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1; -+ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); -+ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]); + + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); + qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; @@ -30952,8 +30962,8 @@ index 0000000000..25ae294ff4 + const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); + const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)]; + const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)]; -+ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]); -+ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]); ++ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]); ++ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]); + qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; + HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; @@ -31018,8 +31028,8 @@ index 0000000000..25ae294ff4 + const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1; + const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1; + -+ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]); -+ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]); ++ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]); ++ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]); + + const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; + const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); @@ -31297,16 +31307,17 @@ index 0000000000..25ae294ff4 + const unsigned int idx) +{ + const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size); -+ int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; -+ int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; + + // Up does not cross boundries so as we always scan 1 slice-tile-line in an + // lc we can just keep 1 CTB lR stashes ++ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job + const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu]; -+ const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) == 0 && xb_pu == 0) ? INTRA_DC : lc->ipm_left[yb_pu]; ++ const unsigned int cand_left = lc->ipm_left[yb_pu]; + -+ int intra_pred_mode; -+ int a, b, c; ++ unsigned int intra_pred_mode; ++ unsigned int a, b, c; + + if (cand_left == cand_up) { + if (cand_left < 2) { @@ -31349,7 +31360,6 @@ index 0000000000..25ae294ff4 + } + + /* write the intra prediction units into the mv array */ -+ + set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode); + return intra_pred_mode; +} @@ -31844,7 +31854,6 @@ index 0000000000..25ae294ff4 + u->pic_ch = pic_height_c; + u->stride2 = av_rpi_sand_frame_stride2(s->frame); + u->stride1 = av_rpi_sand_frame_stride1(s->frame); -+ u->wdenom = s->sh.chroma_log2_weight_denom; + cp->last_l0 = &u->next_src1; + + u->next_fn = 0; @@ -31871,7 +31880,6 @@ index 0000000000..25ae294ff4 + y->pic_w = pic_width_y; + y->stride2 = av_rpi_sand_frame_stride2(s->frame); + y->stride1 = av_rpi_sand_frame_stride1(s->frame); -+ y->wdenom = s->sh.luma_log2_weight_denom; + y->next_fn = 0; + yp->last_l0 = &y->next_src1; + yp->last_l1 = &y->next_src2; @@ -32437,17 +32445,19 @@ index 0000000000..25ae294ff4 + +static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks) +{ -+ const int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); + HEVCRpiJob * const jb = lc->jb0; + int more_data = 1; -+ int ctb_addr_ts = lc->ts; ++ unsigned int ctb_addr_ts = lc->ts; ++ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size; ++ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size; + + lc->unit_done = 0; ++ + while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) + { -+ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; -+ const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; + int q_full; + const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; + @@ -32455,7 +32465,7 @@ index 0000000000..25ae294ff4 + + ff_hevc_rpi_cabac_init(s, lc, ctb_flags); + -+ hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); ++ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size); + + s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; + s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; @@ -32463,9 +32473,12 @@ index 0000000000..25ae294ff4 + + // Zap stashes if navail + if ((lc->ctb_avail & AVAIL_U) == 0) -+ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), s->ps.sps->log2_ctb_size - 3); ++ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3); + if ((lc->ctb_avail & AVAIL_L) == 0) -+ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), s->ps.sps->log2_ctb_size - 3); ++ { ++ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE); ++ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3); ++ } +#if MVF_STASH_WIDTH > 64 + // Restore left mvf stash at start of tile if not at start of line + if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap) @@ -32487,7 +32500,7 @@ index 0000000000..25ae294ff4 + lc->tu.cu_chroma_qp_offset_wanted = 0; + + // Decode -+ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); ++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0); + + if (ff_hevc_rpi_cabac_overflow(lc)) + { @@ -32514,8 +32527,8 @@ index 0000000000..25ae294ff4 + // --- Post CTB processing + + // Stash rpl top/left for deblock that needs to remember such things cross-slice -+ s->rpl_up[x_ctb >> s->ps.sps->log2_ctb_size] = s->refPicList; -+ s->rpl_left[y_ctb >> s->ps.sps->log2_ctb_size] = s->refPicList; ++ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList; ++ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList; + + if (!s->is_irap) + { @@ -32580,6 +32593,8 @@ index 0000000000..25ae294ff4 + + // Inc TS to next. + ctb_addr_ts++; ++ ctb_addr_rs++; ++ x_ctb += ctb_size; + + if (q_full) + { @@ -32642,6 +32657,7 @@ index 0000000000..25ae294ff4 + if (is_dep) + { + dst_lc->qPy_pred = src_lc->qPy_pred; ++ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left)); + memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); + memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); + } @@ -32794,7 +32810,7 @@ index 0000000000..25ae294ff4 + } + else + { -+ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); ++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT +#if MVF_STASH_WIDTH > 64 + // Horrid calculations to work out what we want but luckily this should almost never execute + // **** Move to movlc @@ -43505,10 +43521,10 @@ index 0000000000..b1e99a6a89 + diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv new file mode 100644 -index 0000000000..b5381794be +index 0000000000..3e90f6893f --- /dev/null +++ b/pi-util/conf_h265.2016.csv -@@ -0,0 +1,194 @@ +@@ -0,0 +1,195 @@ +1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 +1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 +1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 @@ -43703,6 +43719,7 @@ index 0000000000..b5381794be +1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5 +1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5 +1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5 ++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5 diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv new file mode 100644 index 0000000000..6082641271 @@ -44421,18 +44438,125 @@ index 0000000000..67b22d2d51 + print >>out, ' // %04x' % (i - 8) + print >>out,'};' + +diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py +new file mode 100644 +index 0000000000..e44cfa0c3c +--- /dev/null ++++ b/pi-util/perfcmp.py +@@ -0,0 +1,101 @@ ++#!/usr/bin/env python3 ++ ++import time ++import string ++import os ++import tempfile ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class tstats: ++ close_threshold = 0.01 ++ ++ def __init__(self, stats_dict=None): ++ if stats_dict != None: ++ self.name = stats_dict["name"] ++ self.elapsed = float(stats_dict["elapsed"]) ++ self.user = float(stats_dict["user"]) ++ self.sys = float(stats_dict["sys"]) ++ ++ def times_str(self): ++ ctime = self.sys + self.user ++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) ++ ++ def dict(self): ++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} ++ ++ def is_close(self, other): ++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold ++ ++ def __lt__(self, other): ++ return self.elapsed < other.elapsed ++ def __gt__(self, other): ++ return self.elapsed > other.elapsed ++ ++ def time_file(name, prefix): ++ stats = tstats() ++ stats.name = name ++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name, ++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog); ++ pinfo = os.wait4(cproc.pid, 0) ++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ stats.elapsed = end_time - start_time ++ stats.user = pinfo[2].ru_utime ++ stats.sys = pinfo[2].ru_stime ++ return stats ++ ++ ++def common_prefix(s1, s2): ++ for i in range(min(len(s1),len(s2))): ++ if s1[i] != s2[i]: ++ return s1[:i] ++ return s1[:i+1] ++ ++def main(): ++ argp = argparse.ArgumentParser(description="FFmpeg performance compare") ++ ++ argp.add_argument("stream0", help="CSV to compare") ++ argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare") ++ ++ args = argp.parse_args() ++ ++ with open(args.stream0, 'r', newline='') as f_in: ++ stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} ++ with open(args.stream1, 'r', newline='') as f_in: ++ stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} ++ ++ print (args.stream0, "<<-->>", args.stream1) ++ print () ++ ++ for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()): ++ if not (f in stats0) : ++ print (" XX :", f) ++ continue ++ if not (f in stats1) : ++ print (" XX :", f) ++ continue ++ ++ s0 = stats0[f] ++ s1 = stats1[f] ++ ++ pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0 ++ thresh = 0.3 ++ tc = 6 ++ ++ nchar = min(tc - 1, int(abs(pcent) / thresh)) ++ cc = " -- " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar ++ ++ print ("%6.2f %s%6.2f (%+5.2f) : %s" % ++ (s0.elapsed, cc, s1.elapsed, pcent, f)) ++ ++ return 0 ++ ++ ++if __name__ == '__main__': ++ exit(main()) ++ diff --git a/pi-util/qem.sh b/pi-util/qem.sh new file mode 100755 -index 0000000000..5ce2eeaf72 +index 0000000000..a4dbb6eacd --- /dev/null +++ b/pi-util/qem.sh @@ -0,0 +1,9 @@ +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex +QASM=python\ ../local/bin/qasm.py -+SRC_FILE=libavcodec/rpi_shader.qasm ++SRC_FILE=libavcodec/rpi_hevc_shader.qasm +DST_BASE=shader + -+cp libavcodec/rpi_shader_cmd.h $TARGET_DIR ++cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h +