gentoo/dev-libs/blake3/files/blake3-1.8.2-x32.patch
Sam James 73b7b2969f
dev-libs/blake3: fix x32 compat
Closes: https://bugs.gentoo.org/942562
Signed-off-by: Sam James <sam@gentoo.org>
2025-09-24 02:14:10 +01:00

539 lines
18 KiB
Diff

https://bugs.gentoo.org/942562
https://github.com/BLAKE3-team/BLAKE3/issues/499
https://github.com/BLAKE3-team/BLAKE3/pull/500
From 93958a2775a8453f0549ed3560c82c3d487b24d8 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald@gigawatt.nl>
Date: Sat, 19 Jul 2025 11:43:32 +0100
Subject: [PATCH] [x32] Fix assembly
The x86-64 assembly implementations of BLAKE3 are used both in 64-bit
and in 32-bit pointer mode, but only worked in 64-bit pointer mode. This
PR adds support to also allow them to work in 32-bit pointer mode.
---
c/blake3_avx2_x86-64_unix.S | 43 +++++++++++++++++
c/blake3_avx512_x86-64_unix.S | 91 +++++++++++++++++++++++++++++++++++
c/blake3_sse2_x86-64_unix.S | 28 +++++++++++
c/blake3_sse41_x86-64_unix.S | 28 +++++++++++
4 files changed, 190 insertions(+)
diff --git a/blake3_avx2_x86-64_unix.S b/blake3_avx2_x86-64_unix.S
index 812bb856..e977627c 100644
--- a/blake3_avx2_x86-64_unix.S
+++ b/blake3_avx2_x86-64_unix.S
@@ -33,6 +33,10 @@ blake3_hash_many_avx2:
mov rbp, rsp
sub rsp, 680
and rsp, 0xFFFFFFFFFFFFFFC0
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
neg r9d
vmovd xmm0, r9d
vpbroadcastd ymm0, xmm0
@@ -65,6 +69,7 @@ blake3_hash_many_avx2:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -73,6 +78,16 @@ blake3_hash_many_avx2:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x10]
+ mov r13d, dword ptr [rdi+0x14]
+ mov r14d, dword ptr [rdi+0x18]
+ mov r15d, dword ptr [rdi+0x1c]
+#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
@@ -1293,7 +1308,11 @@ blake3_hash_many_avx2:
vmovdqa ymm0, ymmword ptr [rsp+0x260]
vpsubd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm2
+#ifndef _ILP32
add rdi, 64
+#else
+ add rdi, 32
+#endif
add rbx, 256
mov qword ptr [rbp+0x50], rbx
sub rsi, 8
@@ -1334,10 +1353,17 @@ blake3_hash_many_avx2:
vpblendd ymm15, ymm15, ymm12, 0x44
vmovdqa ymmword ptr [rsp], ymm14
vmovdqa ymmword ptr [rsp+0x20], ymm15
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1545,7 +1571,11 @@ blake3_hash_many_avx2:
vmovaps xmmword ptr [rsp+0x240], xmm0
vmovaps xmmword ptr [rsp+0x260], xmm2
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
3:
test rsi, 0x2
@@ -1561,8 +1591,13 @@ blake3_hash_many_avx2:
vinserti128 ymm13, ymm13, xmm14, 0x01
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1671,7 +1706,11 @@ blake3_hash_many_avx2:
vmovaps ymmword ptr [rsp+0x240], ymm0
vmovaps ymmword ptr [rsp+0x260], ymm2
add rbx, 64
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
sub rsi, 2
3:
test rsi, 0x1
@@ -1683,7 +1722,11 @@ blake3_hash_many_avx2:
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm14, xmmword ptr [ROT16+rip]
vmovdqa xmm15, xmmword ptr [ROT8+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/blake3_avx512_x86-64_unix.S b/blake3_avx512_x86-64_unix.S
index 9642e413..7c09704e 100644
--- a/blake3_avx512_x86-64_unix.S
+++ b/blake3_avx512_x86-64_unix.S
@@ -41,6 +41,10 @@ blake3_hash_many_avx512:
sub rsp, 144
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
kmovw k1, r9d
vmovd xmm0, r8d
vpbroadcastd ymm0, xmm0
@@ -89,6 +93,7 @@ blake3_hash_many_avx512:
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -97,6 +102,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x20]
+ mov r13d, dword ptr [rdi+0x24]
+ mov r14d, dword ptr [rdi+0x28]
+ mov r15d, dword ptr [rdi+0x2c]
+#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -109,6 +124,7 @@ blake3_hash_many_avx512:
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
+#ifndef _ILP32
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
@@ -117,6 +133,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
+#else
+ mov r8d, dword ptr [rdi+0x10]
+ mov r9d, dword ptr [rdi+0x14]
+ mov r10d, dword ptr [rdi+0x18]
+ mov r11d, dword ptr [rdi+0x1c]
+ mov r12d, dword ptr [rdi+0x30]
+ mov r13d, dword ptr [rdi+0x34]
+ mov r14d, dword ptr [rdi+0x38]
+ mov r15d, dword ptr [rdi+0x3c]
+#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -151,6 +177,7 @@ blake3_hash_many_avx512:
vmovdqa32 zmm23, zmm19
vpermt2d zmm19, zmm27, zmm8
vpermt2d zmm23, zmm31, zmm8
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -159,6 +186,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x20]
+ mov r13d, dword ptr [rdi+0x24]
+ mov r14d, dword ptr [rdi+0x28]
+ mov r15d, dword ptr [rdi+0x2c]
+#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -179,6 +216,7 @@ blake3_hash_many_avx512:
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
+#ifndef _ILP32
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
@@ -187,6 +225,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
+#else
+ mov r8d, dword ptr [rdi+0x10]
+ mov r9d, dword ptr [rdi+0x14]
+ mov r10d, dword ptr [rdi+0x18]
+ mov r11d, dword ptr [rdi+0x1c]
+ mov r12d, dword ptr [rdi+0x30]
+ mov r13d, dword ptr [rdi+0x34]
+ mov r14d, dword ptr [rdi+0x38]
+ mov r15d, dword ptr [rdi+0x3c]
+#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -1077,7 +1125,11 @@ blake3_hash_many_avx512:
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+#ifndef _ILP32
add rdi, 128
+#else
+ add rdi, 64
+#endif
add rbx, 512
mov qword ptr [rbp+0x50], rbx
sub rsi, 16
@@ -1107,6 +1159,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -1115,6 +1168,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x10]
+ mov r13d, dword ptr [rdi+0x14]
+ mov r14d, dword ptr [rdi+0x18]
+ mov r15d, dword ptr [rdi+0x1c]
+#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
@@ -2037,7 +2100,11 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
add rbx, 256
mov qword ptr [rbp+0x50], rbx
+#ifndef _ILP32
add rdi, 64
+#else
+ add rdi, 32
+#endif
sub rsi, 8
3:
mov rbx, qword ptr [rbp+0x50]
@@ -2060,10 +2127,17 @@ blake3_hash_many_avx512:
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
mov eax, 43690
kmovw k3, eax
mov eax, 34952
@@ -2177,7 +2251,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x40], xmm2
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
3:
test esi, 0x2
@@ -2191,8 +2269,13 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -2290,7 +2373,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
add rbx, 64
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
sub rsi, 2
3:
test esi, 0x1
@@ -2301,7 +2388,11 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/blake3_sse2_x86-64_unix.S b/blake3_sse2_x86-64_unix.S
index 99f033fe..b3d368c4 100644
--- a/blake3_sse2_x86-64_unix.S
+++ b/blake3_sse2_x86-64_unix.S
@@ -38,6 +38,10 @@ blake3_hash_many_sse2:
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
@@ -75,10 +79,17 @@ blake3_hash_many_sse2:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1632,7 +1643,11 @@ blake3_hash_many_sse2:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
@@ -1663,8 +1678,13 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1893,7 +1913,11 @@ blake3_hash_many_sse2:
mov r11d, dword ptr [rsp+0x120+8*rax]
mov dword ptr [rsp+0x110], r10d
mov dword ptr [rsp+0x120], r11d
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
add rbx, 64
sub rsi, 2
3:
@@ -1904,7 +1928,11 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/blake3_sse41_x86-64_unix.S b/blake3_sse41_x86-64_unix.S
index a3ff6426..9f797299 100644
--- a/blake3_sse41_x86-64_unix.S
+++ b/blake3_sse41_x86-64_unix.S
@@ -38,6 +38,10 @@ blake3_hash_many_sse41:
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
@@ -75,10 +79,17 @@ blake3_hash_many_sse41:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1436,7 +1447,11 @@ blake3_hash_many_sse41:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
@@ -1467,8 +1482,13 @@ blake3_hash_many_sse41:
pinsrd xmm14, dword ptr [rsp+0x124], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp+0x10], xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1670,7 +1690,11 @@ blake3_hash_many_sse41:
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+0x110], xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
add rbx, 64
sub rsi, 2
3:
@@ -1683,7 +1707,11 @@ blake3_hash_many_sse41:
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx