p3_field/packed/interleaves.rs
1//! A file containing a collection of architecture-specific interleaving functions.
2//! Used for PackedFields to implement interleaving operations.
3
4#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
5pub mod interleave {
6 use core::arch::aarch64::{self, uint32x4_t};
7
8 #[inline]
9 #[must_use]
10 /// Interleave two vectors of 32-bit integers.
11 ///
12 /// Maps `[a0, ..., a3], [b0, ..., b3], ` to `[a0, b0, ...], [..., a3, b3]`.
13 pub fn interleave_u32(v0: uint32x4_t, v1: uint32x4_t) -> (uint32x4_t, uint32x4_t) {
14 // We want this to compile to:
15 // trn1 res0.4s, v0.4s, v1.4s
16 // trn2 res1.4s, v0.4s, v1.4s
17 // throughput: .5 cyc/2 vec (16 els/cyc)
18 // latency: 2 cyc
19 unsafe {
20 // Safety: If this code got compiled then NEON intrinsics are available.
21 (aarch64::vtrn1q_u32(v0, v1), aarch64::vtrn2q_u32(v0, v1))
22 }
23 }
24
25 #[inline]
26 #[must_use]
27 /// Interleave two vectors of 64-bit integers.
28 ///
29 /// Maps `[a0, a1], [b0, b1], ` to `[a0, b0], [a1, b1]`.
30 pub fn interleave_u64(v0: uint32x4_t, v1: uint32x4_t) -> (uint32x4_t, uint32x4_t) {
31 // We want this to compile to:
32 // trn1 res0.2d, v0.2d, v1.2d
33 // trn2 res1.2d, v0.2d, v1.2d
34 // throughput: .5 cyc/2 vec (16 els/cyc)
35 // latency: 2 cyc
36
37 // To transpose 64-bit blocks, cast the [u32; 4] vectors to [u64; 2], transpose, and cast back.
38 unsafe {
39 // Safety: If this code got compiled then NEON intrinsics are available.
40 let v0 = aarch64::vreinterpretq_u64_u32(v0);
41 let v1 = aarch64::vreinterpretq_u64_u32(v1);
42 (
43 aarch64::vreinterpretq_u32_u64(aarch64::vtrn1q_u64(v0, v1)),
44 aarch64::vreinterpretq_u32_u64(aarch64::vtrn2q_u64(v0, v1)),
45 )
46 }
47 }
48}
49
50#[cfg(all(
51 target_arch = "x86_64",
52 target_feature = "avx2",
53 not(target_feature = "avx512f")
54))]
55pub mod interleave {
56 use core::arch::x86_64::{self, __m256i};
57
58 #[inline]
59 #[must_use]
60 /// Interleave two vectors of 32-bit integers.
61 ///
62 /// Maps `[a0, ..., a7], [b0, ..., b7], ` to `[a0, b0, ...], [..., a7, b7]`.
63 pub fn interleave_u32(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
64 // We want this to compile to:
65 // vpsllq t, a, 32
66 // vpsrlq u, b, 32
67 // vpblendd res0, a, u, aah
68 // vpblendd res1, t, b, aah
69 // throughput: 1.33 cyc/2 vec (12 els/cyc)
70 // latency: (1 -> 1) 1 cyc
71 // (1 -> 2) 2 cyc
72 // (2 -> 1) 2 cyc
73 // (2 -> 2) 1 cyc
74 unsafe {
75 // Safety: If this code got compiled then AVX2 intrinsics are available.
76
77 // We currently have:
78 // a = [ a0 a1 a2 a3 a4 a5 a6 a7 ],
79 // b = [ b0 b1 b2 b3 b4 b5 b6 b7 ].
80 // First form
81 // t = [ a1 0 a3 0 a5 0 a7 0 ].
82 // u = [ 0 b0 0 b2 0 b4 0 b6 ].
83 let t = x86_64::_mm256_srli_epi64::<32>(a);
84 let u = x86_64::_mm256_slli_epi64::<32>(b);
85
86 // Then
87 // res0 = [ a0 b0 a2 b2 a4 b4 a6 b6 ],
88 // res1 = [ a1 b1 a3 b3 a5 b5 a7 b7 ].
89 (
90 x86_64::_mm256_blend_epi32::<0b10101010>(a, u),
91 x86_64::_mm256_blend_epi32::<0b10101010>(t, b),
92 )
93 }
94 }
95
96 #[inline]
97 #[must_use]
98 /// Interleave two vectors of 64-bit integers.
99 ///
100 /// Maps `[a0, ..., a3], [b0, ..., b3], ` to `[a0, b0, ...], [..., a3, b3]`.
101 pub fn interleave_u64(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
102 // We want this to compile to:
103 // vpunpcklqdq res0, a, b
104 // vpunpckhqdq res1, a, b
105 // throughput: 1 cyc/2 vec (16 els/cyc)
106 // latency: 1 cyc
107
108 unsafe {
109 // Safety: If this code got compiled then AVX2 intrinsics are available.
110 (
111 x86_64::_mm256_unpacklo_epi64(a, b),
112 x86_64::_mm256_unpackhi_epi64(a, b),
113 )
114 }
115 }
116
117 #[inline]
118 #[must_use]
119 /// Interleave two vectors of 128-bit integers.
120 ///
121 /// Maps `[a0, a1], [b0, b1], ` to `[a0, b0], [a1, b1]`.
122 pub fn interleave_u128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
123 // We want this to compile to:
124 // vperm2i128 t, a, b, 21h
125 // vpblendd res0, a, t, f0h
126 // vpblendd res1, t, b, f0h
127 // throughput: 1 cyc/2 vec (16 els/cyc)
128 // latency: 4 cyc
129
130 unsafe {
131 // Safety: If this code got compiled then AVX2 intrinsics are available.
132
133 // We currently have:
134 // a = [ a0 a1 a2 a3 a4 a5 a6 a7 ],
135 // b = [ b0 b1 b2 b3 b4 b5 b6 b7 ].
136 // First form
137 // t = [ a4 a5 a6 a7 b0 b1 b2 b3 ].
138 let t = x86_64::_mm256_permute2x128_si256::<0x21>(a, b);
139
140 // Then
141 // res0 = [ a0 a1 a2 a3 b0 b1 b2 b3 ],
142 // res1 = [ a4 a5 a6 a7 b4 b5 b6 b7 ].
143 (
144 x86_64::_mm256_blend_epi32::<0b11110000>(a, t),
145 x86_64::_mm256_blend_epi32::<0b11110000>(t, b),
146 )
147 }
148 }
149}
150
151#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
152pub mod interleave {
153 use core::arch::x86_64::{self, __m512i, __mmask8, __mmask16};
154 use core::mem::transmute;
155
156 const EVENS: __mmask16 = 0b0101010101010101;
157 const EVENS4: __mmask16 = 0x0f0f;
158
159 // vpshrdq requires AVX-512VBMI2.
160 #[cfg(target_feature = "avx512vbmi2")]
161 #[inline]
162 #[must_use]
163 fn interleave1_antidiagonal(x: __m512i, y: __m512i) -> __m512i {
164 unsafe {
165 // Safety: If this code got compiled then AVX-512VBMI2 intrinsics are available.
166 x86_64::_mm512_shrdi_epi64::<32>(x, y)
167 }
168 }
169
170 // If we can't use vpshrdq, then do a vpermi2d, but we waste a register and double the latency.
171 #[cfg(not(target_feature = "avx512vbmi2"))]
172 #[inline]
173 #[must_use]
174 fn interleave1_antidiagonal(x: __m512i, y: __m512i) -> __m512i {
175 const INTERLEAVE1_INDICES: __m512i = unsafe {
176 // Safety: `[u32; 16]` is trivially transmutable to `__m512i`.
177 transmute::<[u32; WIDTH], _>([
178 0x01, 0x10, 0x03, 0x12, 0x05, 0x14, 0x07, 0x16, 0x09, 0x18, 0x0b, 0x1a, 0x0d, 0x1c,
179 0x0f, 0x1e,
180 ])
181 };
182 unsafe {
183 // Safety: If this code got compiled then AVX-512F intrinsics are available.
184 x86_64::_mm512_permutex2var_epi32(x, INTERLEAVE1_INDICES, y)
185 }
186 }
187
188 #[inline]
189 #[must_use]
190 /// Interleave two vectors of 32-bit integers.
191 ///
192 /// Maps `[a0, ..., a15], [b0, ..., b15], ` to `[a0, b0, ...], [..., a15, b15]`.
193 pub fn interleave_u32(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
194 // If we have AVX-512VBMI2, we want this to compile to:
195 // vpshrdq t, x, y, 32
196 // vpblendmd res0 {EVENS}, t, x
197 // vpblendmd res1 {EVENS}, y, t
198 // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
199 // latency: 2 cyc
200 //
201 // Otherwise, we want it to compile to:
202 // vmovdqa32 t, INTERLEAVE1_INDICES
203 // vpermi2d t, x, y
204 // vpblendmd res0 {EVENS}, t, x
205 // vpblendmd res1 {EVENS}, y, t
206 // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
207 // latency: 4 cyc
208
209 // We currently have:
210 // x = [ x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf ],
211 // y = [ y0 y1 y2 y3 y4 y5 y6 y7 y8 y9 ya yb yc yd ye yf ].
212 // First form
213 // t = [ x1 y0 x3 y2 x5 y4 x7 y6 x9 y8 xb ya xd yc xf ye ].
214 let t = interleave1_antidiagonal(x, y);
215
216 unsafe {
217 // Safety: If this code got compiled then AVX-512F intrinsics are available.
218
219 // Then
220 // res0 = [ x0 y0 x2 y2 x4 y4 x6 y6 x8 y8 xa ya xc yc xe ye ],
221 // res1 = [ x1 y1 x3 y3 x5 y5 x7 y7 x9 y9 xb yb xd yd xf yf ].
222 (
223 x86_64::_mm512_mask_blend_epi32(EVENS, t, x),
224 x86_64::_mm512_mask_blend_epi32(EVENS, y, t),
225 )
226 }
227 }
228
229 #[inline]
230 #[must_use]
231 fn shuffle_epi64<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
232 // The instruction is only available in the floating-point flavor; this distinction is only for
233 // historical reasons and no longer matters. We cast to floats, do the thing, and cast back.
234 unsafe {
235 let a = x86_64::_mm512_castsi512_pd(a);
236 let b = x86_64::_mm512_castsi512_pd(b);
237 x86_64::_mm512_castpd_si512(x86_64::_mm512_shuffle_pd::<MASK>(a, b))
238 }
239 }
240
241 #[inline]
242 #[must_use]
243 /// Interleave two vectors of 64-bit integers.
244 ///
245 /// Maps `[a0, ..., a7], [b0, ..., b7], ` to `[a0, b0, ...], [..., a7, b7]`.
246 pub fn interleave_u64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
247 // We want this to compile to:
248 // vshufpd t, x, y, 55h
249 // vpblendmq res0 {EVENS}, t, x
250 // vpblendmq res1 {EVENS}, y, t
251 // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
252 // latency: 2 cyc
253
254 unsafe {
255 // Safety: If this code got compiled then AVX-512F intrinsics are available.
256
257 // We currently have:
258 // x = [ x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf ],
259 // y = [ y0 y1 y2 y3 y4 y5 y6 y7 y8 y9 ya yb yc yd ye yf ].
260 // First form
261 // t = [ x2 x3 y0 y1 x6 x7 y4 y5 xa xb y8 y9 xe xf yc yd ].
262 let t = shuffle_epi64::<0b01010101>(x, y);
263
264 // Then
265 // res0 = [ x0 x1 y0 y1 x4 x5 y4 y5 x8 x9 y8 y9 xc xd yc yd ],
266 // res1 = [ x2 x3 y2 y3 x6 x7 y6 y7 xa xb ya yb xe xf ye yf ].
267 (
268 x86_64::_mm512_mask_blend_epi64(EVENS as __mmask8, t, x),
269 x86_64::_mm512_mask_blend_epi64(EVENS as __mmask8, y, t),
270 )
271 }
272 }
273
274 #[inline]
275 #[must_use]
276 /// Interleave two vectors of 128-bit integers.
277 ///
278 /// Maps `[a0, ..., a3], [b0, ..., b3], ` to `[a0, b0, ...], [..., a3, b3]`.
279 pub fn interleave_u128(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
280 // We want this to compile to:
281 // vmovdqa64 t, INTERLEAVE4_INDICES
282 // vpermi2q t, x, y
283 // vpblendmd res0 {EVENS4}, t, x
284 // vpblendmd res1 {EVENS4}, y, t
285 // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
286 // latency: 4 cyc
287
288 const INTERLEAVE4_INDICES: __m512i = unsafe {
289 // Safety: `[u64; 8]` is trivially transmutable to `__m512i`.
290 transmute::<[u64; 8], _>([0o02, 0o03, 0o10, 0o11, 0o06, 0o07, 0o14, 0o15])
291 };
292
293 unsafe {
294 // Safety: If this code got compiled then AVX-512F intrinsics are available.
295
296 // We currently have:
297 // x = [ x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf ],
298 // y = [ y0 y1 y2 y3 y4 y5 y6 y7 y8 y9 ya yb yc yd ye yf ].
299 // First form
300 // t = [ x4 x5 x6 x7 y0 y1 y2 y3 xc xd xe xf y8 y9 ya yb ].
301 let t = x86_64::_mm512_permutex2var_epi64(x, INTERLEAVE4_INDICES, y);
302
303 // Then
304 // res0 = [ x0 x1 x2 x3 y0 y1 y2 y3 x8 x9 xa xb y8 y9 ya yb ],
305 // res1 = [ x4 x5 x6 x7 y4 y5 y6 y7 xc xd xe xf yc yd ye yf ].
306 (
307 x86_64::_mm512_mask_blend_epi32(EVENS4, t, x),
308 x86_64::_mm512_mask_blend_epi32(EVENS4, y, t),
309 )
310 }
311 }
312
313 #[inline]
314 #[must_use]
315 /// Interleave two vectors of 256-bit integers.
316 ///
317 /// Maps `[a0, a1], [b0, b1], ` to `[a0, b0], [a1, b1]`.
318 pub fn interleave_u256(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
319 // We want this to compile to:
320 // vshufi64x2 t, x, b, 4eh
321 // vpblendmq res0 {EVENS4}, t, x
322 // vpblendmq res1 {EVENS4}, y, t
323 // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
324 // latency: 4 cyc
325
326 unsafe {
327 // Safety: If this code got compiled then AVX-512F intrinsics are available.
328
329 // We currently have:
330 // x = [ x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf ],
331 // y = [ y0 y1 y2 y3 y4 y5 y6 y7 y8 y9 ya yb yc yd ye yf ].
332 // First form
333 // t = [ x8 x9 xa xb xc xd xe xf y0 y1 y2 y3 y4 y5 y6 y7 ].
334 let t = x86_64::_mm512_shuffle_i64x2::<0b01_00_11_10>(x, y);
335
336 // Then
337 // res0 = [ x0 x1 x2 x3 x4 x5 x6 x7 y0 y1 y2 y3 y4 y5 y6 y7 ],
338 // res1 = [ x8 x9 xa xb xc xd xe xf y8 y9 ya yb yc yd ye yf ].
339 (
340 x86_64::_mm512_mask_blend_epi64(EVENS4 as __mmask8, t, x),
341 x86_64::_mm512_mask_blend_epi64(EVENS4 as __mmask8, y, t),
342 )
343 }
344 }
345}
346
347/// A macro to implement the PackedFieldPow2 trait for PackedFields. The macro assumes that the PackedFields
348/// have a `to_vector` and `from_vector` method, which convert between the PackedField and a packed vector.
349///
350/// # Arguments:
351/// - `$type`: The type of the PackedField.
352/// - `($type_param, $param_name)`: Optional type parameter if one is needed and a name for it.
353/// - `; [ ($block_len, $func), ... ]`: A list of block lengths and their corresponding interleaving functions.
354/// - `$width`: The width of the PackedField, corresponding to the largest possible block length.
355///
356/// For example, calling this macro with:
357/// ```rust,ignore
358/// impl_packed_field_pow_2!(
359/// PackedMontyField31Neon, (FieldParameters, FP);
360/// [
361/// (1, interleave_u32),
362/// (2, interleave_u64),
363/// ],
364/// 4
365/// );
366/// ```
367/// crates the code:
368/// ```rust,ignore
369/// impl<FP: FieldParameters> PackedFieldPow2 for PackedMontyField31Neon<FP> {
370/// #[inline]
371/// fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
372/// let (v0, v1) = (self.to_vector(), other.to_vector());
373/// let (res0, res1) = match block_len {
374/// 1 => interleave_u32(v0, v1),
375/// 2 => interleave_u64(v0, v1),
376/// 4 => (v0, v1),
377/// _ => panic!("unsupported block_len"),
378/// };
379/// unsafe {
380/// // Safety: We haven't changed any values, just moved data around
381/// // so all entries still represent valid field elements.
382/// (Self::from_vector(res0), Self::from_vector(res1))
383/// }
384/// }
385/// }
386/// ```
387#[macro_export]
388macro_rules! impl_packed_field_pow_2 {
389 // Accepts: type, block sizes as (block_len, function), and optional type param
390 (
391 $type:ty
392 $(, ($type_param:ty, $param_name:ty))?
393 ; [ $( ($block_len:expr, $func:ident) ),* $(,)? ],
394 $width:expr
395 ) => {
396 paste::paste! {
397 unsafe impl$(<$param_name: $type_param>)? PackedFieldPow2 for $type$(<$param_name>)? {
398 #[inline]
399 fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
400 let (v0, v1) = (self.to_vector(), other.to_vector());
401 let (res0, res1) = match block_len {
402 $(
403 $block_len => $func(v0, v1),
404 )*
405 $width => (v0, v1),
406 _ => panic!("unsupported block_len"),
407 };
408 unsafe {
409 // Safety: We haven't changed any values, just moved data around
410 // so all entries still represent valid field elements.
411 (Self::from_vector(res0), Self::from_vector(res1))
412 }
413 }
414 }
415 }
416 };
417}
418
419pub use impl_packed_field_pow_2;