p3_field/packed/
interleaves.rs

1//! A file containing a collection of architecture-specific interleaving functions.
2//! Used for PackedFields to implement interleaving operations.
3
4#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
5pub mod interleave {
6    use core::arch::aarch64::{self, uint32x4_t};
7
8    #[inline]
9    #[must_use]
10    /// Interleave two vectors of 32-bit integers.
11    ///
12    /// Maps `[a0, ..., a3], [b0, ..., b3], ` to `[a0, b0, ...], [..., a3, b3]`.
13    pub fn interleave_u32(v0: uint32x4_t, v1: uint32x4_t) -> (uint32x4_t, uint32x4_t) {
14        // We want this to compile to:
15        //      trn1  res0.4s, v0.4s, v1.4s
16        //      trn2  res1.4s, v0.4s, v1.4s
17        // throughput: .5 cyc/2 vec (16 els/cyc)
18        // latency: 2 cyc
19        unsafe {
20            // Safety: If this code got compiled then NEON intrinsics are available.
21            (aarch64::vtrn1q_u32(v0, v1), aarch64::vtrn2q_u32(v0, v1))
22        }
23    }
24
25    #[inline]
26    #[must_use]
27    /// Interleave two vectors of 64-bit integers.
28    ///
29    /// Maps `[a0, a1], [b0, b1], ` to `[a0, b0], [a1, b1]`.
30    pub fn interleave_u64(v0: uint32x4_t, v1: uint32x4_t) -> (uint32x4_t, uint32x4_t) {
31        // We want this to compile to:
32        //      trn1  res0.2d, v0.2d, v1.2d
33        //      trn2  res1.2d, v0.2d, v1.2d
34        // throughput: .5 cyc/2 vec (16 els/cyc)
35        // latency: 2 cyc
36
37        // To transpose 64-bit blocks, cast the [u32; 4] vectors to [u64; 2], transpose, and cast back.
38        unsafe {
39            // Safety: If this code got compiled then NEON intrinsics are available.
40            let v0 = aarch64::vreinterpretq_u64_u32(v0);
41            let v1 = aarch64::vreinterpretq_u64_u32(v1);
42            (
43                aarch64::vreinterpretq_u32_u64(aarch64::vtrn1q_u64(v0, v1)),
44                aarch64::vreinterpretq_u32_u64(aarch64::vtrn2q_u64(v0, v1)),
45            )
46        }
47    }
48}
49
50#[cfg(all(
51    target_arch = "x86_64",
52    target_feature = "avx2",
53    not(target_feature = "avx512f")
54))]
55pub mod interleave {
56    use core::arch::x86_64::{self, __m256i};
57
58    #[inline]
59    #[must_use]
60    /// Interleave two vectors of 32-bit integers.
61    ///
62    /// Maps `[a0, ..., a7], [b0, ..., b7], ` to `[a0, b0, ...], [..., a7, b7]`.
63    pub fn interleave_u32(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
64        // We want this to compile to:
65        //      vpsllq    t, a, 32
66        //      vpsrlq    u, b, 32
67        //      vpblendd  res0, a, u, aah
68        //      vpblendd  res1, t, b, aah
69        // throughput: 1.33 cyc/2 vec (12 els/cyc)
70        // latency: (1 -> 1)  1 cyc
71        //          (1 -> 2)  2 cyc
72        //          (2 -> 1)  2 cyc
73        //          (2 -> 2)  1 cyc
74        unsafe {
75            // Safety: If this code got compiled then AVX2 intrinsics are available.
76
77            // We currently have:
78            //   a = [ a0  a1  a2  a3  a4  a5  a6  a7 ],
79            //   b = [ b0  b1  b2  b3  b4  b5  b6  b7 ].
80            // First form
81            //   t = [ a1   0  a3   0  a5   0  a7   0 ].
82            //   u = [  0  b0   0  b2   0  b4   0  b6 ].
83            let t = x86_64::_mm256_srli_epi64::<32>(a);
84            let u = x86_64::_mm256_slli_epi64::<32>(b);
85
86            // Then
87            //   res0 = [ a0  b0  a2  b2  a4  b4  a6  b6 ],
88            //   res1 = [ a1  b1  a3  b3  a5  b5  a7  b7 ].
89            (
90                x86_64::_mm256_blend_epi32::<0b10101010>(a, u),
91                x86_64::_mm256_blend_epi32::<0b10101010>(t, b),
92            )
93        }
94    }
95
96    #[inline]
97    #[must_use]
98    /// Interleave two vectors of 64-bit integers.
99    ///
100    /// Maps `[a0, ..., a3], [b0, ..., b3], ` to `[a0, b0, ...], [..., a3, b3]`.
101    pub fn interleave_u64(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
102        // We want this to compile to:
103        //      vpunpcklqdq   res0, a, b
104        //      vpunpckhqdq   res1, a, b
105        // throughput: 1 cyc/2 vec (16 els/cyc)
106        // latency: 1 cyc
107
108        unsafe {
109            // Safety: If this code got compiled then AVX2 intrinsics are available.
110            (
111                x86_64::_mm256_unpacklo_epi64(a, b),
112                x86_64::_mm256_unpackhi_epi64(a, b),
113            )
114        }
115    }
116
117    #[inline]
118    #[must_use]
119    /// Interleave two vectors of 128-bit integers.
120    ///
121    /// Maps `[a0, a1], [b0, b1], ` to `[a0, b0], [a1, b1]`.
122    pub fn interleave_u128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
123        // We want this to compile to:
124        //      vperm2i128  t, a, b, 21h
125        //      vpblendd    res0, a, t, f0h
126        //      vpblendd    res1, t, b, f0h
127        // throughput: 1 cyc/2 vec (16 els/cyc)
128        // latency: 4 cyc
129
130        unsafe {
131            // Safety: If this code got compiled then AVX2 intrinsics are available.
132
133            // We currently have:
134            //   a = [ a0  a1  a2  a3  a4  a5  a6  a7 ],
135            //   b = [ b0  b1  b2  b3  b4  b5  b6  b7 ].
136            // First form
137            //   t = [ a4  a5  a6  a7  b0  b1  b2  b3 ].
138            let t = x86_64::_mm256_permute2x128_si256::<0x21>(a, b);
139
140            // Then
141            //   res0 = [ a0  a1  a2  a3  b0  b1  b2  b3 ],
142            //   res1 = [ a4  a5  a6  a7  b4  b5  b6  b7 ].
143            (
144                x86_64::_mm256_blend_epi32::<0b11110000>(a, t),
145                x86_64::_mm256_blend_epi32::<0b11110000>(t, b),
146            )
147        }
148    }
149}
150
151#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
152pub mod interleave {
153    use core::arch::x86_64::{self, __m512i, __mmask8, __mmask16};
154    use core::mem::transmute;
155
156    const EVENS: __mmask16 = 0b0101010101010101;
157    const EVENS4: __mmask16 = 0x0f0f;
158
159    // vpshrdq requires AVX-512VBMI2.
160    #[cfg(target_feature = "avx512vbmi2")]
161    #[inline]
162    #[must_use]
163    fn interleave1_antidiagonal(x: __m512i, y: __m512i) -> __m512i {
164        unsafe {
165            // Safety: If this code got compiled then AVX-512VBMI2 intrinsics are available.
166            x86_64::_mm512_shrdi_epi64::<32>(x, y)
167        }
168    }
169
170    // If we can't use vpshrdq, then do a vpermi2d, but we waste a register and double the latency.
171    #[cfg(not(target_feature = "avx512vbmi2"))]
172    #[inline]
173    #[must_use]
174    fn interleave1_antidiagonal(x: __m512i, y: __m512i) -> __m512i {
175        const INTERLEAVE1_INDICES: __m512i = unsafe {
176            // Safety: `[u32; 16]` is trivially transmutable to `__m512i`.
177            transmute::<[u32; WIDTH], _>([
178                0x01, 0x10, 0x03, 0x12, 0x05, 0x14, 0x07, 0x16, 0x09, 0x18, 0x0b, 0x1a, 0x0d, 0x1c,
179                0x0f, 0x1e,
180            ])
181        };
182        unsafe {
183            // Safety: If this code got compiled then AVX-512F intrinsics are available.
184            x86_64::_mm512_permutex2var_epi32(x, INTERLEAVE1_INDICES, y)
185        }
186    }
187
188    #[inline]
189    #[must_use]
190    /// Interleave two vectors of 32-bit integers.
191    ///
192    /// Maps `[a0, ..., a15], [b0, ..., b15], ` to `[a0, b0, ...], [..., a15, b15]`.
193    pub fn interleave_u32(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
194        // If we have AVX-512VBMI2, we want this to compile to:
195        //      vpshrdq    t, x, y, 32
196        //      vpblendmd  res0 {EVENS}, t, x
197        //      vpblendmd  res1 {EVENS}, y, t
198        // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
199        // latency: 2 cyc
200        //
201        // Otherwise, we want it to compile to:
202        //      vmovdqa32  t, INTERLEAVE1_INDICES
203        //      vpermi2d   t, x, y
204        //      vpblendmd  res0 {EVENS}, t, x
205        //      vpblendmd  res1 {EVENS}, y, t
206        // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
207        // latency: 4 cyc
208
209        // We currently have:
210        //   x = [ x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf ],
211        //   y = [ y0  y1  y2  y3  y4  y5  y6  y7  y8  y9  ya  yb  yc  yd  ye  yf ].
212        // First form
213        //   t = [ x1  y0  x3  y2  x5  y4  x7  y6  x9  y8  xb  ya  xd  yc  xf  ye ].
214        let t = interleave1_antidiagonal(x, y);
215
216        unsafe {
217            // Safety: If this code got compiled then AVX-512F intrinsics are available.
218
219            // Then
220            //   res0 = [ x0  y0  x2  y2  x4  y4  x6  y6  x8  y8  xa  ya  xc  yc  xe  ye ],
221            //   res1 = [ x1  y1  x3  y3  x5  y5  x7  y7  x9  y9  xb  yb  xd  yd  xf  yf ].
222            (
223                x86_64::_mm512_mask_blend_epi32(EVENS, t, x),
224                x86_64::_mm512_mask_blend_epi32(EVENS, y, t),
225            )
226        }
227    }
228
229    #[inline]
230    #[must_use]
231    fn shuffle_epi64<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
232        // The instruction is only available in the floating-point flavor; this distinction is only for
233        // historical reasons and no longer matters. We cast to floats, do the thing, and cast back.
234        unsafe {
235            let a = x86_64::_mm512_castsi512_pd(a);
236            let b = x86_64::_mm512_castsi512_pd(b);
237            x86_64::_mm512_castpd_si512(x86_64::_mm512_shuffle_pd::<MASK>(a, b))
238        }
239    }
240
241    #[inline]
242    #[must_use]
243    /// Interleave two vectors of 64-bit integers.
244    ///
245    /// Maps `[a0, ..., a7], [b0, ..., b7], ` to `[a0, b0, ...], [..., a7, b7]`.
246    pub fn interleave_u64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
247        // We want this to compile to:
248        //      vshufpd    t, x, y, 55h
249        //      vpblendmq  res0 {EVENS}, t, x
250        //      vpblendmq  res1 {EVENS}, y, t
251        // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
252        // latency: 2 cyc
253
254        unsafe {
255            // Safety: If this code got compiled then AVX-512F intrinsics are available.
256
257            // We currently have:
258            //   x = [ x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf ],
259            //   y = [ y0  y1  y2  y3  y4  y5  y6  y7  y8  y9  ya  yb  yc  yd  ye  yf ].
260            // First form
261            //   t = [ x2  x3  y0  y1  x6  x7  y4  y5  xa  xb  y8  y9  xe  xf  yc  yd ].
262            let t = shuffle_epi64::<0b01010101>(x, y);
263
264            // Then
265            //   res0 = [ x0  x1  y0  y1  x4  x5  y4  y5  x8  x9  y8  y9  xc  xd  yc  yd ],
266            //   res1 = [ x2  x3  y2  y3  x6  x7  y6  y7  xa  xb  ya  yb  xe  xf  ye  yf ].
267            (
268                x86_64::_mm512_mask_blend_epi64(EVENS as __mmask8, t, x),
269                x86_64::_mm512_mask_blend_epi64(EVENS as __mmask8, y, t),
270            )
271        }
272    }
273
274    #[inline]
275    #[must_use]
276    /// Interleave two vectors of 128-bit integers.
277    ///
278    /// Maps `[a0, ..., a3], [b0, ..., b3], ` to `[a0, b0, ...], [..., a3, b3]`.
279    pub fn interleave_u128(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
280        // We want this to compile to:
281        //      vmovdqa64   t, INTERLEAVE4_INDICES
282        //      vpermi2q    t, x, y
283        //      vpblendmd   res0 {EVENS4}, t, x
284        //      vpblendmd   res1 {EVENS4}, y, t
285        // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
286        // latency: 4 cyc
287
288        const INTERLEAVE4_INDICES: __m512i = unsafe {
289            // Safety: `[u64; 8]` is trivially transmutable to `__m512i`.
290            transmute::<[u64; 8], _>([0o02, 0o03, 0o10, 0o11, 0o06, 0o07, 0o14, 0o15])
291        };
292
293        unsafe {
294            // Safety: If this code got compiled then AVX-512F intrinsics are available.
295
296            // We currently have:
297            //   x = [ x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf ],
298            //   y = [ y0  y1  y2  y3  y4  y5  y6  y7  y8  y9  ya  yb  yc  yd  ye  yf ].
299            // First form
300            //   t = [ x4  x5  x6  x7  y0  y1  y2  y3  xc  xd  xe  xf  y8  y9  ya  yb ].
301            let t = x86_64::_mm512_permutex2var_epi64(x, INTERLEAVE4_INDICES, y);
302
303            // Then
304            //   res0 = [ x0  x1  x2  x3  y0  y1  y2  y3  x8  x9  xa  xb  y8  y9  ya  yb ],
305            //   res1 = [ x4  x5  x6  x7  y4  y5  y6  y7  xc  xd  xe  xf  yc  yd  ye  yf ].
306            (
307                x86_64::_mm512_mask_blend_epi32(EVENS4, t, x),
308                x86_64::_mm512_mask_blend_epi32(EVENS4, y, t),
309            )
310        }
311    }
312
313    #[inline]
314    #[must_use]
315    /// Interleave two vectors of 256-bit integers.
316    ///
317    /// Maps `[a0, a1], [b0, b1], ` to `[a0, b0], [a1, b1]`.
318    pub fn interleave_u256(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
319        // We want this to compile to:
320        //      vshufi64x2  t, x, b, 4eh
321        //      vpblendmq   res0 {EVENS4}, t, x
322        //      vpblendmq   res1 {EVENS4}, y, t
323        // throughput: 1.5 cyc/2 vec (21.33 els/cyc)
324        // latency: 4 cyc
325
326        unsafe {
327            // Safety: If this code got compiled then AVX-512F intrinsics are available.
328
329            // We currently have:
330            //   x = [ x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf ],
331            //   y = [ y0  y1  y2  y3  y4  y5  y6  y7  y8  y9  ya  yb  yc  yd  ye  yf ].
332            // First form
333            //   t = [ x8  x9  xa  xb  xc  xd  xe  xf  y0  y1  y2  y3  y4  y5  y6  y7 ].
334            let t = x86_64::_mm512_shuffle_i64x2::<0b01_00_11_10>(x, y);
335
336            // Then
337            //   res0 = [ x0  x1  x2  x3  x4  x5  x6  x7  y0  y1  y2  y3  y4  y5  y6  y7 ],
338            //   res1 = [ x8  x9  xa  xb  xc  xd  xe  xf  y8  y9  ya  yb  yc  yd  ye  yf ].
339            (
340                x86_64::_mm512_mask_blend_epi64(EVENS4 as __mmask8, t, x),
341                x86_64::_mm512_mask_blend_epi64(EVENS4 as __mmask8, y, t),
342            )
343        }
344    }
345}
346
347/// A macro to implement the PackedFieldPow2 trait for PackedFields. The macro assumes that the PackedFields
348/// have a `to_vector` and `from_vector` method, which convert between the PackedField and a packed vector.
349///
350/// # Arguments:
351/// - `$type`: The type of the PackedField.
352/// - `($type_param, $param_name)`: Optional type parameter if one is needed and a name for it.
353/// - `; [ ($block_len, $func), ... ]`: A list of block lengths and their corresponding interleaving functions.
354/// - `$width`: The width of the PackedField, corresponding to the largest possible block length.
355///
356/// For example, calling this macro with:
357/// ```rust,ignore
358/// impl_packed_field_pow_2!(
359///    PackedMontyField31Neon, (FieldParameters, FP);
360///    [
361///        (1, interleave_u32),
362///        (2, interleave_u64),
363///   ],
364///    4
365/// );
366/// ```
367/// crates the code:
368/// ```rust,ignore
369/// impl<FP: FieldParameters> PackedFieldPow2 for PackedMontyField31Neon<FP> {
370///     #[inline]
371///     fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
372///         let (v0, v1) = (self.to_vector(), other.to_vector());
373///         let (res0, res1) = match block_len {
374///             1 => interleave_u32(v0, v1),
375///             2 => interleave_u64(v0, v1),
376///             4 => (v0, v1),
377///             _ => panic!("unsupported block_len"),
378///         };
379///         unsafe {
380///             // Safety: We haven't changed any values, just moved data around
381///             // so all entries still represent valid field elements.
382///             (Self::from_vector(res0), Self::from_vector(res1))
383///         }
384///     }
385/// }
386/// ```
387#[macro_export]
388macro_rules! impl_packed_field_pow_2 {
389    // Accepts: type, block sizes as (block_len, function), and optional type param
390    (
391        $type:ty
392        $(, ($type_param:ty, $param_name:ty))?
393        ; [ $( ($block_len:expr, $func:ident) ),* $(,)? ],
394        $width:expr
395    ) => {
396        paste::paste! {
397            unsafe impl$(<$param_name: $type_param>)? PackedFieldPow2 for $type$(<$param_name>)? {
398                #[inline]
399                fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
400                    let (v0, v1) = (self.to_vector(), other.to_vector());
401                    let (res0, res1) = match block_len {
402                        $(
403                            $block_len => $func(v0, v1),
404                        )*
405                        $width => (v0, v1),
406                        _ => panic!("unsupported block_len"),
407                    };
408                    unsafe {
409                        // Safety: We haven't changed any values, just moved data around
410                        // so all entries still represent valid field elements.
411                        (Self::from_vector(res0), Self::from_vector(res1))
412                    }
413                }
414            }
415        }
416    };
417}
418
419pub use impl_packed_field_pow_2;
p3_field/packed/interleaves.rs

p3_field/packed/
interleaves.rs