ppv_lite86/x86_64/
sse2.rs

1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12use zerocopy::transmute;
13
14macro_rules! impl_binop {
15    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
16        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
17            type Output = Self;
18            #[inline(always)]
19            fn $fn(self, rhs: Self) -> Self::Output {
20                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
21            }
22        }
23    };
24}
25
26macro_rules! impl_binop_assign {
27    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
28        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
29        where
30            $vec<S3, S4, NI>: Copy,
31        {
32            #[inline(always)]
33            fn $fn_assign(&mut self, rhs: Self) {
34                *self = self.$fn(rhs);
35            }
36        }
37    };
38}
39
40macro_rules! def_vec {
41    ($vec:ident, $word:ident) => {
42        zerocopy::cryptocorrosion_derive_traits! {
43            #[repr(transparent)]
44            #[allow(non_camel_case_types)]
45            #[derive(Copy, Clone)]
46            pub struct $vec<S3, S4, NI> {
47                x: __m128i,
48                s3: PhantomData<S3>,
49                s4: PhantomData<S4>,
50                ni: PhantomData<NI>,
51            }
52        }
53
54        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
55            #[inline(always)]
56            unsafe fn unpack(x: vec128_storage) -> Self {
57                Self::new(x.sse2)
58            }
59        }
60        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
61            #[inline(always)]
62            fn from(x: $vec<S3, S4, NI>) -> Self {
63                vec128_storage { sse2: x.x }
64            }
65        }
66        impl<S3, S4, NI> $vec<S3, S4, NI> {
67            #[inline(always)]
68            fn new(x: __m128i) -> Self {
69                $vec {
70                    x,
71                    s3: PhantomData,
72                    s4: PhantomData,
73                    ni: PhantomData,
74                }
75            }
76        }
77
78        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
79        where
80            Self: BSwap,
81        {
82            #[inline(always)]
83            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
84                assert_eq!(input.len(), 16);
85                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
86            }
87            #[inline(always)]
88            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
89                assert_eq!(input.len(), 16);
90                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
91            }
92            #[inline(always)]
93            fn write_le(self, out: &mut [u8]) {
94                assert_eq!(out.len(), 16);
95                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
96            }
97            #[inline(always)]
98            fn write_be(self, out: &mut [u8]) {
99                assert_eq!(out.len(), 16);
100                let x = self.bswap().x;
101                unsafe {
102                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
103                }
104            }
105        }
106
107        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
108            #[inline(always)]
109            fn default() -> Self {
110                Self::new(unsafe { _mm_setzero_si128() })
111            }
112        }
113
114        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
115            type Output = Self;
116            #[inline(always)]
117            fn not(self) -> Self::Output {
118                unsafe {
119                    let ff = _mm_set1_epi64x(-1i64);
120                    self ^ Self::new(ff)
121                }
122            }
123        }
124
125        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
126        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
127        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
128        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
129        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
130        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
131        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
132        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
133            type Output = Self;
134            #[inline(always)]
135            fn andnot(self, rhs: Self) -> Self {
136                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
137            }
138        }
139    };
140}
141
142macro_rules! impl_bitops32 {
143    ($vec:ident) => {
144        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
145            $vec<S3, S4, NI>: RotateEachWord32
146        {
147        }
148    };
149}
150
151macro_rules! impl_bitops64 {
152    ($vec:ident) => {
153        impl_bitops32!($vec);
154        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
155            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
156        {
157        }
158    };
159}
160
161macro_rules! impl_bitops128 {
162    ($vec:ident) => {
163        impl_bitops64!($vec);
164        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
165            $vec<S3, S4, NI>: RotateEachWord128
166        {
167        }
168    };
169}
170
171macro_rules! rotr_32_s3 {
172    ($name:ident, $k0:expr, $k1:expr) => {
173        #[inline(always)]
174        fn $name(self) -> Self {
175            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
176        }
177    };
178}
179macro_rules! rotr_32 {
180    ($name:ident, $i:expr) => {
181        #[inline(always)]
182        fn $name(self) -> Self {
183            Self::new(unsafe {
184                _mm_or_si128(
185                    _mm_srli_epi32(self.x, $i as i32),
186                    _mm_slli_epi32(self.x, 32 - $i as i32),
187                )
188            })
189        }
190    };
191}
192impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
193    rotr_32!(rotate_each_word_right7, 7);
194    rotr_32_s3!(
195        rotate_each_word_right8,
196        0x0c0f_0e0d_080b_0a09,
197        0x0407_0605_0003_0201
198    );
199    rotr_32!(rotate_each_word_right11, 11);
200    rotr_32!(rotate_each_word_right12, 12);
201    rotr_32_s3!(
202        rotate_each_word_right16,
203        0x0d0c_0f0e_0908_0b0a,
204        0x0504_0706_0100_0302
205    );
206    rotr_32!(rotate_each_word_right20, 20);
207    rotr_32_s3!(
208        rotate_each_word_right24,
209        0x0e0d_0c0f_0a09_080b,
210        0x0605_0407_0201_0003
211    );
212    rotr_32!(rotate_each_word_right25, 25);
213}
214impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
215    rotr_32!(rotate_each_word_right7, 7);
216    rotr_32!(rotate_each_word_right8, 8);
217    rotr_32!(rotate_each_word_right11, 11);
218    rotr_32!(rotate_each_word_right12, 12);
219    #[inline(always)]
220    fn rotate_each_word_right16(self) -> Self {
221        Self::new(swap16_s2(self.x))
222    }
223    rotr_32!(rotate_each_word_right20, 20);
224    rotr_32!(rotate_each_word_right24, 24);
225    rotr_32!(rotate_each_word_right25, 25);
226}
227
228macro_rules! rotr_64_s3 {
229    ($name:ident, $k0:expr, $k1:expr) => {
230        #[inline(always)]
231        fn $name(self) -> Self {
232            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
233        }
234    };
235}
236macro_rules! rotr_64 {
237    ($name:ident, $i:expr) => {
238        #[inline(always)]
239        fn $name(self) -> Self {
240            Self::new(unsafe {
241                _mm_or_si128(
242                    _mm_srli_epi64(self.x, $i as i32),
243                    _mm_slli_epi64(self.x, 64 - $i as i32),
244                )
245            })
246        }
247    };
248}
249impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
250    rotr_64!(rotate_each_word_right7, 7);
251    rotr_64_s3!(
252        rotate_each_word_right8,
253        0x080f_0e0d_0c0b_0a09,
254        0x0007_0605_0403_0201
255    );
256    rotr_64!(rotate_each_word_right11, 11);
257    rotr_64!(rotate_each_word_right12, 12);
258    rotr_64_s3!(
259        rotate_each_word_right16,
260        0x0908_0f0e_0d0c_0b0a,
261        0x0100_0706_0504_0302
262    );
263    rotr_64!(rotate_each_word_right20, 20);
264    rotr_64_s3!(
265        rotate_each_word_right24,
266        0x0a09_080f_0e0d_0c0b,
267        0x0201_0007_0605_0403
268    );
269    rotr_64!(rotate_each_word_right25, 25);
270}
271impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
272    rotr_64!(rotate_each_word_right7, 7);
273    rotr_64!(rotate_each_word_right8, 8);
274    rotr_64!(rotate_each_word_right11, 11);
275    rotr_64!(rotate_each_word_right12, 12);
276    #[inline(always)]
277    fn rotate_each_word_right16(self) -> Self {
278        Self::new(swap16_s2(self.x))
279    }
280    rotr_64!(rotate_each_word_right20, 20);
281    rotr_64!(rotate_each_word_right24, 24);
282    rotr_64!(rotate_each_word_right25, 25);
283}
284impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
285    #[inline(always)]
286    fn rotate_each_word_right32(self) -> Self {
287        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
288    }
289}
290
291macro_rules! rotr_128 {
292    ($name:ident, $i:expr) => {
293        #[inline(always)]
294        fn $name(self) -> Self {
295            Self::new(unsafe {
296                _mm_or_si128(
297                    _mm_srli_si128(self.x, $i as i32),
298                    _mm_slli_si128(self.x, 128 - $i as i32),
299                )
300            })
301        }
302    };
303}
304// TODO: completely unoptimized
305impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
306    rotr_128!(rotate_each_word_right7, 7);
307    rotr_128!(rotate_each_word_right8, 8);
308    rotr_128!(rotate_each_word_right11, 11);
309    rotr_128!(rotate_each_word_right12, 12);
310    rotr_128!(rotate_each_word_right16, 16);
311    rotr_128!(rotate_each_word_right20, 20);
312    rotr_128!(rotate_each_word_right24, 24);
313    rotr_128!(rotate_each_word_right25, 25);
314}
315// TODO: completely unoptimized
316impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
317    rotr_128!(rotate_each_word_right32, 32);
318}
319impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
320
321def_vec!(u32x4_sse2, u32);
322def_vec!(u64x2_sse2, u64);
323def_vec!(u128x1_sse2, u128);
324
325impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
326    #[inline(always)]
327    fn to_lanes(self) -> [u32; 4] {
328        unsafe {
329            let x = _mm_cvtsi128_si64(self.x) as u64;
330            let y = _mm_extract_epi64(self.x, 1) as u64;
331            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
332        }
333    }
334    #[inline(always)]
335    fn from_lanes(xs: [u32; 4]) -> Self {
336        unsafe {
337            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
338            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
339            Self::new(x)
340        }
341    }
342}
343impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
344    #[inline(always)]
345    fn to_lanes(self) -> [u32; 4] {
346        unsafe {
347            let x = _mm_cvtsi128_si64(self.x) as u64;
348            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
349            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
350        }
351    }
352    #[inline(always)]
353    fn from_lanes(xs: [u32; 4]) -> Self {
354        unsafe {
355            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
356            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
357            let x = _mm_cvtsi64_si128(x);
358            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
359            Self::new(_mm_or_si128(x, y))
360        }
361    }
362}
363impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
364    #[inline(always)]
365    fn to_lanes(self) -> [u64; 2] {
366        unsafe {
367            [
368                _mm_cvtsi128_si64(self.x) as u64,
369                _mm_extract_epi64(self.x, 1) as u64,
370            ]
371        }
372    }
373    #[inline(always)]
374    fn from_lanes(xs: [u64; 2]) -> Self {
375        unsafe {
376            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
377            x = _mm_insert_epi64(x, xs[1] as i64, 1);
378            Self::new(x)
379        }
380    }
381}
382impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
383    #[inline(always)]
384    fn to_lanes(self) -> [u64; 2] {
385        unsafe {
386            [
387                _mm_cvtsi128_si64(self.x) as u64,
388                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
389            ]
390        }
391    }
392    #[inline(always)]
393    fn from_lanes(xs: [u64; 2]) -> Self {
394        unsafe {
395            let x = _mm_cvtsi64_si128(xs[0] as i64);
396            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
397            Self::new(_mm_or_si128(x, y))
398        }
399    }
400}
401impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
402    #[inline(always)]
403    fn to_lanes(self) -> [u128; 1] {
404        unimplemented!()
405    }
406    #[inline(always)]
407    fn from_lanes(xs: [u128; 1]) -> Self {
408        unimplemented!("{:?}", xs)
409    }
410}
411
412impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
413where
414    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
415{
416    #[inline(always)]
417    fn to_lanes(self) -> [u64; 4] {
418        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
419        [a[0], a[1], b[0], b[1]]
420    }
421    #[inline(always)]
422    fn from_lanes(xs: [u64; 4]) -> Self {
423        let (a, b) = (
424            u64x2_sse2::from_lanes([xs[0], xs[1]]),
425            u64x2_sse2::from_lanes([xs[2], xs[3]]),
426        );
427        x2::new([a, b])
428    }
429}
430
431macro_rules! impl_into {
432    ($from:ident, $to:ident) => {
433        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
434            #[inline(always)]
435            fn from(x: $from<S3, S4, NI>) -> Self {
436                $to::new(x.x)
437            }
438        }
439    };
440}
441
442impl_into!(u128x1_sse2, u32x4_sse2);
443impl_into!(u128x1_sse2, u64x2_sse2);
444
445impl_bitops32!(u32x4_sse2);
446impl_bitops64!(u64x2_sse2);
447impl_bitops128!(u128x1_sse2);
448
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
450    u32x4_sse2<S3, S4, NI>: BSwap
451{
452}
453impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
454    u64x2_sse2<S3, S4, NI>: BSwap
455{
456}
457impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
458impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
459impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
460impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
461
462impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
463where
464    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
465    Machine86<S3, S4, NI>: Machine,
466{
467}
468impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
469where
470    u64x2_sse2<S3, S4, NI>:
471        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
472    Machine86<S3, S4, NI>: Machine,
473{
474}
475impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
476where
477    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
478    Machine86<S3, S4, NI>: Machine,
479    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
480    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
481{
482}
483
484impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
485where
486    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
487    Machine86<YesS3, YesS4, NI>: Machine,
488{
489}
490impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
491where
492    u64x2_sse2<YesS3, YesS4, NI>:
493        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
494    Machine86<YesS3, YesS4, NI>: Machine,
495{
496}
497impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
498where
499    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
500    Machine86<YesS3, YesS4, NI>: Machine,
501    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
502    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
503{
504}
505
506impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
507    #[inline(always)]
508    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
509        Self::new(_mm_set_epi32(
510            xs[3] as i32,
511            xs[2] as i32,
512            xs[1] as i32,
513            xs[0] as i32,
514        ))
515    }
516}
517
518impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
519where
520    Self: MultiLane<[u32; 4]>,
521{
522    #[inline(always)]
523    fn extract(self, i: u32) -> u32 {
524        self.to_lanes()[i as usize]
525    }
526    #[inline(always)]
527    fn insert(self, v: u32, i: u32) -> Self {
528        Self::new(unsafe {
529            match i {
530                0 => _mm_insert_epi32(self.x, v as i32, 0),
531                1 => _mm_insert_epi32(self.x, v as i32, 1),
532                2 => _mm_insert_epi32(self.x, v as i32, 2),
533                3 => _mm_insert_epi32(self.x, v as i32, 3),
534                _ => unreachable!(),
535            }
536        })
537    }
538}
539impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
540where
541    Self: MultiLane<[u32; 4]>,
542{
543    #[inline(always)]
544    fn extract(self, i: u32) -> u32 {
545        self.to_lanes()[i as usize]
546    }
547    #[inline(always)]
548    fn insert(self, v: u32, i: u32) -> Self {
549        Self::new(unsafe {
550            match i {
551                0 => {
552                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
553                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
554                }
555                1 => {
556                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
557                    x = _mm_slli_si128(x, 4);
558                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
559                    _mm_shuffle_epi32(x, 0b1110_0001)
560                }
561                2 => {
562                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
563                    x = _mm_slli_si128(x, 4);
564                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
565                    _mm_shuffle_epi32(x, 0b1100_1001)
566                }
567                3 => {
568                    let mut x = _mm_slli_si128(self.x, 4);
569                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
570                    _mm_shuffle_epi32(x, 0b0011_1001)
571                }
572                _ => unreachable!(),
573            }
574        })
575    }
576}
577
578impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
579    #[inline(always)]
580    fn shuffle_lane_words2301(self) -> Self {
581        self.shuffle2301()
582    }
583    #[inline(always)]
584    fn shuffle_lane_words1230(self) -> Self {
585        self.shuffle1230()
586    }
587    #[inline(always)]
588    fn shuffle_lane_words3012(self) -> Self {
589        self.shuffle3012()
590    }
591}
592
593impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
594    #[inline(always)]
595    fn shuffle2301(self) -> Self {
596        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
597    }
598    #[inline(always)]
599    fn shuffle1230(self) -> Self {
600        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
601    }
602    #[inline(always)]
603    fn shuffle3012(self) -> Self {
604        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
605    }
606}
607
608impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
609    #[inline(always)]
610    fn shuffle2301(self) -> Self {
611        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
612    }
613    #[inline(always)]
614    fn shuffle3012(self) -> Self {
615        unsafe {
616            x2::new([
617                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
618                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
619            ])
620        }
621    }
622    #[inline(always)]
623    fn shuffle1230(self) -> Self {
624        unsafe {
625            x2::new([
626                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
627                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
628            ])
629        }
630    }
631}
632impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
633    #[inline(always)]
634    fn shuffle2301(self) -> Self {
635        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
636    }
637    #[inline(always)]
638    fn shuffle3012(self) -> Self {
639        unsafe {
640            let a = _mm_srli_si128(self.0[0].x, 8);
641            let b = _mm_slli_si128(self.0[0].x, 8);
642            let c = _mm_srli_si128(self.0[1].x, 8);
643            let d = _mm_slli_si128(self.0[1].x, 8);
644            let da = _mm_or_si128(d, a);
645            let bc = _mm_or_si128(b, c);
646            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
647        }
648    }
649    #[inline(always)]
650    fn shuffle1230(self) -> Self {
651        unsafe {
652            let a = _mm_srli_si128(self.0[0].x, 8);
653            let b = _mm_slli_si128(self.0[0].x, 8);
654            let c = _mm_srli_si128(self.0[1].x, 8);
655            let d = _mm_slli_si128(self.0[1].x, 8);
656            let da = _mm_or_si128(d, a);
657            let bc = _mm_or_si128(b, c);
658            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
659        }
660    }
661}
662
663impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
664    #[inline(always)]
665    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
666        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
667    }
668}
669
670impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
671    #[inline(always)]
672    fn extract(self, i: u32) -> u64 {
673        unsafe {
674            match i {
675                0 => _mm_cvtsi128_si64(self.x) as u64,
676                1 => _mm_extract_epi64(self.x, 1) as u64,
677                _ => unreachable!(),
678            }
679        }
680    }
681    #[inline(always)]
682    fn insert(self, x: u64, i: u32) -> Self {
683        Self::new(unsafe {
684            match i {
685                0 => _mm_insert_epi64(self.x, x as i64, 0),
686                1 => _mm_insert_epi64(self.x, x as i64, 1),
687                _ => unreachable!(),
688            }
689        })
690    }
691}
692impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
693    #[inline(always)]
694    fn extract(self, i: u32) -> u64 {
695        unsafe {
696            match i {
697                0 => _mm_cvtsi128_si64(self.x) as u64,
698                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
699                _ => unreachable!(),
700            }
701        }
702    }
703    #[inline(always)]
704    fn insert(self, x: u64, i: u32) -> Self {
705        Self::new(unsafe {
706            match i {
707                0 => _mm_or_si128(
708                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
709                    _mm_cvtsi64_si128(x as i64),
710                ),
711                1 => _mm_or_si128(
712                    _mm_move_epi64(self.x),
713                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
714                ),
715                _ => unreachable!(),
716            }
717        })
718    }
719}
720
721impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
722    #[inline(always)]
723    fn bswap(self) -> Self {
724        Self::new(unsafe {
725            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
726            _mm_shuffle_epi8(self.x, k)
727        })
728    }
729}
730#[inline(always)]
731fn bswap32_s2(x: __m128i) -> __m128i {
732    unsafe {
733        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
734        y = _mm_shufflehi_epi16(y, 0b0001_1011);
735        y = _mm_shufflelo_epi16(y, 0b0001_1011);
736        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
737        z = _mm_shufflehi_epi16(z, 0b0001_1011);
738        z = _mm_shufflelo_epi16(z, 0b0001_1011);
739        _mm_packus_epi16(y, z)
740    }
741}
742impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
743    #[inline(always)]
744    fn bswap(self) -> Self {
745        Self::new(bswap32_s2(self.x))
746    }
747}
748
749impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
750    #[inline(always)]
751    fn bswap(self) -> Self {
752        Self::new(unsafe {
753            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
754            _mm_shuffle_epi8(self.x, k)
755        })
756    }
757}
758impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
759    #[inline(always)]
760    fn bswap(self) -> Self {
761        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
762    }
763}
764
765impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
766    #[inline(always)]
767    fn bswap(self) -> Self {
768        Self::new(unsafe {
769            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
770            _mm_shuffle_epi8(self.x, k)
771        })
772    }
773}
774impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
775    #[inline(always)]
776    fn bswap(self) -> Self {
777        unimplemented!()
778    }
779}
780
781macro_rules! swapi {
782    ($x:expr, $i:expr, $k:expr) => {
783        unsafe {
784            const K: u8 = $k;
785            let k = _mm_set1_epi8(K as i8);
786            u128x1_sse2::new(_mm_or_si128(
787                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
788                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
789            ))
790        }
791    };
792}
793#[inline(always)]
794fn swap16_s2(x: __m128i) -> __m128i {
795    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
796}
797impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
798    #[inline(always)]
799    fn swap1(self) -> Self {
800        swapi!(self, 1, 0xaa)
801    }
802    #[inline(always)]
803    fn swap2(self) -> Self {
804        swapi!(self, 2, 0xcc)
805    }
806    #[inline(always)]
807    fn swap4(self) -> Self {
808        swapi!(self, 4, 0xf0)
809    }
810    #[inline(always)]
811    fn swap8(self) -> Self {
812        u128x1_sse2::new(unsafe {
813            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
814            _mm_shuffle_epi8(self.x, k)
815        })
816    }
817    #[inline(always)]
818    fn swap16(self) -> Self {
819        u128x1_sse2::new(unsafe {
820            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
821            _mm_shuffle_epi8(self.x, k)
822        })
823    }
824    #[inline(always)]
825    fn swap32(self) -> Self {
826        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
827    }
828    #[inline(always)]
829    fn swap64(self) -> Self {
830        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
831    }
832}
833impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
834    #[inline(always)]
835    fn swap1(self) -> Self {
836        swapi!(self, 1, 0xaa)
837    }
838    #[inline(always)]
839    fn swap2(self) -> Self {
840        swapi!(self, 2, 0xcc)
841    }
842    #[inline(always)]
843    fn swap4(self) -> Self {
844        swapi!(self, 4, 0xf0)
845    }
846    #[inline(always)]
847    fn swap8(self) -> Self {
848        u128x1_sse2::new(unsafe {
849            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
850        })
851    }
852    #[inline(always)]
853    fn swap16(self) -> Self {
854        u128x1_sse2::new(swap16_s2(self.x))
855    }
856    #[inline(always)]
857    fn swap32(self) -> Self {
858        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
859    }
860    #[inline(always)]
861    fn swap64(self) -> Self {
862        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
863    }
864}
865
866#[derive(Copy, Clone)]
867pub struct G0;
868#[derive(Copy, Clone)]
869pub struct G1;
870
871#[allow(non_camel_case_types)]
872pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
873#[allow(non_camel_case_types)]
874pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
875#[allow(non_camel_case_types)]
876pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
877#[allow(non_camel_case_types)]
878pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
879
880#[allow(non_camel_case_types)]
881pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
882#[allow(non_camel_case_types)]
883pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
884#[allow(non_camel_case_types)]
885pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
886
887impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
888    #[inline(always)]
889    fn to_scalars(self) -> [u32; 16] {
890        transmute!(self)
891    }
892}
893
894impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
895where
896    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
897    Machine86<S3, S4, NI>: Machine,
898    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
899    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
900{
901}
902impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
903where
904    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
905    Machine86<S3, S4, NI>: Machine,
906    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
907    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
908{
909}
910impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
911where
912    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
913    Machine86<S3, S4, NI>: Machine,
914    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
915{
916}
917impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
918where
919    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
920    Machine86<S3, S4, NI>: Machine,
921    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
922    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
923    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
924    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
925    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
926{
927}
928
929impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
930where
931    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
932    Avx2Machine<NI>: Machine,
933    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
934    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
935{
936}
937impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
938where
939    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
940    Avx2Machine<NI>: Machine,
941    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
942    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
943{
944}
945impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
946where
947    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
948    Avx2Machine<NI>: Machine,
949    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
950{
951}
952impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
953where
954    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
955    Avx2Machine<NI>: Machine,
956    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
957    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
958    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
959    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
960    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
961{
962}
963
964impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
965where
966    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
967{
968    #[inline(always)]
969    fn extract(self, i: u32) -> u64 {
970        match i {
971            0 => self.0[0].extract(0),
972            1 => self.0[0].extract(1),
973            2 => self.0[1].extract(0),
974            3 => self.0[1].extract(1),
975            _ => panic!(),
976        }
977    }
978    #[inline(always)]
979    fn insert(mut self, w: u64, i: u32) -> Self {
980        match i {
981            0 => self.0[0] = self.0[0].insert(w, 0),
982            1 => self.0[0] = self.0[0].insert(w, 1),
983            2 => self.0[1] = self.0[1].insert(w, 0),
984            3 => self.0[1] = self.0[1].insert(w, 1),
985            _ => panic!(),
986        };
987        self
988    }
989}
990
991impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
992where
993    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
994    Machine86<S3, S4, NI>: Machine,
995    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
996    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
997    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
998    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
999{
1000}
1001impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1002where
1003    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1004    Machine86<S3, S4, NI>: Machine,
1005    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1006    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1007{
1008}
1009impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1010where
1011    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1012    Machine86<S3, S4, NI>: Machine,
1013    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1014    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1015    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1016    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1017{
1018}
1019
1020impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1021where
1022    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1023    Avx2Machine<NI>: Machine,
1024    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1025    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1026{
1027}
1028impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1029where
1030    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1031    Avx2Machine<NI>: Machine,
1032    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1033    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1034    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1035    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1036{
1037}
1038
1039macro_rules! impl_into_x {
1040    ($from:ident, $to:ident) => {
1041        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1042            for x2<$to<S3, S4, NI>, Gt>
1043        {
1044            #[inline(always)]
1045            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1046                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1047            }
1048        }
1049        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1050            #[inline(always)]
1051            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1052                x4::new([
1053                    $to::from(x.0[0]),
1054                    $to::from(x.0[1]),
1055                    $to::from(x.0[2]),
1056                    $to::from(x.0[3]),
1057                ])
1058            }
1059        }
1060    };
1061}
1062impl_into_x!(u128x1_sse2, u64x2_sse2);
1063impl_into_x!(u128x1_sse2, u32x4_sse2);
1064
1065///// Debugging
1066
1067use core::fmt::{Debug, Formatter, Result};
1068
1069impl<W: PartialEq, G> PartialEq for x2<W, G> {
1070    #[inline(always)]
1071    fn eq(&self, rhs: &Self) -> bool {
1072        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1073    }
1074}
1075
1076#[allow(unused)]
1077#[inline(always)]
1078unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1079    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1080    _mm_cvtsi128_si64(q) == -1
1081}
1082
1083#[inline(always)]
1084unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1085    let q = _mm_cmpeq_epi32(x, y);
1086    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1087    let q = _mm_cvtsi128_si64(q);
1088    (p & q) == -1
1089}
1090
1091impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1092    #[inline(always)]
1093    fn eq(&self, rhs: &Self) -> bool {
1094        unsafe { eq128_s2(self.x, rhs.x) }
1095    }
1096}
1097impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1098where
1099    Self: Copy + MultiLane<[u32; 4]>,
1100{
1101    #[cold]
1102    fn fmt(&self, fmt: &mut Formatter) -> Result {
1103        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1104    }
1105}
1106
1107impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1108    #[inline(always)]
1109    fn eq(&self, rhs: &Self) -> bool {
1110        unsafe { eq128_s2(self.x, rhs.x) }
1111    }
1112}
1113impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1114where
1115    Self: Copy + MultiLane<[u64; 2]>,
1116{
1117    #[cold]
1118    fn fmt(&self, fmt: &mut Formatter) -> Result {
1119        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1120    }
1121}
1122
1123impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1124where
1125    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1126{
1127    #[cold]
1128    fn fmt(&self, fmt: &mut Formatter) -> Result {
1129        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1130        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1131    }
1132}
1133
1134#[cfg(test)]
1135#[cfg(target_arch = "x86_64")]
1136mod test {
1137    use super::*;
1138    use crate::x86_64::{SSE2, SSE41, SSSE3};
1139    use crate::Machine;
1140
1141    #[test]
1142    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1143    fn test_bswap32_s2_vs_s3() {
1144        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1145        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1146
1147        let s2 = unsafe { SSE2::instance() };
1148        let s3 = unsafe { SSSE3::instance() };
1149
1150        let x_s2 = {
1151            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1152            x_s2.bswap()
1153        };
1154
1155        let x_s3 = {
1156            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1157            x_s3.bswap()
1158        };
1159
1160        assert_eq!(x_s2, transmute!(x_s3));
1161        assert_eq!(x_s2, s2.vec(ys));
1162    }
1163
1164    #[test]
1165    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1166    fn test_bswap64_s2_vs_s3() {
1167        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1168        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1169
1170        let s2 = unsafe { SSE2::instance() };
1171        let s3 = unsafe { SSSE3::instance() };
1172
1173        let x_s2 = {
1174            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1175            x_s2.bswap()
1176        };
1177
1178        let x_s3 = {
1179            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1180            x_s3.bswap()
1181        };
1182
1183        assert_eq!(x_s2, s2.vec(ys));
1184        assert_eq!(x_s3, transmute!(x_s3));
1185    }
1186
1187    #[test]
1188    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1189    fn test_shuffle32_s2_vs_s3() {
1190        let xs = [0x0, 0x1, 0x2, 0x3];
1191        let ys = [0x2, 0x3, 0x0, 0x1];
1192        let zs = [0x1, 0x2, 0x3, 0x0];
1193
1194        let s2 = unsafe { SSE2::instance() };
1195        let s3 = unsafe { SSSE3::instance() };
1196
1197        let x_s2 = {
1198            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1199            x_s2.shuffle2301()
1200        };
1201        let x_s3 = {
1202            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1203            x_s3.shuffle2301()
1204        };
1205        assert_eq!(x_s2, s2.vec(ys));
1206        assert_eq!(x_s3, transmute!(x_s3));
1207
1208        let x_s2 = {
1209            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1210            x_s2.shuffle3012()
1211        };
1212        let x_s3 = {
1213            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1214            x_s3.shuffle3012()
1215        };
1216        assert_eq!(x_s2, s2.vec(zs));
1217        assert_eq!(x_s3, transmute!(x_s3));
1218
1219        let x_s2 = x_s2.shuffle1230();
1220        let x_s3 = x_s3.shuffle1230();
1221        assert_eq!(x_s2, s2.vec(xs));
1222        assert_eq!(x_s3, transmute!(x_s3));
1223    }
1224
1225    #[test]
1226    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1227    fn test_shuffle64_s2_vs_s3() {
1228        let xs = [0x0, 0x1, 0x2, 0x3];
1229        let ys = [0x2, 0x3, 0x0, 0x1];
1230        let zs = [0x1, 0x2, 0x3, 0x0];
1231
1232        let s2 = unsafe { SSE2::instance() };
1233        let s3 = unsafe { SSSE3::instance() };
1234
1235        let x_s2 = {
1236            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1237            x_s2.shuffle2301()
1238        };
1239        let x_s3 = {
1240            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1241            x_s3.shuffle2301()
1242        };
1243        assert_eq!(x_s2, s2.vec(ys));
1244        assert_eq!(x_s3, transmute!(x_s3));
1245
1246        let x_s2 = {
1247            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1248            x_s2.shuffle3012()
1249        };
1250        let x_s3 = {
1251            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1252            x_s3.shuffle3012()
1253        };
1254        assert_eq!(x_s2, s2.vec(zs));
1255        assert_eq!(x_s3, transmute!(x_s3));
1256
1257        let x_s2 = x_s2.shuffle1230();
1258        let x_s3 = x_s3.shuffle1230();
1259        assert_eq!(x_s2, s2.vec(xs));
1260        assert_eq!(x_s3, transmute!(x_s3));
1261    }
1262
1263    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1264    #[test]
1265    fn test_lanes_u32x4() {
1266        let xs = [0x1, 0x2, 0x3, 0x4];
1267
1268        let s2 = unsafe { SSE2::instance() };
1269        let s3 = unsafe { SSSE3::instance() };
1270        let s4 = unsafe { SSE41::instance() };
1271
1272        {
1273            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1274            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1275            assert_eq!(x_s2, y_s2);
1276            assert_eq!(xs, y_s2.to_lanes());
1277        }
1278
1279        {
1280            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1281            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1282            assert_eq!(x_s3, y_s3);
1283            assert_eq!(xs, y_s3.to_lanes());
1284        }
1285
1286        {
1287            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1288            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1289            assert_eq!(x_s4, y_s4);
1290            assert_eq!(xs, y_s4.to_lanes());
1291        }
1292    }
1293
1294    #[test]
1295    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1296    fn test_lanes_u64x2() {
1297        let xs = [0x1, 0x2];
1298
1299        let s2 = unsafe { SSE2::instance() };
1300        let s3 = unsafe { SSSE3::instance() };
1301        let s4 = unsafe { SSE41::instance() };
1302
1303        {
1304            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1305            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1306            assert_eq!(x_s2, y_s2);
1307            assert_eq!(xs, y_s2.to_lanes());
1308        }
1309
1310        {
1311            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1312            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1313            assert_eq!(x_s3, y_s3);
1314            assert_eq!(xs, y_s3.to_lanes());
1315        }
1316
1317        {
1318            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1319            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1320            assert_eq!(x_s4, y_s4);
1321            assert_eq!(xs, y_s4.to_lanes());
1322        }
1323    }
1324
1325    #[test]
1326    fn test_vec4_u32x4_s2() {
1327        let xs = [1, 2, 3, 4];
1328        let s2 = unsafe { SSE2::instance() };
1329        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1330        assert_eq!(x_s2.extract(0), 1);
1331        assert_eq!(x_s2.extract(1), 2);
1332        assert_eq!(x_s2.extract(2), 3);
1333        assert_eq!(x_s2.extract(3), 4);
1334        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1335        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1336        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1337        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1338    }
1339
1340    #[test]
1341    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1342    fn test_vec4_u32x4_s4() {
1343        let xs = [1, 2, 3, 4];
1344        let s4 = unsafe { SSE41::instance() };
1345        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1346        assert_eq!(x_s4.extract(0), 1);
1347        assert_eq!(x_s4.extract(1), 2);
1348        assert_eq!(x_s4.extract(2), 3);
1349        assert_eq!(x_s4.extract(3), 4);
1350        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1351        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1352        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1353        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1354    }
1355
1356    #[test]
1357    fn test_vec2_u64x2_s2() {
1358        let xs = [0x1, 0x2];
1359        let s2 = unsafe { SSE2::instance() };
1360        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1361        assert_eq!(x_s2.extract(0), 1);
1362        assert_eq!(x_s2.extract(1), 2);
1363        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1364        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1365    }
1366
1367    #[test]
1368    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1369    fn test_vec4_u64x2_s4() {
1370        let xs = [0x1, 0x2];
1371        let s4 = unsafe { SSE41::instance() };
1372        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1373        assert_eq!(x_s4.extract(0), 1);
1374        assert_eq!(x_s4.extract(1), 2);
1375        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1376        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1377    }
1378}
1379
1380pub mod avx2 {
1381    #![allow(non_camel_case_types)]
1382    use crate::soft::{x2, x4};
1383    use crate::types::*;
1384    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1385    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1386    use core::arch::x86_64::*;
1387    use core::marker::PhantomData;
1388    use core::ops::*;
1389    use zerocopy::transmute;
1390
1391    zerocopy::cryptocorrosion_derive_traits! {
1392        #[repr(transparent)]
1393        #[derive(Copy, Clone)]
1394        pub struct u32x4x2_avx2<NI> {
1395            x: __m256i,
1396            ni: PhantomData<NI>,
1397        }
1398    }
1399
1400    impl<NI> u32x4x2_avx2<NI> {
1401        #[inline(always)]
1402        fn new(x: __m256i) -> Self {
1403            Self { x, ni: PhantomData }
1404        }
1405    }
1406
1407    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1408    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1409        #[inline(always)]
1410        unsafe fn unpack(p: vec256_storage) -> Self {
1411            Self::new(p.avx)
1412        }
1413    }
1414    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1415        #[inline(always)]
1416        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1417            assert_eq!(input.len(), 32);
1418            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1419        }
1420        #[inline(always)]
1421        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1422            Self::unsafe_read_le(input).bswap()
1423        }
1424        #[inline(always)]
1425        fn write_le(self, out: &mut [u8]) {
1426            unsafe {
1427                assert_eq!(out.len(), 32);
1428                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1429            }
1430        }
1431        #[inline(always)]
1432        fn write_be(self, out: &mut [u8]) {
1433            self.bswap().write_le(out)
1434        }
1435    }
1436    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
1437        #[inline(always)]
1438        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
1439            unsafe {
1440                [
1441                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1442                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1443                ]
1444            }
1445        }
1446        #[inline(always)]
1447        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
1448            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
1449        }
1450    }
1451    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1452        #[inline(always)]
1453        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1454            unsafe {
1455                match i {
1456                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1457                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1458                    _ => panic!(),
1459                }
1460            }
1461        }
1462        #[inline(always)]
1463        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1464            Self::new(unsafe {
1465                match i {
1466                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
1467                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
1468                    _ => panic!(),
1469                }
1470            })
1471        }
1472    }
1473    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1474    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1475    macro_rules! shuf_lane_bytes {
1476        ($name:ident, $k0:expr, $k1:expr) => {
1477            #[inline(always)]
1478            fn $name(self) -> Self {
1479                Self::new(unsafe {
1480                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1481                })
1482            }
1483        };
1484    }
1485    macro_rules! rotr_32 {
1486        ($name:ident, $i:expr) => {
1487            #[inline(always)]
1488            fn $name(self) -> Self {
1489                Self::new(unsafe {
1490                    _mm256_or_si256(
1491                        _mm256_srli_epi32(self.x, $i as i32),
1492                        _mm256_slli_epi32(self.x, 32 - $i as i32),
1493                    )
1494                })
1495            }
1496        };
1497    }
1498    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1499        rotr_32!(rotate_each_word_right7, 7);
1500        shuf_lane_bytes!(
1501            rotate_each_word_right8,
1502            0x0c0f_0e0d_080b_0a09,
1503            0x0407_0605_0003_0201
1504        );
1505        rotr_32!(rotate_each_word_right11, 11);
1506        rotr_32!(rotate_each_word_right12, 12);
1507        shuf_lane_bytes!(
1508            rotate_each_word_right16,
1509            0x0d0c_0f0e_0908_0b0a,
1510            0x0504_0706_0100_0302
1511        );
1512        rotr_32!(rotate_each_word_right20, 20);
1513        shuf_lane_bytes!(
1514            rotate_each_word_right24,
1515            0x0e0d_0c0f_0a09_080b,
1516            0x0605_0407_0201_0003
1517        );
1518        rotr_32!(rotate_each_word_right25, 25);
1519    }
1520    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1521    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1522        #[inline(always)]
1523        fn from(x: u32x4x2_avx2<NI>) -> Self {
1524            Self { avx: x.x }
1525        }
1526    }
1527
1528    macro_rules! impl_assign {
1529        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1530            impl<NI> $Assign for $vec<NI>
1531            where
1532                NI: Copy,
1533            {
1534                #[inline(always)]
1535                fn $assign_fn(&mut self, rhs: Self) {
1536                    *self = self.$bin_fn(rhs);
1537                }
1538            }
1539        };
1540    }
1541    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1542    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1543    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1544    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1545
1546    macro_rules! impl_bitop {
1547        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1548            impl<NI> $Op for $vec<NI> {
1549                type Output = Self;
1550                #[inline(always)]
1551                fn $op_fn(self, rhs: Self) -> Self::Output {
1552                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1553                }
1554            }
1555        };
1556    }
1557    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1558    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1559    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1560    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1561    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1562
1563    impl<NI> Not for u32x4x2_avx2<NI> {
1564        type Output = Self;
1565        #[inline(always)]
1566        fn not(self) -> Self::Output {
1567            unsafe {
1568                let f = _mm256_set1_epi8(-0x7f);
1569                Self::new(f) ^ self
1570            }
1571        }
1572    }
1573
1574    impl<NI> BSwap for u32x4x2_avx2<NI> {
1575        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1576    }
1577
1578    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1579    where
1580        NI: Copy,
1581    {
1582        #[inline(always)]
1583        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1584            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
1585        }
1586    }
1587
1588    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1589        #[inline(always)]
1590        fn shuffle_lane_words1230(self) -> Self {
1591            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
1592        }
1593        #[inline(always)]
1594        fn shuffle_lane_words2301(self) -> Self {
1595            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
1596        }
1597        #[inline(always)]
1598        fn shuffle_lane_words3012(self) -> Self {
1599            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
1600        }
1601    }
1602
1603    ///////////////////////////////////////////////////////////////////////////////////////////
1604
1605    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1606    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1607
1608    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1609        #[inline(always)]
1610        unsafe fn unpack(p: vec512_storage) -> Self {
1611            Self::new([
1612                u32x4x2_avx2::unpack(p.avx[0]),
1613                u32x4x2_avx2::unpack(p.avx[1]),
1614            ])
1615        }
1616    }
1617    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1618        #[inline(always)]
1619        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1620            let [a, b] = self.0[0].to_lanes();
1621            let [c, d] = self.0[1].to_lanes();
1622            [a, b, c, d]
1623        }
1624        #[inline(always)]
1625        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1626            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
1627            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
1628            Self::new([ab, cd])
1629        }
1630    }
1631    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1632        #[inline(always)]
1633        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1634            match i {
1635                0 => self.0[0].extract(0),
1636                1 => self.0[0].extract(1),
1637                2 => self.0[1].extract(0),
1638                3 => self.0[1].extract(1),
1639                _ => panic!(),
1640            }
1641        }
1642        #[inline(always)]
1643        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1644            Self::new(match i {
1645                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
1646                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
1647                _ => panic!(),
1648            })
1649        }
1650    }
1651    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1652        #[inline(always)]
1653        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1654            /*
1655             * a00:a01 a10:a11
1656             * b00:b01 b10:b11
1657             * c00:c01 c10:c11
1658             * d00:d01 d10:d11
1659             *       =>
1660             * a00:b00 c00:d00
1661             * a01:b01 c01:d01
1662             * a10:b10 c10:d10
1663             * a11:b11 c11:d11
1664             */
1665            unsafe {
1666                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
1667                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
1668                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
1669                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
1670                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
1671                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
1672                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
1673                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
1674                (
1675                    Self::new([ab00, cd00]),
1676                    Self::new([ab01, cd01]),
1677                    Self::new([ab10, cd10]),
1678                    Self::new([ab11, cd11]),
1679                )
1680            }
1681        }
1682    }
1683    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1684        #[inline(always)]
1685        fn to_scalars(self) -> [u32; 16] {
1686            transmute!(self)
1687        }
1688    }
1689    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1690        #[inline(always)]
1691        fn from(x: u32x4x4_avx2<NI>) -> Self {
1692            Self {
1693                avx: [
1694                    vec256_storage { avx: x.0[0].x },
1695                    vec256_storage { avx: x.0[1].x },
1696                ],
1697            }
1698        }
1699    }
1700    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1701        #[inline(always)]
1702        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1703            Self::new(unsafe {
1704                [
1705                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
1706                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
1707                ]
1708            })
1709        }
1710    }
1711}