1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10 Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12use zerocopy::transmute;
13
14macro_rules! impl_binop {
15 ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
16 impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
17 type Output = Self;
18 #[inline(always)]
19 fn $fn(self, rhs: Self) -> Self::Output {
20 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
21 }
22 }
23 };
24}
25
26macro_rules! impl_binop_assign {
27 ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
28 impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
29 where
30 $vec<S3, S4, NI>: Copy,
31 {
32 #[inline(always)]
33 fn $fn_assign(&mut self, rhs: Self) {
34 *self = self.$fn(rhs);
35 }
36 }
37 };
38}
39
40macro_rules! def_vec {
41 ($vec:ident, $word:ident) => {
42 zerocopy::cryptocorrosion_derive_traits! {
43 #[repr(transparent)]
44 #[allow(non_camel_case_types)]
45 #[derive(Copy, Clone)]
46 pub struct $vec<S3, S4, NI> {
47 x: __m128i,
48 s3: PhantomData<S3>,
49 s4: PhantomData<S4>,
50 ni: PhantomData<NI>,
51 }
52 }
53
54 impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
55 #[inline(always)]
56 unsafe fn unpack(x: vec128_storage) -> Self {
57 Self::new(x.sse2)
58 }
59 }
60 impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
61 #[inline(always)]
62 fn from(x: $vec<S3, S4, NI>) -> Self {
63 vec128_storage { sse2: x.x }
64 }
65 }
66 impl<S3, S4, NI> $vec<S3, S4, NI> {
67 #[inline(always)]
68 fn new(x: __m128i) -> Self {
69 $vec {
70 x,
71 s3: PhantomData,
72 s4: PhantomData,
73 ni: PhantomData,
74 }
75 }
76 }
77
78 impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
79 where
80 Self: BSwap,
81 {
82 #[inline(always)]
83 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
84 assert_eq!(input.len(), 16);
85 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
86 }
87 #[inline(always)]
88 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
89 assert_eq!(input.len(), 16);
90 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
91 }
92 #[inline(always)]
93 fn write_le(self, out: &mut [u8]) {
94 assert_eq!(out.len(), 16);
95 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
96 }
97 #[inline(always)]
98 fn write_be(self, out: &mut [u8]) {
99 assert_eq!(out.len(), 16);
100 let x = self.bswap().x;
101 unsafe {
102 _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
103 }
104 }
105 }
106
107 impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
108 #[inline(always)]
109 fn default() -> Self {
110 Self::new(unsafe { _mm_setzero_si128() })
111 }
112 }
113
114 impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
115 type Output = Self;
116 #[inline(always)]
117 fn not(self) -> Self::Output {
118 unsafe {
119 let ff = _mm_set1_epi64x(-1i64);
120 self ^ Self::new(ff)
121 }
122 }
123 }
124
125 impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
126 impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
127 impl_binop!($vec, BitOr, bitor, _mm_or_si128);
128 impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
129 impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
130 impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
131 impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
132 impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
133 type Output = Self;
134 #[inline(always)]
135 fn andnot(self, rhs: Self) -> Self {
136 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
137 }
138 }
139 };
140}
141
142macro_rules! impl_bitops32 {
143 ($vec:ident) => {
144 impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
145 $vec<S3, S4, NI>: RotateEachWord32
146 {
147 }
148 };
149}
150
151macro_rules! impl_bitops64 {
152 ($vec:ident) => {
153 impl_bitops32!($vec);
154 impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
155 $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
156 {
157 }
158 };
159}
160
161macro_rules! impl_bitops128 {
162 ($vec:ident) => {
163 impl_bitops64!($vec);
164 impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
165 $vec<S3, S4, NI>: RotateEachWord128
166 {
167 }
168 };
169}
170
171macro_rules! rotr_32_s3 {
172 ($name:ident, $k0:expr, $k1:expr) => {
173 #[inline(always)]
174 fn $name(self) -> Self {
175 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
176 }
177 };
178}
179macro_rules! rotr_32 {
180 ($name:ident, $i:expr) => {
181 #[inline(always)]
182 fn $name(self) -> Self {
183 Self::new(unsafe {
184 _mm_or_si128(
185 _mm_srli_epi32(self.x, $i as i32),
186 _mm_slli_epi32(self.x, 32 - $i as i32),
187 )
188 })
189 }
190 };
191}
192impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
193 rotr_32!(rotate_each_word_right7, 7);
194 rotr_32_s3!(
195 rotate_each_word_right8,
196 0x0c0f_0e0d_080b_0a09,
197 0x0407_0605_0003_0201
198 );
199 rotr_32!(rotate_each_word_right11, 11);
200 rotr_32!(rotate_each_word_right12, 12);
201 rotr_32_s3!(
202 rotate_each_word_right16,
203 0x0d0c_0f0e_0908_0b0a,
204 0x0504_0706_0100_0302
205 );
206 rotr_32!(rotate_each_word_right20, 20);
207 rotr_32_s3!(
208 rotate_each_word_right24,
209 0x0e0d_0c0f_0a09_080b,
210 0x0605_0407_0201_0003
211 );
212 rotr_32!(rotate_each_word_right25, 25);
213}
214impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
215 rotr_32!(rotate_each_word_right7, 7);
216 rotr_32!(rotate_each_word_right8, 8);
217 rotr_32!(rotate_each_word_right11, 11);
218 rotr_32!(rotate_each_word_right12, 12);
219 #[inline(always)]
220 fn rotate_each_word_right16(self) -> Self {
221 Self::new(swap16_s2(self.x))
222 }
223 rotr_32!(rotate_each_word_right20, 20);
224 rotr_32!(rotate_each_word_right24, 24);
225 rotr_32!(rotate_each_word_right25, 25);
226}
227
228macro_rules! rotr_64_s3 {
229 ($name:ident, $k0:expr, $k1:expr) => {
230 #[inline(always)]
231 fn $name(self) -> Self {
232 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
233 }
234 };
235}
236macro_rules! rotr_64 {
237 ($name:ident, $i:expr) => {
238 #[inline(always)]
239 fn $name(self) -> Self {
240 Self::new(unsafe {
241 _mm_or_si128(
242 _mm_srli_epi64(self.x, $i as i32),
243 _mm_slli_epi64(self.x, 64 - $i as i32),
244 )
245 })
246 }
247 };
248}
249impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
250 rotr_64!(rotate_each_word_right7, 7);
251 rotr_64_s3!(
252 rotate_each_word_right8,
253 0x080f_0e0d_0c0b_0a09,
254 0x0007_0605_0403_0201
255 );
256 rotr_64!(rotate_each_word_right11, 11);
257 rotr_64!(rotate_each_word_right12, 12);
258 rotr_64_s3!(
259 rotate_each_word_right16,
260 0x0908_0f0e_0d0c_0b0a,
261 0x0100_0706_0504_0302
262 );
263 rotr_64!(rotate_each_word_right20, 20);
264 rotr_64_s3!(
265 rotate_each_word_right24,
266 0x0a09_080f_0e0d_0c0b,
267 0x0201_0007_0605_0403
268 );
269 rotr_64!(rotate_each_word_right25, 25);
270}
271impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
272 rotr_64!(rotate_each_word_right7, 7);
273 rotr_64!(rotate_each_word_right8, 8);
274 rotr_64!(rotate_each_word_right11, 11);
275 rotr_64!(rotate_each_word_right12, 12);
276 #[inline(always)]
277 fn rotate_each_word_right16(self) -> Self {
278 Self::new(swap16_s2(self.x))
279 }
280 rotr_64!(rotate_each_word_right20, 20);
281 rotr_64!(rotate_each_word_right24, 24);
282 rotr_64!(rotate_each_word_right25, 25);
283}
284impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
285 #[inline(always)]
286 fn rotate_each_word_right32(self) -> Self {
287 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
288 }
289}
290
291macro_rules! rotr_128 {
292 ($name:ident, $i:expr) => {
293 #[inline(always)]
294 fn $name(self) -> Self {
295 Self::new(unsafe {
296 _mm_or_si128(
297 _mm_srli_si128(self.x, $i as i32),
298 _mm_slli_si128(self.x, 128 - $i as i32),
299 )
300 })
301 }
302 };
303}
304impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
306 rotr_128!(rotate_each_word_right7, 7);
307 rotr_128!(rotate_each_word_right8, 8);
308 rotr_128!(rotate_each_word_right11, 11);
309 rotr_128!(rotate_each_word_right12, 12);
310 rotr_128!(rotate_each_word_right16, 16);
311 rotr_128!(rotate_each_word_right20, 20);
312 rotr_128!(rotate_each_word_right24, 24);
313 rotr_128!(rotate_each_word_right25, 25);
314}
315impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
317 rotr_128!(rotate_each_word_right32, 32);
318}
319impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
320
321def_vec!(u32x4_sse2, u32);
322def_vec!(u64x2_sse2, u64);
323def_vec!(u128x1_sse2, u128);
324
325impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
326 #[inline(always)]
327 fn to_lanes(self) -> [u32; 4] {
328 unsafe {
329 let x = _mm_cvtsi128_si64(self.x) as u64;
330 let y = _mm_extract_epi64(self.x, 1) as u64;
331 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
332 }
333 }
334 #[inline(always)]
335 fn from_lanes(xs: [u32; 4]) -> Self {
336 unsafe {
337 let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
338 x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
339 Self::new(x)
340 }
341 }
342}
343impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
344 #[inline(always)]
345 fn to_lanes(self) -> [u32; 4] {
346 unsafe {
347 let x = _mm_cvtsi128_si64(self.x) as u64;
348 let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
349 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
350 }
351 }
352 #[inline(always)]
353 fn from_lanes(xs: [u32; 4]) -> Self {
354 unsafe {
355 let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
356 let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
357 let x = _mm_cvtsi64_si128(x);
358 let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
359 Self::new(_mm_or_si128(x, y))
360 }
361 }
362}
363impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
364 #[inline(always)]
365 fn to_lanes(self) -> [u64; 2] {
366 unsafe {
367 [
368 _mm_cvtsi128_si64(self.x) as u64,
369 _mm_extract_epi64(self.x, 1) as u64,
370 ]
371 }
372 }
373 #[inline(always)]
374 fn from_lanes(xs: [u64; 2]) -> Self {
375 unsafe {
376 let mut x = _mm_cvtsi64_si128(xs[0] as i64);
377 x = _mm_insert_epi64(x, xs[1] as i64, 1);
378 Self::new(x)
379 }
380 }
381}
382impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
383 #[inline(always)]
384 fn to_lanes(self) -> [u64; 2] {
385 unsafe {
386 [
387 _mm_cvtsi128_si64(self.x) as u64,
388 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
389 ]
390 }
391 }
392 #[inline(always)]
393 fn from_lanes(xs: [u64; 2]) -> Self {
394 unsafe {
395 let x = _mm_cvtsi64_si128(xs[0] as i64);
396 let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
397 Self::new(_mm_or_si128(x, y))
398 }
399 }
400}
401impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
402 #[inline(always)]
403 fn to_lanes(self) -> [u128; 1] {
404 unimplemented!()
405 }
406 #[inline(always)]
407 fn from_lanes(xs: [u128; 1]) -> Self {
408 unimplemented!("{:?}", xs)
409 }
410}
411
412impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
413where
414 u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
415{
416 #[inline(always)]
417 fn to_lanes(self) -> [u64; 4] {
418 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
419 [a[0], a[1], b[0], b[1]]
420 }
421 #[inline(always)]
422 fn from_lanes(xs: [u64; 4]) -> Self {
423 let (a, b) = (
424 u64x2_sse2::from_lanes([xs[0], xs[1]]),
425 u64x2_sse2::from_lanes([xs[2], xs[3]]),
426 );
427 x2::new([a, b])
428 }
429}
430
431macro_rules! impl_into {
432 ($from:ident, $to:ident) => {
433 impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
434 #[inline(always)]
435 fn from(x: $from<S3, S4, NI>) -> Self {
436 $to::new(x.x)
437 }
438 }
439 };
440}
441
442impl_into!(u128x1_sse2, u32x4_sse2);
443impl_into!(u128x1_sse2, u64x2_sse2);
444
445impl_bitops32!(u32x4_sse2);
446impl_bitops64!(u64x2_sse2);
447impl_bitops128!(u128x1_sse2);
448
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
450 u32x4_sse2<S3, S4, NI>: BSwap
451{
452}
453impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
454 u64x2_sse2<S3, S4, NI>: BSwap
455{
456}
457impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
458impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
459impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
460impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
461
462impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
463where
464 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
465 Machine86<S3, S4, NI>: Machine,
466{
467}
468impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
469where
470 u64x2_sse2<S3, S4, NI>:
471 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
472 Machine86<S3, S4, NI>: Machine,
473{
474}
475impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
476where
477 u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
478 Machine86<S3, S4, NI>: Machine,
479 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
480 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
481{
482}
483
484impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
485where
486 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
487 Machine86<YesS3, YesS4, NI>: Machine,
488{
489}
490impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
491where
492 u64x2_sse2<YesS3, YesS4, NI>:
493 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
494 Machine86<YesS3, YesS4, NI>: Machine,
495{
496}
497impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
498where
499 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
500 Machine86<YesS3, YesS4, NI>: Machine,
501 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
502 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
503{
504}
505
506impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
507 #[inline(always)]
508 unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
509 Self::new(_mm_set_epi32(
510 xs[3] as i32,
511 xs[2] as i32,
512 xs[1] as i32,
513 xs[0] as i32,
514 ))
515 }
516}
517
518impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
519where
520 Self: MultiLane<[u32; 4]>,
521{
522 #[inline(always)]
523 fn extract(self, i: u32) -> u32 {
524 self.to_lanes()[i as usize]
525 }
526 #[inline(always)]
527 fn insert(self, v: u32, i: u32) -> Self {
528 Self::new(unsafe {
529 match i {
530 0 => _mm_insert_epi32(self.x, v as i32, 0),
531 1 => _mm_insert_epi32(self.x, v as i32, 1),
532 2 => _mm_insert_epi32(self.x, v as i32, 2),
533 3 => _mm_insert_epi32(self.x, v as i32, 3),
534 _ => unreachable!(),
535 }
536 })
537 }
538}
539impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
540where
541 Self: MultiLane<[u32; 4]>,
542{
543 #[inline(always)]
544 fn extract(self, i: u32) -> u32 {
545 self.to_lanes()[i as usize]
546 }
547 #[inline(always)]
548 fn insert(self, v: u32, i: u32) -> Self {
549 Self::new(unsafe {
550 match i {
551 0 => {
552 let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
553 _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
554 }
555 1 => {
556 let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
557 x = _mm_slli_si128(x, 4);
558 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
559 _mm_shuffle_epi32(x, 0b1110_0001)
560 }
561 2 => {
562 let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
563 x = _mm_slli_si128(x, 4);
564 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
565 _mm_shuffle_epi32(x, 0b1100_1001)
566 }
567 3 => {
568 let mut x = _mm_slli_si128(self.x, 4);
569 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
570 _mm_shuffle_epi32(x, 0b0011_1001)
571 }
572 _ => unreachable!(),
573 }
574 })
575 }
576}
577
578impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
579 #[inline(always)]
580 fn shuffle_lane_words2301(self) -> Self {
581 self.shuffle2301()
582 }
583 #[inline(always)]
584 fn shuffle_lane_words1230(self) -> Self {
585 self.shuffle1230()
586 }
587 #[inline(always)]
588 fn shuffle_lane_words3012(self) -> Self {
589 self.shuffle3012()
590 }
591}
592
593impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
594 #[inline(always)]
595 fn shuffle2301(self) -> Self {
596 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
597 }
598 #[inline(always)]
599 fn shuffle1230(self) -> Self {
600 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
601 }
602 #[inline(always)]
603 fn shuffle3012(self) -> Self {
604 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
605 }
606}
607
608impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
609 #[inline(always)]
610 fn shuffle2301(self) -> Self {
611 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
612 }
613 #[inline(always)]
614 fn shuffle3012(self) -> Self {
615 unsafe {
616 x2::new([
617 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
618 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
619 ])
620 }
621 }
622 #[inline(always)]
623 fn shuffle1230(self) -> Self {
624 unsafe {
625 x2::new([
626 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
627 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
628 ])
629 }
630 }
631}
632impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
633 #[inline(always)]
634 fn shuffle2301(self) -> Self {
635 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
636 }
637 #[inline(always)]
638 fn shuffle3012(self) -> Self {
639 unsafe {
640 let a = _mm_srli_si128(self.0[0].x, 8);
641 let b = _mm_slli_si128(self.0[0].x, 8);
642 let c = _mm_srli_si128(self.0[1].x, 8);
643 let d = _mm_slli_si128(self.0[1].x, 8);
644 let da = _mm_or_si128(d, a);
645 let bc = _mm_or_si128(b, c);
646 x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
647 }
648 }
649 #[inline(always)]
650 fn shuffle1230(self) -> Self {
651 unsafe {
652 let a = _mm_srli_si128(self.0[0].x, 8);
653 let b = _mm_slli_si128(self.0[0].x, 8);
654 let c = _mm_srli_si128(self.0[1].x, 8);
655 let d = _mm_slli_si128(self.0[1].x, 8);
656 let da = _mm_or_si128(d, a);
657 let bc = _mm_or_si128(b, c);
658 x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
659 }
660 }
661}
662
663impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
664 #[inline(always)]
665 unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
666 Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
667 }
668}
669
670impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
671 #[inline(always)]
672 fn extract(self, i: u32) -> u64 {
673 unsafe {
674 match i {
675 0 => _mm_cvtsi128_si64(self.x) as u64,
676 1 => _mm_extract_epi64(self.x, 1) as u64,
677 _ => unreachable!(),
678 }
679 }
680 }
681 #[inline(always)]
682 fn insert(self, x: u64, i: u32) -> Self {
683 Self::new(unsafe {
684 match i {
685 0 => _mm_insert_epi64(self.x, x as i64, 0),
686 1 => _mm_insert_epi64(self.x, x as i64, 1),
687 _ => unreachable!(),
688 }
689 })
690 }
691}
692impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
693 #[inline(always)]
694 fn extract(self, i: u32) -> u64 {
695 unsafe {
696 match i {
697 0 => _mm_cvtsi128_si64(self.x) as u64,
698 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
699 _ => unreachable!(),
700 }
701 }
702 }
703 #[inline(always)]
704 fn insert(self, x: u64, i: u32) -> Self {
705 Self::new(unsafe {
706 match i {
707 0 => _mm_or_si128(
708 _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
709 _mm_cvtsi64_si128(x as i64),
710 ),
711 1 => _mm_or_si128(
712 _mm_move_epi64(self.x),
713 _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
714 ),
715 _ => unreachable!(),
716 }
717 })
718 }
719}
720
721impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
722 #[inline(always)]
723 fn bswap(self) -> Self {
724 Self::new(unsafe {
725 let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
726 _mm_shuffle_epi8(self.x, k)
727 })
728 }
729}
730#[inline(always)]
731fn bswap32_s2(x: __m128i) -> __m128i {
732 unsafe {
733 let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
734 y = _mm_shufflehi_epi16(y, 0b0001_1011);
735 y = _mm_shufflelo_epi16(y, 0b0001_1011);
736 let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
737 z = _mm_shufflehi_epi16(z, 0b0001_1011);
738 z = _mm_shufflelo_epi16(z, 0b0001_1011);
739 _mm_packus_epi16(y, z)
740 }
741}
742impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
743 #[inline(always)]
744 fn bswap(self) -> Self {
745 Self::new(bswap32_s2(self.x))
746 }
747}
748
749impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
750 #[inline(always)]
751 fn bswap(self) -> Self {
752 Self::new(unsafe {
753 let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
754 _mm_shuffle_epi8(self.x, k)
755 })
756 }
757}
758impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
759 #[inline(always)]
760 fn bswap(self) -> Self {
761 Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
762 }
763}
764
765impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
766 #[inline(always)]
767 fn bswap(self) -> Self {
768 Self::new(unsafe {
769 let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
770 _mm_shuffle_epi8(self.x, k)
771 })
772 }
773}
774impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
775 #[inline(always)]
776 fn bswap(self) -> Self {
777 unimplemented!()
778 }
779}
780
781macro_rules! swapi {
782 ($x:expr, $i:expr, $k:expr) => {
783 unsafe {
784 const K: u8 = $k;
785 let k = _mm_set1_epi8(K as i8);
786 u128x1_sse2::new(_mm_or_si128(
787 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
788 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
789 ))
790 }
791 };
792}
793#[inline(always)]
794fn swap16_s2(x: __m128i) -> __m128i {
795 unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
796}
797impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
798 #[inline(always)]
799 fn swap1(self) -> Self {
800 swapi!(self, 1, 0xaa)
801 }
802 #[inline(always)]
803 fn swap2(self) -> Self {
804 swapi!(self, 2, 0xcc)
805 }
806 #[inline(always)]
807 fn swap4(self) -> Self {
808 swapi!(self, 4, 0xf0)
809 }
810 #[inline(always)]
811 fn swap8(self) -> Self {
812 u128x1_sse2::new(unsafe {
813 let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
814 _mm_shuffle_epi8(self.x, k)
815 })
816 }
817 #[inline(always)]
818 fn swap16(self) -> Self {
819 u128x1_sse2::new(unsafe {
820 let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
821 _mm_shuffle_epi8(self.x, k)
822 })
823 }
824 #[inline(always)]
825 fn swap32(self) -> Self {
826 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
827 }
828 #[inline(always)]
829 fn swap64(self) -> Self {
830 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
831 }
832}
833impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
834 #[inline(always)]
835 fn swap1(self) -> Self {
836 swapi!(self, 1, 0xaa)
837 }
838 #[inline(always)]
839 fn swap2(self) -> Self {
840 swapi!(self, 2, 0xcc)
841 }
842 #[inline(always)]
843 fn swap4(self) -> Self {
844 swapi!(self, 4, 0xf0)
845 }
846 #[inline(always)]
847 fn swap8(self) -> Self {
848 u128x1_sse2::new(unsafe {
849 _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
850 })
851 }
852 #[inline(always)]
853 fn swap16(self) -> Self {
854 u128x1_sse2::new(swap16_s2(self.x))
855 }
856 #[inline(always)]
857 fn swap32(self) -> Self {
858 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
859 }
860 #[inline(always)]
861 fn swap64(self) -> Self {
862 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
863 }
864}
865
866#[derive(Copy, Clone)]
867pub struct G0;
868#[derive(Copy, Clone)]
869pub struct G1;
870
871#[allow(non_camel_case_types)]
872pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
873#[allow(non_camel_case_types)]
874pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
875#[allow(non_camel_case_types)]
876pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
877#[allow(non_camel_case_types)]
878pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
879
880#[allow(non_camel_case_types)]
881pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
882#[allow(non_camel_case_types)]
883pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
884#[allow(non_camel_case_types)]
885pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
886
887impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
888 #[inline(always)]
889 fn to_scalars(self) -> [u32; 16] {
890 transmute!(self)
891 }
892}
893
894impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
895where
896 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
897 Machine86<S3, S4, NI>: Machine,
898 u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
899 u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
900{
901}
902impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
903where
904 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
905 Machine86<S3, S4, NI>: Machine,
906 u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
907 u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
908{
909}
910impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
911where
912 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
913 Machine86<S3, S4, NI>: Machine,
914 u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
915{
916}
917impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
918where
919 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
920 Machine86<S3, S4, NI>: Machine,
921 u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
922 u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
923 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
924 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
925 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
926{
927}
928
929impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
930where
931 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
932 Avx2Machine<NI>: Machine,
933 u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
934 u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
935{
936}
937impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
938where
939 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
940 Avx2Machine<NI>: Machine,
941 u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
942 u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
943{
944}
945impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
946where
947 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
948 Avx2Machine<NI>: Machine,
949 u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
950{
951}
952impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
953where
954 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
955 Avx2Machine<NI>: Machine,
956 u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
957 u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
958 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
959 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
960 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
961{
962}
963
964impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
965where
966 u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
967{
968 #[inline(always)]
969 fn extract(self, i: u32) -> u64 {
970 match i {
971 0 => self.0[0].extract(0),
972 1 => self.0[0].extract(1),
973 2 => self.0[1].extract(0),
974 3 => self.0[1].extract(1),
975 _ => panic!(),
976 }
977 }
978 #[inline(always)]
979 fn insert(mut self, w: u64, i: u32) -> Self {
980 match i {
981 0 => self.0[0] = self.0[0].insert(w, 0),
982 1 => self.0[0] = self.0[0].insert(w, 1),
983 2 => self.0[1] = self.0[1].insert(w, 0),
984 3 => self.0[1] = self.0[1].insert(w, 1),
985 _ => panic!(),
986 };
987 self
988 }
989}
990
991impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
992where
993 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
994 Machine86<S3, S4, NI>: Machine,
995 u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
996 u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
997 u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
998 u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
999{
1000}
1001impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1002where
1003 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1004 Machine86<S3, S4, NI>: Machine,
1005 u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1006 u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1007{
1008}
1009impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1010where
1011 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1012 Machine86<S3, S4, NI>: Machine,
1013 u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1014 u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1015 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1016 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1017{
1018}
1019
1020impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1021where
1022 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1023 Avx2Machine<NI>: Machine,
1024 u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1025 u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1026{
1027}
1028impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1029where
1030 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1031 Avx2Machine<NI>: Machine,
1032 u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1033 u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1034 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1035 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1036{
1037}
1038
1039macro_rules! impl_into_x {
1040 ($from:ident, $to:ident) => {
1041 impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1042 for x2<$to<S3, S4, NI>, Gt>
1043 {
1044 #[inline(always)]
1045 fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1046 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1047 }
1048 }
1049 impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1050 #[inline(always)]
1051 fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1052 x4::new([
1053 $to::from(x.0[0]),
1054 $to::from(x.0[1]),
1055 $to::from(x.0[2]),
1056 $to::from(x.0[3]),
1057 ])
1058 }
1059 }
1060 };
1061}
1062impl_into_x!(u128x1_sse2, u64x2_sse2);
1063impl_into_x!(u128x1_sse2, u32x4_sse2);
1064
1065use core::fmt::{Debug, Formatter, Result};
1068
1069impl<W: PartialEq, G> PartialEq for x2<W, G> {
1070 #[inline(always)]
1071 fn eq(&self, rhs: &Self) -> bool {
1072 self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1073 }
1074}
1075
1076#[allow(unused)]
1077#[inline(always)]
1078unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1079 let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1080 _mm_cvtsi128_si64(q) == -1
1081}
1082
1083#[inline(always)]
1084unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1085 let q = _mm_cmpeq_epi32(x, y);
1086 let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1087 let q = _mm_cvtsi128_si64(q);
1088 (p & q) == -1
1089}
1090
1091impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1092 #[inline(always)]
1093 fn eq(&self, rhs: &Self) -> bool {
1094 unsafe { eq128_s2(self.x, rhs.x) }
1095 }
1096}
1097impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1098where
1099 Self: Copy + MultiLane<[u32; 4]>,
1100{
1101 #[cold]
1102 fn fmt(&self, fmt: &mut Formatter) -> Result {
1103 fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1104 }
1105}
1106
1107impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1108 #[inline(always)]
1109 fn eq(&self, rhs: &Self) -> bool {
1110 unsafe { eq128_s2(self.x, rhs.x) }
1111 }
1112}
1113impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1114where
1115 Self: Copy + MultiLane<[u64; 2]>,
1116{
1117 #[cold]
1118 fn fmt(&self, fmt: &mut Formatter) -> Result {
1119 fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1120 }
1121}
1122
1123impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1124where
1125 u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1126{
1127 #[cold]
1128 fn fmt(&self, fmt: &mut Formatter) -> Result {
1129 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1130 fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1131 }
1132}
1133
1134#[cfg(test)]
1135#[cfg(target_arch = "x86_64")]
1136mod test {
1137 use super::*;
1138 use crate::x86_64::{SSE2, SSE41, SSSE3};
1139 use crate::Machine;
1140
1141 #[test]
1142 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1143 fn test_bswap32_s2_vs_s3() {
1144 let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1145 let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1146
1147 let s2 = unsafe { SSE2::instance() };
1148 let s3 = unsafe { SSSE3::instance() };
1149
1150 let x_s2 = {
1151 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1152 x_s2.bswap()
1153 };
1154
1155 let x_s3 = {
1156 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1157 x_s3.bswap()
1158 };
1159
1160 assert_eq!(x_s2, transmute!(x_s3));
1161 assert_eq!(x_s2, s2.vec(ys));
1162 }
1163
1164 #[test]
1165 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1166 fn test_bswap64_s2_vs_s3() {
1167 let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1168 let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1169
1170 let s2 = unsafe { SSE2::instance() };
1171 let s3 = unsafe { SSSE3::instance() };
1172
1173 let x_s2 = {
1174 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1175 x_s2.bswap()
1176 };
1177
1178 let x_s3 = {
1179 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1180 x_s3.bswap()
1181 };
1182
1183 assert_eq!(x_s2, s2.vec(ys));
1184 assert_eq!(x_s3, transmute!(x_s3));
1185 }
1186
1187 #[test]
1188 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1189 fn test_shuffle32_s2_vs_s3() {
1190 let xs = [0x0, 0x1, 0x2, 0x3];
1191 let ys = [0x2, 0x3, 0x0, 0x1];
1192 let zs = [0x1, 0x2, 0x3, 0x0];
1193
1194 let s2 = unsafe { SSE2::instance() };
1195 let s3 = unsafe { SSSE3::instance() };
1196
1197 let x_s2 = {
1198 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1199 x_s2.shuffle2301()
1200 };
1201 let x_s3 = {
1202 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1203 x_s3.shuffle2301()
1204 };
1205 assert_eq!(x_s2, s2.vec(ys));
1206 assert_eq!(x_s3, transmute!(x_s3));
1207
1208 let x_s2 = {
1209 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1210 x_s2.shuffle3012()
1211 };
1212 let x_s3 = {
1213 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1214 x_s3.shuffle3012()
1215 };
1216 assert_eq!(x_s2, s2.vec(zs));
1217 assert_eq!(x_s3, transmute!(x_s3));
1218
1219 let x_s2 = x_s2.shuffle1230();
1220 let x_s3 = x_s3.shuffle1230();
1221 assert_eq!(x_s2, s2.vec(xs));
1222 assert_eq!(x_s3, transmute!(x_s3));
1223 }
1224
1225 #[test]
1226 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1227 fn test_shuffle64_s2_vs_s3() {
1228 let xs = [0x0, 0x1, 0x2, 0x3];
1229 let ys = [0x2, 0x3, 0x0, 0x1];
1230 let zs = [0x1, 0x2, 0x3, 0x0];
1231
1232 let s2 = unsafe { SSE2::instance() };
1233 let s3 = unsafe { SSSE3::instance() };
1234
1235 let x_s2 = {
1236 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1237 x_s2.shuffle2301()
1238 };
1239 let x_s3 = {
1240 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1241 x_s3.shuffle2301()
1242 };
1243 assert_eq!(x_s2, s2.vec(ys));
1244 assert_eq!(x_s3, transmute!(x_s3));
1245
1246 let x_s2 = {
1247 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1248 x_s2.shuffle3012()
1249 };
1250 let x_s3 = {
1251 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1252 x_s3.shuffle3012()
1253 };
1254 assert_eq!(x_s2, s2.vec(zs));
1255 assert_eq!(x_s3, transmute!(x_s3));
1256
1257 let x_s2 = x_s2.shuffle1230();
1258 let x_s3 = x_s3.shuffle1230();
1259 assert_eq!(x_s2, s2.vec(xs));
1260 assert_eq!(x_s3, transmute!(x_s3));
1261 }
1262
1263 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1264 #[test]
1265 fn test_lanes_u32x4() {
1266 let xs = [0x1, 0x2, 0x3, 0x4];
1267
1268 let s2 = unsafe { SSE2::instance() };
1269 let s3 = unsafe { SSSE3::instance() };
1270 let s4 = unsafe { SSE41::instance() };
1271
1272 {
1273 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1274 let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1275 assert_eq!(x_s2, y_s2);
1276 assert_eq!(xs, y_s2.to_lanes());
1277 }
1278
1279 {
1280 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1281 let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1282 assert_eq!(x_s3, y_s3);
1283 assert_eq!(xs, y_s3.to_lanes());
1284 }
1285
1286 {
1287 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1288 let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1289 assert_eq!(x_s4, y_s4);
1290 assert_eq!(xs, y_s4.to_lanes());
1291 }
1292 }
1293
1294 #[test]
1295 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1296 fn test_lanes_u64x2() {
1297 let xs = [0x1, 0x2];
1298
1299 let s2 = unsafe { SSE2::instance() };
1300 let s3 = unsafe { SSSE3::instance() };
1301 let s4 = unsafe { SSE41::instance() };
1302
1303 {
1304 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1305 let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1306 assert_eq!(x_s2, y_s2);
1307 assert_eq!(xs, y_s2.to_lanes());
1308 }
1309
1310 {
1311 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1312 let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1313 assert_eq!(x_s3, y_s3);
1314 assert_eq!(xs, y_s3.to_lanes());
1315 }
1316
1317 {
1318 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1319 let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1320 assert_eq!(x_s4, y_s4);
1321 assert_eq!(xs, y_s4.to_lanes());
1322 }
1323 }
1324
1325 #[test]
1326 fn test_vec4_u32x4_s2() {
1327 let xs = [1, 2, 3, 4];
1328 let s2 = unsafe { SSE2::instance() };
1329 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1330 assert_eq!(x_s2.extract(0), 1);
1331 assert_eq!(x_s2.extract(1), 2);
1332 assert_eq!(x_s2.extract(2), 3);
1333 assert_eq!(x_s2.extract(3), 4);
1334 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1335 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1336 assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1337 assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1338 }
1339
1340 #[test]
1341 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1342 fn test_vec4_u32x4_s4() {
1343 let xs = [1, 2, 3, 4];
1344 let s4 = unsafe { SSE41::instance() };
1345 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1346 assert_eq!(x_s4.extract(0), 1);
1347 assert_eq!(x_s4.extract(1), 2);
1348 assert_eq!(x_s4.extract(2), 3);
1349 assert_eq!(x_s4.extract(3), 4);
1350 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1351 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1352 assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1353 assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1354 }
1355
1356 #[test]
1357 fn test_vec2_u64x2_s2() {
1358 let xs = [0x1, 0x2];
1359 let s2 = unsafe { SSE2::instance() };
1360 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1361 assert_eq!(x_s2.extract(0), 1);
1362 assert_eq!(x_s2.extract(1), 2);
1363 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1364 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1365 }
1366
1367 #[test]
1368 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1369 fn test_vec4_u64x2_s4() {
1370 let xs = [0x1, 0x2];
1371 let s4 = unsafe { SSE41::instance() };
1372 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1373 assert_eq!(x_s4.extract(0), 1);
1374 assert_eq!(x_s4.extract(1), 2);
1375 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1376 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1377 }
1378}
1379
1380pub mod avx2 {
1381 #![allow(non_camel_case_types)]
1382 use crate::soft::{x2, x4};
1383 use crate::types::*;
1384 use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1385 use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1386 use core::arch::x86_64::*;
1387 use core::marker::PhantomData;
1388 use core::ops::*;
1389 use zerocopy::transmute;
1390
1391 zerocopy::cryptocorrosion_derive_traits! {
1392 #[repr(transparent)]
1393 #[derive(Copy, Clone)]
1394 pub struct u32x4x2_avx2<NI> {
1395 x: __m256i,
1396 ni: PhantomData<NI>,
1397 }
1398 }
1399
1400 impl<NI> u32x4x2_avx2<NI> {
1401 #[inline(always)]
1402 fn new(x: __m256i) -> Self {
1403 Self { x, ni: PhantomData }
1404 }
1405 }
1406
1407 impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1408 impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1409 #[inline(always)]
1410 unsafe fn unpack(p: vec256_storage) -> Self {
1411 Self::new(p.avx)
1412 }
1413 }
1414 impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1415 #[inline(always)]
1416 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1417 assert_eq!(input.len(), 32);
1418 Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1419 }
1420 #[inline(always)]
1421 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1422 Self::unsafe_read_le(input).bswap()
1423 }
1424 #[inline(always)]
1425 fn write_le(self, out: &mut [u8]) {
1426 unsafe {
1427 assert_eq!(out.len(), 32);
1428 _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1429 }
1430 }
1431 #[inline(always)]
1432 fn write_be(self, out: &mut [u8]) {
1433 self.bswap().write_le(out)
1434 }
1435 }
1436 impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
1437 #[inline(always)]
1438 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
1439 unsafe {
1440 [
1441 u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1442 u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1443 ]
1444 }
1445 }
1446 #[inline(always)]
1447 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
1448 Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
1449 }
1450 }
1451 impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1452 #[inline(always)]
1453 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1454 unsafe {
1455 match i {
1456 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1457 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1458 _ => panic!(),
1459 }
1460 }
1461 }
1462 #[inline(always)]
1463 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1464 Self::new(unsafe {
1465 match i {
1466 0 => _mm256_inserti128_si256(self.x, w.x, 0),
1467 1 => _mm256_inserti128_si256(self.x, w.x, 1),
1468 _ => panic!(),
1469 }
1470 })
1471 }
1472 }
1473 impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1474 impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1475 macro_rules! shuf_lane_bytes {
1476 ($name:ident, $k0:expr, $k1:expr) => {
1477 #[inline(always)]
1478 fn $name(self) -> Self {
1479 Self::new(unsafe {
1480 _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1481 })
1482 }
1483 };
1484 }
1485 macro_rules! rotr_32 {
1486 ($name:ident, $i:expr) => {
1487 #[inline(always)]
1488 fn $name(self) -> Self {
1489 Self::new(unsafe {
1490 _mm256_or_si256(
1491 _mm256_srli_epi32(self.x, $i as i32),
1492 _mm256_slli_epi32(self.x, 32 - $i as i32),
1493 )
1494 })
1495 }
1496 };
1497 }
1498 impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1499 rotr_32!(rotate_each_word_right7, 7);
1500 shuf_lane_bytes!(
1501 rotate_each_word_right8,
1502 0x0c0f_0e0d_080b_0a09,
1503 0x0407_0605_0003_0201
1504 );
1505 rotr_32!(rotate_each_word_right11, 11);
1506 rotr_32!(rotate_each_word_right12, 12);
1507 shuf_lane_bytes!(
1508 rotate_each_word_right16,
1509 0x0d0c_0f0e_0908_0b0a,
1510 0x0504_0706_0100_0302
1511 );
1512 rotr_32!(rotate_each_word_right20, 20);
1513 shuf_lane_bytes!(
1514 rotate_each_word_right24,
1515 0x0e0d_0c0f_0a09_080b,
1516 0x0605_0407_0201_0003
1517 );
1518 rotr_32!(rotate_each_word_right25, 25);
1519 }
1520 impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1521 impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1522 #[inline(always)]
1523 fn from(x: u32x4x2_avx2<NI>) -> Self {
1524 Self { avx: x.x }
1525 }
1526 }
1527
1528 macro_rules! impl_assign {
1529 ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1530 impl<NI> $Assign for $vec<NI>
1531 where
1532 NI: Copy,
1533 {
1534 #[inline(always)]
1535 fn $assign_fn(&mut self, rhs: Self) {
1536 *self = self.$bin_fn(rhs);
1537 }
1538 }
1539 };
1540 }
1541 impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1542 impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1543 impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1544 impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1545
1546 macro_rules! impl_bitop {
1547 ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1548 impl<NI> $Op for $vec<NI> {
1549 type Output = Self;
1550 #[inline(always)]
1551 fn $op_fn(self, rhs: Self) -> Self::Output {
1552 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1553 }
1554 }
1555 };
1556 }
1557 impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1558 impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1559 impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1560 impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1561 impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1562
1563 impl<NI> Not for u32x4x2_avx2<NI> {
1564 type Output = Self;
1565 #[inline(always)]
1566 fn not(self) -> Self::Output {
1567 unsafe {
1568 let f = _mm256_set1_epi8(-0x7f);
1569 Self::new(f) ^ self
1570 }
1571 }
1572 }
1573
1574 impl<NI> BSwap for u32x4x2_avx2<NI> {
1575 shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1576 }
1577
1578 impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1579 where
1580 NI: Copy,
1581 {
1582 #[inline(always)]
1583 fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1584 Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
1585 }
1586 }
1587
1588 impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1589 #[inline(always)]
1590 fn shuffle_lane_words1230(self) -> Self {
1591 Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
1592 }
1593 #[inline(always)]
1594 fn shuffle_lane_words2301(self) -> Self {
1595 Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
1596 }
1597 #[inline(always)]
1598 fn shuffle_lane_words3012(self) -> Self {
1599 Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
1600 }
1601 }
1602
1603 pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1606 impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1607
1608 impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1609 #[inline(always)]
1610 unsafe fn unpack(p: vec512_storage) -> Self {
1611 Self::new([
1612 u32x4x2_avx2::unpack(p.avx[0]),
1613 u32x4x2_avx2::unpack(p.avx[1]),
1614 ])
1615 }
1616 }
1617 impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1618 #[inline(always)]
1619 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1620 let [a, b] = self.0[0].to_lanes();
1621 let [c, d] = self.0[1].to_lanes();
1622 [a, b, c, d]
1623 }
1624 #[inline(always)]
1625 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1626 let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
1627 let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
1628 Self::new([ab, cd])
1629 }
1630 }
1631 impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1632 #[inline(always)]
1633 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1634 match i {
1635 0 => self.0[0].extract(0),
1636 1 => self.0[0].extract(1),
1637 2 => self.0[1].extract(0),
1638 3 => self.0[1].extract(1),
1639 _ => panic!(),
1640 }
1641 }
1642 #[inline(always)]
1643 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1644 Self::new(match i {
1645 0 | 1 => [self.0[0].insert(w, i), self.0[1]],
1646 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
1647 _ => panic!(),
1648 })
1649 }
1650 }
1651 impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1652 #[inline(always)]
1653 fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1654 unsafe {
1666 let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
1667 let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
1668 let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
1669 let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
1670 let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
1671 let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
1672 let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
1673 let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
1674 (
1675 Self::new([ab00, cd00]),
1676 Self::new([ab01, cd01]),
1677 Self::new([ab10, cd10]),
1678 Self::new([ab11, cd11]),
1679 )
1680 }
1681 }
1682 }
1683 impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1684 #[inline(always)]
1685 fn to_scalars(self) -> [u32; 16] {
1686 transmute!(self)
1687 }
1688 }
1689 impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1690 #[inline(always)]
1691 fn from(x: u32x4x4_avx2<NI>) -> Self {
1692 Self {
1693 avx: [
1694 vec256_storage { avx: x.0[0].x },
1695 vec256_storage { avx: x.0[1].x },
1696 ],
1697 }
1698 }
1699 }
1700 impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1701 #[inline(always)]
1702 fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1703 Self::new(unsafe {
1704 [
1705 u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
1706 u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
1707 ]
1708 })
1709 }
1710 }
1711}