blake3/
platform.rs

1use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
2use arrayref::{array_mut_ref, array_ref};
3
4cfg_if::cfg_if! {
5    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
6        cfg_if::cfg_if! {
7            if #[cfg(blake3_avx512_ffi)] {
8                pub const MAX_SIMD_DEGREE: usize = 16;
9            } else {
10                pub const MAX_SIMD_DEGREE: usize = 8;
11            }
12        }
13    } else if #[cfg(blake3_neon)] {
14        pub const MAX_SIMD_DEGREE: usize = 4;
15    } else if #[cfg(blake3_wasm32_simd)] {
16        pub const MAX_SIMD_DEGREE: usize = 4;
17    } else {
18        pub const MAX_SIMD_DEGREE: usize = 1;
19    }
20}
21
22// There are some places where we want a static size that's equal to the
23// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
24// allowed to use cmp::max, so we have to hardcode this additional constant
25// value. Get rid of this once cmp::max is a const fn.
26cfg_if::cfg_if! {
27    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
28        cfg_if::cfg_if! {
29            if #[cfg(blake3_avx512_ffi)] {
30                pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
31            } else {
32                pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
33            }
34        }
35    } else if #[cfg(blake3_neon)] {
36        pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
37    } else if #[cfg(blake3_wasm32_simd)] {
38        pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
39    } else {
40        pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
41    }
42}
43
44#[derive(Clone, Copy, Debug)]
45pub enum Platform {
46    Portable,
47    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
48    SSE2,
49    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
50    SSE41,
51    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
52    AVX2,
53    #[cfg(blake3_avx512_ffi)]
54    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
55    AVX512,
56    #[cfg(blake3_neon)]
57    NEON,
58    #[cfg(blake3_wasm32_simd)]
59    #[allow(non_camel_case_types)]
60    WASM32_SIMD,
61}
62
63impl Platform {
64    #[allow(unreachable_code)]
65    pub fn detect() -> Self {
66        #[cfg(miri)]
67        {
68            return Platform::Portable;
69        }
70
71        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
72        {
73            #[cfg(blake3_avx512_ffi)]
74            {
75                if avx512_detected() {
76                    return Platform::AVX512;
77                }
78            }
79            if avx2_detected() {
80                return Platform::AVX2;
81            }
82            if sse41_detected() {
83                return Platform::SSE41;
84            }
85            if sse2_detected() {
86                return Platform::SSE2;
87            }
88        }
89        // We don't use dynamic feature detection for NEON. If the "neon"
90        // feature is on, NEON is assumed to be supported.
91        #[cfg(blake3_neon)]
92        {
93            return Platform::NEON;
94        }
95        #[cfg(blake3_wasm32_simd)]
96        {
97            return Platform::WASM32_SIMD;
98        }
99        Platform::Portable
100    }
101
102    pub fn simd_degree(&self) -> usize {
103        let degree = match self {
104            Platform::Portable => 1,
105            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
106            Platform::SSE2 => 4,
107            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
108            Platform::SSE41 => 4,
109            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
110            Platform::AVX2 => 8,
111            #[cfg(blake3_avx512_ffi)]
112            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
113            Platform::AVX512 => 16,
114            #[cfg(blake3_neon)]
115            Platform::NEON => 4,
116            #[cfg(blake3_wasm32_simd)]
117            Platform::WASM32_SIMD => 4,
118        };
119        debug_assert!(degree <= MAX_SIMD_DEGREE);
120        degree
121    }
122
123    pub fn compress_in_place(
124        &self,
125        cv: &mut CVWords,
126        block: &[u8; BLOCK_LEN],
127        block_len: u8,
128        counter: u64,
129        flags: u8,
130    ) {
131        match self {
132            Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
133            // Safe because detect() checked for platform support.
134            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
135            Platform::SSE2 => unsafe {
136                crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
137            },
138            // Safe because detect() checked for platform support.
139            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
140            Platform::SSE41 | Platform::AVX2 => unsafe {
141                crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
142            },
143            // Safe because detect() checked for platform support.
144            #[cfg(blake3_avx512_ffi)]
145            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
146            Platform::AVX512 => unsafe {
147                crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
148            },
149            // No NEON compress_in_place() implementation yet.
150            #[cfg(blake3_neon)]
151            Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
152            #[cfg(blake3_wasm32_simd)]
153            Platform::WASM32_SIMD => {
154                crate::wasm32_simd::compress_in_place(cv, block, block_len, counter, flags)
155            }
156        }
157    }
158
159    pub fn compress_xof(
160        &self,
161        cv: &CVWords,
162        block: &[u8; BLOCK_LEN],
163        block_len: u8,
164        counter: u64,
165        flags: u8,
166    ) -> [u8; 64] {
167        match self {
168            Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
169            // Safe because detect() checked for platform support.
170            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
171            Platform::SSE2 => unsafe {
172                crate::sse2::compress_xof(cv, block, block_len, counter, flags)
173            },
174            // Safe because detect() checked for platform support.
175            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
176            Platform::SSE41 | Platform::AVX2 => unsafe {
177                crate::sse41::compress_xof(cv, block, block_len, counter, flags)
178            },
179            // Safe because detect() checked for platform support.
180            #[cfg(blake3_avx512_ffi)]
181            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
182            Platform::AVX512 => unsafe {
183                crate::avx512::compress_xof(cv, block, block_len, counter, flags)
184            },
185            // No NEON compress_xof() implementation yet.
186            #[cfg(blake3_neon)]
187            Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
188            #[cfg(blake3_wasm32_simd)]
189            Platform::WASM32_SIMD => {
190                crate::wasm32_simd::compress_xof(cv, block, block_len, counter, flags)
191            }
192        }
193    }
194
195    // IMPLEMENTATION NOTE
196    // ===================
197    // hash_many() applies two optimizations. The critically important
198    // optimization is the high-performance parallel SIMD hashing mode,
199    // described in detail in the spec. This more than doubles throughput per
200    // thread. Another optimization is keeping the state vectors transposed
201    // from block to block within a chunk. When state vectors are transposed
202    // after every block, there's a small but measurable performance loss.
203    // Compressing chunks with a dedicated loop avoids this.
204
205    pub fn hash_many<const N: usize>(
206        &self,
207        inputs: &[&[u8; N]],
208        key: &CVWords,
209        counter: u64,
210        increment_counter: IncrementCounter,
211        flags: u8,
212        flags_start: u8,
213        flags_end: u8,
214        out: &mut [u8],
215    ) {
216        match self {
217            Platform::Portable => portable::hash_many(
218                inputs,
219                key,
220                counter,
221                increment_counter,
222                flags,
223                flags_start,
224                flags_end,
225                out,
226            ),
227            // Safe because detect() checked for platform support.
228            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
229            Platform::SSE2 => unsafe {
230                crate::sse2::hash_many(
231                    inputs,
232                    key,
233                    counter,
234                    increment_counter,
235                    flags,
236                    flags_start,
237                    flags_end,
238                    out,
239                )
240            },
241            // Safe because detect() checked for platform support.
242            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
243            Platform::SSE41 => unsafe {
244                crate::sse41::hash_many(
245                    inputs,
246                    key,
247                    counter,
248                    increment_counter,
249                    flags,
250                    flags_start,
251                    flags_end,
252                    out,
253                )
254            },
255            // Safe because detect() checked for platform support.
256            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
257            Platform::AVX2 => unsafe {
258                crate::avx2::hash_many(
259                    inputs,
260                    key,
261                    counter,
262                    increment_counter,
263                    flags,
264                    flags_start,
265                    flags_end,
266                    out,
267                )
268            },
269            // Safe because detect() checked for platform support.
270            #[cfg(blake3_avx512_ffi)]
271            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
272            Platform::AVX512 => unsafe {
273                crate::avx512::hash_many(
274                    inputs,
275                    key,
276                    counter,
277                    increment_counter,
278                    flags,
279                    flags_start,
280                    flags_end,
281                    out,
282                )
283            },
284            // Assumed to be safe if the "neon" feature is on.
285            #[cfg(blake3_neon)]
286            Platform::NEON => unsafe {
287                crate::neon::hash_many(
288                    inputs,
289                    key,
290                    counter,
291                    increment_counter,
292                    flags,
293                    flags_start,
294                    flags_end,
295                    out,
296                )
297            },
298            // Assumed to be safe if the "wasm32_simd" feature is on.
299            #[cfg(blake3_wasm32_simd)]
300            Platform::WASM32_SIMD => unsafe {
301                crate::wasm32_simd::hash_many(
302                    inputs,
303                    key,
304                    counter,
305                    increment_counter,
306                    flags,
307                    flags_start,
308                    flags_end,
309                    out,
310                )
311            },
312        }
313    }
314
315    pub fn xof_many(
316        &self,
317        cv: &CVWords,
318        block: &[u8; BLOCK_LEN],
319        block_len: u8,
320        mut counter: u64,
321        flags: u8,
322        out: &mut [u8],
323    ) {
324        debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
325        if out.is_empty() {
326            // The current assembly implementation always outputs at least 1 block.
327            return;
328        }
329        match self {
330            // Safe because detect() checked for platform support.
331            #[cfg(blake3_avx512_ffi)]
332            #[cfg(unix)]
333            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
334            Platform::AVX512 => unsafe {
335                crate::avx512::xof_many(cv, block, block_len, counter, flags, out)
336            },
337            _ => {
338                // For platforms without an optimized xof_many, fall back to a loop over
339                // compress_xof. This is still faster than portable code.
340                for out_block in out.chunks_exact_mut(BLOCK_LEN) {
341                    // TODO: Use array_chunks_mut here once that's stable.
342                    let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap();
343                    *out_array = self.compress_xof(cv, block, block_len, counter, flags);
344                    counter += 1;
345                }
346            }
347        }
348    }
349
350    // Explicit platform constructors, for benchmarks.
351
352    pub fn portable() -> Self {
353        Self::Portable
354    }
355
356    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
357    pub fn sse2() -> Option<Self> {
358        if sse2_detected() {
359            Some(Self::SSE2)
360        } else {
361            None
362        }
363    }
364
365    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
366    pub fn sse41() -> Option<Self> {
367        if sse41_detected() {
368            Some(Self::SSE41)
369        } else {
370            None
371        }
372    }
373
374    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
375    pub fn avx2() -> Option<Self> {
376        if avx2_detected() {
377            Some(Self::AVX2)
378        } else {
379            None
380        }
381    }
382
383    #[cfg(blake3_avx512_ffi)]
384    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
385    pub fn avx512() -> Option<Self> {
386        if avx512_detected() {
387            Some(Self::AVX512)
388        } else {
389            None
390        }
391    }
392
393    #[cfg(blake3_neon)]
394    pub fn neon() -> Option<Self> {
395        // Assumed to be safe if the "neon" feature is on.
396        Some(Self::NEON)
397    }
398
399    #[cfg(blake3_wasm32_simd)]
400    pub fn wasm32_simd() -> Option<Self> {
401        // Assumed to be safe if the "wasm32_simd" feature is on.
402        Some(Self::WASM32_SIMD)
403    }
404}
405
406// Note that AVX-512 is divided into multiple featuresets, and we use two of
407// them, F and VL.
408#[cfg(blake3_avx512_ffi)]
409#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
410#[inline(always)]
411#[allow(unreachable_code)]
412pub fn avx512_detected() -> bool {
413    if cfg!(miri) {
414        return false;
415    }
416
417    // A testing-only short-circuit.
418    if cfg!(feature = "no_avx512") {
419        return false;
420    }
421    // Static check, e.g. for building with target-cpu=native.
422    #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
423    {
424        return true;
425    }
426    // Dynamic check, if std is enabled.
427    #[cfg(feature = "std")]
428    {
429        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
430            return true;
431        }
432    }
433    false
434}
435
436#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
437#[inline(always)]
438#[allow(unreachable_code)]
439pub fn avx2_detected() -> bool {
440    if cfg!(miri) {
441        return false;
442    }
443
444    // A testing-only short-circuit.
445    if cfg!(feature = "no_avx2") {
446        return false;
447    }
448    // Static check, e.g. for building with target-cpu=native.
449    #[cfg(target_feature = "avx2")]
450    {
451        return true;
452    }
453    // Dynamic check, if std is enabled.
454    #[cfg(feature = "std")]
455    {
456        if is_x86_feature_detected!("avx2") {
457            return true;
458        }
459    }
460    false
461}
462
463#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
464#[inline(always)]
465#[allow(unreachable_code)]
466pub fn sse41_detected() -> bool {
467    if cfg!(miri) {
468        return false;
469    }
470
471    // A testing-only short-circuit.
472    if cfg!(feature = "no_sse41") {
473        return false;
474    }
475    // Static check, e.g. for building with target-cpu=native.
476    #[cfg(target_feature = "sse4.1")]
477    {
478        return true;
479    }
480    // Dynamic check, if std is enabled.
481    #[cfg(feature = "std")]
482    {
483        if is_x86_feature_detected!("sse4.1") {
484            return true;
485        }
486    }
487    false
488}
489
490#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
491#[inline(always)]
492#[allow(unreachable_code)]
493pub fn sse2_detected() -> bool {
494    if cfg!(miri) {
495        return false;
496    }
497
498    // A testing-only short-circuit.
499    if cfg!(feature = "no_sse2") {
500        return false;
501    }
502    // Static check, e.g. for building with target-cpu=native.
503    #[cfg(target_feature = "sse2")]
504    {
505        return true;
506    }
507    // Dynamic check, if std is enabled.
508    #[cfg(feature = "std")]
509    {
510        if is_x86_feature_detected!("sse2") {
511            return true;
512        }
513    }
514    false
515}
516
517#[inline(always)]
518pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
519    let mut out = [0; 8];
520    out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
521    out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
522    out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
523    out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
524    out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
525    out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
526    out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
527    out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
528    out
529}
530
531#[inline(always)]
532pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
533    let mut out = [0; 16];
534    out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
535    out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
536    out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
537    out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
538    out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
539    out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
540    out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
541    out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
542    out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
543    out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
544    out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
545    out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
546    out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
547    out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
548    out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
549    out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
550    out
551}
552
553#[inline(always)]
554pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
555    let mut out = [0; 32];
556    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
557    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
558    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
559    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
560    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
561    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
562    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
563    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
564    out
565}
566
567#[inline(always)]
568pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
569    let mut out = [0; 64];
570    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
571    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
572    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
573    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
574    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
575    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
576    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
577    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
578    *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
579    *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
580    *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
581    *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
582    *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
583    *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
584    *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
585    *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
586    out
587}