Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
34pub const fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35    unsafe { simd_add(a, b) }
36}
37
38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
39/// `b`.
40///
41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
45#[stable(feature = "simd_x86", since = "1.27.0")]
46#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
47pub const fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
48    unsafe { simd_add(a, b) }
49}
50
51/// Computes the bitwise AND of a packed double-precision (64-bit)
52/// floating-point elements in `a` and `b`.
53///
54/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
55#[inline]
56#[target_feature(enable = "avx")]
57// See https://github.com/rust-lang/stdarch/issues/71
58#[cfg_attr(test, assert_instr(vandp))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
62    unsafe {
63        let a: u64x4 = transmute(a);
64        let b: u64x4 = transmute(b);
65        transmute(simd_and(a, b))
66    }
67}
68
69/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
70/// elements in `a` and `b`.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
73#[inline]
74#[target_feature(enable = "avx")]
75#[cfg_attr(test, assert_instr(vandps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
78pub const fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
79    unsafe {
80        let a: u32x8 = transmute(a);
81        let b: u32x8 = transmute(b);
82        transmute(simd_and(a, b))
83    }
84}
85
86/// Computes the bitwise OR packed double-precision (64-bit) floating-point
87/// elements in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
90#[inline]
91#[target_feature(enable = "avx")]
92// See <https://github.com/rust-lang/stdarch/issues/71>.
93#[cfg_attr(test, assert_instr(vorp))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
96pub const fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
97    unsafe {
98        let a: u64x4 = transmute(a);
99        let b: u64x4 = transmute(b);
100        transmute(simd_or(a, b))
101    }
102}
103
104/// Computes the bitwise OR packed single-precision (32-bit) floating-point
105/// elements in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
108#[inline]
109#[target_feature(enable = "avx")]
110#[cfg_attr(test, assert_instr(vorps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
114    unsafe {
115        let a: u32x8 = transmute(a);
116        let b: u32x8 = transmute(b);
117        transmute(simd_or(a, b))
118    }
119}
120
121/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
122/// lanes using the control in `imm8`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
125#[inline]
126#[target_feature(enable = "avx")]
127#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
128#[rustc_legacy_const_generics(2)]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
132    static_assert_uimm_bits!(MASK, 8);
133    unsafe {
134        simd_shuffle!(
135            a,
136            b,
137            [
138                MASK as u32 & 0b1,
139                ((MASK as u32 >> 1) & 0b1) + 4,
140                ((MASK as u32 >> 2) & 0b1) + 2,
141                ((MASK as u32 >> 3) & 0b1) + 6,
142            ],
143        )
144    }
145}
146
147/// Shuffles single-precision (32-bit) floating-point elements in `a` within
148/// 128-bit lanes using the control in `imm8`.
149///
150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
151#[inline]
152#[target_feature(enable = "avx")]
153#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
154#[rustc_legacy_const_generics(2)]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
157pub const fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
158    static_assert_uimm_bits!(MASK, 8);
159    unsafe {
160        simd_shuffle!(
161            a,
162            b,
163            [
164                MASK as u32 & 0b11,
165                (MASK as u32 >> 2) & 0b11,
166                ((MASK as u32 >> 4) & 0b11) + 8,
167                ((MASK as u32 >> 6) & 0b11) + 8,
168                (MASK as u32 & 0b11) + 4,
169                ((MASK as u32 >> 2) & 0b11) + 4,
170                ((MASK as u32 >> 4) & 0b11) + 12,
171                ((MASK as u32 >> 6) & 0b11) + 12,
172            ],
173        )
174    }
175}
176
177/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
178/// elements in `a`, and then AND with `b`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
181#[inline]
182#[target_feature(enable = "avx")]
183#[cfg_attr(test, assert_instr(vandnp))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
187    unsafe {
188        let a: u64x4 = transmute(a);
189        let b: u64x4 = transmute(b);
190        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
191    }
192}
193
194/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
195/// elements in `a`
196/// and then AND with `b`.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
199#[inline]
200#[target_feature(enable = "avx")]
201#[cfg_attr(test, assert_instr(vandnps))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
204pub const fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
205    unsafe {
206        let a: u32x8 = transmute(a);
207        let b: u32x8 = transmute(b);
208        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
209    }
210}
211
212/// Compares packed double-precision (64-bit) floating-point elements
213/// in `a` and `b`, and returns packed maximum values
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
216#[inline]
217#[target_feature(enable = "avx")]
218#[cfg_attr(test, assert_instr(vmaxpd))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
221    unsafe { vmaxpd(a, b) }
222}
223
224/// Compares packed single-precision (32-bit) floating-point elements in `a`
225/// and `b`, and returns packed maximum values
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
228#[inline]
229#[target_feature(enable = "avx")]
230#[cfg_attr(test, assert_instr(vmaxps))]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
233    unsafe { vmaxps(a, b) }
234}
235
236/// Compares packed double-precision (64-bit) floating-point elements
237/// in `a` and `b`, and returns packed minimum values
238///
239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
240#[inline]
241#[target_feature(enable = "avx")]
242#[cfg_attr(test, assert_instr(vminpd))]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
245    unsafe { vminpd(a, b) }
246}
247
248/// Compares packed single-precision (32-bit) floating-point elements in `a`
249/// and `b`, and returns packed minimum values
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
252#[inline]
253#[target_feature(enable = "avx")]
254#[cfg_attr(test, assert_instr(vminps))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
257    unsafe { vminps(a, b) }
258}
259
260/// Multiplies packed double-precision (64-bit) floating-point elements
261/// in `a` and `b`.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
264#[inline]
265#[target_feature(enable = "avx")]
266#[cfg_attr(test, assert_instr(vmulpd))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
269pub const fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
270    unsafe { simd_mul(a, b) }
271}
272
273/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
274/// `b`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
277#[inline]
278#[target_feature(enable = "avx")]
279#[cfg_attr(test, assert_instr(vmulps))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
282pub const fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
283    unsafe { simd_mul(a, b) }
284}
285
286/// Alternatively adds and subtracts packed double-precision (64-bit)
287/// floating-point elements in `a` to/from packed elements in `b`.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vaddsubpd))]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
295pub const fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
296    unsafe {
297        let a = a.as_f64x4();
298        let b = b.as_f64x4();
299        let add = simd_add(a, b);
300        let sub = simd_sub(a, b);
301        simd_shuffle!(add, sub, [4, 1, 6, 3])
302    }
303}
304
305/// Alternatively adds and subtracts packed single-precision (32-bit)
306/// floating-point elements in `a` to/from packed elements in `b`.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
309#[inline]
310#[target_feature(enable = "avx")]
311#[cfg_attr(test, assert_instr(vaddsubps))]
312#[stable(feature = "simd_x86", since = "1.27.0")]
313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
314pub const fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
315    unsafe {
316        let a = a.as_f32x8();
317        let b = b.as_f32x8();
318        let add = simd_add(a, b);
319        let sub = simd_sub(a, b);
320        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
321    }
322}
323
324/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
325/// from packed elements in `a`.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
328#[inline]
329#[target_feature(enable = "avx")]
330#[cfg_attr(test, assert_instr(vsubpd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
333pub const fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
334    unsafe { simd_sub(a, b) }
335}
336
337/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
338/// from packed elements in `a`.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
341#[inline]
342#[target_feature(enable = "avx")]
343#[cfg_attr(test, assert_instr(vsubps))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
346pub const fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
347    unsafe { simd_sub(a, b) }
348}
349
350/// Computes the division of each of the 8 packed 32-bit floating-point elements
351/// in `a` by the corresponding packed elements in `b`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
354#[inline]
355#[target_feature(enable = "avx")]
356#[cfg_attr(test, assert_instr(vdivps))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
359pub const fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
360    unsafe { simd_div(a, b) }
361}
362
363/// Computes the division of each of the 4 packed 64-bit floating-point elements
364/// in `a` by the corresponding packed elements in `b`.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
367#[inline]
368#[target_feature(enable = "avx")]
369#[cfg_attr(test, assert_instr(vdivpd))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
372pub const fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
373    unsafe { simd_div(a, b) }
374}
375
376/// Rounds packed double-precision (64-bit) floating point elements in `a`
377/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
378///
379/// - `0x00`: Round to the nearest whole number.
380/// - `0x01`: Round down, toward negative infinity.
381/// - `0x02`: Round up, toward positive infinity.
382/// - `0x03`: Truncate the values.
383///
384/// For a complete list of options, check [the LLVM docs][llvm_docs].
385///
386/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
389#[inline]
390#[target_feature(enable = "avx")]
391#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
392#[rustc_legacy_const_generics(1)]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
395    static_assert_uimm_bits!(ROUNDING, 4);
396    unsafe { roundpd256(a, ROUNDING) }
397}
398
399/// Rounds packed double-precision (64-bit) floating point elements in `a`
400/// toward positive infinity.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
403#[inline]
404#[target_feature(enable = "avx")]
405#[cfg_attr(test, assert_instr(vroundpd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_ceil_pd(a: __m256d) -> __m256d {
409    unsafe { simd_ceil(a) }
410}
411
412/// Rounds packed double-precision (64-bit) floating point elements in `a`
413/// toward negative infinity.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
416#[inline]
417#[target_feature(enable = "avx")]
418#[cfg_attr(test, assert_instr(vroundpd))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
421pub const fn _mm256_floor_pd(a: __m256d) -> __m256d {
422    unsafe { simd_floor(a) }
423}
424
425/// Rounds packed single-precision (32-bit) floating point elements in `a`
426/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
427///
428/// - `0x00`: Round to the nearest whole number.
429/// - `0x01`: Round down, toward negative infinity.
430/// - `0x02`: Round up, toward positive infinity.
431/// - `0x03`: Truncate the values.
432///
433/// For a complete list of options, check [the LLVM docs][llvm_docs].
434///
435/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
438#[inline]
439#[target_feature(enable = "avx")]
440#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
441#[rustc_legacy_const_generics(1)]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
444    static_assert_uimm_bits!(ROUNDING, 4);
445    unsafe { roundps256(a, ROUNDING) }
446}
447
448/// Rounds packed single-precision (32-bit) floating point elements in `a`
449/// toward positive infinity.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
452#[inline]
453#[target_feature(enable = "avx")]
454#[cfg_attr(test, assert_instr(vroundps))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
457pub const fn _mm256_ceil_ps(a: __m256) -> __m256 {
458    unsafe { simd_ceil(a) }
459}
460
461/// Rounds packed single-precision (32-bit) floating point elements in `a`
462/// toward negative infinity.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
465#[inline]
466#[target_feature(enable = "avx")]
467#[cfg_attr(test, assert_instr(vroundps))]
468#[stable(feature = "simd_x86", since = "1.27.0")]
469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
470pub const fn _mm256_floor_ps(a: __m256) -> __m256 {
471    unsafe { simd_floor(a) }
472}
473
474/// Returns the square root of packed single-precision (32-bit) floating point
475/// elements in `a`.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
478#[inline]
479#[target_feature(enable = "avx")]
480#[cfg_attr(test, assert_instr(vsqrtps))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
483    unsafe { simd_fsqrt(a) }
484}
485
486/// Returns the square root of packed double-precision (64-bit) floating point
487/// elements in `a`.
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
490#[inline]
491#[target_feature(enable = "avx")]
492#[cfg_attr(test, assert_instr(vsqrtpd))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
495    unsafe { simd_fsqrt(a) }
496}
497
498/// Blends packed double-precision (64-bit) floating-point elements from
499/// `a` and `b` using control mask `imm8`.
500///
501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
502#[inline]
503#[target_feature(enable = "avx")]
504// Note: LLVM7 prefers single-precision blend instructions when
505// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
506// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
507#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
508#[rustc_legacy_const_generics(2)]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
511pub const fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
512    static_assert_uimm_bits!(IMM4, 4);
513    unsafe {
514        simd_shuffle!(
515            a,
516            b,
517            [
518                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
519                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
520                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
521                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
522            ],
523        )
524    }
525}
526
527/// Blends packed single-precision (32-bit) floating-point elements from
528/// `a` and `b` using control mask `imm8`.
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
531#[inline]
532#[target_feature(enable = "avx")]
533#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
534#[rustc_legacy_const_generics(2)]
535#[stable(feature = "simd_x86", since = "1.27.0")]
536#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
537pub const fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
538    static_assert_uimm_bits!(IMM8, 8);
539    unsafe {
540        simd_shuffle!(
541            a,
542            b,
543            [
544                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
545                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
546                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
547                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
548                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
549                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
550                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
551                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
552            ],
553        )
554    }
555}
556
557/// Blends packed double-precision (64-bit) floating-point elements from
558/// `a` and `b` using `c` as a mask.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
561#[inline]
562#[target_feature(enable = "avx")]
563#[cfg_attr(test, assert_instr(vblendvpd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
567    unsafe {
568        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
569        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
570    }
571}
572
573/// Blends packed single-precision (32-bit) floating-point elements from
574/// `a` and `b` using `c` as a mask.
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
577#[inline]
578#[target_feature(enable = "avx")]
579#[cfg_attr(test, assert_instr(vblendvps))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
583    unsafe {
584        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
585        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
586    }
587}
588
589/// Conditionally multiplies the packed single-precision (32-bit) floating-point
590/// elements in `a` and `b` using the high 4 bits in `imm8`,
591/// sum the four products, and conditionally return the sum
592///  using the low 4 bits of `imm8`.
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
598#[rustc_legacy_const_generics(2)]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
601    static_assert_uimm_bits!(IMM8, 8);
602    unsafe { vdpps(a, b, IMM8 as i8) }
603}
604
605/// Horizontal addition of adjacent pairs in the two packed vectors
606/// of 4 64-bit floating points `a` and `b`.
607/// In the result, sums of elements from `a` are returned in even locations,
608/// while sums of elements from `b` are returned in odd locations.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
611#[inline]
612#[target_feature(enable = "avx")]
613#[cfg_attr(test, assert_instr(vhaddpd))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617    unsafe {
618        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620        simd_add(even, odd)
621    }
622}
623
624/// Horizontal addition of adjacent pairs in the two packed vectors
625/// of 8 32-bit floating points `a` and `b`.
626/// In the result, sums of elements from `a` are returned in locations of
627/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
628/// 2, 3, 6, 7.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
631#[inline]
632#[target_feature(enable = "avx")]
633#[cfg_attr(test, assert_instr(vhaddps))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
636pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637    unsafe {
638        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640        simd_add(even, odd)
641    }
642}
643
644/// Horizontal subtraction of adjacent pairs in the two packed vectors
645/// of 4 64-bit floating points `a` and `b`.
646/// In the result, sums of elements from `a` are returned in even locations,
647/// while sums of elements from `b` are returned in odd locations.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
650#[inline]
651#[target_feature(enable = "avx")]
652#[cfg_attr(test, assert_instr(vhsubpd))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
655pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656    unsafe {
657        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659        simd_sub(even, odd)
660    }
661}
662
663/// Horizontal subtraction of adjacent pairs in the two packed vectors
664/// of 8 32-bit floating points `a` and `b`.
665/// In the result, sums of elements from `a` are returned in locations of
666/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
667/// 2, 3, 6, 7.
668///
669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
670#[inline]
671#[target_feature(enable = "avx")]
672#[cfg_attr(test, assert_instr(vhsubps))]
673#[stable(feature = "simd_x86", since = "1.27.0")]
674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
675pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676    unsafe {
677        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679        simd_sub(even, odd)
680    }
681}
682
683/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
684/// elements in `a` and `b`.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
687#[inline]
688#[target_feature(enable = "avx")]
689#[cfg_attr(test, assert_instr(vxorp))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
692pub const fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
693    unsafe {
694        let a: u64x4 = transmute(a);
695        let b: u64x4 = transmute(b);
696        transmute(simd_xor(a, b))
697    }
698}
699
700/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
701/// elements in `a` and `b`.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
704#[inline]
705#[target_feature(enable = "avx")]
706#[cfg_attr(test, assert_instr(vxorps))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
710    unsafe {
711        let a: u32x8 = transmute(a);
712        let b: u32x8 = transmute(b);
713        transmute(simd_xor(a, b))
714    }
715}
716
717/// Equal (ordered, non-signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OQ: i32 = 0x00;
720/// Less-than (ordered, signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OS: i32 = 0x01;
723/// Less-than-or-equal (ordered, signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OS: i32 = 0x02;
726/// Unordered (non-signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_Q: i32 = 0x03;
729/// Not-equal (unordered, non-signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_UQ: i32 = 0x04;
732/// Not-less-than (unordered, signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_US: i32 = 0x05;
735/// Not-less-than-or-equal (unordered, signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_US: i32 = 0x06;
738/// Ordered (non-signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_Q: i32 = 0x07;
741/// Equal (unordered, non-signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_UQ: i32 = 0x08;
744/// Not-greater-than-or-equal (unordered, signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_US: i32 = 0x09;
747/// Not-greater-than (unordered, signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_US: i32 = 0x0a;
750/// False (ordered, non-signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OQ: i32 = 0x0b;
753/// Not-equal (ordered, non-signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OQ: i32 = 0x0c;
756/// Greater-than-or-equal (ordered, signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OS: i32 = 0x0d;
759/// Greater-than (ordered, signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OS: i32 = 0x0e;
762/// True (unordered, non-signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_UQ: i32 = 0x0f;
765/// Equal (ordered, signaling)
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub const _CMP_EQ_OS: i32 = 0x10;
768/// Less-than (ordered, non-signaling)
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub const _CMP_LT_OQ: i32 = 0x11;
771/// Less-than-or-equal (ordered, non-signaling)
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub const _CMP_LE_OQ: i32 = 0x12;
774/// Unordered (signaling)
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub const _CMP_UNORD_S: i32 = 0x13;
777/// Not-equal (unordered, signaling)
778#[stable(feature = "simd_x86", since = "1.27.0")]
779pub const _CMP_NEQ_US: i32 = 0x14;
780/// Not-less-than (unordered, non-signaling)
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub const _CMP_NLT_UQ: i32 = 0x15;
783/// Not-less-than-or-equal (unordered, non-signaling)
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub const _CMP_NLE_UQ: i32 = 0x16;
786/// Ordered (signaling)
787#[stable(feature = "simd_x86", since = "1.27.0")]
788pub const _CMP_ORD_S: i32 = 0x17;
789/// Equal (unordered, signaling)
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub const _CMP_EQ_US: i32 = 0x18;
792/// Not-greater-than-or-equal (unordered, non-signaling)
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub const _CMP_NGE_UQ: i32 = 0x19;
795/// Not-greater-than (unordered, non-signaling)
796#[stable(feature = "simd_x86", since = "1.27.0")]
797pub const _CMP_NGT_UQ: i32 = 0x1a;
798/// False (ordered, signaling)
799#[stable(feature = "simd_x86", since = "1.27.0")]
800pub const _CMP_FALSE_OS: i32 = 0x1b;
801/// Not-equal (ordered, signaling)
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub const _CMP_NEQ_OS: i32 = 0x1c;
804/// Greater-than-or-equal (ordered, non-signaling)
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub const _CMP_GE_OQ: i32 = 0x1d;
807/// Greater-than (ordered, non-signaling)
808#[stable(feature = "simd_x86", since = "1.27.0")]
809pub const _CMP_GT_OQ: i32 = 0x1e;
810/// True (unordered, signaling)
811#[stable(feature = "simd_x86", since = "1.27.0")]
812pub const _CMP_TRUE_US: i32 = 0x1f;
813
814/// Compares packed double-precision (64-bit) floating-point
815/// elements in `a` and `b` based on the comparison operand
816/// specified by `IMM5`.
817///
818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
819#[inline]
820#[target_feature(enable = "avx")]
821#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
822#[rustc_legacy_const_generics(2)]
823#[stable(feature = "simd_x86", since = "1.27.0")]
824pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
825    static_assert_uimm_bits!(IMM5, 5);
826    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
827}
828
829/// Compares packed double-precision (64-bit) floating-point
830/// elements in `a` and `b` based on the comparison operand
831/// specified by `IMM5`.
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
834#[inline]
835#[target_feature(enable = "avx")]
836#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
837#[rustc_legacy_const_generics(2)]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
840    static_assert_uimm_bits!(IMM5, 5);
841    unsafe { vcmppd256(a, b, IMM5 as u8) }
842}
843
844/// Compares packed single-precision (32-bit) floating-point
845/// elements in `a` and `b` based on the comparison operand
846/// specified by `IMM5`.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
857}
858
859/// Compares packed single-precision (32-bit) floating-point
860/// elements in `a` and `b` based on the comparison operand
861/// specified by `IMM5`.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
870    static_assert_uimm_bits!(IMM5, 5);
871    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
872}
873
874/// Compares the lower double-precision (64-bit) floating-point element in
875/// `a` and `b` based on the comparison operand specified by `IMM5`,
876/// store the result in the lower element of returned vector,
877/// and copies the upper element from `a` to the upper element of returned
878/// vector.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
881#[inline]
882#[target_feature(enable = "avx")]
883#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
884#[rustc_legacy_const_generics(2)]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
887    static_assert_uimm_bits!(IMM5, 5);
888    unsafe { vcmpsd(a, b, IMM5 as i8) }
889}
890
891/// Compares the lower single-precision (32-bit) floating-point element in
892/// `a` and `b` based on the comparison operand specified by `IMM5`,
893/// store the result in the lower element of returned vector,
894/// and copies the upper 3 packed elements from `a` to the upper elements of
895/// returned vector.
896///
897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
898#[inline]
899#[target_feature(enable = "avx")]
900#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
901#[rustc_legacy_const_generics(2)]
902#[stable(feature = "simd_x86", since = "1.27.0")]
903pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
904    static_assert_uimm_bits!(IMM5, 5);
905    unsafe { vcmpss(a, b, IMM5 as i8) }
906}
907
908/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
909/// floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtdq2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
917pub const fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
918    unsafe { simd_cast(a.as_i32x4()) }
919}
920
921/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
922/// floating-point elements.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
925#[inline]
926#[target_feature(enable = "avx")]
927#[cfg_attr(test, assert_instr(vcvtdq2ps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
930pub const fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
931    unsafe { simd_cast(a.as_i32x8()) }
932}
933
934/// Converts packed double-precision (64-bit) floating-point elements in `a`
935/// to packed single-precision (32-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
938#[inline]
939#[target_feature(enable = "avx")]
940#[cfg_attr(test, assert_instr(vcvtpd2ps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
944    unsafe { simd_cast(a) }
945}
946
947/// Converts packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvtps2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
956    unsafe { transmute(vcvtps2dq(a)) }
957}
958
959/// Converts packed single-precision (32-bit) floating-point elements in `a`
960/// to packed double-precision (64-bit) floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtps2pd))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
968pub const fn _mm256_cvtps_pd(a: __m128) -> __m256d {
969    unsafe { simd_cast(a) }
970}
971
972/// Returns the first element of the input vector of `[4 x double]`.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
975#[inline]
976#[target_feature(enable = "avx")]
977//#[cfg_attr(test, assert_instr(movsd))] FIXME
978#[stable(feature = "simd_x86", since = "1.27.0")]
979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
980pub const fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
981    unsafe { simd_extract!(a, 0) }
982}
983
984/// Converts packed double-precision (64-bit) floating-point elements in `a`
985/// to packed 32-bit integers with truncation.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
988#[inline]
989#[target_feature(enable = "avx")]
990#[cfg_attr(test, assert_instr(vcvttpd2dq))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
993    unsafe { transmute(vcvttpd2dq(a)) }
994}
995
996/// Converts packed double-precision (64-bit) floating-point elements in `a`
997/// to packed 32-bit integers.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
1000#[inline]
1001#[target_feature(enable = "avx")]
1002#[cfg_attr(test, assert_instr(vcvtpd2dq))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
1005    unsafe { transmute(vcvtpd2dq(a)) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers with truncation.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
1012#[inline]
1013#[target_feature(enable = "avx")]
1014#[cfg_attr(test, assert_instr(vcvttps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
1017    unsafe { transmute(vcvttps2dq(a)) }
1018}
1019
1020/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
1021/// floating-point elements) from `a`, selected with `imm8`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
1024#[inline]
1025#[target_feature(enable = "avx")]
1026#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1027#[rustc_legacy_const_generics(1)]
1028#[stable(feature = "simd_x86", since = "1.27.0")]
1029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1030pub const fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
1031    static_assert_uimm_bits!(IMM1, 1);
1032    unsafe {
1033        simd_shuffle!(
1034            a,
1035            _mm256_undefined_ps(),
1036            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
1037        )
1038    }
1039}
1040
1041/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1042/// floating-point elements) from `a`, selected with `imm8`.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1045#[inline]
1046#[target_feature(enable = "avx")]
1047#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1048#[rustc_legacy_const_generics(1)]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1052    static_assert_uimm_bits!(IMM1, 1);
1053    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1054}
1055
1056/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1062#[rustc_legacy_const_generics(1)]
1063#[stable(feature = "simd_x86", since = "1.27.0")]
1064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1065pub const fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1066    static_assert_uimm_bits!(IMM1, 1);
1067    unsafe {
1068        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1069        transmute(dst)
1070    }
1071}
1072
1073/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078// This intrinsic has no corresponding instruction.
1079#[rustc_legacy_const_generics(1)]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1082pub const fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1083    static_assert_uimm_bits!(INDEX, 3);
1084    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1085}
1086
1087/// Returns the first element of the input vector of `[8 x i32]`.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1090#[inline]
1091#[target_feature(enable = "avx")]
1092#[stable(feature = "simd_x86", since = "1.27.0")]
1093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1094pub const fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1095    unsafe { simd_extract!(a.as_i32x8(), 0) }
1096}
1097
1098/// Zeroes the contents of all XMM or YMM registers.
1099///
1100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1101#[inline]
1102#[target_feature(enable = "avx")]
1103#[cfg_attr(test, assert_instr(vzeroall))]
1104#[stable(feature = "simd_x86", since = "1.27.0")]
1105pub fn _mm256_zeroall() {
1106    unsafe { vzeroall() }
1107}
1108
1109/// Zeroes the upper 128 bits of all YMM registers;
1110/// the lower 128-bits of the registers are unmodified.
1111///
1112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1113#[inline]
1114#[target_feature(enable = "avx")]
1115#[cfg_attr(test, assert_instr(vzeroupper))]
1116#[stable(feature = "simd_x86", since = "1.27.0")]
1117pub fn _mm256_zeroupper() {
1118    unsafe { vzeroupper() }
1119}
1120
1121/// Shuffles single-precision (32-bit) floating-point elements in `a`
1122/// within 128-bit lanes using the control in `b`.
1123///
1124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1125#[inline]
1126#[target_feature(enable = "avx")]
1127#[cfg_attr(test, assert_instr(vpermilps))]
1128#[stable(feature = "simd_x86", since = "1.27.0")]
1129pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1130    unsafe { vpermilps256(a, b.as_i32x8()) }
1131}
1132
1133/// Shuffles single-precision (32-bit) floating-point elements in `a`
1134/// using the control in `b`.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1137#[inline]
1138#[target_feature(enable = "avx")]
1139#[cfg_attr(test, assert_instr(vpermilps))]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1142    unsafe { vpermilps(a, b.as_i32x4()) }
1143}
1144
1145/// Shuffles single-precision (32-bit) floating-point elements in `a`
1146/// within 128-bit lanes using the control in `imm8`.
1147///
1148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1149#[inline]
1150#[target_feature(enable = "avx")]
1151#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1152#[rustc_legacy_const_generics(1)]
1153#[stable(feature = "simd_x86", since = "1.27.0")]
1154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1155pub const fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1156    static_assert_uimm_bits!(IMM8, 8);
1157    unsafe {
1158        simd_shuffle!(
1159            a,
1160            _mm256_undefined_ps(),
1161            [
1162                (IMM8 as u32 >> 0) & 0b11,
1163                (IMM8 as u32 >> 2) & 0b11,
1164                (IMM8 as u32 >> 4) & 0b11,
1165                (IMM8 as u32 >> 6) & 0b11,
1166                ((IMM8 as u32 >> 0) & 0b11) + 4,
1167                ((IMM8 as u32 >> 2) & 0b11) + 4,
1168                ((IMM8 as u32 >> 4) & 0b11) + 4,
1169                ((IMM8 as u32 >> 6) & 0b11) + 4,
1170            ],
1171        )
1172    }
1173}
1174
1175/// Shuffles single-precision (32-bit) floating-point elements in `a`
1176/// using the control in `imm8`.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1179#[inline]
1180#[target_feature(enable = "avx")]
1181#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1182#[rustc_legacy_const_generics(1)]
1183#[stable(feature = "simd_x86", since = "1.27.0")]
1184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1185pub const fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1186    static_assert_uimm_bits!(IMM8, 8);
1187    unsafe {
1188        simd_shuffle!(
1189            a,
1190            _mm_undefined_ps(),
1191            [
1192                (IMM8 as u32 >> 0) & 0b11,
1193                (IMM8 as u32 >> 2) & 0b11,
1194                (IMM8 as u32 >> 4) & 0b11,
1195                (IMM8 as u32 >> 6) & 0b11,
1196            ],
1197        )
1198    }
1199}
1200
1201/// Shuffles double-precision (64-bit) floating-point elements in `a`
1202/// within 256-bit lanes using the control in `b`.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1205#[inline]
1206#[target_feature(enable = "avx")]
1207#[cfg_attr(test, assert_instr(vpermilpd))]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1210    unsafe { vpermilpd256(a, b.as_i64x4()) }
1211}
1212
1213/// Shuffles double-precision (64-bit) floating-point elements in `a`
1214/// using the control in `b`.
1215///
1216/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1217#[inline]
1218#[target_feature(enable = "avx")]
1219#[cfg_attr(test, assert_instr(vpermilpd))]
1220#[stable(feature = "simd_x86", since = "1.27.0")]
1221pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1222    unsafe { vpermilpd(a, b.as_i64x2()) }
1223}
1224
1225/// Shuffles double-precision (64-bit) floating-point elements in `a`
1226/// within 128-bit lanes using the control in `imm8`.
1227///
1228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1229#[inline]
1230#[target_feature(enable = "avx")]
1231#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1232#[rustc_legacy_const_generics(1)]
1233#[stable(feature = "simd_x86", since = "1.27.0")]
1234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1235pub const fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1236    static_assert_uimm_bits!(IMM4, 4);
1237    unsafe {
1238        simd_shuffle!(
1239            a,
1240            _mm256_undefined_pd(),
1241            [
1242                ((IMM4 as u32 >> 0) & 1),
1243                ((IMM4 as u32 >> 1) & 1),
1244                ((IMM4 as u32 >> 2) & 1) + 2,
1245                ((IMM4 as u32 >> 3) & 1) + 2,
1246            ],
1247        )
1248    }
1249}
1250
1251/// Shuffles double-precision (64-bit) floating-point elements in `a`
1252/// using the control in `imm8`.
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1255#[inline]
1256#[target_feature(enable = "avx")]
1257#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1258#[rustc_legacy_const_generics(1)]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1261pub const fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1262    static_assert_uimm_bits!(IMM2, 2);
1263    unsafe {
1264        simd_shuffle!(
1265            a,
1266            _mm_undefined_pd(),
1267            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1268        )
1269    }
1270}
1271
1272/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1273/// floating-point elements) selected by `imm8` from `a` and `b`.
1274///
1275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1276#[inline]
1277#[target_feature(enable = "avx")]
1278#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1279#[rustc_legacy_const_generics(2)]
1280#[stable(feature = "simd_x86", since = "1.27.0")]
1281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1282pub const fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1283    static_assert_uimm_bits!(IMM8, 8);
1284    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1285        _mm256_castps_si256(a),
1286        _mm256_castps_si256(b),
1287    ))
1288}
1289
1290/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1291/// floating-point elements) selected by `imm8` from `a` and `b`.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1294#[inline]
1295#[target_feature(enable = "avx")]
1296#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1297#[rustc_legacy_const_generics(2)]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1300pub const fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1301    static_assert_uimm_bits!(IMM8, 8);
1302    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1303        _mm256_castpd_si256(a),
1304        _mm256_castpd_si256(b),
1305    ))
1306}
1307
1308/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1309/// from `a` and `b`.
1310///
1311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1312#[inline]
1313#[target_feature(enable = "avx")]
1314#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1315#[rustc_legacy_const_generics(2)]
1316#[stable(feature = "simd_x86", since = "1.27.0")]
1317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1318pub const fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1319    static_assert_uimm_bits!(IMM8, 8);
1320    const fn idx(imm8: i32, pos: u32) -> u32 {
1321        let part = if pos < 2 {
1322            imm8 & 0xf
1323        } else {
1324            (imm8 & 0xf0) >> 4
1325        };
1326        2 * (part as u32 & 0b11) + (pos & 1)
1327    }
1328    const fn idx0(imm8: i32, pos: u32) -> u32 {
1329        let part = if pos < 2 {
1330            imm8 & 0xf
1331        } else {
1332            (imm8 & 0xf0) >> 4
1333        };
1334        if part & 0b1000 != 0 { 4 } else { pos }
1335    }
1336    unsafe {
1337        let r = simd_shuffle!(
1338            a.as_i64x4(),
1339            b.as_i64x4(),
1340            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1341        );
1342        let r: i64x4 = simd_shuffle!(
1343            r,
1344            i64x4::ZERO,
1345            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1346        );
1347        r.as_m256i()
1348    }
1349}
1350
1351/// Broadcasts a single-precision (32-bit) floating-point element from memory
1352/// to all elements of the returned vector.
1353///
1354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1355#[inline]
1356#[target_feature(enable = "avx")]
1357#[cfg_attr(test, assert_instr(vbroadcastss))]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359#[allow(clippy::trivially_copy_pass_by_ref)]
1360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1361pub const fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1362    _mm256_set1_ps(*f)
1363}
1364
1365/// Broadcasts a single-precision (32-bit) floating-point element from memory
1366/// to all elements of the returned vector.
1367///
1368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1369#[inline]
1370#[target_feature(enable = "avx")]
1371#[cfg_attr(test, assert_instr(vbroadcastss))]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373#[allow(clippy::trivially_copy_pass_by_ref)]
1374#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1375pub const fn _mm_broadcast_ss(f: &f32) -> __m128 {
1376    _mm_set1_ps(*f)
1377}
1378
1379/// Broadcasts a double-precision (64-bit) floating-point element from memory
1380/// to all elements of the returned vector.
1381///
1382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1383#[inline]
1384#[target_feature(enable = "avx")]
1385#[cfg_attr(test, assert_instr(vbroadcastsd))]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387#[allow(clippy::trivially_copy_pass_by_ref)]
1388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1389pub const fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1390    _mm256_set1_pd(*f)
1391}
1392
1393/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1394/// (32-bit) floating-point elements) to all elements of the returned vector.
1395///
1396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1397#[inline]
1398#[target_feature(enable = "avx")]
1399#[cfg_attr(test, assert_instr(vbroadcastf128))]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1402pub const fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1403    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1404}
1405
1406/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1407/// (64-bit) floating-point elements) to all elements of the returned vector.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412#[cfg_attr(test, assert_instr(vbroadcastf128))]
1413#[stable(feature = "simd_x86", since = "1.27.0")]
1414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1415pub const fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1416    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1417}
1418
1419/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1420/// single-precision (32-bit) floating-point elements) from `b` into result
1421/// at the location specified by `imm8`.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1424#[inline]
1425#[target_feature(enable = "avx")]
1426#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1427#[rustc_legacy_const_generics(2)]
1428#[stable(feature = "simd_x86", since = "1.27.0")]
1429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1430pub const fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1431    static_assert_uimm_bits!(IMM1, 1);
1432    unsafe {
1433        simd_shuffle!(
1434            a,
1435            _mm256_castps128_ps256(b),
1436            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1437        )
1438    }
1439}
1440
1441/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1442/// double-precision (64-bit) floating-point elements) from `b` into result
1443/// at the location specified by `imm8`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1452pub const fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1453    static_assert_uimm_bits!(IMM1, 1);
1454    unsafe {
1455        simd_shuffle!(
1456            a,
1457            _mm256_castpd128_pd256(b),
1458            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1459        )
1460    }
1461}
1462
1463/// Copies `a` to result, then inserts 128 bits from `b` into result
1464/// at the location specified by `imm8`.
1465///
1466/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1467#[inline]
1468#[target_feature(enable = "avx")]
1469#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1470#[rustc_legacy_const_generics(2)]
1471#[stable(feature = "simd_x86", since = "1.27.0")]
1472#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1473pub const fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1474    static_assert_uimm_bits!(IMM1, 1);
1475    unsafe {
1476        let dst: i64x4 = simd_shuffle!(
1477            a.as_i64x4(),
1478            _mm256_castsi128_si256(b).as_i64x4(),
1479            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1480        );
1481        transmute(dst)
1482    }
1483}
1484
1485/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1486/// at the location specified by `index`.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1489#[inline]
1490#[target_feature(enable = "avx")]
1491// This intrinsic has no corresponding instruction.
1492#[rustc_legacy_const_generics(2)]
1493#[stable(feature = "simd_x86", since = "1.27.0")]
1494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1495pub const fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1496    static_assert_uimm_bits!(INDEX, 5);
1497    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1498}
1499
1500/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1501/// at the location specified by `index`.
1502///
1503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1504#[inline]
1505#[target_feature(enable = "avx")]
1506// This intrinsic has no corresponding instruction.
1507#[rustc_legacy_const_generics(2)]
1508#[stable(feature = "simd_x86", since = "1.27.0")]
1509#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1510pub const fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1511    static_assert_uimm_bits!(INDEX, 4);
1512    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1513}
1514
1515/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1516/// at the location specified by `index`.
1517///
1518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1519#[inline]
1520#[target_feature(enable = "avx")]
1521// This intrinsic has no corresponding instruction.
1522#[rustc_legacy_const_generics(2)]
1523#[stable(feature = "simd_x86", since = "1.27.0")]
1524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1525pub const fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1526    static_assert_uimm_bits!(INDEX, 3);
1527    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1528}
1529
1530/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1531/// floating-point elements) from memory into result.
1532/// `mem_addr` must be aligned on a 32-byte boundary or a
1533/// general-protection exception may be generated.
1534///
1535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1536#[inline]
1537#[target_feature(enable = "avx")]
1538#[cfg_attr(
1539    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1540    assert_instr(vmovap)
1541)]
1542#[stable(feature = "simd_x86", since = "1.27.0")]
1543#[allow(clippy::cast_ptr_alignment)]
1544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1545pub const unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1546    *(mem_addr as *const __m256d)
1547}
1548
1549/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1550/// floating-point elements) from `a` into memory.
1551/// `mem_addr` must be aligned on a 32-byte boundary or a
1552/// general-protection exception may be generated.
1553///
1554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1555#[inline]
1556#[target_feature(enable = "avx")]
1557#[cfg_attr(
1558    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1559    assert_instr(vmovap)
1560)]
1561#[stable(feature = "simd_x86", since = "1.27.0")]
1562#[allow(clippy::cast_ptr_alignment)]
1563#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1564pub const unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1565    *(mem_addr as *mut __m256d) = a;
1566}
1567
1568/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1569/// floating-point elements) from memory into result.
1570/// `mem_addr` must be aligned on a 32-byte boundary or a
1571/// general-protection exception may be generated.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1574#[inline]
1575#[target_feature(enable = "avx")]
1576#[cfg_attr(
1577    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1578    assert_instr(vmovaps)
1579)]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581#[allow(clippy::cast_ptr_alignment)]
1582#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1583pub const unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1584    *(mem_addr as *const __m256)
1585}
1586
1587/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1588/// floating-point elements) from `a` into memory.
1589/// `mem_addr` must be aligned on a 32-byte boundary or a
1590/// general-protection exception may be generated.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1593#[inline]
1594#[target_feature(enable = "avx")]
1595#[cfg_attr(
1596    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1597    assert_instr(vmovaps)
1598)]
1599#[stable(feature = "simd_x86", since = "1.27.0")]
1600#[allow(clippy::cast_ptr_alignment)]
1601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1602pub const unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1603    *(mem_addr as *mut __m256) = a;
1604}
1605
1606/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1607/// floating-point elements) from memory into result.
1608/// `mem_addr` does not need to be aligned on any particular boundary.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(test, assert_instr(vmovup))]
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1616pub const unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1617    let mut dst = _mm256_undefined_pd();
1618    ptr::copy_nonoverlapping(
1619        mem_addr as *const u8,
1620        ptr::addr_of_mut!(dst) as *mut u8,
1621        mem::size_of::<__m256d>(),
1622    );
1623    dst
1624}
1625
1626/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1627/// floating-point elements) from `a` into memory.
1628/// `mem_addr` does not need to be aligned on any particular boundary.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1631#[inline]
1632#[target_feature(enable = "avx")]
1633#[cfg_attr(test, assert_instr(vmovup))]
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1636pub const unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1637    mem_addr.cast::<__m256d>().write_unaligned(a);
1638}
1639
1640/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1641/// floating-point elements) from memory into result.
1642/// `mem_addr` does not need to be aligned on any particular boundary.
1643///
1644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1645#[inline]
1646#[target_feature(enable = "avx")]
1647#[cfg_attr(test, assert_instr(vmovups))]
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1650pub const unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1651    let mut dst = _mm256_undefined_ps();
1652    ptr::copy_nonoverlapping(
1653        mem_addr as *const u8,
1654        ptr::addr_of_mut!(dst) as *mut u8,
1655        mem::size_of::<__m256>(),
1656    );
1657    dst
1658}
1659
1660/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1661/// floating-point elements) from `a` into memory.
1662/// `mem_addr` does not need to be aligned on any particular boundary.
1663///
1664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1665#[inline]
1666#[target_feature(enable = "avx")]
1667#[cfg_attr(test, assert_instr(vmovups))]
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1670pub const unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1671    mem_addr.cast::<__m256>().write_unaligned(a);
1672}
1673
1674/// Loads 256-bits of integer data from memory into result.
1675/// `mem_addr` must be aligned on a 32-byte boundary or a
1676/// general-protection exception may be generated.
1677///
1678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1679#[inline]
1680#[target_feature(enable = "avx")]
1681#[cfg_attr(
1682    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1683    assert_instr(vmovaps)
1684)] // FIXME vmovdqa expected
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1688    *mem_addr
1689}
1690
1691/// Stores 256-bits of integer data from `a` into memory.
1692/// `mem_addr` must be aligned on a 32-byte boundary or a
1693/// general-protection exception may be generated.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1696#[inline]
1697#[target_feature(enable = "avx")]
1698#[cfg_attr(
1699    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1700    assert_instr(vmovaps)
1701)] // FIXME vmovdqa expected
1702#[stable(feature = "simd_x86", since = "1.27.0")]
1703#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1704pub const unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1705    *mem_addr = a;
1706}
1707
1708/// Loads 256-bits of integer data from memory into result.
1709/// `mem_addr` does not need to be aligned on any particular boundary.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1712#[inline]
1713#[target_feature(enable = "avx")]
1714#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1718    let mut dst = _mm256_undefined_si256();
1719    ptr::copy_nonoverlapping(
1720        mem_addr as *const u8,
1721        ptr::addr_of_mut!(dst) as *mut u8,
1722        mem::size_of::<__m256i>(),
1723    );
1724    dst
1725}
1726
1727/// Stores 256-bits of integer data from `a` into memory.
1728/// `mem_addr` does not need to be aligned on any particular boundary.
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1731#[inline]
1732#[target_feature(enable = "avx")]
1733#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1736pub const unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1737    mem_addr.write_unaligned(a);
1738}
1739
1740/// Loads packed double-precision (64-bit) floating-point elements from memory
1741/// into result using `mask` (elements are zeroed out when the high bit of the
1742/// corresponding element is not set).
1743///
1744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1745#[inline]
1746#[target_feature(enable = "avx")]
1747#[cfg_attr(test, assert_instr(vmaskmovpd))]
1748#[stable(feature = "simd_x86", since = "1.27.0")]
1749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1750pub const unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1751    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1752    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1753}
1754
1755/// Stores packed double-precision (64-bit) floating-point elements from `a`
1756/// into memory using `mask`.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vmaskmovpd))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1764pub const unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1765    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1766    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1767}
1768
1769/// Loads packed double-precision (64-bit) floating-point elements from memory
1770/// into result using `mask` (elements are zeroed out when the high bit of the
1771/// corresponding element is not set).
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1774#[inline]
1775#[target_feature(enable = "avx")]
1776#[cfg_attr(test, assert_instr(vmaskmovpd))]
1777#[stable(feature = "simd_x86", since = "1.27.0")]
1778#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1779pub const unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1780    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1781    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1782}
1783
1784/// Stores packed double-precision (64-bit) floating-point elements from `a`
1785/// into memory using `mask`.
1786///
1787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1788#[inline]
1789#[target_feature(enable = "avx")]
1790#[cfg_attr(test, assert_instr(vmaskmovpd))]
1791#[stable(feature = "simd_x86", since = "1.27.0")]
1792#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1793pub const unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1794    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1795    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1796}
1797
1798/// Loads packed single-precision (32-bit) floating-point elements from memory
1799/// into result using `mask` (elements are zeroed out when the high bit of the
1800/// corresponding element is not set).
1801///
1802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1803#[inline]
1804#[target_feature(enable = "avx")]
1805#[cfg_attr(test, assert_instr(vmaskmovps))]
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1808pub const unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1809    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1810    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1811}
1812
1813/// Stores packed single-precision (32-bit) floating-point elements from `a`
1814/// into memory using `mask`.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vmaskmovps))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1822pub const unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1823    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1824    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1825}
1826
1827/// Loads packed single-precision (32-bit) floating-point elements from memory
1828/// into result using `mask` (elements are zeroed out when the high bit of the
1829/// corresponding element is not set).
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1832#[inline]
1833#[target_feature(enable = "avx")]
1834#[cfg_attr(test, assert_instr(vmaskmovps))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1837pub const unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1838    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1839    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1840}
1841
1842/// Stores packed single-precision (32-bit) floating-point elements from `a`
1843/// into memory using `mask`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1846#[inline]
1847#[target_feature(enable = "avx")]
1848#[cfg_attr(test, assert_instr(vmaskmovps))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1851pub const unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1852    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1853    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1854}
1855
1856/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1857/// from `a`, and returns the results.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1860#[inline]
1861#[target_feature(enable = "avx")]
1862#[cfg_attr(test, assert_instr(vmovshdup))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1866    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1867}
1868
1869/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1870/// from `a`, and returns the results.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1873#[inline]
1874#[target_feature(enable = "avx")]
1875#[cfg_attr(test, assert_instr(vmovsldup))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1878pub const fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1879    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1880}
1881
1882/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1883/// from `a`, and returns the results.
1884///
1885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1886#[inline]
1887#[target_feature(enable = "avx")]
1888#[cfg_attr(test, assert_instr(vmovddup))]
1889#[stable(feature = "simd_x86", since = "1.27.0")]
1890#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1891pub const fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1892    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1893}
1894
1895/// Loads 256-bits of integer data from unaligned memory into result.
1896/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1897/// data crosses a cache line boundary.
1898///
1899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1900#[inline]
1901#[target_feature(enable = "avx")]
1902#[cfg_attr(test, assert_instr(vlddqu))]
1903#[stable(feature = "simd_x86", since = "1.27.0")]
1904pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1905    transmute(vlddqu(mem_addr as *const i8))
1906}
1907
1908/// Moves integer data from a 256-bit integer vector to a 32-byte
1909/// aligned memory location. To minimize caching, the data is flagged as
1910/// non-temporal (unlikely to be used again soon)
1911///
1912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1913///
1914/// # Safety of non-temporal stores
1915///
1916/// After using this intrinsic, but before any other access to the memory that this intrinsic
1917/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1918/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1919/// return.
1920///
1921/// See [`_mm_sfence`] for details.
1922#[inline]
1923#[target_feature(enable = "avx")]
1924#[cfg_attr(test, assert_instr(vmovntdq))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1927    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1928    crate::arch::asm!(
1929        vps!("vmovntdq", ",{a}"),
1930        p = in(reg) mem_addr,
1931        a = in(ymm_reg) a,
1932        options(nostack, preserves_flags),
1933    );
1934}
1935
1936/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1937/// to a 32-byte aligned memory location. To minimize caching, the data is
1938/// flagged as non-temporal (unlikely to be used again soon).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1941///
1942/// # Safety of non-temporal stores
1943///
1944/// After using this intrinsic, but before any other access to the memory that this intrinsic
1945/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1946/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1947/// return.
1948///
1949/// See [`_mm_sfence`] for details.
1950#[inline]
1951#[target_feature(enable = "avx")]
1952#[cfg_attr(test, assert_instr(vmovntpd))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954#[allow(clippy::cast_ptr_alignment)]
1955pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1956    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1957    crate::arch::asm!(
1958        vps!("vmovntpd", ",{a}"),
1959        p = in(reg) mem_addr,
1960        a = in(ymm_reg) a,
1961        options(nostack, preserves_flags),
1962    );
1963}
1964
1965/// Moves single-precision floating point values from a 256-bit vector
1966/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1967/// caching, the data is flagged as non-temporal (unlikely to be used again
1968/// soon).
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1971///
1972/// # Safety of non-temporal stores
1973///
1974/// After using this intrinsic, but before any other access to the memory that this intrinsic
1975/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1976/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1977/// return.
1978///
1979/// See [`_mm_sfence`] for details.
1980#[inline]
1981#[target_feature(enable = "avx")]
1982#[cfg_attr(test, assert_instr(vmovntps))]
1983#[stable(feature = "simd_x86", since = "1.27.0")]
1984#[allow(clippy::cast_ptr_alignment)]
1985pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1986    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1987    crate::arch::asm!(
1988        vps!("vmovntps", ",{a}"),
1989        p = in(reg) mem_addr,
1990        a = in(ymm_reg) a,
1991        options(nostack, preserves_flags),
1992    );
1993}
1994
1995/// Computes the approximate reciprocal of packed single-precision (32-bit)
1996/// floating-point elements in `a`, and returns the results. The maximum
1997/// relative error for this approximation is less than 1.5*2^-12.
1998///
1999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
2000#[inline]
2001#[target_feature(enable = "avx")]
2002#[cfg_attr(test, assert_instr(vrcpps))]
2003#[stable(feature = "simd_x86", since = "1.27.0")]
2004pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
2005    unsafe { vrcpps(a) }
2006}
2007
2008/// Computes the approximate reciprocal square root of packed single-precision
2009/// (32-bit) floating-point elements in `a`, and returns the results.
2010/// The maximum relative error for this approximation is less than 1.5*2^-12.
2011///
2012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
2013#[inline]
2014#[target_feature(enable = "avx")]
2015#[cfg_attr(test, assert_instr(vrsqrtps))]
2016#[stable(feature = "simd_x86", since = "1.27.0")]
2017pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
2018    unsafe { vrsqrtps(a) }
2019}
2020
2021/// Unpacks and interleave double-precision (64-bit) floating-point elements
2022/// from the high half of each 128-bit lane in `a` and `b`.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
2025#[inline]
2026#[target_feature(enable = "avx")]
2027#[cfg_attr(test, assert_instr(vunpckhpd))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2030pub const fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
2031    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
2032}
2033
2034/// Unpacks and interleave single-precision (32-bit) floating-point elements
2035/// from the high half of each 128-bit lane in `a` and `b`.
2036///
2037/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
2038#[inline]
2039#[target_feature(enable = "avx")]
2040#[cfg_attr(test, assert_instr(vunpckhps))]
2041#[stable(feature = "simd_x86", since = "1.27.0")]
2042#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2043pub const fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
2044    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
2045}
2046
2047/// Unpacks and interleave double-precision (64-bit) floating-point elements
2048/// from the low half of each 128-bit lane in `a` and `b`.
2049///
2050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
2051#[inline]
2052#[target_feature(enable = "avx")]
2053#[cfg_attr(test, assert_instr(vunpcklpd))]
2054#[stable(feature = "simd_x86", since = "1.27.0")]
2055#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2056pub const fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
2057    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
2058}
2059
2060/// Unpacks and interleave single-precision (32-bit) floating-point elements
2061/// from the low half of each 128-bit lane in `a` and `b`.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vunpcklps))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2069pub const fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
2070    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
2071}
2072
2073/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2074/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2075/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2076/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vptest))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2084pub const fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
2085    unsafe {
2086        let r = simd_and(a.as_i64x4(), b.as_i64x4());
2087        (0i64 == simd_reduce_or(r)) as i32
2088    }
2089}
2090
2091/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2092/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2093/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2094/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2097#[inline]
2098#[target_feature(enable = "avx")]
2099#[cfg_attr(test, assert_instr(vptest))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2102pub const fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2103    unsafe {
2104        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2105        (0i64 == simd_reduce_or(r)) as i32
2106    }
2107}
2108
2109/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2110/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2111/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2112/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2113/// `CF` values are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vptest))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2121    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2122}
2123
2124/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestpd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2138    unsafe { vtestzpd256(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestpd))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2155    unsafe { vtestcpd256(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2173    unsafe { vtestnzcpd256(a, b) }
2174}
2175
2176/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2177/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2178/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2179/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2180/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2181/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2182/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2185#[inline]
2186#[target_feature(enable = "avx")]
2187#[cfg_attr(test, assert_instr(vtestpd))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2190pub const fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2191    unsafe {
2192        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2193        (0i64 == simd_reduce_or(r)) as i32
2194    }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestpd))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2211pub const fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2212    unsafe {
2213        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2214        (0i64 == simd_reduce_or(r)) as i32
2215    }
2216}
2217
2218/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2219/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2220/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2221/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2222/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2223/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2224/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2225/// are zero, otherwise return 0.
2226///
2227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2228#[inline]
2229#[target_feature(enable = "avx")]
2230#[cfg_attr(test, assert_instr(vtestpd))]
2231#[stable(feature = "simd_x86", since = "1.27.0")]
2232pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2233    unsafe { vtestnzcpd(a, b) }
2234}
2235
2236/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2237/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2238/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2239/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2240/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2241/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2242/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2245#[inline]
2246#[target_feature(enable = "avx")]
2247#[cfg_attr(test, assert_instr(vtestps))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2250    unsafe { vtestzps256(a, b) }
2251}
2252
2253/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2254/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2255/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2256/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2257/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2258/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2259/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264#[cfg_attr(test, assert_instr(vtestps))]
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2267    unsafe { vtestcps256(a, b) }
2268}
2269
2270/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2271/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2272/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2273/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2274/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2275/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2276/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2277/// are zero, otherwise return 0.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2280#[inline]
2281#[target_feature(enable = "avx")]
2282#[cfg_attr(test, assert_instr(vtestps))]
2283#[stable(feature = "simd_x86", since = "1.27.0")]
2284pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2285    unsafe { vtestnzcps256(a, b) }
2286}
2287
2288/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2289/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2290/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2291/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2292/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2293/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2294/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2297#[inline]
2298#[target_feature(enable = "avx")]
2299#[cfg_attr(test, assert_instr(vtestps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2302pub const fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2303    unsafe {
2304        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2305        (0i32 == simd_reduce_or(r)) as i32
2306    }
2307}
2308
2309/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2310/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2311/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2312/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2313/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2314/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2315/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2318#[inline]
2319#[target_feature(enable = "avx")]
2320#[cfg_attr(test, assert_instr(vtestps))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2323pub const fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2324    unsafe {
2325        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2326        (0i32 == simd_reduce_or(r)) as i32
2327    }
2328}
2329
2330/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2331/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2332/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2333/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2334/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2335/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2336/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2337/// are zero, otherwise return 0.
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2340#[inline]
2341#[target_feature(enable = "avx")]
2342#[cfg_attr(test, assert_instr(vtestps))]
2343#[stable(feature = "simd_x86", since = "1.27.0")]
2344pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2345    unsafe { vtestnzcps(a, b) }
2346}
2347
2348/// Sets each bit of the returned mask based on the most significant bit of the
2349/// corresponding packed double-precision (64-bit) floating-point element in
2350/// `a`.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2353#[inline]
2354#[target_feature(enable = "avx")]
2355#[cfg_attr(test, assert_instr(vmovmskpd))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2358pub const fn _mm256_movemask_pd(a: __m256d) -> i32 {
2359    // Propagate the highest bit to the rest, because simd_bitmask
2360    // requires all-1 or all-0.
2361    unsafe {
2362        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2363        simd_bitmask::<i64x4, u8>(mask) as i32
2364    }
2365}
2366
2367/// Sets each bit of the returned mask based on the most significant bit of the
2368/// corresponding packed single-precision (32-bit) floating-point element in
2369/// `a`.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2372#[inline]
2373#[target_feature(enable = "avx")]
2374#[cfg_attr(test, assert_instr(vmovmskps))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2377pub const fn _mm256_movemask_ps(a: __m256) -> i32 {
2378    // Propagate the highest bit to the rest, because simd_bitmask
2379    // requires all-1 or all-0.
2380    unsafe {
2381        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2382        simd_bitmask::<i32x8, u8>(mask) as i32
2383    }
2384}
2385
2386/// Returns vector of type __m256d with all elements set to zero.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391#[cfg_attr(test, assert_instr(vxorp))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2394pub const fn _mm256_setzero_pd() -> __m256d {
2395    const { unsafe { mem::zeroed() } }
2396}
2397
2398/// Returns vector of type __m256 with all elements set to zero.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403#[cfg_attr(test, assert_instr(vxorps))]
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm256_setzero_ps() -> __m256 {
2407    const { unsafe { mem::zeroed() } }
2408}
2409
2410/// Returns vector of type __m256i with all elements set to zero.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415#[cfg_attr(test, assert_instr(vxor))]
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2418pub const fn _mm256_setzero_si256() -> __m256i {
2419    const { unsafe { mem::zeroed() } }
2420}
2421
2422/// Sets packed double-precision (64-bit) floating-point elements in returned
2423/// vector with the supplied values.
2424///
2425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2426#[inline]
2427#[target_feature(enable = "avx")]
2428// This intrinsic has no corresponding instruction.
2429#[cfg_attr(test, assert_instr(vinsertf128))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2432pub const fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2433    _mm256_setr_pd(d, c, b, a)
2434}
2435
2436/// Sets packed single-precision (32-bit) floating-point elements in returned
2437/// vector with the supplied values.
2438///
2439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2440#[inline]
2441#[target_feature(enable = "avx")]
2442// This intrinsic has no corresponding instruction.
2443#[stable(feature = "simd_x86", since = "1.27.0")]
2444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2445pub const fn _mm256_set_ps(
2446    a: f32,
2447    b: f32,
2448    c: f32,
2449    d: f32,
2450    e: f32,
2451    f: f32,
2452    g: f32,
2453    h: f32,
2454) -> __m256 {
2455    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2456}
2457
2458/// Sets packed 8-bit integers in returned vector with the supplied values.
2459///
2460/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2461#[inline]
2462#[target_feature(enable = "avx")]
2463// This intrinsic has no corresponding instruction.
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2466pub const fn _mm256_set_epi8(
2467    e00: i8,
2468    e01: i8,
2469    e02: i8,
2470    e03: i8,
2471    e04: i8,
2472    e05: i8,
2473    e06: i8,
2474    e07: i8,
2475    e08: i8,
2476    e09: i8,
2477    e10: i8,
2478    e11: i8,
2479    e12: i8,
2480    e13: i8,
2481    e14: i8,
2482    e15: i8,
2483    e16: i8,
2484    e17: i8,
2485    e18: i8,
2486    e19: i8,
2487    e20: i8,
2488    e21: i8,
2489    e22: i8,
2490    e23: i8,
2491    e24: i8,
2492    e25: i8,
2493    e26: i8,
2494    e27: i8,
2495    e28: i8,
2496    e29: i8,
2497    e30: i8,
2498    e31: i8,
2499) -> __m256i {
2500    #[rustfmt::skip]
2501    _mm256_setr_epi8(
2502        e31, e30, e29, e28, e27, e26, e25, e24,
2503        e23, e22, e21, e20, e19, e18, e17, e16,
2504        e15, e14, e13, e12, e11, e10, e09, e08,
2505        e07, e06, e05, e04, e03, e02, e01, e00,
2506    )
2507}
2508
2509/// Sets packed 16-bit integers in returned vector with the supplied values.
2510///
2511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2512#[inline]
2513#[target_feature(enable = "avx")]
2514// This intrinsic has no corresponding instruction.
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2517pub const fn _mm256_set_epi16(
2518    e00: i16,
2519    e01: i16,
2520    e02: i16,
2521    e03: i16,
2522    e04: i16,
2523    e05: i16,
2524    e06: i16,
2525    e07: i16,
2526    e08: i16,
2527    e09: i16,
2528    e10: i16,
2529    e11: i16,
2530    e12: i16,
2531    e13: i16,
2532    e14: i16,
2533    e15: i16,
2534) -> __m256i {
2535    #[rustfmt::skip]
2536    _mm256_setr_epi16(
2537        e15, e14, e13, e12,
2538        e11, e10, e09, e08,
2539        e07, e06, e05, e04,
2540        e03, e02, e01, e00,
2541    )
2542}
2543
2544/// Sets packed 32-bit integers in returned vector with the supplied values.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2547#[inline]
2548#[target_feature(enable = "avx")]
2549// This intrinsic has no corresponding instruction.
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2552pub const fn _mm256_set_epi32(
2553    e0: i32,
2554    e1: i32,
2555    e2: i32,
2556    e3: i32,
2557    e4: i32,
2558    e5: i32,
2559    e6: i32,
2560    e7: i32,
2561) -> __m256i {
2562    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2563}
2564
2565/// Sets packed 64-bit integers in returned vector with the supplied values.
2566///
2567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2568#[inline]
2569#[target_feature(enable = "avx")]
2570// This intrinsic has no corresponding instruction.
2571#[stable(feature = "simd_x86", since = "1.27.0")]
2572#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2573pub const fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2574    _mm256_setr_epi64x(d, c, b, a)
2575}
2576
2577/// Sets packed double-precision (64-bit) floating-point elements in returned
2578/// vector with the supplied values in reverse order.
2579///
2580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2581#[inline]
2582#[target_feature(enable = "avx")]
2583// This intrinsic has no corresponding instruction.
2584#[stable(feature = "simd_x86", since = "1.27.0")]
2585#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2586pub const fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2587    __m256d([a, b, c, d])
2588}
2589
2590/// Sets packed single-precision (32-bit) floating-point elements in returned
2591/// vector with the supplied values in reverse order.
2592///
2593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2594#[inline]
2595#[target_feature(enable = "avx")]
2596// This intrinsic has no corresponding instruction.
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm256_setr_ps(
2600    a: f32,
2601    b: f32,
2602    c: f32,
2603    d: f32,
2604    e: f32,
2605    f: f32,
2606    g: f32,
2607    h: f32,
2608) -> __m256 {
2609    __m256([a, b, c, d, e, f, g, h])
2610}
2611
2612/// Sets packed 8-bit integers in returned vector with the supplied values in
2613/// reverse order.
2614///
2615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2616#[inline]
2617#[target_feature(enable = "avx")]
2618// This intrinsic has no corresponding instruction.
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2621pub const fn _mm256_setr_epi8(
2622    e00: i8,
2623    e01: i8,
2624    e02: i8,
2625    e03: i8,
2626    e04: i8,
2627    e05: i8,
2628    e06: i8,
2629    e07: i8,
2630    e08: i8,
2631    e09: i8,
2632    e10: i8,
2633    e11: i8,
2634    e12: i8,
2635    e13: i8,
2636    e14: i8,
2637    e15: i8,
2638    e16: i8,
2639    e17: i8,
2640    e18: i8,
2641    e19: i8,
2642    e20: i8,
2643    e21: i8,
2644    e22: i8,
2645    e23: i8,
2646    e24: i8,
2647    e25: i8,
2648    e26: i8,
2649    e27: i8,
2650    e28: i8,
2651    e29: i8,
2652    e30: i8,
2653    e31: i8,
2654) -> __m256i {
2655    unsafe {
2656        #[rustfmt::skip]
2657        transmute(i8x32::new(
2658            e00, e01, e02, e03, e04, e05, e06, e07,
2659            e08, e09, e10, e11, e12, e13, e14, e15,
2660            e16, e17, e18, e19, e20, e21, e22, e23,
2661            e24, e25, e26, e27, e28, e29, e30, e31,
2662        ))
2663    }
2664}
2665
2666/// Sets packed 16-bit integers in returned vector with the supplied values in
2667/// reverse order.
2668///
2669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2670#[inline]
2671#[target_feature(enable = "avx")]
2672// This intrinsic has no corresponding instruction.
2673#[stable(feature = "simd_x86", since = "1.27.0")]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const fn _mm256_setr_epi16(
2676    e00: i16,
2677    e01: i16,
2678    e02: i16,
2679    e03: i16,
2680    e04: i16,
2681    e05: i16,
2682    e06: i16,
2683    e07: i16,
2684    e08: i16,
2685    e09: i16,
2686    e10: i16,
2687    e11: i16,
2688    e12: i16,
2689    e13: i16,
2690    e14: i16,
2691    e15: i16,
2692) -> __m256i {
2693    unsafe {
2694        #[rustfmt::skip]
2695        transmute(i16x16::new(
2696            e00, e01, e02, e03,
2697            e04, e05, e06, e07,
2698            e08, e09, e10, e11,
2699            e12, e13, e14, e15,
2700        ))
2701    }
2702}
2703
2704/// Sets packed 32-bit integers in returned vector with the supplied values in
2705/// reverse order.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic has no corresponding instruction.
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2713pub const fn _mm256_setr_epi32(
2714    e0: i32,
2715    e1: i32,
2716    e2: i32,
2717    e3: i32,
2718    e4: i32,
2719    e5: i32,
2720    e6: i32,
2721    e7: i32,
2722) -> __m256i {
2723    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2724}
2725
2726/// Sets packed 64-bit integers in returned vector with the supplied values in
2727/// reverse order.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic has no corresponding instruction.
2733#[stable(feature = "simd_x86", since = "1.27.0")]
2734#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2735pub const fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2736    unsafe { transmute(i64x4::new(a, b, c, d)) }
2737}
2738
2739/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2740/// elements of returned vector.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic has no corresponding instruction.
2746#[stable(feature = "simd_x86", since = "1.27.0")]
2747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2748pub const fn _mm256_set1_pd(a: f64) -> __m256d {
2749    f64x4::splat(a).as_m256d()
2750}
2751
2752/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2753/// elements of returned vector.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic has no corresponding instruction.
2759#[stable(feature = "simd_x86", since = "1.27.0")]
2760#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2761pub const fn _mm256_set1_ps(a: f32) -> __m256 {
2762    f32x8::splat(a).as_m256()
2763}
2764
2765/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2766/// This intrinsic may generate the `vpbroadcastb`.
2767///
2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2769#[inline]
2770#[target_feature(enable = "avx")]
2771// This intrinsic has no corresponding instruction.
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2774pub const fn _mm256_set1_epi8(a: i8) -> __m256i {
2775    i8x32::splat(a).as_m256i()
2776}
2777
2778/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2779/// This intrinsic may generate the `vpbroadcastw`.
2780///
2781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2782#[inline]
2783#[target_feature(enable = "avx")]
2784//#[cfg_attr(test, assert_instr(vpshufb))]
2785#[cfg_attr(test, assert_instr(vinsertf128))]
2786// This intrinsic has no corresponding instruction.
2787#[stable(feature = "simd_x86", since = "1.27.0")]
2788#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2789pub const fn _mm256_set1_epi16(a: i16) -> __m256i {
2790    i16x16::splat(a).as_m256i()
2791}
2792
2793/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2794/// This intrinsic may generate the `vpbroadcastd`.
2795///
2796/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2797#[inline]
2798#[target_feature(enable = "avx")]
2799// This intrinsic has no corresponding instruction.
2800#[stable(feature = "simd_x86", since = "1.27.0")]
2801#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2802pub const fn _mm256_set1_epi32(a: i32) -> __m256i {
2803    i32x8::splat(a).as_m256i()
2804}
2805
2806/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2807/// This intrinsic may generate the `vpbroadcastq`.
2808///
2809/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2810#[inline]
2811#[target_feature(enable = "avx")]
2812#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2813#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2814// This intrinsic has no corresponding instruction.
2815#[stable(feature = "simd_x86", since = "1.27.0")]
2816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2817pub const fn _mm256_set1_epi64x(a: i64) -> __m256i {
2818    i64x4::splat(a).as_m256i()
2819}
2820
2821/// Cast vector of type __m256d to type __m256.
2822///
2823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2824#[inline]
2825#[target_feature(enable = "avx")]
2826// This intrinsic is only used for compilation and does not generate any
2827// instructions, thus it has zero latency.
2828#[stable(feature = "simd_x86", since = "1.27.0")]
2829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2830pub const fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2831    unsafe { transmute(a) }
2832}
2833
2834/// Cast vector of type __m256 to type __m256d.
2835///
2836/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2837#[inline]
2838#[target_feature(enable = "avx")]
2839// This intrinsic is only used for compilation and does not generate any
2840// instructions, thus it has zero latency.
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2843pub const fn _mm256_castps_pd(a: __m256) -> __m256d {
2844    unsafe { transmute(a) }
2845}
2846
2847/// Casts vector of type __m256 to type __m256i.
2848///
2849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2850#[inline]
2851#[target_feature(enable = "avx")]
2852// This intrinsic is only used for compilation and does not generate any
2853// instructions, thus it has zero latency.
2854#[stable(feature = "simd_x86", since = "1.27.0")]
2855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2856pub const fn _mm256_castps_si256(a: __m256) -> __m256i {
2857    unsafe { transmute(a) }
2858}
2859
2860/// Casts vector of type __m256i to type __m256.
2861///
2862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2863#[inline]
2864#[target_feature(enable = "avx")]
2865// This intrinsic is only used for compilation and does not generate any
2866// instructions, thus it has zero latency.
2867#[stable(feature = "simd_x86", since = "1.27.0")]
2868#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2869pub const fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2870    unsafe { transmute(a) }
2871}
2872
2873/// Casts vector of type __m256d to type __m256i.
2874///
2875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2876#[inline]
2877#[target_feature(enable = "avx")]
2878// This intrinsic is only used for compilation and does not generate any
2879// instructions, thus it has zero latency.
2880#[stable(feature = "simd_x86", since = "1.27.0")]
2881#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2882pub const fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2883    unsafe { transmute(a) }
2884}
2885
2886/// Casts vector of type __m256i to type __m256d.
2887///
2888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2889#[inline]
2890#[target_feature(enable = "avx")]
2891// This intrinsic is only used for compilation and does not generate any
2892// instructions, thus it has zero latency.
2893#[stable(feature = "simd_x86", since = "1.27.0")]
2894#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2895pub const fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2896    unsafe { transmute(a) }
2897}
2898
2899/// Casts vector of type __m256 to type __m128.
2900///
2901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2902#[inline]
2903#[target_feature(enable = "avx")]
2904// This intrinsic is only used for compilation and does not generate any
2905// instructions, thus it has zero latency.
2906#[stable(feature = "simd_x86", since = "1.27.0")]
2907#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2908pub const fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2909    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2910}
2911
2912/// Casts vector of type __m256d to type __m128d.
2913///
2914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2915#[inline]
2916#[target_feature(enable = "avx")]
2917// This intrinsic is only used for compilation and does not generate any
2918// instructions, thus it has zero latency.
2919#[stable(feature = "simd_x86", since = "1.27.0")]
2920#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2921pub const fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2922    unsafe { simd_shuffle!(a, a, [0, 1]) }
2923}
2924
2925/// Casts vector of type __m256i to type __m128i.
2926///
2927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2928#[inline]
2929#[target_feature(enable = "avx")]
2930// This intrinsic is only used for compilation and does not generate any
2931// instructions, thus it has zero latency.
2932#[stable(feature = "simd_x86", since = "1.27.0")]
2933#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2934pub const fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2935    unsafe {
2936        let a = a.as_i64x4();
2937        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2938        transmute(dst)
2939    }
2940}
2941
2942/// Casts vector of type __m128 to type __m256;
2943/// the upper 128 bits of the result are indeterminate.
2944///
2945/// In the Intel documentation, the upper bits are declared to be "undefined".
2946/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2947/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2948///
2949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2950#[inline]
2951#[target_feature(enable = "avx")]
2952// This intrinsic is only used for compilation and does not generate any
2953// instructions, thus it has zero latency.
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2956pub const fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2957    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2958}
2959
2960/// Casts vector of type __m128d to type __m256d;
2961/// the upper 128 bits of the result are indeterminate.
2962///
2963/// In the Intel documentation, the upper bits are declared to be "undefined".
2964/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2965/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2966///
2967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2968#[inline]
2969#[target_feature(enable = "avx")]
2970// This intrinsic is only used for compilation and does not generate any
2971// instructions, thus it has zero latency.
2972#[stable(feature = "simd_x86", since = "1.27.0")]
2973#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2974pub const fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2975    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2976}
2977
2978/// Casts vector of type __m128i to type __m256i;
2979/// the upper 128 bits of the result are indeterminate.
2980///
2981/// In the Intel documentation, the upper bits are declared to be "undefined".
2982/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2983/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2984///
2985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2986#[inline]
2987#[target_feature(enable = "avx")]
2988// This intrinsic is only used for compilation and does not generate any
2989// instructions, thus it has zero latency.
2990#[stable(feature = "simd_x86", since = "1.27.0")]
2991#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2992pub const fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2993    unsafe {
2994        let a = a.as_i64x2();
2995        let undefined = i64x2::ZERO;
2996        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2997        transmute(dst)
2998    }
2999}
3000
3001/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
3002/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
3003/// the value of the source vector. The upper 128 bits are set to zero.
3004///
3005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
3006#[inline]
3007#[target_feature(enable = "avx")]
3008// This intrinsic is only used for compilation and does not generate any
3009// instructions, thus it has zero latency.
3010#[stable(feature = "simd_x86", since = "1.27.0")]
3011#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3012pub const fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
3013    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
3014}
3015
3016/// Constructs a 256-bit integer vector from a 128-bit integer vector.
3017/// The lower 128 bits contain the value of the source vector. The upper
3018/// 128 bits are set to zero.
3019///
3020/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
3021#[inline]
3022#[target_feature(enable = "avx")]
3023// This intrinsic is only used for compilation and does not generate any
3024// instructions, thus it has zero latency.
3025#[stable(feature = "simd_x86", since = "1.27.0")]
3026#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3027pub const fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
3028    unsafe {
3029        let b = i64x2::ZERO;
3030        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
3031        transmute(dst)
3032    }
3033}
3034
3035/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
3036/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
3037/// contain the value of the source vector. The upper 128 bits are set
3038/// to zero.
3039///
3040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
3041#[inline]
3042#[target_feature(enable = "avx")]
3043// This intrinsic is only used for compilation and does not generate any
3044// instructions, thus it has zero latency.
3045#[stable(feature = "simd_x86", since = "1.27.0")]
3046#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3047pub const fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
3048    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
3049}
3050
3051/// Returns vector of type `__m256` with indeterminate elements.
3052/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3053/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3054/// In practice, this is typically equivalent to [`mem::zeroed`].
3055///
3056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
3057#[inline]
3058#[target_feature(enable = "avx")]
3059// This intrinsic has no corresponding instruction.
3060#[stable(feature = "simd_x86", since = "1.27.0")]
3061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3062pub const fn _mm256_undefined_ps() -> __m256 {
3063    const { unsafe { mem::zeroed() } }
3064}
3065
3066/// Returns vector of type `__m256d` with indeterminate elements.
3067/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3068/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3069/// In practice, this is typically equivalent to [`mem::zeroed`].
3070///
3071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
3072#[inline]
3073#[target_feature(enable = "avx")]
3074// This intrinsic has no corresponding instruction.
3075#[stable(feature = "simd_x86", since = "1.27.0")]
3076#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3077pub const fn _mm256_undefined_pd() -> __m256d {
3078    const { unsafe { mem::zeroed() } }
3079}
3080
3081/// Returns vector of type __m256i with with indeterminate elements.
3082/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3083/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3084/// In practice, this is typically equivalent to [`mem::zeroed`].
3085///
3086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
3087#[inline]
3088#[target_feature(enable = "avx")]
3089// This intrinsic has no corresponding instruction.
3090#[stable(feature = "simd_x86", since = "1.27.0")]
3091#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3092pub const fn _mm256_undefined_si256() -> __m256i {
3093    const { unsafe { mem::zeroed() } }
3094}
3095
3096/// Sets packed __m256 returned vector with the supplied values.
3097///
3098/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
3099#[inline]
3100#[target_feature(enable = "avx")]
3101#[cfg_attr(test, assert_instr(vinsertf128))]
3102#[stable(feature = "simd_x86", since = "1.27.0")]
3103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3104pub const fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
3105    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
3106}
3107
3108/// Sets packed __m256d returned vector with the supplied values.
3109///
3110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
3111#[inline]
3112#[target_feature(enable = "avx")]
3113#[cfg_attr(test, assert_instr(vinsertf128))]
3114#[stable(feature = "simd_x86", since = "1.27.0")]
3115#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3116pub const fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
3117    unsafe {
3118        let hi: __m128 = transmute(hi);
3119        let lo: __m128 = transmute(lo);
3120        transmute(_mm256_set_m128(hi, lo))
3121    }
3122}
3123
3124/// Sets packed __m256i returned vector with the supplied values.
3125///
3126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
3127#[inline]
3128#[target_feature(enable = "avx")]
3129#[cfg_attr(test, assert_instr(vinsertf128))]
3130#[stable(feature = "simd_x86", since = "1.27.0")]
3131#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3132pub const fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
3133    unsafe {
3134        let hi: __m128 = transmute(hi);
3135        let lo: __m128 = transmute(lo);
3136        transmute(_mm256_set_m128(hi, lo))
3137    }
3138}
3139
3140/// Sets packed __m256 returned vector with the supplied values.
3141///
3142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
3143#[inline]
3144#[target_feature(enable = "avx")]
3145#[cfg_attr(test, assert_instr(vinsertf128))]
3146#[stable(feature = "simd_x86", since = "1.27.0")]
3147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3148pub const fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
3149    _mm256_set_m128(hi, lo)
3150}
3151
3152/// Sets packed __m256d returned vector with the supplied values.
3153///
3154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
3155#[inline]
3156#[target_feature(enable = "avx")]
3157#[cfg_attr(test, assert_instr(vinsertf128))]
3158#[stable(feature = "simd_x86", since = "1.27.0")]
3159#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3160pub const fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
3161    _mm256_set_m128d(hi, lo)
3162}
3163
3164/// Sets packed __m256i returned vector with the supplied values.
3165///
3166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3167#[inline]
3168#[target_feature(enable = "avx")]
3169#[cfg_attr(test, assert_instr(vinsertf128))]
3170#[stable(feature = "simd_x86", since = "1.27.0")]
3171#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3172pub const fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3173    _mm256_set_m128i(hi, lo)
3174}
3175
3176/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3177/// floating-point elements) from memory, and combine them into a 256-bit
3178/// value.
3179/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3180///
3181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3182#[inline]
3183#[target_feature(enable = "avx")]
3184// This intrinsic has no corresponding instruction.
3185#[stable(feature = "simd_x86", since = "1.27.0")]
3186#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3187pub const unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3188    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3189    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3190}
3191
3192/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3193/// floating-point elements) from memory, and combine them into a 256-bit
3194/// value.
3195/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3196///
3197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3198#[inline]
3199#[target_feature(enable = "avx")]
3200// This intrinsic has no corresponding instruction.
3201#[stable(feature = "simd_x86", since = "1.27.0")]
3202#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3203pub const unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3204    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3205    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3206}
3207
3208/// Loads two 128-bit values (composed of integer data) from memory, and combine
3209/// them into a 256-bit value.
3210/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3211///
3212/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3213#[inline]
3214#[target_feature(enable = "avx")]
3215// This intrinsic has no corresponding instruction.
3216#[stable(feature = "simd_x86", since = "1.27.0")]
3217#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3218pub const unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3219    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3220    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3221}
3222
3223/// Stores the high and low 128-bit halves (each composed of 4 packed
3224/// single-precision (32-bit) floating-point elements) from `a` into memory two
3225/// different 128-bit locations.
3226/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3227///
3228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3229#[inline]
3230#[target_feature(enable = "avx")]
3231// This intrinsic has no corresponding instruction.
3232#[stable(feature = "simd_x86", since = "1.27.0")]
3233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3234pub const unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3235    let lo = _mm256_castps256_ps128(a);
3236    _mm_storeu_ps(loaddr, lo);
3237    let hi = _mm256_extractf128_ps::<1>(a);
3238    _mm_storeu_ps(hiaddr, hi);
3239}
3240
3241/// Stores the high and low 128-bit halves (each composed of 2 packed
3242/// double-precision (64-bit) floating-point elements) from `a` into memory two
3243/// different 128-bit locations.
3244/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3245///
3246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3247#[inline]
3248#[target_feature(enable = "avx")]
3249// This intrinsic has no corresponding instruction.
3250#[stable(feature = "simd_x86", since = "1.27.0")]
3251#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3252pub const unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3253    let lo = _mm256_castpd256_pd128(a);
3254    _mm_storeu_pd(loaddr, lo);
3255    let hi = _mm256_extractf128_pd::<1>(a);
3256    _mm_storeu_pd(hiaddr, hi);
3257}
3258
3259/// Stores the high and low 128-bit halves (each composed of integer data) from
3260/// `a` into memory two different 128-bit locations.
3261/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3262///
3263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3264#[inline]
3265#[target_feature(enable = "avx")]
3266// This intrinsic has no corresponding instruction.
3267#[stable(feature = "simd_x86", since = "1.27.0")]
3268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3269pub const unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3270    let lo = _mm256_castsi256_si128(a);
3271    _mm_storeu_si128(loaddr, lo);
3272    let hi = _mm256_extractf128_si256::<1>(a);
3273    _mm_storeu_si128(hiaddr, hi);
3274}
3275
3276/// Returns the first element of the input vector of `[8 x float]`.
3277///
3278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3279#[inline]
3280#[target_feature(enable = "avx")]
3281//#[cfg_attr(test, assert_instr(movss))] FIXME
3282#[stable(feature = "simd_x86", since = "1.27.0")]
3283#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3284pub const fn _mm256_cvtss_f32(a: __m256) -> f32 {
3285    unsafe { simd_extract!(a, 0) }
3286}
3287
3288// LLVM intrinsics used in the above functions
3289#[allow(improper_ctypes)]
3290unsafe extern "C" {
3291    #[link_name = "llvm.x86.avx.round.pd.256"]
3292    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3293    #[link_name = "llvm.x86.avx.round.ps.256"]
3294    fn roundps256(a: __m256, b: i32) -> __m256;
3295    #[link_name = "llvm.x86.avx.dp.ps.256"]
3296    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3297    #[link_name = "llvm.x86.sse2.cmp.pd"]
3298    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3299    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3300    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3301    #[link_name = "llvm.x86.sse.cmp.ps"]
3302    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3303    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3304    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3305    #[link_name = "llvm.x86.sse2.cmp.sd"]
3306    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3307    #[link_name = "llvm.x86.sse.cmp.ss"]
3308    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3309    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3310    fn vcvtps2dq(a: __m256) -> i32x8;
3311    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3312    fn vcvttpd2dq(a: __m256d) -> i32x4;
3313    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3314    fn vcvtpd2dq(a: __m256d) -> i32x4;
3315    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3316    fn vcvttps2dq(a: __m256) -> i32x8;
3317    #[link_name = "llvm.x86.avx.vzeroall"]
3318    fn vzeroall();
3319    #[link_name = "llvm.x86.avx.vzeroupper"]
3320    fn vzeroupper();
3321    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3322    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3323    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3324    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3325    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3326    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3327    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3328    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3329    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3330    fn vlddqu(mem_addr: *const i8) -> i8x32;
3331    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3332    fn vrcpps(a: __m256) -> __m256;
3333    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3334    fn vrsqrtps(a: __m256) -> __m256;
3335    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3336    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3337    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3338    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3339    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3340    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3341    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3342    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3343    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3344    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3345    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3346    fn vtestzps256(a: __m256, b: __m256) -> i32;
3347    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3348    fn vtestcps256(a: __m256, b: __m256) -> i32;
3349    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3350    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3351    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3352    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3353    #[link_name = "llvm.x86.avx.min.ps.256"]
3354    fn vminps(a: __m256, b: __m256) -> __m256;
3355    #[link_name = "llvm.x86.avx.max.ps.256"]
3356    fn vmaxps(a: __m256, b: __m256) -> __m256;
3357    #[link_name = "llvm.x86.avx.min.pd.256"]
3358    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3359    #[link_name = "llvm.x86.avx.max.pd.256"]
3360    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3361}
3362
3363#[cfg(test)]
3364mod tests {
3365    use crate::core_arch::assert_eq_const as assert_eq;
3366    use crate::hint::black_box;
3367    use crate::ptr;
3368    use stdarch_test::simd_test;
3369
3370    use crate::core_arch::x86::*;
3371
3372    #[simd_test(enable = "avx")]
3373    const fn test_mm256_add_pd() {
3374        let a = _mm256_setr_pd(1., 2., 3., 4.);
3375        let b = _mm256_setr_pd(5., 6., 7., 8.);
3376        let r = _mm256_add_pd(a, b);
3377        let e = _mm256_setr_pd(6., 8., 10., 12.);
3378        assert_eq_m256d(r, e);
3379    }
3380
3381    #[simd_test(enable = "avx")]
3382    const fn test_mm256_add_ps() {
3383        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3384        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3385        let r = _mm256_add_ps(a, b);
3386        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3387        assert_eq_m256(r, e);
3388    }
3389
3390    #[simd_test(enable = "avx")]
3391    const fn test_mm256_and_pd() {
3392        let a = _mm256_set1_pd(1.);
3393        let b = _mm256_set1_pd(0.6);
3394        let r = _mm256_and_pd(a, b);
3395        let e = _mm256_set1_pd(0.5);
3396        assert_eq_m256d(r, e);
3397    }
3398
3399    #[simd_test(enable = "avx")]
3400    const fn test_mm256_and_ps() {
3401        let a = _mm256_set1_ps(1.);
3402        let b = _mm256_set1_ps(0.6);
3403        let r = _mm256_and_ps(a, b);
3404        let e = _mm256_set1_ps(0.5);
3405        assert_eq_m256(r, e);
3406    }
3407
3408    #[simd_test(enable = "avx")]
3409    const fn test_mm256_or_pd() {
3410        let a = _mm256_set1_pd(1.);
3411        let b = _mm256_set1_pd(0.6);
3412        let r = _mm256_or_pd(a, b);
3413        let e = _mm256_set1_pd(1.2);
3414        assert_eq_m256d(r, e);
3415    }
3416
3417    #[simd_test(enable = "avx")]
3418    const fn test_mm256_or_ps() {
3419        let a = _mm256_set1_ps(1.);
3420        let b = _mm256_set1_ps(0.6);
3421        let r = _mm256_or_ps(a, b);
3422        let e = _mm256_set1_ps(1.2);
3423        assert_eq_m256(r, e);
3424    }
3425
3426    #[simd_test(enable = "avx")]
3427    const fn test_mm256_shuffle_pd() {
3428        let a = _mm256_setr_pd(1., 4., 5., 8.);
3429        let b = _mm256_setr_pd(2., 3., 6., 7.);
3430        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3431        let e = _mm256_setr_pd(4., 3., 8., 7.);
3432        assert_eq_m256d(r, e);
3433    }
3434
3435    #[simd_test(enable = "avx")]
3436    const fn test_mm256_shuffle_ps() {
3437        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3438        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3439        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3440        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3441        assert_eq_m256(r, e);
3442    }
3443
3444    #[simd_test(enable = "avx")]
3445    const fn test_mm256_andnot_pd() {
3446        let a = _mm256_set1_pd(0.);
3447        let b = _mm256_set1_pd(0.6);
3448        let r = _mm256_andnot_pd(a, b);
3449        assert_eq_m256d(r, b);
3450    }
3451
3452    #[simd_test(enable = "avx")]
3453    const fn test_mm256_andnot_ps() {
3454        let a = _mm256_set1_ps(0.);
3455        let b = _mm256_set1_ps(0.6);
3456        let r = _mm256_andnot_ps(a, b);
3457        assert_eq_m256(r, b);
3458    }
3459
3460    #[simd_test(enable = "avx")]
3461    unsafe fn test_mm256_max_pd() {
3462        let a = _mm256_setr_pd(1., 4., 5., 8.);
3463        let b = _mm256_setr_pd(2., 3., 6., 7.);
3464        let r = _mm256_max_pd(a, b);
3465        let e = _mm256_setr_pd(2., 4., 6., 8.);
3466        assert_eq_m256d(r, e);
3467        // > If the values being compared are both 0.0s (of either sign), the
3468        // > value in the second operand (source operand) is returned.
3469        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3470        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3471        let wu: [u64; 4] = transmute(w);
3472        let xu: [u64; 4] = transmute(x);
3473        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3474        assert_eq!(xu, [0u64; 4]);
3475        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3476        // > second operand (source operand), either a NaN or a valid
3477        // > floating-point value, is written to the result.
3478        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3479        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3480        let yf: [f64; 4] = transmute(y);
3481        let zf: [f64; 4] = transmute(z);
3482        assert_eq!(yf, [0.0; 4]);
3483        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3484    }
3485
3486    #[simd_test(enable = "avx")]
3487    unsafe fn test_mm256_max_ps() {
3488        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3489        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3490        let r = _mm256_max_ps(a, b);
3491        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3492        assert_eq_m256(r, e);
3493        // > If the values being compared are both 0.0s (of either sign), the
3494        // > value in the second operand (source operand) is returned.
3495        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3496        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3497        let wu: [u32; 8] = transmute(w);
3498        let xu: [u32; 8] = transmute(x);
3499        assert_eq!(wu, [0x8000_0000u32; 8]);
3500        assert_eq!(xu, [0u32; 8]);
3501        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3502        // > second operand (source operand), either a NaN or a valid
3503        // > floating-point value, is written to the result.
3504        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3505        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3506        let yf: [f32; 8] = transmute(y);
3507        let zf: [f32; 8] = transmute(z);
3508        assert_eq!(yf, [0.0; 8]);
3509        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3510    }
3511
3512    #[simd_test(enable = "avx")]
3513    unsafe fn test_mm256_min_pd() {
3514        let a = _mm256_setr_pd(1., 4., 5., 8.);
3515        let b = _mm256_setr_pd(2., 3., 6., 7.);
3516        let r = _mm256_min_pd(a, b);
3517        let e = _mm256_setr_pd(1., 3., 5., 7.);
3518        assert_eq_m256d(r, e);
3519        // > If the values being compared are both 0.0s (of either sign), the
3520        // > value in the second operand (source operand) is returned.
3521        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3522        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3523        let wu: [u64; 4] = transmute(w);
3524        let xu: [u64; 4] = transmute(x);
3525        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3526        assert_eq!(xu, [0u64; 4]);
3527        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3528        // > second operand (source operand), either a NaN or a valid
3529        // > floating-point value, is written to the result.
3530        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3531        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3532        let yf: [f64; 4] = transmute(y);
3533        let zf: [f64; 4] = transmute(z);
3534        assert_eq!(yf, [0.0; 4]);
3535        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3536    }
3537
3538    #[simd_test(enable = "avx")]
3539    unsafe fn test_mm256_min_ps() {
3540        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3541        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3542        let r = _mm256_min_ps(a, b);
3543        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3544        assert_eq_m256(r, e);
3545        // > If the values being compared are both 0.0s (of either sign), the
3546        // > value in the second operand (source operand) is returned.
3547        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3548        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3549        let wu: [u32; 8] = transmute(w);
3550        let xu: [u32; 8] = transmute(x);
3551        assert_eq!(wu, [0x8000_0000u32; 8]);
3552        assert_eq!(xu, [0u32; 8]);
3553        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3554        // > second operand (source operand), either a NaN or a valid
3555        // > floating-point value, is written to the result.
3556        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3557        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3558        let yf: [f32; 8] = transmute(y);
3559        let zf: [f32; 8] = transmute(z);
3560        assert_eq!(yf, [0.0; 8]);
3561        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3562    }
3563
3564    #[simd_test(enable = "avx")]
3565    const fn test_mm256_mul_pd() {
3566        let a = _mm256_setr_pd(1., 2., 3., 4.);
3567        let b = _mm256_setr_pd(5., 6., 7., 8.);
3568        let r = _mm256_mul_pd(a, b);
3569        let e = _mm256_setr_pd(5., 12., 21., 32.);
3570        assert_eq_m256d(r, e);
3571    }
3572
3573    #[simd_test(enable = "avx")]
3574    const fn test_mm256_mul_ps() {
3575        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3576        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3577        let r = _mm256_mul_ps(a, b);
3578        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3579        assert_eq_m256(r, e);
3580    }
3581
3582    #[simd_test(enable = "avx")]
3583    const fn test_mm256_addsub_pd() {
3584        let a = _mm256_setr_pd(1., 2., 3., 4.);
3585        let b = _mm256_setr_pd(5., 6., 7., 8.);
3586        let r = _mm256_addsub_pd(a, b);
3587        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3588        assert_eq_m256d(r, e);
3589    }
3590
3591    #[simd_test(enable = "avx")]
3592    const fn test_mm256_addsub_ps() {
3593        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3594        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3595        let r = _mm256_addsub_ps(a, b);
3596        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3597        assert_eq_m256(r, e);
3598    }
3599
3600    #[simd_test(enable = "avx")]
3601    const fn test_mm256_sub_pd() {
3602        let a = _mm256_setr_pd(1., 2., 3., 4.);
3603        let b = _mm256_setr_pd(5., 6., 7., 8.);
3604        let r = _mm256_sub_pd(a, b);
3605        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3606        assert_eq_m256d(r, e);
3607    }
3608
3609    #[simd_test(enable = "avx")]
3610    const fn test_mm256_sub_ps() {
3611        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3612        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3613        let r = _mm256_sub_ps(a, b);
3614        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3615        assert_eq_m256(r, e);
3616    }
3617
3618    #[simd_test(enable = "avx")]
3619    fn test_mm256_round_pd() {
3620        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3621        let result_closest = _mm256_round_pd::<0b0000>(a);
3622        let result_down = _mm256_round_pd::<0b0001>(a);
3623        let result_up = _mm256_round_pd::<0b0010>(a);
3624        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3625        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3626        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3627        assert_eq_m256d(result_closest, expected_closest);
3628        assert_eq_m256d(result_down, expected_down);
3629        assert_eq_m256d(result_up, expected_up);
3630    }
3631
3632    #[simd_test(enable = "avx")]
3633    const fn test_mm256_floor_pd() {
3634        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3635        let result_down = _mm256_floor_pd(a);
3636        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3637        assert_eq_m256d(result_down, expected_down);
3638    }
3639
3640    #[simd_test(enable = "avx")]
3641    const fn test_mm256_ceil_pd() {
3642        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3643        let result_up = _mm256_ceil_pd(a);
3644        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3645        assert_eq_m256d(result_up, expected_up);
3646    }
3647
3648    #[simd_test(enable = "avx")]
3649    fn test_mm256_round_ps() {
3650        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3651        let result_closest = _mm256_round_ps::<0b0000>(a);
3652        let result_down = _mm256_round_ps::<0b0001>(a);
3653        let result_up = _mm256_round_ps::<0b0010>(a);
3654        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3655        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3656        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3657        assert_eq_m256(result_closest, expected_closest);
3658        assert_eq_m256(result_down, expected_down);
3659        assert_eq_m256(result_up, expected_up);
3660    }
3661
3662    #[simd_test(enable = "avx")]
3663    const fn test_mm256_floor_ps() {
3664        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3665        let result_down = _mm256_floor_ps(a);
3666        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3667        assert_eq_m256(result_down, expected_down);
3668    }
3669
3670    #[simd_test(enable = "avx")]
3671    const fn test_mm256_ceil_ps() {
3672        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3673        let result_up = _mm256_ceil_ps(a);
3674        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3675        assert_eq_m256(result_up, expected_up);
3676    }
3677
3678    #[simd_test(enable = "avx")]
3679    fn test_mm256_sqrt_pd() {
3680        let a = _mm256_setr_pd(4., 9., 16., 25.);
3681        let r = _mm256_sqrt_pd(a);
3682        let e = _mm256_setr_pd(2., 3., 4., 5.);
3683        assert_eq_m256d(r, e);
3684    }
3685
3686    #[simd_test(enable = "avx")]
3687    fn test_mm256_sqrt_ps() {
3688        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3689        let r = _mm256_sqrt_ps(a);
3690        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3691        assert_eq_m256(r, e);
3692    }
3693
3694    #[simd_test(enable = "avx")]
3695    const fn test_mm256_div_ps() {
3696        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3697        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3698        let r = _mm256_div_ps(a, b);
3699        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3700        assert_eq_m256(r, e);
3701    }
3702
3703    #[simd_test(enable = "avx")]
3704    const fn test_mm256_div_pd() {
3705        let a = _mm256_setr_pd(4., 9., 16., 25.);
3706        let b = _mm256_setr_pd(4., 3., 2., 5.);
3707        let r = _mm256_div_pd(a, b);
3708        let e = _mm256_setr_pd(1., 3., 8., 5.);
3709        assert_eq_m256d(r, e);
3710    }
3711
3712    #[simd_test(enable = "avx")]
3713    const fn test_mm256_blend_pd() {
3714        let a = _mm256_setr_pd(4., 9., 16., 25.);
3715        let b = _mm256_setr_pd(4., 3., 2., 5.);
3716        let r = _mm256_blend_pd::<0x0>(a, b);
3717        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3718        let r = _mm256_blend_pd::<0x3>(a, b);
3719        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3720        let r = _mm256_blend_pd::<0xF>(a, b);
3721        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3722    }
3723
3724    #[simd_test(enable = "avx")]
3725    const fn test_mm256_blend_ps() {
3726        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3727        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3728        let r = _mm256_blend_ps::<0x0>(a, b);
3729        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3730        let r = _mm256_blend_ps::<0x3>(a, b);
3731        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3732        let r = _mm256_blend_ps::<0xF>(a, b);
3733        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3734    }
3735
3736    #[simd_test(enable = "avx")]
3737    const fn test_mm256_blendv_pd() {
3738        let a = _mm256_setr_pd(4., 9., 16., 25.);
3739        let b = _mm256_setr_pd(4., 3., 2., 5.);
3740        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3741        let r = _mm256_blendv_pd(a, b, c);
3742        let e = _mm256_setr_pd(4., 9., 2., 5.);
3743        assert_eq_m256d(r, e);
3744    }
3745
3746    #[simd_test(enable = "avx")]
3747    const fn test_mm256_blendv_ps() {
3748        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3749        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3750        #[rustfmt::skip]
3751        let c = _mm256_setr_ps(
3752            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3753        );
3754        let r = _mm256_blendv_ps(a, b, c);
3755        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3756        assert_eq_m256(r, e);
3757    }
3758
3759    #[simd_test(enable = "avx")]
3760    fn test_mm256_dp_ps() {
3761        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3762        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3763        let r = _mm256_dp_ps::<0xFF>(a, b);
3764        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3765        assert_eq_m256(r, e);
3766    }
3767
3768    #[simd_test(enable = "avx")]
3769    const fn test_mm256_hadd_pd() {
3770        let a = _mm256_setr_pd(4., 9., 16., 25.);
3771        let b = _mm256_setr_pd(4., 3., 2., 5.);
3772        let r = _mm256_hadd_pd(a, b);
3773        let e = _mm256_setr_pd(13., 7., 41., 7.);
3774        assert_eq_m256d(r, e);
3775
3776        let a = _mm256_setr_pd(1., 2., 3., 4.);
3777        let b = _mm256_setr_pd(5., 6., 7., 8.);
3778        let r = _mm256_hadd_pd(a, b);
3779        let e = _mm256_setr_pd(3., 11., 7., 15.);
3780        assert_eq_m256d(r, e);
3781    }
3782
3783    #[simd_test(enable = "avx")]
3784    const fn test_mm256_hadd_ps() {
3785        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3786        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3787        let r = _mm256_hadd_ps(a, b);
3788        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3789        assert_eq_m256(r, e);
3790
3791        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3792        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3793        let r = _mm256_hadd_ps(a, b);
3794        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3795        assert_eq_m256(r, e);
3796    }
3797
3798    #[simd_test(enable = "avx")]
3799    const fn test_mm256_hsub_pd() {
3800        let a = _mm256_setr_pd(4., 9., 16., 25.);
3801        let b = _mm256_setr_pd(4., 3., 2., 5.);
3802        let r = _mm256_hsub_pd(a, b);
3803        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3804        assert_eq_m256d(r, e);
3805
3806        let a = _mm256_setr_pd(1., 2., 3., 4.);
3807        let b = _mm256_setr_pd(5., 6., 7., 8.);
3808        let r = _mm256_hsub_pd(a, b);
3809        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3810        assert_eq_m256d(r, e);
3811    }
3812
3813    #[simd_test(enable = "avx")]
3814    const fn test_mm256_hsub_ps() {
3815        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3816        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3817        let r = _mm256_hsub_ps(a, b);
3818        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3819        assert_eq_m256(r, e);
3820
3821        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3822        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3823        let r = _mm256_hsub_ps(a, b);
3824        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3825        assert_eq_m256(r, e);
3826    }
3827
3828    #[simd_test(enable = "avx")]
3829    const fn test_mm256_xor_pd() {
3830        let a = _mm256_setr_pd(4., 9., 16., 25.);
3831        let b = _mm256_set1_pd(0.);
3832        let r = _mm256_xor_pd(a, b);
3833        assert_eq_m256d(r, a);
3834    }
3835
3836    #[simd_test(enable = "avx")]
3837    const fn test_mm256_xor_ps() {
3838        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3839        let b = _mm256_set1_ps(0.);
3840        let r = _mm256_xor_ps(a, b);
3841        assert_eq_m256(r, a);
3842    }
3843
3844    #[simd_test(enable = "avx")]
3845    fn test_mm_cmp_pd() {
3846        let a = _mm_setr_pd(4., 9.);
3847        let b = _mm_setr_pd(4., 3.);
3848        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3849        assert!(get_m128d(r, 0).is_nan());
3850        assert!(get_m128d(r, 1).is_nan());
3851    }
3852
3853    #[simd_test(enable = "avx")]
3854    fn test_mm256_cmp_pd() {
3855        let a = _mm256_setr_pd(1., 2., 3., 4.);
3856        let b = _mm256_setr_pd(5., 6., 7., 8.);
3857        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3858        let e = _mm256_set1_pd(0.);
3859        assert_eq_m256d(r, e);
3860    }
3861
3862    #[simd_test(enable = "avx")]
3863    fn test_mm_cmp_ps() {
3864        let a = _mm_setr_ps(4., 3., 2., 5.);
3865        let b = _mm_setr_ps(4., 9., 16., 25.);
3866        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3867        assert!(get_m128(r, 0).is_nan());
3868        assert_eq!(get_m128(r, 1), 0.);
3869        assert_eq!(get_m128(r, 2), 0.);
3870        assert_eq!(get_m128(r, 3), 0.);
3871    }
3872
3873    #[simd_test(enable = "avx")]
3874    fn test_mm256_cmp_ps() {
3875        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3876        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3877        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3878        let e = _mm256_set1_ps(0.);
3879        assert_eq_m256(r, e);
3880    }
3881
3882    #[simd_test(enable = "avx")]
3883    fn test_mm_cmp_sd() {
3884        let a = _mm_setr_pd(4., 9.);
3885        let b = _mm_setr_pd(4., 3.);
3886        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3887        assert!(get_m128d(r, 0).is_nan());
3888        assert_eq!(get_m128d(r, 1), 9.);
3889    }
3890
3891    #[simd_test(enable = "avx")]
3892    fn test_mm_cmp_ss() {
3893        let a = _mm_setr_ps(4., 3., 2., 5.);
3894        let b = _mm_setr_ps(4., 9., 16., 25.);
3895        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3896        assert!(get_m128(r, 0).is_nan());
3897        assert_eq!(get_m128(r, 1), 3.);
3898        assert_eq!(get_m128(r, 2), 2.);
3899        assert_eq!(get_m128(r, 3), 5.);
3900    }
3901
3902    #[simd_test(enable = "avx")]
3903    const fn test_mm256_cvtepi32_pd() {
3904        let a = _mm_setr_epi32(4, 9, 16, 25);
3905        let r = _mm256_cvtepi32_pd(a);
3906        let e = _mm256_setr_pd(4., 9., 16., 25.);
3907        assert_eq_m256d(r, e);
3908    }
3909
3910    #[simd_test(enable = "avx")]
3911    const fn test_mm256_cvtepi32_ps() {
3912        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3913        let r = _mm256_cvtepi32_ps(a);
3914        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3915        assert_eq_m256(r, e);
3916    }
3917
3918    #[simd_test(enable = "avx")]
3919    const fn test_mm256_cvtpd_ps() {
3920        let a = _mm256_setr_pd(4., 9., 16., 25.);
3921        let r = _mm256_cvtpd_ps(a);
3922        let e = _mm_setr_ps(4., 9., 16., 25.);
3923        assert_eq_m128(r, e);
3924    }
3925
3926    #[simd_test(enable = "avx")]
3927    fn test_mm256_cvtps_epi32() {
3928        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3929        let r = _mm256_cvtps_epi32(a);
3930        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3931        assert_eq_m256i(r, e);
3932    }
3933
3934    #[simd_test(enable = "avx")]
3935    const fn test_mm256_cvtps_pd() {
3936        let a = _mm_setr_ps(4., 9., 16., 25.);
3937        let r = _mm256_cvtps_pd(a);
3938        let e = _mm256_setr_pd(4., 9., 16., 25.);
3939        assert_eq_m256d(r, e);
3940    }
3941
3942    #[simd_test(enable = "avx")]
3943    const fn test_mm256_cvtsd_f64() {
3944        let a = _mm256_setr_pd(1., 2., 3., 4.);
3945        let r = _mm256_cvtsd_f64(a);
3946        assert_eq!(r, 1.);
3947    }
3948
3949    #[simd_test(enable = "avx")]
3950    fn test_mm256_cvttpd_epi32() {
3951        let a = _mm256_setr_pd(4., 9., 16., 25.);
3952        let r = _mm256_cvttpd_epi32(a);
3953        let e = _mm_setr_epi32(4, 9, 16, 25);
3954        assert_eq_m128i(r, e);
3955    }
3956
3957    #[simd_test(enable = "avx")]
3958    fn test_mm256_cvtpd_epi32() {
3959        let a = _mm256_setr_pd(4., 9., 16., 25.);
3960        let r = _mm256_cvtpd_epi32(a);
3961        let e = _mm_setr_epi32(4, 9, 16, 25);
3962        assert_eq_m128i(r, e);
3963    }
3964
3965    #[simd_test(enable = "avx")]
3966    fn test_mm256_cvttps_epi32() {
3967        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3968        let r = _mm256_cvttps_epi32(a);
3969        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3970        assert_eq_m256i(r, e);
3971    }
3972
3973    #[simd_test(enable = "avx")]
3974    const fn test_mm256_extractf128_ps() {
3975        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3976        let r = _mm256_extractf128_ps::<0>(a);
3977        let e = _mm_setr_ps(4., 3., 2., 5.);
3978        assert_eq_m128(r, e);
3979    }
3980
3981    #[simd_test(enable = "avx")]
3982    const fn test_mm256_extractf128_pd() {
3983        let a = _mm256_setr_pd(4., 3., 2., 5.);
3984        let r = _mm256_extractf128_pd::<0>(a);
3985        let e = _mm_setr_pd(4., 3.);
3986        assert_eq_m128d(r, e);
3987    }
3988
3989    #[simd_test(enable = "avx")]
3990    const fn test_mm256_extractf128_si256() {
3991        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3992        let r = _mm256_extractf128_si256::<0>(a);
3993        let e = _mm_setr_epi64x(4, 3);
3994        assert_eq_m128i(r, e);
3995    }
3996
3997    #[simd_test(enable = "avx")]
3998    const fn test_mm256_extract_epi32() {
3999        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
4000        let r1 = _mm256_extract_epi32::<0>(a);
4001        let r2 = _mm256_extract_epi32::<3>(a);
4002        assert_eq!(r1, -1);
4003        assert_eq!(r2, 3);
4004    }
4005
4006    #[simd_test(enable = "avx")]
4007    const fn test_mm256_cvtsi256_si32() {
4008        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4009        let r = _mm256_cvtsi256_si32(a);
4010        assert_eq!(r, 1);
4011    }
4012
4013    #[simd_test(enable = "avx")]
4014    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4015    fn test_mm256_zeroall() {
4016        _mm256_zeroall();
4017    }
4018
4019    #[simd_test(enable = "avx")]
4020    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4021    fn test_mm256_zeroupper() {
4022        _mm256_zeroupper();
4023    }
4024
4025    #[simd_test(enable = "avx")]
4026    fn test_mm256_permutevar_ps() {
4027        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4028        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4029        let r = _mm256_permutevar_ps(a, b);
4030        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
4031        assert_eq_m256(r, e);
4032    }
4033
4034    #[simd_test(enable = "avx")]
4035    fn test_mm_permutevar_ps() {
4036        let a = _mm_setr_ps(4., 3., 2., 5.);
4037        let b = _mm_setr_epi32(1, 2, 3, 4);
4038        let r = _mm_permutevar_ps(a, b);
4039        let e = _mm_setr_ps(3., 2., 5., 4.);
4040        assert_eq_m128(r, e);
4041    }
4042
4043    #[simd_test(enable = "avx")]
4044    const fn test_mm256_permute_ps() {
4045        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4046        let r = _mm256_permute_ps::<0x1b>(a);
4047        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
4048        assert_eq_m256(r, e);
4049    }
4050
4051    #[simd_test(enable = "avx")]
4052    const fn test_mm_permute_ps() {
4053        let a = _mm_setr_ps(4., 3., 2., 5.);
4054        let r = _mm_permute_ps::<0x1b>(a);
4055        let e = _mm_setr_ps(5., 2., 3., 4.);
4056        assert_eq_m128(r, e);
4057    }
4058
4059    #[simd_test(enable = "avx")]
4060    fn test_mm256_permutevar_pd() {
4061        let a = _mm256_setr_pd(4., 3., 2., 5.);
4062        let b = _mm256_setr_epi64x(1, 2, 3, 4);
4063        let r = _mm256_permutevar_pd(a, b);
4064        let e = _mm256_setr_pd(4., 3., 5., 2.);
4065        assert_eq_m256d(r, e);
4066    }
4067
4068    #[simd_test(enable = "avx")]
4069    fn test_mm_permutevar_pd() {
4070        let a = _mm_setr_pd(4., 3.);
4071        let b = _mm_setr_epi64x(3, 0);
4072        let r = _mm_permutevar_pd(a, b);
4073        let e = _mm_setr_pd(3., 4.);
4074        assert_eq_m128d(r, e);
4075    }
4076
4077    #[simd_test(enable = "avx")]
4078    const fn test_mm256_permute_pd() {
4079        let a = _mm256_setr_pd(4., 3., 2., 5.);
4080        let r = _mm256_permute_pd::<5>(a);
4081        let e = _mm256_setr_pd(3., 4., 5., 2.);
4082        assert_eq_m256d(r, e);
4083    }
4084
4085    #[simd_test(enable = "avx")]
4086    const fn test_mm_permute_pd() {
4087        let a = _mm_setr_pd(4., 3.);
4088        let r = _mm_permute_pd::<1>(a);
4089        let e = _mm_setr_pd(3., 4.);
4090        assert_eq_m128d(r, e);
4091    }
4092
4093    #[simd_test(enable = "avx")]
4094    const fn test_mm256_permute2f128_ps() {
4095        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
4096        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
4097        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
4098        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
4099        assert_eq_m256(r, e);
4100
4101        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4102        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
4103        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
4104        assert_eq_m256(r, z);
4105    }
4106
4107    #[simd_test(enable = "avx")]
4108    const fn test_mm256_permute2f128_pd() {
4109        let a = _mm256_setr_pd(1., 2., 3., 4.);
4110        let b = _mm256_setr_pd(5., 6., 7., 8.);
4111        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
4112        let e = _mm256_setr_pd(3., 4., 7., 8.);
4113        assert_eq_m256d(r, e);
4114
4115        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4116        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
4117        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
4118        assert_eq_m256d(r, e);
4119    }
4120
4121    #[simd_test(enable = "avx")]
4122    const fn test_mm256_permute2f128_si256() {
4123        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
4124        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
4125        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
4126        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
4127        assert_eq_m256i(r, e);
4128
4129        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4130        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
4131        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
4132        assert_eq_m256i(r, e);
4133    }
4134
4135    #[simd_test(enable = "avx")]
4136    const fn test_mm256_broadcast_ss() {
4137        let r = _mm256_broadcast_ss(&3.);
4138        let e = _mm256_set1_ps(3.);
4139        assert_eq_m256(r, e);
4140    }
4141
4142    #[simd_test(enable = "avx")]
4143    const fn test_mm_broadcast_ss() {
4144        let r = _mm_broadcast_ss(&3.);
4145        let e = _mm_set1_ps(3.);
4146        assert_eq_m128(r, e);
4147    }
4148
4149    #[simd_test(enable = "avx")]
4150    const fn test_mm256_broadcast_sd() {
4151        let r = _mm256_broadcast_sd(&3.);
4152        let e = _mm256_set1_pd(3.);
4153        assert_eq_m256d(r, e);
4154    }
4155
4156    #[simd_test(enable = "avx")]
4157    const fn test_mm256_broadcast_ps() {
4158        let a = _mm_setr_ps(4., 3., 2., 5.);
4159        let r = _mm256_broadcast_ps(&a);
4160        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
4161        assert_eq_m256(r, e);
4162    }
4163
4164    #[simd_test(enable = "avx")]
4165    const fn test_mm256_broadcast_pd() {
4166        let a = _mm_setr_pd(4., 3.);
4167        let r = _mm256_broadcast_pd(&a);
4168        let e = _mm256_setr_pd(4., 3., 4., 3.);
4169        assert_eq_m256d(r, e);
4170    }
4171
4172    #[simd_test(enable = "avx")]
4173    const fn test_mm256_insertf128_ps() {
4174        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4175        let b = _mm_setr_ps(4., 9., 16., 25.);
4176        let r = _mm256_insertf128_ps::<0>(a, b);
4177        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4178        assert_eq_m256(r, e);
4179    }
4180
4181    #[simd_test(enable = "avx")]
4182    const fn test_mm256_insertf128_pd() {
4183        let a = _mm256_setr_pd(1., 2., 3., 4.);
4184        let b = _mm_setr_pd(5., 6.);
4185        let r = _mm256_insertf128_pd::<0>(a, b);
4186        let e = _mm256_setr_pd(5., 6., 3., 4.);
4187        assert_eq_m256d(r, e);
4188    }
4189
4190    #[simd_test(enable = "avx")]
4191    const fn test_mm256_insertf128_si256() {
4192        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4193        let b = _mm_setr_epi64x(5, 6);
4194        let r = _mm256_insertf128_si256::<0>(a, b);
4195        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4196        assert_eq_m256i(r, e);
4197    }
4198
4199    #[simd_test(enable = "avx")]
4200    const fn test_mm256_insert_epi8() {
4201        #[rustfmt::skip]
4202        let a = _mm256_setr_epi8(
4203            1, 2, 3, 4, 5, 6, 7, 8,
4204            9, 10, 11, 12, 13, 14, 15, 16,
4205            17, 18, 19, 20, 21, 22, 23, 24,
4206            25, 26, 27, 28, 29, 30, 31, 32,
4207        );
4208        let r = _mm256_insert_epi8::<31>(a, 0);
4209        #[rustfmt::skip]
4210        let e = _mm256_setr_epi8(
4211            1, 2, 3, 4, 5, 6, 7, 8,
4212            9, 10, 11, 12, 13, 14, 15, 16,
4213            17, 18, 19, 20, 21, 22, 23, 24,
4214            25, 26, 27, 28, 29, 30, 31, 0,
4215        );
4216        assert_eq_m256i(r, e);
4217    }
4218
4219    #[simd_test(enable = "avx")]
4220    const fn test_mm256_insert_epi16() {
4221        #[rustfmt::skip]
4222        let a = _mm256_setr_epi16(
4223            0, 1, 2, 3, 4, 5, 6, 7,
4224            8, 9, 10, 11, 12, 13, 14, 15,
4225        );
4226        let r = _mm256_insert_epi16::<15>(a, 0);
4227        #[rustfmt::skip]
4228        let e = _mm256_setr_epi16(
4229            0, 1, 2, 3, 4, 5, 6, 7,
4230            8, 9, 10, 11, 12, 13, 14, 0,
4231        );
4232        assert_eq_m256i(r, e);
4233    }
4234
4235    #[simd_test(enable = "avx")]
4236    const fn test_mm256_insert_epi32() {
4237        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4238        let r = _mm256_insert_epi32::<7>(a, 0);
4239        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4240        assert_eq_m256i(r, e);
4241    }
4242
4243    #[simd_test(enable = "avx")]
4244    const unsafe fn test_mm256_load_pd() {
4245        let a = _mm256_setr_pd(1., 2., 3., 4.);
4246        let p = ptr::addr_of!(a) as *const f64;
4247        let r = _mm256_load_pd(p);
4248        let e = _mm256_setr_pd(1., 2., 3., 4.);
4249        assert_eq_m256d(r, e);
4250    }
4251
4252    #[simd_test(enable = "avx")]
4253    const unsafe fn test_mm256_store_pd() {
4254        let a = _mm256_setr_pd(1., 2., 3., 4.);
4255        let mut r = _mm256_undefined_pd();
4256        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4257        assert_eq_m256d(r, a);
4258    }
4259
4260    #[simd_test(enable = "avx")]
4261    const unsafe fn test_mm256_load_ps() {
4262        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4263        let p = ptr::addr_of!(a) as *const f32;
4264        let r = _mm256_load_ps(p);
4265        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4266        assert_eq_m256(r, e);
4267    }
4268
4269    #[simd_test(enable = "avx")]
4270    const unsafe fn test_mm256_store_ps() {
4271        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4272        let mut r = _mm256_undefined_ps();
4273        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4274        assert_eq_m256(r, a);
4275    }
4276
4277    #[simd_test(enable = "avx")]
4278    const unsafe fn test_mm256_loadu_pd() {
4279        let a = &[1.0f64, 2., 3., 4.];
4280        let p = a.as_ptr();
4281        let r = _mm256_loadu_pd(black_box(p));
4282        let e = _mm256_setr_pd(1., 2., 3., 4.);
4283        assert_eq_m256d(r, e);
4284    }
4285
4286    #[simd_test(enable = "avx")]
4287    const unsafe fn test_mm256_storeu_pd() {
4288        let a = _mm256_set1_pd(9.);
4289        let mut r = _mm256_undefined_pd();
4290        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4291        assert_eq_m256d(r, a);
4292    }
4293
4294    #[simd_test(enable = "avx")]
4295    const unsafe fn test_mm256_loadu_ps() {
4296        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4297        let p = a.as_ptr();
4298        let r = _mm256_loadu_ps(black_box(p));
4299        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4300        assert_eq_m256(r, e);
4301    }
4302
4303    #[simd_test(enable = "avx")]
4304    const unsafe fn test_mm256_storeu_ps() {
4305        let a = _mm256_set1_ps(9.);
4306        let mut r = _mm256_undefined_ps();
4307        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4308        assert_eq_m256(r, a);
4309    }
4310
4311    #[simd_test(enable = "avx")]
4312    const unsafe fn test_mm256_load_si256() {
4313        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4314        let p = ptr::addr_of!(a);
4315        let r = _mm256_load_si256(p);
4316        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4317        assert_eq_m256i(r, e);
4318    }
4319
4320    #[simd_test(enable = "avx")]
4321    const unsafe fn test_mm256_store_si256() {
4322        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4323        let mut r = _mm256_undefined_si256();
4324        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4325        assert_eq_m256i(r, a);
4326    }
4327
4328    #[simd_test(enable = "avx")]
4329    const unsafe fn test_mm256_loadu_si256() {
4330        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4331        let p = ptr::addr_of!(a);
4332        let r = _mm256_loadu_si256(black_box(p));
4333        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4334        assert_eq_m256i(r, e);
4335    }
4336
4337    #[simd_test(enable = "avx")]
4338    const unsafe fn test_mm256_storeu_si256() {
4339        let a = _mm256_set1_epi8(9);
4340        let mut r = _mm256_undefined_si256();
4341        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4342        assert_eq_m256i(r, a);
4343    }
4344
4345    #[simd_test(enable = "avx")]
4346    const unsafe fn test_mm256_maskload_pd() {
4347        let a = &[1.0f64, 2., 3., 4.];
4348        let p = a.as_ptr();
4349        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4350        let r = _mm256_maskload_pd(black_box(p), mask);
4351        let e = _mm256_setr_pd(0., 2., 0., 4.);
4352        assert_eq_m256d(r, e);
4353    }
4354
4355    #[simd_test(enable = "avx")]
4356    const unsafe fn test_mm256_maskstore_pd() {
4357        let mut r = _mm256_set1_pd(0.);
4358        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4359        let a = _mm256_setr_pd(1., 2., 3., 4.);
4360        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4361        let e = _mm256_setr_pd(0., 2., 0., 4.);
4362        assert_eq_m256d(r, e);
4363    }
4364
4365    #[simd_test(enable = "avx")]
4366    const unsafe fn test_mm_maskload_pd() {
4367        let a = &[1.0f64, 2.];
4368        let p = a.as_ptr();
4369        let mask = _mm_setr_epi64x(0, !0);
4370        let r = _mm_maskload_pd(black_box(p), mask);
4371        let e = _mm_setr_pd(0., 2.);
4372        assert_eq_m128d(r, e);
4373    }
4374
4375    #[simd_test(enable = "avx")]
4376    const unsafe fn test_mm_maskstore_pd() {
4377        let mut r = _mm_set1_pd(0.);
4378        let mask = _mm_setr_epi64x(0, !0);
4379        let a = _mm_setr_pd(1., 2.);
4380        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4381        let e = _mm_setr_pd(0., 2.);
4382        assert_eq_m128d(r, e);
4383    }
4384
4385    #[simd_test(enable = "avx")]
4386    const unsafe fn test_mm256_maskload_ps() {
4387        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4388        let p = a.as_ptr();
4389        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4390        let r = _mm256_maskload_ps(black_box(p), mask);
4391        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4392        assert_eq_m256(r, e);
4393    }
4394
4395    #[simd_test(enable = "avx")]
4396    const unsafe fn test_mm256_maskstore_ps() {
4397        let mut r = _mm256_set1_ps(0.);
4398        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4399        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4400        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4401        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4402        assert_eq_m256(r, e);
4403    }
4404
4405    #[simd_test(enable = "avx")]
4406    const unsafe fn test_mm_maskload_ps() {
4407        let a = &[1.0f32, 2., 3., 4.];
4408        let p = a.as_ptr();
4409        let mask = _mm_setr_epi32(0, !0, 0, !0);
4410        let r = _mm_maskload_ps(black_box(p), mask);
4411        let e = _mm_setr_ps(0., 2., 0., 4.);
4412        assert_eq_m128(r, e);
4413    }
4414
4415    #[simd_test(enable = "avx")]
4416    const unsafe fn test_mm_maskstore_ps() {
4417        let mut r = _mm_set1_ps(0.);
4418        let mask = _mm_setr_epi32(0, !0, 0, !0);
4419        let a = _mm_setr_ps(1., 2., 3., 4.);
4420        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4421        let e = _mm_setr_ps(0., 2., 0., 4.);
4422        assert_eq_m128(r, e);
4423    }
4424
4425    #[simd_test(enable = "avx")]
4426    const fn test_mm256_movehdup_ps() {
4427        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4428        let r = _mm256_movehdup_ps(a);
4429        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4430        assert_eq_m256(r, e);
4431    }
4432
4433    #[simd_test(enable = "avx")]
4434    const fn test_mm256_moveldup_ps() {
4435        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4436        let r = _mm256_moveldup_ps(a);
4437        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4438        assert_eq_m256(r, e);
4439    }
4440
4441    #[simd_test(enable = "avx")]
4442    const fn test_mm256_movedup_pd() {
4443        let a = _mm256_setr_pd(1., 2., 3., 4.);
4444        let r = _mm256_movedup_pd(a);
4445        let e = _mm256_setr_pd(1., 1., 3., 3.);
4446        assert_eq_m256d(r, e);
4447    }
4448
4449    #[simd_test(enable = "avx")]
4450    unsafe fn test_mm256_lddqu_si256() {
4451        #[rustfmt::skip]
4452        let a = _mm256_setr_epi8(
4453            1, 2, 3, 4, 5, 6, 7, 8,
4454            9, 10, 11, 12, 13, 14, 15, 16,
4455            17, 18, 19, 20, 21, 22, 23, 24,
4456            25, 26, 27, 28, 29, 30, 31, 32,
4457        );
4458        let p = ptr::addr_of!(a);
4459        let r = _mm256_lddqu_si256(black_box(p));
4460        #[rustfmt::skip]
4461        let e = _mm256_setr_epi8(
4462            1, 2, 3, 4, 5, 6, 7, 8,
4463            9, 10, 11, 12, 13, 14, 15, 16,
4464            17, 18, 19, 20, 21, 22, 23, 24,
4465            25, 26, 27, 28, 29, 30, 31, 32,
4466        );
4467        assert_eq_m256i(r, e);
4468    }
4469
4470    #[simd_test(enable = "avx")]
4471    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4472    unsafe fn test_mm256_stream_si256() {
4473        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4474        let mut r = _mm256_undefined_si256();
4475        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4476        _mm_sfence();
4477        assert_eq_m256i(r, a);
4478    }
4479
4480    #[simd_test(enable = "avx")]
4481    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4482    unsafe fn test_mm256_stream_pd() {
4483        #[repr(align(32))]
4484        struct Memory {
4485            pub data: [f64; 4],
4486        }
4487        let a = _mm256_set1_pd(7.0);
4488        let mut mem = Memory { data: [-1.0; 4] };
4489
4490        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4491        _mm_sfence();
4492        for i in 0..4 {
4493            assert_eq!(mem.data[i], get_m256d(a, i));
4494        }
4495    }
4496
4497    #[simd_test(enable = "avx")]
4498    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4499    unsafe fn test_mm256_stream_ps() {
4500        #[repr(align(32))]
4501        struct Memory {
4502            pub data: [f32; 8],
4503        }
4504        let a = _mm256_set1_ps(7.0);
4505        let mut mem = Memory { data: [-1.0; 8] };
4506
4507        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4508        _mm_sfence();
4509        for i in 0..8 {
4510            assert_eq!(mem.data[i], get_m256(a, i));
4511        }
4512    }
4513
4514    #[simd_test(enable = "avx")]
4515    fn test_mm256_rcp_ps() {
4516        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4517        let r = _mm256_rcp_ps(a);
4518        #[rustfmt::skip]
4519        let e = _mm256_setr_ps(
4520            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4521            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4522        );
4523        let rel_err = 0.00048828125;
4524        for i in 0..8 {
4525            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4526        }
4527    }
4528
4529    #[simd_test(enable = "avx")]
4530    fn test_mm256_rsqrt_ps() {
4531        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4532        let r = _mm256_rsqrt_ps(a);
4533        #[rustfmt::skip]
4534        let e = _mm256_setr_ps(
4535            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4536            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4537        );
4538        let rel_err = 0.00048828125;
4539        for i in 0..8 {
4540            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4541        }
4542    }
4543
4544    #[simd_test(enable = "avx")]
4545    const fn test_mm256_unpackhi_pd() {
4546        let a = _mm256_setr_pd(1., 2., 3., 4.);
4547        let b = _mm256_setr_pd(5., 6., 7., 8.);
4548        let r = _mm256_unpackhi_pd(a, b);
4549        let e = _mm256_setr_pd(2., 6., 4., 8.);
4550        assert_eq_m256d(r, e);
4551    }
4552
4553    #[simd_test(enable = "avx")]
4554    const fn test_mm256_unpackhi_ps() {
4555        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4556        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4557        let r = _mm256_unpackhi_ps(a, b);
4558        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4559        assert_eq_m256(r, e);
4560    }
4561
4562    #[simd_test(enable = "avx")]
4563    const fn test_mm256_unpacklo_pd() {
4564        let a = _mm256_setr_pd(1., 2., 3., 4.);
4565        let b = _mm256_setr_pd(5., 6., 7., 8.);
4566        let r = _mm256_unpacklo_pd(a, b);
4567        let e = _mm256_setr_pd(1., 5., 3., 7.);
4568        assert_eq_m256d(r, e);
4569    }
4570
4571    #[simd_test(enable = "avx")]
4572    const fn test_mm256_unpacklo_ps() {
4573        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4574        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4575        let r = _mm256_unpacklo_ps(a, b);
4576        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4577        assert_eq_m256(r, e);
4578    }
4579
4580    #[simd_test(enable = "avx")]
4581    const fn test_mm256_testz_si256() {
4582        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4583        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4584        let r = _mm256_testz_si256(a, b);
4585        assert_eq!(r, 0);
4586        let b = _mm256_set1_epi64x(0);
4587        let r = _mm256_testz_si256(a, b);
4588        assert_eq!(r, 1);
4589    }
4590
4591    #[simd_test(enable = "avx")]
4592    const fn test_mm256_testc_si256() {
4593        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4594        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4595        let r = _mm256_testc_si256(a, b);
4596        assert_eq!(r, 0);
4597        let b = _mm256_set1_epi64x(0);
4598        let r = _mm256_testc_si256(a, b);
4599        assert_eq!(r, 1);
4600    }
4601
4602    #[simd_test(enable = "avx")]
4603    fn test_mm256_testnzc_si256() {
4604        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4605        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4606        let r = _mm256_testnzc_si256(a, b);
4607        assert_eq!(r, 1);
4608        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4609        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4610        let r = _mm256_testnzc_si256(a, b);
4611        assert_eq!(r, 0);
4612    }
4613
4614    #[simd_test(enable = "avx")]
4615    fn test_mm256_testz_pd() {
4616        let a = _mm256_setr_pd(1., 2., 3., 4.);
4617        let b = _mm256_setr_pd(5., 6., 7., 8.);
4618        let r = _mm256_testz_pd(a, b);
4619        assert_eq!(r, 1);
4620        let a = _mm256_set1_pd(-1.);
4621        let r = _mm256_testz_pd(a, a);
4622        assert_eq!(r, 0);
4623    }
4624
4625    #[simd_test(enable = "avx")]
4626    fn test_mm256_testc_pd() {
4627        let a = _mm256_setr_pd(1., 2., 3., 4.);
4628        let b = _mm256_setr_pd(5., 6., 7., 8.);
4629        let r = _mm256_testc_pd(a, b);
4630        assert_eq!(r, 1);
4631        let a = _mm256_set1_pd(1.);
4632        let b = _mm256_set1_pd(-1.);
4633        let r = _mm256_testc_pd(a, b);
4634        assert_eq!(r, 0);
4635    }
4636
4637    #[simd_test(enable = "avx")]
4638    fn test_mm256_testnzc_pd() {
4639        let a = _mm256_setr_pd(1., 2., 3., 4.);
4640        let b = _mm256_setr_pd(5., 6., 7., 8.);
4641        let r = _mm256_testnzc_pd(a, b);
4642        assert_eq!(r, 0);
4643        let a = _mm256_setr_pd(1., -1., -1., -1.);
4644        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4645        let r = _mm256_testnzc_pd(a, b);
4646        assert_eq!(r, 1);
4647    }
4648
4649    #[simd_test(enable = "avx")]
4650    const fn test_mm_testz_pd() {
4651        let a = _mm_setr_pd(1., 2.);
4652        let b = _mm_setr_pd(5., 6.);
4653        let r = _mm_testz_pd(a, b);
4654        assert_eq!(r, 1);
4655        let a = _mm_set1_pd(-1.);
4656        let r = _mm_testz_pd(a, a);
4657        assert_eq!(r, 0);
4658    }
4659
4660    #[simd_test(enable = "avx")]
4661    const fn test_mm_testc_pd() {
4662        let a = _mm_setr_pd(1., 2.);
4663        let b = _mm_setr_pd(5., 6.);
4664        let r = _mm_testc_pd(a, b);
4665        assert_eq!(r, 1);
4666        let a = _mm_set1_pd(1.);
4667        let b = _mm_set1_pd(-1.);
4668        let r = _mm_testc_pd(a, b);
4669        assert_eq!(r, 0);
4670    }
4671
4672    #[simd_test(enable = "avx")]
4673    fn test_mm_testnzc_pd() {
4674        let a = _mm_setr_pd(1., 2.);
4675        let b = _mm_setr_pd(5., 6.);
4676        let r = _mm_testnzc_pd(a, b);
4677        assert_eq!(r, 0);
4678        let a = _mm_setr_pd(1., -1.);
4679        let b = _mm_setr_pd(-1., -1.);
4680        let r = _mm_testnzc_pd(a, b);
4681        assert_eq!(r, 1);
4682    }
4683
4684    #[simd_test(enable = "avx")]
4685    fn test_mm256_testz_ps() {
4686        let a = _mm256_set1_ps(1.);
4687        let r = _mm256_testz_ps(a, a);
4688        assert_eq!(r, 1);
4689        let a = _mm256_set1_ps(-1.);
4690        let r = _mm256_testz_ps(a, a);
4691        assert_eq!(r, 0);
4692    }
4693
4694    #[simd_test(enable = "avx")]
4695    fn test_mm256_testc_ps() {
4696        let a = _mm256_set1_ps(1.);
4697        let r = _mm256_testc_ps(a, a);
4698        assert_eq!(r, 1);
4699        let b = _mm256_set1_ps(-1.);
4700        let r = _mm256_testc_ps(a, b);
4701        assert_eq!(r, 0);
4702    }
4703
4704    #[simd_test(enable = "avx")]
4705    fn test_mm256_testnzc_ps() {
4706        let a = _mm256_set1_ps(1.);
4707        let r = _mm256_testnzc_ps(a, a);
4708        assert_eq!(r, 0);
4709        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4710        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4711        let r = _mm256_testnzc_ps(a, b);
4712        assert_eq!(r, 1);
4713    }
4714
4715    #[simd_test(enable = "avx")]
4716    const fn test_mm_testz_ps() {
4717        let a = _mm_set1_ps(1.);
4718        let r = _mm_testz_ps(a, a);
4719        assert_eq!(r, 1);
4720        let a = _mm_set1_ps(-1.);
4721        let r = _mm_testz_ps(a, a);
4722        assert_eq!(r, 0);
4723    }
4724
4725    #[simd_test(enable = "avx")]
4726    const fn test_mm_testc_ps() {
4727        let a = _mm_set1_ps(1.);
4728        let r = _mm_testc_ps(a, a);
4729        assert_eq!(r, 1);
4730        let b = _mm_set1_ps(-1.);
4731        let r = _mm_testc_ps(a, b);
4732        assert_eq!(r, 0);
4733    }
4734
4735    #[simd_test(enable = "avx")]
4736    fn test_mm_testnzc_ps() {
4737        let a = _mm_set1_ps(1.);
4738        let r = _mm_testnzc_ps(a, a);
4739        assert_eq!(r, 0);
4740        let a = _mm_setr_ps(1., -1., -1., -1.);
4741        let b = _mm_setr_ps(-1., -1., 1., 1.);
4742        let r = _mm_testnzc_ps(a, b);
4743        assert_eq!(r, 1);
4744    }
4745
4746    #[simd_test(enable = "avx")]
4747    const fn test_mm256_movemask_pd() {
4748        let a = _mm256_setr_pd(1., -2., 3., -4.);
4749        let r = _mm256_movemask_pd(a);
4750        assert_eq!(r, 0xA);
4751    }
4752
4753    #[simd_test(enable = "avx")]
4754    const fn test_mm256_movemask_ps() {
4755        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4756        let r = _mm256_movemask_ps(a);
4757        assert_eq!(r, 0xAA);
4758    }
4759
4760    #[simd_test(enable = "avx")]
4761    const fn test_mm256_setzero_pd() {
4762        let r = _mm256_setzero_pd();
4763        assert_eq_m256d(r, _mm256_set1_pd(0.));
4764    }
4765
4766    #[simd_test(enable = "avx")]
4767    const fn test_mm256_setzero_ps() {
4768        let r = _mm256_setzero_ps();
4769        assert_eq_m256(r, _mm256_set1_ps(0.));
4770    }
4771
4772    #[simd_test(enable = "avx")]
4773    const fn test_mm256_setzero_si256() {
4774        let r = _mm256_setzero_si256();
4775        assert_eq_m256i(r, _mm256_set1_epi8(0));
4776    }
4777
4778    #[simd_test(enable = "avx")]
4779    const fn test_mm256_set_pd() {
4780        let r = _mm256_set_pd(1., 2., 3., 4.);
4781        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4782    }
4783
4784    #[simd_test(enable = "avx")]
4785    const fn test_mm256_set_ps() {
4786        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4787        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4788    }
4789
4790    #[simd_test(enable = "avx")]
4791    const fn test_mm256_set_epi8() {
4792        #[rustfmt::skip]
4793        let r = _mm256_set_epi8(
4794            1, 2, 3, 4, 5, 6, 7, 8,
4795            9, 10, 11, 12, 13, 14, 15, 16,
4796            17, 18, 19, 20, 21, 22, 23, 24,
4797            25, 26, 27, 28, 29, 30, 31, 32,
4798        );
4799        #[rustfmt::skip]
4800        let e = _mm256_setr_epi8(
4801            32, 31, 30, 29, 28, 27, 26, 25,
4802            24, 23, 22, 21, 20, 19, 18, 17,
4803            16, 15, 14, 13, 12, 11, 10, 9,
4804            8, 7, 6, 5, 4, 3, 2, 1
4805        );
4806        assert_eq_m256i(r, e);
4807    }
4808
4809    #[simd_test(enable = "avx")]
4810    const fn test_mm256_set_epi16() {
4811        #[rustfmt::skip]
4812        let r = _mm256_set_epi16(
4813            1, 2, 3, 4, 5, 6, 7, 8,
4814            9, 10, 11, 12, 13, 14, 15, 16,
4815        );
4816        #[rustfmt::skip]
4817        let e = _mm256_setr_epi16(
4818            16, 15, 14, 13, 12, 11, 10, 9, 8,
4819            7, 6, 5, 4, 3, 2, 1,
4820        );
4821        assert_eq_m256i(r, e);
4822    }
4823
4824    #[simd_test(enable = "avx")]
4825    const fn test_mm256_set_epi32() {
4826        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4827        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4828    }
4829
4830    #[simd_test(enable = "avx")]
4831    const fn test_mm256_set_epi64x() {
4832        let r = _mm256_set_epi64x(1, 2, 3, 4);
4833        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4834    }
4835
4836    #[simd_test(enable = "avx")]
4837    const fn test_mm256_setr_pd() {
4838        let r = _mm256_setr_pd(1., 2., 3., 4.);
4839        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4840    }
4841
4842    #[simd_test(enable = "avx")]
4843    const fn test_mm256_setr_ps() {
4844        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4845        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4846    }
4847
4848    #[simd_test(enable = "avx")]
4849    const fn test_mm256_setr_epi8() {
4850        #[rustfmt::skip]
4851        let r = _mm256_setr_epi8(
4852            1, 2, 3, 4, 5, 6, 7, 8,
4853            9, 10, 11, 12, 13, 14, 15, 16,
4854            17, 18, 19, 20, 21, 22, 23, 24,
4855            25, 26, 27, 28, 29, 30, 31, 32,
4856        );
4857        #[rustfmt::skip]
4858        let e = _mm256_setr_epi8(
4859            1, 2, 3, 4, 5, 6, 7, 8,
4860            9, 10, 11, 12, 13, 14, 15, 16,
4861            17, 18, 19, 20, 21, 22, 23, 24,
4862            25, 26, 27, 28, 29, 30, 31, 32
4863        );
4864
4865        assert_eq_m256i(r, e);
4866    }
4867
4868    #[simd_test(enable = "avx")]
4869    const fn test_mm256_setr_epi16() {
4870        #[rustfmt::skip]
4871        let r = _mm256_setr_epi16(
4872            1, 2, 3, 4, 5, 6, 7, 8,
4873            9, 10, 11, 12, 13, 14, 15, 16,
4874        );
4875        #[rustfmt::skip]
4876        let e = _mm256_setr_epi16(
4877            1, 2, 3, 4, 5, 6, 7, 8,
4878            9, 10, 11, 12, 13, 14, 15, 16,
4879        );
4880        assert_eq_m256i(r, e);
4881    }
4882
4883    #[simd_test(enable = "avx")]
4884    const fn test_mm256_setr_epi32() {
4885        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4886        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4887    }
4888
4889    #[simd_test(enable = "avx")]
4890    const fn test_mm256_setr_epi64x() {
4891        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4892        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4893    }
4894
4895    #[simd_test(enable = "avx")]
4896    const fn test_mm256_set1_pd() {
4897        let r = _mm256_set1_pd(1.);
4898        assert_eq_m256d(r, _mm256_set1_pd(1.));
4899    }
4900
4901    #[simd_test(enable = "avx")]
4902    const fn test_mm256_set1_ps() {
4903        let r = _mm256_set1_ps(1.);
4904        assert_eq_m256(r, _mm256_set1_ps(1.));
4905    }
4906
4907    #[simd_test(enable = "avx")]
4908    const fn test_mm256_set1_epi8() {
4909        let r = _mm256_set1_epi8(1);
4910        assert_eq_m256i(r, _mm256_set1_epi8(1));
4911    }
4912
4913    #[simd_test(enable = "avx")]
4914    const fn test_mm256_set1_epi16() {
4915        let r = _mm256_set1_epi16(1);
4916        assert_eq_m256i(r, _mm256_set1_epi16(1));
4917    }
4918
4919    #[simd_test(enable = "avx")]
4920    const fn test_mm256_set1_epi32() {
4921        let r = _mm256_set1_epi32(1);
4922        assert_eq_m256i(r, _mm256_set1_epi32(1));
4923    }
4924
4925    #[simd_test(enable = "avx")]
4926    const fn test_mm256_set1_epi64x() {
4927        let r = _mm256_set1_epi64x(1);
4928        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4929    }
4930
4931    #[simd_test(enable = "avx")]
4932    const fn test_mm256_castpd_ps() {
4933        let a = _mm256_setr_pd(1., 2., 3., 4.);
4934        let r = _mm256_castpd_ps(a);
4935        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4936        assert_eq_m256(r, e);
4937    }
4938
4939    #[simd_test(enable = "avx")]
4940    const fn test_mm256_castps_pd() {
4941        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4942        let r = _mm256_castps_pd(a);
4943        let e = _mm256_setr_pd(1., 2., 3., 4.);
4944        assert_eq_m256d(r, e);
4945    }
4946
4947    #[simd_test(enable = "avx")]
4948    const fn test_mm256_castps_si256() {
4949        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4950        let r = _mm256_castps_si256(a);
4951        #[rustfmt::skip]
4952        let e = _mm256_setr_epi8(
4953            0, 0, -128, 63, 0, 0, 0, 64,
4954            0, 0, 64, 64, 0, 0, -128, 64,
4955            0, 0, -96, 64, 0, 0, -64, 64,
4956            0, 0, -32, 64, 0, 0, 0, 65,
4957        );
4958        assert_eq_m256i(r, e);
4959    }
4960
4961    #[simd_test(enable = "avx")]
4962    const fn test_mm256_castsi256_ps() {
4963        #[rustfmt::skip]
4964        let a = _mm256_setr_epi8(
4965            0, 0, -128, 63, 0, 0, 0, 64,
4966            0, 0, 64, 64, 0, 0, -128, 64,
4967            0, 0, -96, 64, 0, 0, -64, 64,
4968            0, 0, -32, 64, 0, 0, 0, 65,
4969        );
4970        let r = _mm256_castsi256_ps(a);
4971        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4972        assert_eq_m256(r, e);
4973    }
4974
4975    #[simd_test(enable = "avx")]
4976    const fn test_mm256_castpd_si256() {
4977        let a = _mm256_setr_pd(1., 2., 3., 4.);
4978        let r = _mm256_castpd_si256(a);
4979        assert_eq_m256d(unsafe { transmute(r) }, a);
4980    }
4981
4982    #[simd_test(enable = "avx")]
4983    const fn test_mm256_castsi256_pd() {
4984        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4985        let r = _mm256_castsi256_pd(a);
4986        assert_eq_m256d(r, unsafe { transmute(a) });
4987    }
4988
4989    #[simd_test(enable = "avx")]
4990    const fn test_mm256_castps256_ps128() {
4991        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4992        let r = _mm256_castps256_ps128(a);
4993        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4994    }
4995
4996    #[simd_test(enable = "avx")]
4997    const fn test_mm256_castpd256_pd128() {
4998        let a = _mm256_setr_pd(1., 2., 3., 4.);
4999        let r = _mm256_castpd256_pd128(a);
5000        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
5001    }
5002
5003    #[simd_test(enable = "avx")]
5004    const fn test_mm256_castsi256_si128() {
5005        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5006        let r = _mm256_castsi256_si128(a);
5007        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5008    }
5009
5010    #[simd_test(enable = "avx")]
5011    const fn test_mm256_castps128_ps256() {
5012        let a = _mm_setr_ps(1., 2., 3., 4.);
5013        let r = _mm256_castps128_ps256(a);
5014        assert_eq_m128(_mm256_castps256_ps128(r), a);
5015    }
5016
5017    #[simd_test(enable = "avx")]
5018    const fn test_mm256_castpd128_pd256() {
5019        let a = _mm_setr_pd(1., 2.);
5020        let r = _mm256_castpd128_pd256(a);
5021        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
5022    }
5023
5024    #[simd_test(enable = "avx")]
5025    const fn test_mm256_castsi128_si256() {
5026        let a = _mm_setr_epi32(1, 2, 3, 4);
5027        let r = _mm256_castsi128_si256(a);
5028        assert_eq_m128i(_mm256_castsi256_si128(r), a);
5029    }
5030
5031    #[simd_test(enable = "avx")]
5032    const fn test_mm256_zextps128_ps256() {
5033        let a = _mm_setr_ps(1., 2., 3., 4.);
5034        let r = _mm256_zextps128_ps256(a);
5035        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
5036        assert_eq_m256(r, e);
5037    }
5038
5039    #[simd_test(enable = "avx")]
5040    const fn test_mm256_zextsi128_si256() {
5041        let a = _mm_setr_epi64x(1, 2);
5042        let r = _mm256_zextsi128_si256(a);
5043        let e = _mm256_setr_epi64x(1, 2, 0, 0);
5044        assert_eq_m256i(r, e);
5045    }
5046
5047    #[simd_test(enable = "avx")]
5048    const fn test_mm256_zextpd128_pd256() {
5049        let a = _mm_setr_pd(1., 2.);
5050        let r = _mm256_zextpd128_pd256(a);
5051        let e = _mm256_setr_pd(1., 2., 0., 0.);
5052        assert_eq_m256d(r, e);
5053    }
5054
5055    #[simd_test(enable = "avx")]
5056    const fn test_mm256_set_m128() {
5057        let hi = _mm_setr_ps(5., 6., 7., 8.);
5058        let lo = _mm_setr_ps(1., 2., 3., 4.);
5059        let r = _mm256_set_m128(hi, lo);
5060        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5061        assert_eq_m256(r, e);
5062    }
5063
5064    #[simd_test(enable = "avx")]
5065    const fn test_mm256_set_m128d() {
5066        let hi = _mm_setr_pd(3., 4.);
5067        let lo = _mm_setr_pd(1., 2.);
5068        let r = _mm256_set_m128d(hi, lo);
5069        let e = _mm256_setr_pd(1., 2., 3., 4.);
5070        assert_eq_m256d(r, e);
5071    }
5072
5073    #[simd_test(enable = "avx")]
5074    const fn test_mm256_set_m128i() {
5075        #[rustfmt::skip]
5076        let hi = _mm_setr_epi8(
5077            17, 18, 19, 20,
5078            21, 22, 23, 24,
5079            25, 26, 27, 28,
5080            29, 30, 31, 32,
5081        );
5082        #[rustfmt::skip]
5083        let lo = _mm_setr_epi8(
5084            1, 2, 3, 4,
5085            5, 6, 7, 8,
5086            9, 10, 11, 12,
5087            13, 14, 15, 16,
5088        );
5089        let r = _mm256_set_m128i(hi, lo);
5090        #[rustfmt::skip]
5091        let e = _mm256_setr_epi8(
5092            1, 2, 3, 4, 5, 6, 7, 8,
5093            9, 10, 11, 12, 13, 14, 15, 16,
5094            17, 18, 19, 20, 21, 22, 23, 24,
5095            25, 26, 27, 28, 29, 30, 31, 32,
5096        );
5097        assert_eq_m256i(r, e);
5098    }
5099
5100    #[simd_test(enable = "avx")]
5101    const fn test_mm256_setr_m128() {
5102        let lo = _mm_setr_ps(1., 2., 3., 4.);
5103        let hi = _mm_setr_ps(5., 6., 7., 8.);
5104        let r = _mm256_setr_m128(lo, hi);
5105        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5106        assert_eq_m256(r, e);
5107    }
5108
5109    #[simd_test(enable = "avx")]
5110    const fn test_mm256_setr_m128d() {
5111        let lo = _mm_setr_pd(1., 2.);
5112        let hi = _mm_setr_pd(3., 4.);
5113        let r = _mm256_setr_m128d(lo, hi);
5114        let e = _mm256_setr_pd(1., 2., 3., 4.);
5115        assert_eq_m256d(r, e);
5116    }
5117
5118    #[simd_test(enable = "avx")]
5119    const fn test_mm256_setr_m128i() {
5120        #[rustfmt::skip]
5121        let lo = _mm_setr_epi8(
5122            1, 2, 3, 4,
5123            5, 6, 7, 8,
5124            9, 10, 11, 12,
5125            13, 14, 15, 16,
5126        );
5127        #[rustfmt::skip]
5128        let hi = _mm_setr_epi8(
5129            17, 18, 19, 20, 21, 22, 23, 24,
5130            25, 26, 27, 28, 29, 30, 31, 32,
5131        );
5132        let r = _mm256_setr_m128i(lo, hi);
5133        #[rustfmt::skip]
5134        let e = _mm256_setr_epi8(
5135            1, 2, 3, 4, 5, 6, 7, 8,
5136            9, 10, 11, 12, 13, 14, 15, 16,
5137            17, 18, 19, 20, 21, 22, 23, 24,
5138            25, 26, 27, 28, 29, 30, 31, 32,
5139        );
5140        assert_eq_m256i(r, e);
5141    }
5142
5143    #[simd_test(enable = "avx")]
5144    const unsafe fn test_mm256_loadu2_m128() {
5145        let hi = &[5., 6., 7., 8.];
5146        let hiaddr = hi.as_ptr();
5147        let lo = &[1., 2., 3., 4.];
5148        let loaddr = lo.as_ptr();
5149        let r = _mm256_loadu2_m128(hiaddr, loaddr);
5150        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5151        assert_eq_m256(r, e);
5152    }
5153
5154    #[simd_test(enable = "avx")]
5155    const unsafe fn test_mm256_loadu2_m128d() {
5156        let hi = &[3., 4.];
5157        let hiaddr = hi.as_ptr();
5158        let lo = &[1., 2.];
5159        let loaddr = lo.as_ptr();
5160        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
5161        let e = _mm256_setr_pd(1., 2., 3., 4.);
5162        assert_eq_m256d(r, e);
5163    }
5164
5165    #[simd_test(enable = "avx")]
5166    const unsafe fn test_mm256_loadu2_m128i() {
5167        #[rustfmt::skip]
5168        let hi = _mm_setr_epi8(
5169            17, 18, 19, 20, 21, 22, 23, 24,
5170            25, 26, 27, 28, 29, 30, 31, 32,
5171        );
5172        #[rustfmt::skip]
5173        let lo = _mm_setr_epi8(
5174            1, 2, 3, 4, 5, 6, 7, 8,
5175            9, 10, 11, 12, 13, 14, 15, 16,
5176        );
5177        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
5178        #[rustfmt::skip]
5179        let e = _mm256_setr_epi8(
5180            1, 2, 3, 4, 5, 6, 7, 8,
5181            9, 10, 11, 12, 13, 14, 15, 16,
5182            17, 18, 19, 20, 21, 22, 23, 24,
5183            25, 26, 27, 28, 29, 30, 31, 32,
5184        );
5185        assert_eq_m256i(r, e);
5186    }
5187
5188    #[simd_test(enable = "avx")]
5189    const unsafe fn test_mm256_storeu2_m128() {
5190        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5191        let mut hi = _mm_undefined_ps();
5192        let mut lo = _mm_undefined_ps();
5193        _mm256_storeu2_m128(
5194            ptr::addr_of_mut!(hi) as *mut f32,
5195            ptr::addr_of_mut!(lo) as *mut f32,
5196            a,
5197        );
5198        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5199        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5200    }
5201
5202    #[simd_test(enable = "avx")]
5203    const unsafe fn test_mm256_storeu2_m128d() {
5204        let a = _mm256_setr_pd(1., 2., 3., 4.);
5205        let mut hi = _mm_undefined_pd();
5206        let mut lo = _mm_undefined_pd();
5207        _mm256_storeu2_m128d(
5208            ptr::addr_of_mut!(hi) as *mut f64,
5209            ptr::addr_of_mut!(lo) as *mut f64,
5210            a,
5211        );
5212        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5213        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5214    }
5215
5216    #[simd_test(enable = "avx")]
5217    const unsafe fn test_mm256_storeu2_m128i() {
5218        #[rustfmt::skip]
5219        let a = _mm256_setr_epi8(
5220            1, 2, 3, 4, 5, 6, 7, 8,
5221            9, 10, 11, 12, 13, 14, 15, 16,
5222            17, 18, 19, 20, 21, 22, 23, 24,
5223            25, 26, 27, 28, 29, 30, 31, 32,
5224        );
5225        let mut hi = _mm_undefined_si128();
5226        let mut lo = _mm_undefined_si128();
5227        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5228        #[rustfmt::skip]
5229        let e_hi = _mm_setr_epi8(
5230            17, 18, 19, 20, 21, 22, 23, 24,
5231            25, 26, 27, 28, 29, 30, 31, 32
5232        );
5233        #[rustfmt::skip]
5234        let e_lo = _mm_setr_epi8(
5235            1, 2, 3, 4, 5, 6, 7, 8,
5236            9, 10, 11, 12, 13, 14, 15, 16
5237        );
5238
5239        assert_eq_m128i(hi, e_hi);
5240        assert_eq_m128i(lo, e_lo);
5241    }
5242
5243    #[simd_test(enable = "avx")]
5244    const fn test_mm256_cvtss_f32() {
5245        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5246        let r = _mm256_cvtss_f32(a);
5247        assert_eq!(r, 1.);
5248    }
5249}