Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
34pub const fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35    unsafe { simd_add(a, b) }
36}
37
38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
39/// `b`.
40///
41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
45#[stable(feature = "simd_x86", since = "1.27.0")]
46#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
47pub const fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
48    unsafe { simd_add(a, b) }
49}
50
51/// Computes the bitwise AND of a packed double-precision (64-bit)
52/// floating-point elements in `a` and `b`.
53///
54/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
55#[inline]
56#[target_feature(enable = "avx")]
57// See https://github.com/rust-lang/stdarch/issues/71
58#[cfg_attr(test, assert_instr(vandp))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
62    unsafe {
63        let a: u64x4 = transmute(a);
64        let b: u64x4 = transmute(b);
65        transmute(simd_and(a, b))
66    }
67}
68
69/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
70/// elements in `a` and `b`.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
73#[inline]
74#[target_feature(enable = "avx")]
75#[cfg_attr(test, assert_instr(vandps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
78pub const fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
79    unsafe {
80        let a: u32x8 = transmute(a);
81        let b: u32x8 = transmute(b);
82        transmute(simd_and(a, b))
83    }
84}
85
86/// Computes the bitwise OR packed double-precision (64-bit) floating-point
87/// elements in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
90#[inline]
91#[target_feature(enable = "avx")]
92// See <https://github.com/rust-lang/stdarch/issues/71>.
93#[cfg_attr(test, assert_instr(vorp))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
96pub const fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
97    unsafe {
98        let a: u64x4 = transmute(a);
99        let b: u64x4 = transmute(b);
100        transmute(simd_or(a, b))
101    }
102}
103
104/// Computes the bitwise OR packed single-precision (32-bit) floating-point
105/// elements in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
108#[inline]
109#[target_feature(enable = "avx")]
110#[cfg_attr(test, assert_instr(vorps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
114    unsafe {
115        let a: u32x8 = transmute(a);
116        let b: u32x8 = transmute(b);
117        transmute(simd_or(a, b))
118    }
119}
120
121/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
122/// lanes using the control in `imm8`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
125#[inline]
126#[target_feature(enable = "avx")]
127#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
128#[rustc_legacy_const_generics(2)]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
132    static_assert_uimm_bits!(MASK, 8);
133    unsafe {
134        simd_shuffle!(
135            a,
136            b,
137            [
138                MASK as u32 & 0b1,
139                ((MASK as u32 >> 1) & 0b1) + 4,
140                ((MASK as u32 >> 2) & 0b1) + 2,
141                ((MASK as u32 >> 3) & 0b1) + 6,
142            ],
143        )
144    }
145}
146
147/// Shuffles single-precision (32-bit) floating-point elements in `a` within
148/// 128-bit lanes using the control in `imm8`.
149///
150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
151#[inline]
152#[target_feature(enable = "avx")]
153#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
154#[rustc_legacy_const_generics(2)]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
157pub const fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
158    static_assert_uimm_bits!(MASK, 8);
159    unsafe {
160        simd_shuffle!(
161            a,
162            b,
163            [
164                MASK as u32 & 0b11,
165                (MASK as u32 >> 2) & 0b11,
166                ((MASK as u32 >> 4) & 0b11) + 8,
167                ((MASK as u32 >> 6) & 0b11) + 8,
168                (MASK as u32 & 0b11) + 4,
169                ((MASK as u32 >> 2) & 0b11) + 4,
170                ((MASK as u32 >> 4) & 0b11) + 12,
171                ((MASK as u32 >> 6) & 0b11) + 12,
172            ],
173        )
174    }
175}
176
177/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
178/// elements in `a`, and then AND with `b`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
181#[inline]
182#[target_feature(enable = "avx")]
183#[cfg_attr(test, assert_instr(vandnp))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
187    unsafe {
188        let a: u64x4 = transmute(a);
189        let b: u64x4 = transmute(b);
190        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
191    }
192}
193
194/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
195/// elements in `a`
196/// and then AND with `b`.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
199#[inline]
200#[target_feature(enable = "avx")]
201#[cfg_attr(test, assert_instr(vandnps))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
204pub const fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
205    unsafe {
206        let a: u32x8 = transmute(a);
207        let b: u32x8 = transmute(b);
208        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
209    }
210}
211
212/// Compares packed double-precision (64-bit) floating-point elements
213/// in `a` and `b`, and returns packed maximum values
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
216#[inline]
217#[target_feature(enable = "avx")]
218#[cfg_attr(test, assert_instr(vmaxpd))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
221    unsafe { vmaxpd(a, b) }
222}
223
224/// Compares packed single-precision (32-bit) floating-point elements in `a`
225/// and `b`, and returns packed maximum values
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
228#[inline]
229#[target_feature(enable = "avx")]
230#[cfg_attr(test, assert_instr(vmaxps))]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
233    unsafe { vmaxps(a, b) }
234}
235
236/// Compares packed double-precision (64-bit) floating-point elements
237/// in `a` and `b`, and returns packed minimum values
238///
239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
240#[inline]
241#[target_feature(enable = "avx")]
242#[cfg_attr(test, assert_instr(vminpd))]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
245    unsafe { vminpd(a, b) }
246}
247
248/// Compares packed single-precision (32-bit) floating-point elements in `a`
249/// and `b`, and returns packed minimum values
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
252#[inline]
253#[target_feature(enable = "avx")]
254#[cfg_attr(test, assert_instr(vminps))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
257    unsafe { vminps(a, b) }
258}
259
260/// Multiplies packed double-precision (64-bit) floating-point elements
261/// in `a` and `b`.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
264#[inline]
265#[target_feature(enable = "avx")]
266#[cfg_attr(test, assert_instr(vmulpd))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
269pub const fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
270    unsafe { simd_mul(a, b) }
271}
272
273/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
274/// `b`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
277#[inline]
278#[target_feature(enable = "avx")]
279#[cfg_attr(test, assert_instr(vmulps))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
282pub const fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
283    unsafe { simd_mul(a, b) }
284}
285
286/// Alternatively adds and subtracts packed double-precision (64-bit)
287/// floating-point elements in `a` to/from packed elements in `b`.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vaddsubpd))]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
295pub const fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
296    unsafe {
297        let a = a.as_f64x4();
298        let b = b.as_f64x4();
299        let add = simd_add(a, b);
300        let sub = simd_sub(a, b);
301        simd_shuffle!(add, sub, [4, 1, 6, 3])
302    }
303}
304
305/// Alternatively adds and subtracts packed single-precision (32-bit)
306/// floating-point elements in `a` to/from packed elements in `b`.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
309#[inline]
310#[target_feature(enable = "avx")]
311#[cfg_attr(test, assert_instr(vaddsubps))]
312#[stable(feature = "simd_x86", since = "1.27.0")]
313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
314pub const fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
315    unsafe {
316        let a = a.as_f32x8();
317        let b = b.as_f32x8();
318        let add = simd_add(a, b);
319        let sub = simd_sub(a, b);
320        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
321    }
322}
323
324/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
325/// from packed elements in `a`.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
328#[inline]
329#[target_feature(enable = "avx")]
330#[cfg_attr(test, assert_instr(vsubpd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
333pub const fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
334    unsafe { simd_sub(a, b) }
335}
336
337/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
338/// from packed elements in `a`.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
341#[inline]
342#[target_feature(enable = "avx")]
343#[cfg_attr(test, assert_instr(vsubps))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
346pub const fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
347    unsafe { simd_sub(a, b) }
348}
349
350/// Computes the division of each of the 8 packed 32-bit floating-point elements
351/// in `a` by the corresponding packed elements in `b`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
354#[inline]
355#[target_feature(enable = "avx")]
356#[cfg_attr(test, assert_instr(vdivps))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
359pub const fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
360    unsafe { simd_div(a, b) }
361}
362
363/// Computes the division of each of the 4 packed 64-bit floating-point elements
364/// in `a` by the corresponding packed elements in `b`.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
367#[inline]
368#[target_feature(enable = "avx")]
369#[cfg_attr(test, assert_instr(vdivpd))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
372pub const fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
373    unsafe { simd_div(a, b) }
374}
375
376/// Rounds packed double-precision (64-bit) floating point elements in `a`
377/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
378///
379/// - `0x00`: Round to the nearest whole number.
380/// - `0x01`: Round down, toward negative infinity.
381/// - `0x02`: Round up, toward positive infinity.
382/// - `0x03`: Truncate the values.
383///
384/// For a complete list of options, check [the LLVM docs][llvm_docs].
385///
386/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
389#[inline]
390#[target_feature(enable = "avx")]
391#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
392#[rustc_legacy_const_generics(1)]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
395    static_assert_uimm_bits!(ROUNDING, 4);
396    unsafe { roundpd256(a, ROUNDING) }
397}
398
399/// Rounds packed double-precision (64-bit) floating point elements in `a`
400/// toward positive infinity.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
403#[inline]
404#[target_feature(enable = "avx")]
405#[cfg_attr(test, assert_instr(vroundpd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_ceil_pd(a: __m256d) -> __m256d {
409    unsafe { simd_ceil(a) }
410}
411
412/// Rounds packed double-precision (64-bit) floating point elements in `a`
413/// toward negative infinity.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
416#[inline]
417#[target_feature(enable = "avx")]
418#[cfg_attr(test, assert_instr(vroundpd))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
421pub const fn _mm256_floor_pd(a: __m256d) -> __m256d {
422    unsafe { simd_floor(a) }
423}
424
425/// Rounds packed single-precision (32-bit) floating point elements in `a`
426/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
427///
428/// - `0x00`: Round to the nearest whole number.
429/// - `0x01`: Round down, toward negative infinity.
430/// - `0x02`: Round up, toward positive infinity.
431/// - `0x03`: Truncate the values.
432///
433/// For a complete list of options, check [the LLVM docs][llvm_docs].
434///
435/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
438#[inline]
439#[target_feature(enable = "avx")]
440#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
441#[rustc_legacy_const_generics(1)]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
444    static_assert_uimm_bits!(ROUNDING, 4);
445    unsafe { roundps256(a, ROUNDING) }
446}
447
448/// Rounds packed single-precision (32-bit) floating point elements in `a`
449/// toward positive infinity.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
452#[inline]
453#[target_feature(enable = "avx")]
454#[cfg_attr(test, assert_instr(vroundps))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
457pub const fn _mm256_ceil_ps(a: __m256) -> __m256 {
458    unsafe { simd_ceil(a) }
459}
460
461/// Rounds packed single-precision (32-bit) floating point elements in `a`
462/// toward negative infinity.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
465#[inline]
466#[target_feature(enable = "avx")]
467#[cfg_attr(test, assert_instr(vroundps))]
468#[stable(feature = "simd_x86", since = "1.27.0")]
469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
470pub const fn _mm256_floor_ps(a: __m256) -> __m256 {
471    unsafe { simd_floor(a) }
472}
473
474/// Returns the square root of packed single-precision (32-bit) floating point
475/// elements in `a`.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
478#[inline]
479#[target_feature(enable = "avx")]
480#[cfg_attr(test, assert_instr(vsqrtps))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
483    unsafe { simd_fsqrt(a) }
484}
485
486/// Returns the square root of packed double-precision (64-bit) floating point
487/// elements in `a`.
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
490#[inline]
491#[target_feature(enable = "avx")]
492#[cfg_attr(test, assert_instr(vsqrtpd))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
495    unsafe { simd_fsqrt(a) }
496}
497
498/// Blends packed double-precision (64-bit) floating-point elements from
499/// `a` and `b` using control mask `imm8`.
500///
501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
502#[inline]
503#[target_feature(enable = "avx")]
504// Note: LLVM7 prefers single-precision blend instructions when
505// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
506// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
507#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
508#[rustc_legacy_const_generics(2)]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
511pub const fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
512    static_assert_uimm_bits!(IMM4, 4);
513    unsafe {
514        simd_shuffle!(
515            a,
516            b,
517            [
518                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
519                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
520                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
521                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
522            ],
523        )
524    }
525}
526
527/// Blends packed single-precision (32-bit) floating-point elements from
528/// `a` and `b` using control mask `imm8`.
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
531#[inline]
532#[target_feature(enable = "avx")]
533#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
534#[rustc_legacy_const_generics(2)]
535#[stable(feature = "simd_x86", since = "1.27.0")]
536#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
537pub const fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
538    static_assert_uimm_bits!(IMM8, 8);
539    unsafe {
540        simd_shuffle!(
541            a,
542            b,
543            [
544                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
545                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
546                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
547                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
548                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
549                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
550                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
551                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
552            ],
553        )
554    }
555}
556
557/// Blends packed double-precision (64-bit) floating-point elements from
558/// `a` and `b` using `c` as a mask.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
561#[inline]
562#[target_feature(enable = "avx")]
563#[cfg_attr(test, assert_instr(vblendvpd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
567    unsafe {
568        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
569        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
570    }
571}
572
573/// Blends packed single-precision (32-bit) floating-point elements from
574/// `a` and `b` using `c` as a mask.
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
577#[inline]
578#[target_feature(enable = "avx")]
579#[cfg_attr(test, assert_instr(vblendvps))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
583    unsafe {
584        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
585        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
586    }
587}
588
589/// Conditionally multiplies the packed single-precision (32-bit) floating-point
590/// elements in `a` and `b` using the high 4 bits in `imm8`,
591/// sum the four products, and conditionally return the sum
592///  using the low 4 bits of `imm8`.
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
598#[rustc_legacy_const_generics(2)]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
601    static_assert_uimm_bits!(IMM8, 8);
602    unsafe { vdpps(a, b, IMM8 as i8) }
603}
604
605/// Horizontal addition of adjacent pairs in the two packed vectors
606/// of 4 64-bit floating points `a` and `b`.
607/// In the result, sums of elements from `a` are returned in even locations,
608/// while sums of elements from `b` are returned in odd locations.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
611#[inline]
612#[target_feature(enable = "avx")]
613#[cfg_attr(test, assert_instr(vhaddpd))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617    unsafe {
618        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620        simd_add(even, odd)
621    }
622}
623
624/// Horizontal addition of adjacent pairs in the two packed vectors
625/// of 8 32-bit floating points `a` and `b`.
626/// In the result, sums of elements from `a` are returned in locations of
627/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
628/// 2, 3, 6, 7.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
631#[inline]
632#[target_feature(enable = "avx")]
633#[cfg_attr(test, assert_instr(vhaddps))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
636pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637    unsafe {
638        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640        simd_add(even, odd)
641    }
642}
643
644/// Horizontal subtraction of adjacent pairs in the two packed vectors
645/// of 4 64-bit floating points `a` and `b`.
646/// In the result, sums of elements from `a` are returned in even locations,
647/// while sums of elements from `b` are returned in odd locations.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
650#[inline]
651#[target_feature(enable = "avx")]
652#[cfg_attr(test, assert_instr(vhsubpd))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
655pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656    unsafe {
657        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659        simd_sub(even, odd)
660    }
661}
662
663/// Horizontal subtraction of adjacent pairs in the two packed vectors
664/// of 8 32-bit floating points `a` and `b`.
665/// In the result, sums of elements from `a` are returned in locations of
666/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
667/// 2, 3, 6, 7.
668///
669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
670#[inline]
671#[target_feature(enable = "avx")]
672#[cfg_attr(test, assert_instr(vhsubps))]
673#[stable(feature = "simd_x86", since = "1.27.0")]
674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
675pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676    unsafe {
677        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679        simd_sub(even, odd)
680    }
681}
682
683/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
684/// elements in `a` and `b`.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
687#[inline]
688#[target_feature(enable = "avx")]
689#[cfg_attr(test, assert_instr(vxorp))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
692pub const fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
693    unsafe {
694        let a: u64x4 = transmute(a);
695        let b: u64x4 = transmute(b);
696        transmute(simd_xor(a, b))
697    }
698}
699
700/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
701/// elements in `a` and `b`.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
704#[inline]
705#[target_feature(enable = "avx")]
706#[cfg_attr(test, assert_instr(vxorps))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
710    unsafe {
711        let a: u32x8 = transmute(a);
712        let b: u32x8 = transmute(b);
713        transmute(simd_xor(a, b))
714    }
715}
716
717/// Equal (ordered, non-signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OQ: i32 = 0x00;
720/// Less-than (ordered, signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OS: i32 = 0x01;
723/// Less-than-or-equal (ordered, signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OS: i32 = 0x02;
726/// Unordered (non-signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_Q: i32 = 0x03;
729/// Not-equal (unordered, non-signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_UQ: i32 = 0x04;
732/// Not-less-than (unordered, signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_US: i32 = 0x05;
735/// Not-less-than-or-equal (unordered, signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_US: i32 = 0x06;
738/// Ordered (non-signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_Q: i32 = 0x07;
741/// Equal (unordered, non-signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_UQ: i32 = 0x08;
744/// Not-greater-than-or-equal (unordered, signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_US: i32 = 0x09;
747/// Not-greater-than (unordered, signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_US: i32 = 0x0a;
750/// False (ordered, non-signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OQ: i32 = 0x0b;
753/// Not-equal (ordered, non-signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OQ: i32 = 0x0c;
756/// Greater-than-or-equal (ordered, signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OS: i32 = 0x0d;
759/// Greater-than (ordered, signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OS: i32 = 0x0e;
762/// True (unordered, non-signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_UQ: i32 = 0x0f;
765/// Equal (ordered, signaling)
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub const _CMP_EQ_OS: i32 = 0x10;
768/// Less-than (ordered, non-signaling)
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub const _CMP_LT_OQ: i32 = 0x11;
771/// Less-than-or-equal (ordered, non-signaling)
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub const _CMP_LE_OQ: i32 = 0x12;
774/// Unordered (signaling)
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub const _CMP_UNORD_S: i32 = 0x13;
777/// Not-equal (unordered, signaling)
778#[stable(feature = "simd_x86", since = "1.27.0")]
779pub const _CMP_NEQ_US: i32 = 0x14;
780/// Not-less-than (unordered, non-signaling)
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub const _CMP_NLT_UQ: i32 = 0x15;
783/// Not-less-than-or-equal (unordered, non-signaling)
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub const _CMP_NLE_UQ: i32 = 0x16;
786/// Ordered (signaling)
787#[stable(feature = "simd_x86", since = "1.27.0")]
788pub const _CMP_ORD_S: i32 = 0x17;
789/// Equal (unordered, signaling)
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub const _CMP_EQ_US: i32 = 0x18;
792/// Not-greater-than-or-equal (unordered, non-signaling)
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub const _CMP_NGE_UQ: i32 = 0x19;
795/// Not-greater-than (unordered, non-signaling)
796#[stable(feature = "simd_x86", since = "1.27.0")]
797pub const _CMP_NGT_UQ: i32 = 0x1a;
798/// False (ordered, signaling)
799#[stable(feature = "simd_x86", since = "1.27.0")]
800pub const _CMP_FALSE_OS: i32 = 0x1b;
801/// Not-equal (ordered, signaling)
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub const _CMP_NEQ_OS: i32 = 0x1c;
804/// Greater-than-or-equal (ordered, non-signaling)
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub const _CMP_GE_OQ: i32 = 0x1d;
807/// Greater-than (ordered, non-signaling)
808#[stable(feature = "simd_x86", since = "1.27.0")]
809pub const _CMP_GT_OQ: i32 = 0x1e;
810/// True (unordered, signaling)
811#[stable(feature = "simd_x86", since = "1.27.0")]
812pub const _CMP_TRUE_US: i32 = 0x1f;
813
814/// Compares packed double-precision (64-bit) floating-point
815/// elements in `a` and `b` based on the comparison operand
816/// specified by `IMM5`.
817///
818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
819#[inline]
820#[target_feature(enable = "avx")]
821#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
822#[rustc_legacy_const_generics(2)]
823#[stable(feature = "simd_x86", since = "1.27.0")]
824pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
825    static_assert_uimm_bits!(IMM5, 5);
826    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
827}
828
829/// Compares packed double-precision (64-bit) floating-point
830/// elements in `a` and `b` based on the comparison operand
831/// specified by `IMM5`.
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
834#[inline]
835#[target_feature(enable = "avx")]
836#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
837#[rustc_legacy_const_generics(2)]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
840    static_assert_uimm_bits!(IMM5, 5);
841    unsafe { vcmppd256(a, b, IMM5 as u8) }
842}
843
844/// Compares packed single-precision (32-bit) floating-point
845/// elements in `a` and `b` based on the comparison operand
846/// specified by `IMM5`.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
857}
858
859/// Compares packed single-precision (32-bit) floating-point
860/// elements in `a` and `b` based on the comparison operand
861/// specified by `IMM5`.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
870    static_assert_uimm_bits!(IMM5, 5);
871    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
872}
873
874/// Compares the lower double-precision (64-bit) floating-point element in
875/// `a` and `b` based on the comparison operand specified by `IMM5`,
876/// store the result in the lower element of returned vector,
877/// and copies the upper element from `a` to the upper element of returned
878/// vector.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
881#[inline]
882#[target_feature(enable = "avx")]
883#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
884#[rustc_legacy_const_generics(2)]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
887    static_assert_uimm_bits!(IMM5, 5);
888    unsafe { vcmpsd(a, b, IMM5 as i8) }
889}
890
891/// Compares the lower single-precision (32-bit) floating-point element in
892/// `a` and `b` based on the comparison operand specified by `IMM5`,
893/// store the result in the lower element of returned vector,
894/// and copies the upper 3 packed elements from `a` to the upper elements of
895/// returned vector.
896///
897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
898#[inline]
899#[target_feature(enable = "avx")]
900#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
901#[rustc_legacy_const_generics(2)]
902#[stable(feature = "simd_x86", since = "1.27.0")]
903pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
904    static_assert_uimm_bits!(IMM5, 5);
905    unsafe { vcmpss(a, b, IMM5 as i8) }
906}
907
908/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
909/// floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtdq2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
917pub const fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
918    unsafe { simd_cast(a.as_i32x4()) }
919}
920
921/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
922/// floating-point elements.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
925#[inline]
926#[target_feature(enable = "avx")]
927#[cfg_attr(test, assert_instr(vcvtdq2ps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
930pub const fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
931    unsafe { simd_cast(a.as_i32x8()) }
932}
933
934/// Converts packed double-precision (64-bit) floating-point elements in `a`
935/// to packed single-precision (32-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
938#[inline]
939#[target_feature(enable = "avx")]
940#[cfg_attr(test, assert_instr(vcvtpd2ps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
944    unsafe { simd_cast(a) }
945}
946
947/// Converts packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvtps2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
956    unsafe { transmute(vcvtps2dq(a)) }
957}
958
959/// Converts packed single-precision (32-bit) floating-point elements in `a`
960/// to packed double-precision (64-bit) floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtps2pd))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
968pub const fn _mm256_cvtps_pd(a: __m128) -> __m256d {
969    unsafe { simd_cast(a) }
970}
971
972/// Returns the first element of the input vector of `[4 x double]`.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
975#[inline]
976#[target_feature(enable = "avx")]
977//#[cfg_attr(test, assert_instr(movsd))] FIXME
978#[stable(feature = "simd_x86", since = "1.27.0")]
979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
980pub const fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
981    unsafe { simd_extract!(a, 0) }
982}
983
984/// Converts packed double-precision (64-bit) floating-point elements in `a`
985/// to packed 32-bit integers with truncation.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
988#[inline]
989#[target_feature(enable = "avx")]
990#[cfg_attr(test, assert_instr(vcvttpd2dq))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
993    unsafe { transmute(vcvttpd2dq(a)) }
994}
995
996/// Converts packed double-precision (64-bit) floating-point elements in `a`
997/// to packed 32-bit integers.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
1000#[inline]
1001#[target_feature(enable = "avx")]
1002#[cfg_attr(test, assert_instr(vcvtpd2dq))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
1005    unsafe { transmute(vcvtpd2dq(a)) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers with truncation.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
1012#[inline]
1013#[target_feature(enable = "avx")]
1014#[cfg_attr(test, assert_instr(vcvttps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
1017    unsafe { transmute(vcvttps2dq(a)) }
1018}
1019
1020/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
1021/// floating-point elements) from `a`, selected with `imm8`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
1024#[inline]
1025#[target_feature(enable = "avx")]
1026#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1027#[rustc_legacy_const_generics(1)]
1028#[stable(feature = "simd_x86", since = "1.27.0")]
1029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1030pub const fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
1031    static_assert_uimm_bits!(IMM1, 1);
1032    unsafe {
1033        simd_shuffle!(
1034            a,
1035            _mm256_undefined_ps(),
1036            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
1037        )
1038    }
1039}
1040
1041/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1042/// floating-point elements) from `a`, selected with `imm8`.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1045#[inline]
1046#[target_feature(enable = "avx")]
1047#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1048#[rustc_legacy_const_generics(1)]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1052    static_assert_uimm_bits!(IMM1, 1);
1053    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1054}
1055
1056/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1062#[rustc_legacy_const_generics(1)]
1063#[stable(feature = "simd_x86", since = "1.27.0")]
1064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1065pub const fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1066    static_assert_uimm_bits!(IMM1, 1);
1067    unsafe {
1068        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1069        transmute(dst)
1070    }
1071}
1072
1073/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078// This intrinsic has no corresponding instruction.
1079#[rustc_legacy_const_generics(1)]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1082pub const fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1083    static_assert_uimm_bits!(INDEX, 3);
1084    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1085}
1086
1087/// Returns the first element of the input vector of `[8 x i32]`.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1090#[inline]
1091#[target_feature(enable = "avx")]
1092#[stable(feature = "simd_x86", since = "1.27.0")]
1093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1094pub const fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1095    unsafe { simd_extract!(a.as_i32x8(), 0) }
1096}
1097
1098/// Zeroes the contents of all XMM or YMM registers.
1099///
1100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1101#[inline]
1102#[target_feature(enable = "avx")]
1103#[cfg_attr(test, assert_instr(vzeroall))]
1104#[stable(feature = "simd_x86", since = "1.27.0")]
1105pub fn _mm256_zeroall() {
1106    unsafe { vzeroall() }
1107}
1108
1109/// Zeroes the upper 128 bits of all YMM registers;
1110/// the lower 128-bits of the registers are unmodified.
1111///
1112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1113#[inline]
1114#[target_feature(enable = "avx")]
1115#[cfg_attr(test, assert_instr(vzeroupper))]
1116#[stable(feature = "simd_x86", since = "1.27.0")]
1117pub fn _mm256_zeroupper() {
1118    unsafe { vzeroupper() }
1119}
1120
1121/// Shuffles single-precision (32-bit) floating-point elements in `a`
1122/// within 128-bit lanes using the control in `b`.
1123///
1124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1125#[inline]
1126#[target_feature(enable = "avx")]
1127#[cfg_attr(test, assert_instr(vpermilps))]
1128#[stable(feature = "simd_x86", since = "1.27.0")]
1129pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1130    unsafe { vpermilps256(a, b.as_i32x8()) }
1131}
1132
1133/// Shuffles single-precision (32-bit) floating-point elements in `a`
1134/// using the control in `b`.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1137#[inline]
1138#[target_feature(enable = "avx")]
1139#[cfg_attr(test, assert_instr(vpermilps))]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1142    unsafe { vpermilps(a, b.as_i32x4()) }
1143}
1144
1145/// Shuffles single-precision (32-bit) floating-point elements in `a`
1146/// within 128-bit lanes using the control in `imm8`.
1147///
1148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1149#[inline]
1150#[target_feature(enable = "avx")]
1151#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1152#[rustc_legacy_const_generics(1)]
1153#[stable(feature = "simd_x86", since = "1.27.0")]
1154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1155pub const fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1156    static_assert_uimm_bits!(IMM8, 8);
1157    unsafe {
1158        simd_shuffle!(
1159            a,
1160            _mm256_undefined_ps(),
1161            [
1162                (IMM8 as u32 >> 0) & 0b11,
1163                (IMM8 as u32 >> 2) & 0b11,
1164                (IMM8 as u32 >> 4) & 0b11,
1165                (IMM8 as u32 >> 6) & 0b11,
1166                ((IMM8 as u32 >> 0) & 0b11) + 4,
1167                ((IMM8 as u32 >> 2) & 0b11) + 4,
1168                ((IMM8 as u32 >> 4) & 0b11) + 4,
1169                ((IMM8 as u32 >> 6) & 0b11) + 4,
1170            ],
1171        )
1172    }
1173}
1174
1175/// Shuffles single-precision (32-bit) floating-point elements in `a`
1176/// using the control in `imm8`.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1179#[inline]
1180#[target_feature(enable = "avx")]
1181#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1182#[rustc_legacy_const_generics(1)]
1183#[stable(feature = "simd_x86", since = "1.27.0")]
1184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1185pub const fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1186    static_assert_uimm_bits!(IMM8, 8);
1187    unsafe {
1188        simd_shuffle!(
1189            a,
1190            _mm_undefined_ps(),
1191            [
1192                (IMM8 as u32 >> 0) & 0b11,
1193                (IMM8 as u32 >> 2) & 0b11,
1194                (IMM8 as u32 >> 4) & 0b11,
1195                (IMM8 as u32 >> 6) & 0b11,
1196            ],
1197        )
1198    }
1199}
1200
1201/// Shuffles double-precision (64-bit) floating-point elements in `a`
1202/// within 256-bit lanes using the control in `b`.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1205#[inline]
1206#[target_feature(enable = "avx")]
1207#[cfg_attr(test, assert_instr(vpermilpd))]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1210    unsafe { vpermilpd256(a, b.as_i64x4()) }
1211}
1212
1213/// Shuffles double-precision (64-bit) floating-point elements in `a`
1214/// using the control in `b`.
1215///
1216/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1217#[inline]
1218#[target_feature(enable = "avx")]
1219#[cfg_attr(test, assert_instr(vpermilpd))]
1220#[stable(feature = "simd_x86", since = "1.27.0")]
1221pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1222    unsafe { vpermilpd(a, b.as_i64x2()) }
1223}
1224
1225/// Shuffles double-precision (64-bit) floating-point elements in `a`
1226/// within 128-bit lanes using the control in `imm8`.
1227///
1228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1229#[inline]
1230#[target_feature(enable = "avx")]
1231#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1232#[rustc_legacy_const_generics(1)]
1233#[stable(feature = "simd_x86", since = "1.27.0")]
1234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1235pub const fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1236    static_assert_uimm_bits!(IMM4, 4);
1237    unsafe {
1238        simd_shuffle!(
1239            a,
1240            _mm256_undefined_pd(),
1241            [
1242                ((IMM4 as u32 >> 0) & 1),
1243                ((IMM4 as u32 >> 1) & 1),
1244                ((IMM4 as u32 >> 2) & 1) + 2,
1245                ((IMM4 as u32 >> 3) & 1) + 2,
1246            ],
1247        )
1248    }
1249}
1250
1251/// Shuffles double-precision (64-bit) floating-point elements in `a`
1252/// using the control in `imm8`.
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1255#[inline]
1256#[target_feature(enable = "avx")]
1257#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1258#[rustc_legacy_const_generics(1)]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1261pub const fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1262    static_assert_uimm_bits!(IMM2, 2);
1263    unsafe {
1264        simd_shuffle!(
1265            a,
1266            _mm_undefined_pd(),
1267            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1268        )
1269    }
1270}
1271
1272/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1273/// floating-point elements) selected by `imm8` from `a` and `b`.
1274///
1275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1276#[inline]
1277#[target_feature(enable = "avx")]
1278#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1279#[rustc_legacy_const_generics(2)]
1280#[stable(feature = "simd_x86", since = "1.27.0")]
1281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1282pub const fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1283    static_assert_uimm_bits!(IMM8, 8);
1284    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1285        _mm256_castps_si256(a),
1286        _mm256_castps_si256(b),
1287    ))
1288}
1289
1290/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1291/// floating-point elements) selected by `imm8` from `a` and `b`.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1294#[inline]
1295#[target_feature(enable = "avx")]
1296#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1297#[rustc_legacy_const_generics(2)]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1300pub const fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1301    static_assert_uimm_bits!(IMM8, 8);
1302    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1303        _mm256_castpd_si256(a),
1304        _mm256_castpd_si256(b),
1305    ))
1306}
1307
1308/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1309/// from `a` and `b`.
1310///
1311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1312#[inline]
1313#[target_feature(enable = "avx")]
1314#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1315#[rustc_legacy_const_generics(2)]
1316#[stable(feature = "simd_x86", since = "1.27.0")]
1317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1318pub const fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1319    static_assert_uimm_bits!(IMM8, 8);
1320    const fn idx(imm8: i32, pos: u32) -> u32 {
1321        let part = if pos < 2 {
1322            imm8 & 0xf
1323        } else {
1324            (imm8 & 0xf0) >> 4
1325        };
1326        2 * (part as u32 & 0b11) + (pos & 1)
1327    }
1328    const fn idx0(imm8: i32, pos: u32) -> u32 {
1329        let part = if pos < 2 {
1330            imm8 & 0xf
1331        } else {
1332            (imm8 & 0xf0) >> 4
1333        };
1334        if part & 0b1000 != 0 { 4 } else { pos }
1335    }
1336    unsafe {
1337        let r = simd_shuffle!(
1338            a.as_i64x4(),
1339            b.as_i64x4(),
1340            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1341        );
1342        let r: i64x4 = simd_shuffle!(
1343            r,
1344            i64x4::ZERO,
1345            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1346        );
1347        r.as_m256i()
1348    }
1349}
1350
1351/// Broadcasts a single-precision (32-bit) floating-point element from memory
1352/// to all elements of the returned vector.
1353///
1354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1355#[inline]
1356#[target_feature(enable = "avx")]
1357#[cfg_attr(test, assert_instr(vbroadcastss))]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359#[allow(clippy::trivially_copy_pass_by_ref)]
1360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1361pub const fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1362    _mm256_set1_ps(*f)
1363}
1364
1365/// Broadcasts a single-precision (32-bit) floating-point element from memory
1366/// to all elements of the returned vector.
1367///
1368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1369#[inline]
1370#[target_feature(enable = "avx")]
1371#[cfg_attr(test, assert_instr(vbroadcastss))]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373#[allow(clippy::trivially_copy_pass_by_ref)]
1374#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1375pub const fn _mm_broadcast_ss(f: &f32) -> __m128 {
1376    _mm_set1_ps(*f)
1377}
1378
1379/// Broadcasts a double-precision (64-bit) floating-point element from memory
1380/// to all elements of the returned vector.
1381///
1382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1383#[inline]
1384#[target_feature(enable = "avx")]
1385#[cfg_attr(test, assert_instr(vbroadcastsd))]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387#[allow(clippy::trivially_copy_pass_by_ref)]
1388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1389pub const fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1390    _mm256_set1_pd(*f)
1391}
1392
1393/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1394/// (32-bit) floating-point elements) to all elements of the returned vector.
1395///
1396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1397#[inline]
1398#[target_feature(enable = "avx")]
1399#[cfg_attr(test, assert_instr(vbroadcastf128))]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1402pub const fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1403    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1404}
1405
1406/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1407/// (64-bit) floating-point elements) to all elements of the returned vector.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412#[cfg_attr(test, assert_instr(vbroadcastf128))]
1413#[stable(feature = "simd_x86", since = "1.27.0")]
1414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1415pub const fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1416    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1417}
1418
1419/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1420/// single-precision (32-bit) floating-point elements) from `b` into result
1421/// at the location specified by `imm8`.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1424#[inline]
1425#[target_feature(enable = "avx")]
1426#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1427#[rustc_legacy_const_generics(2)]
1428#[stable(feature = "simd_x86", since = "1.27.0")]
1429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1430pub const fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1431    static_assert_uimm_bits!(IMM1, 1);
1432    unsafe {
1433        simd_shuffle!(
1434            a,
1435            _mm256_castps128_ps256(b),
1436            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1437        )
1438    }
1439}
1440
1441/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1442/// double-precision (64-bit) floating-point elements) from `b` into result
1443/// at the location specified by `imm8`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1452pub const fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1453    static_assert_uimm_bits!(IMM1, 1);
1454    unsafe {
1455        simd_shuffle!(
1456            a,
1457            _mm256_castpd128_pd256(b),
1458            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1459        )
1460    }
1461}
1462
1463/// Copies `a` to result, then inserts 128 bits from `b` into result
1464/// at the location specified by `imm8`.
1465///
1466/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1467#[inline]
1468#[target_feature(enable = "avx")]
1469#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1470#[rustc_legacy_const_generics(2)]
1471#[stable(feature = "simd_x86", since = "1.27.0")]
1472#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1473pub const fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1474    static_assert_uimm_bits!(IMM1, 1);
1475    unsafe {
1476        let dst: i64x4 = simd_shuffle!(
1477            a.as_i64x4(),
1478            _mm256_castsi128_si256(b).as_i64x4(),
1479            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1480        );
1481        transmute(dst)
1482    }
1483}
1484
1485/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1486/// at the location specified by `index`.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1489#[inline]
1490#[target_feature(enable = "avx")]
1491// This intrinsic has no corresponding instruction.
1492#[rustc_legacy_const_generics(2)]
1493#[stable(feature = "simd_x86", since = "1.27.0")]
1494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1495pub const fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1496    static_assert_uimm_bits!(INDEX, 5);
1497    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1498}
1499
1500/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1501/// at the location specified by `index`.
1502///
1503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1504#[inline]
1505#[target_feature(enable = "avx")]
1506// This intrinsic has no corresponding instruction.
1507#[rustc_legacy_const_generics(2)]
1508#[stable(feature = "simd_x86", since = "1.27.0")]
1509#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1510pub const fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1511    static_assert_uimm_bits!(INDEX, 4);
1512    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1513}
1514
1515/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1516/// at the location specified by `index`.
1517///
1518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1519#[inline]
1520#[target_feature(enable = "avx")]
1521// This intrinsic has no corresponding instruction.
1522#[rustc_legacy_const_generics(2)]
1523#[stable(feature = "simd_x86", since = "1.27.0")]
1524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1525pub const fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1526    static_assert_uimm_bits!(INDEX, 3);
1527    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1528}
1529
1530/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1531/// floating-point elements) from memory into result.
1532/// `mem_addr` must be aligned on a 32-byte boundary or a
1533/// general-protection exception may be generated.
1534///
1535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1536#[inline]
1537#[target_feature(enable = "avx")]
1538#[cfg_attr(
1539    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1540    assert_instr(vmovap)
1541)]
1542#[stable(feature = "simd_x86", since = "1.27.0")]
1543#[allow(clippy::cast_ptr_alignment)]
1544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1545pub const unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1546    *(mem_addr as *const __m256d)
1547}
1548
1549/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1550/// floating-point elements) from `a` into memory.
1551/// `mem_addr` must be aligned on a 32-byte boundary or a
1552/// general-protection exception may be generated.
1553///
1554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1555#[inline]
1556#[target_feature(enable = "avx")]
1557#[cfg_attr(
1558    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1559    assert_instr(vmovap)
1560)]
1561#[stable(feature = "simd_x86", since = "1.27.0")]
1562#[allow(clippy::cast_ptr_alignment)]
1563#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1564pub const unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1565    *(mem_addr as *mut __m256d) = a;
1566}
1567
1568/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1569/// floating-point elements) from memory into result.
1570/// `mem_addr` must be aligned on a 32-byte boundary or a
1571/// general-protection exception may be generated.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1574#[inline]
1575#[target_feature(enable = "avx")]
1576#[cfg_attr(
1577    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1578    assert_instr(vmovaps)
1579)]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581#[allow(clippy::cast_ptr_alignment)]
1582#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1583pub const unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1584    *(mem_addr as *const __m256)
1585}
1586
1587/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1588/// floating-point elements) from `a` into memory.
1589/// `mem_addr` must be aligned on a 32-byte boundary or a
1590/// general-protection exception may be generated.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1593#[inline]
1594#[target_feature(enable = "avx")]
1595#[cfg_attr(
1596    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1597    assert_instr(vmovaps)
1598)]
1599#[stable(feature = "simd_x86", since = "1.27.0")]
1600#[allow(clippy::cast_ptr_alignment)]
1601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1602pub const unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1603    *(mem_addr as *mut __m256) = a;
1604}
1605
1606/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1607/// floating-point elements) from memory into result.
1608/// `mem_addr` does not need to be aligned on any particular boundary.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(test, assert_instr(vmovup))]
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1616pub const unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1617    let mut dst = _mm256_undefined_pd();
1618    ptr::copy_nonoverlapping(
1619        mem_addr as *const u8,
1620        ptr::addr_of_mut!(dst) as *mut u8,
1621        mem::size_of::<__m256d>(),
1622    );
1623    dst
1624}
1625
1626/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1627/// floating-point elements) from `a` into memory.
1628/// `mem_addr` does not need to be aligned on any particular boundary.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1631#[inline]
1632#[target_feature(enable = "avx")]
1633#[cfg_attr(test, assert_instr(vmovup))]
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1636pub const unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1637    mem_addr.cast::<__m256d>().write_unaligned(a);
1638}
1639
1640/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1641/// floating-point elements) from memory into result.
1642/// `mem_addr` does not need to be aligned on any particular boundary.
1643///
1644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1645#[inline]
1646#[target_feature(enable = "avx")]
1647#[cfg_attr(test, assert_instr(vmovups))]
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1650pub const unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1651    let mut dst = _mm256_undefined_ps();
1652    ptr::copy_nonoverlapping(
1653        mem_addr as *const u8,
1654        ptr::addr_of_mut!(dst) as *mut u8,
1655        mem::size_of::<__m256>(),
1656    );
1657    dst
1658}
1659
1660/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1661/// floating-point elements) from `a` into memory.
1662/// `mem_addr` does not need to be aligned on any particular boundary.
1663///
1664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1665#[inline]
1666#[target_feature(enable = "avx")]
1667#[cfg_attr(test, assert_instr(vmovups))]
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1670pub const unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1671    mem_addr.cast::<__m256>().write_unaligned(a);
1672}
1673
1674/// Loads 256-bits of integer data from memory into result.
1675/// `mem_addr` must be aligned on a 32-byte boundary or a
1676/// general-protection exception may be generated.
1677///
1678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1679#[inline]
1680#[target_feature(enable = "avx")]
1681#[cfg_attr(
1682    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1683    assert_instr(vmovaps)
1684)] // FIXME vmovdqa expected
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1688    *mem_addr
1689}
1690
1691/// Stores 256-bits of integer data from `a` into memory.
1692/// `mem_addr` must be aligned on a 32-byte boundary or a
1693/// general-protection exception may be generated.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1696#[inline]
1697#[target_feature(enable = "avx")]
1698#[cfg_attr(
1699    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1700    assert_instr(vmovaps)
1701)] // FIXME vmovdqa expected
1702#[stable(feature = "simd_x86", since = "1.27.0")]
1703#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1704pub const unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1705    *mem_addr = a;
1706}
1707
1708/// Loads 256-bits of integer data from memory into result.
1709/// `mem_addr` does not need to be aligned on any particular boundary.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1712#[inline]
1713#[target_feature(enable = "avx")]
1714#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1718    let mut dst = _mm256_undefined_si256();
1719    ptr::copy_nonoverlapping(
1720        mem_addr as *const u8,
1721        ptr::addr_of_mut!(dst) as *mut u8,
1722        mem::size_of::<__m256i>(),
1723    );
1724    dst
1725}
1726
1727/// Stores 256-bits of integer data from `a` into memory.
1728/// `mem_addr` does not need to be aligned on any particular boundary.
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1731#[inline]
1732#[target_feature(enable = "avx")]
1733#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1736pub const unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1737    mem_addr.write_unaligned(a);
1738}
1739
1740/// Loads packed double-precision (64-bit) floating-point elements from memory
1741/// into result using `mask` (elements are zeroed out when the high bit of the
1742/// corresponding element is not set).
1743///
1744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1745#[inline]
1746#[target_feature(enable = "avx")]
1747#[cfg_attr(test, assert_instr(vmaskmovpd))]
1748#[stable(feature = "simd_x86", since = "1.27.0")]
1749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1750pub const unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1751    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1752    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1753}
1754
1755/// Stores packed double-precision (64-bit) floating-point elements from `a`
1756/// into memory using `mask`.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vmaskmovpd))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1764pub const unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1765    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1766    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1767}
1768
1769/// Loads packed double-precision (64-bit) floating-point elements from memory
1770/// into result using `mask` (elements are zeroed out when the high bit of the
1771/// corresponding element is not set).
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1774#[inline]
1775#[target_feature(enable = "avx")]
1776#[cfg_attr(test, assert_instr(vmaskmovpd))]
1777#[stable(feature = "simd_x86", since = "1.27.0")]
1778#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1779pub const unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1780    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1781    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1782}
1783
1784/// Stores packed double-precision (64-bit) floating-point elements from `a`
1785/// into memory using `mask`.
1786///
1787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1788#[inline]
1789#[target_feature(enable = "avx")]
1790#[cfg_attr(test, assert_instr(vmaskmovpd))]
1791#[stable(feature = "simd_x86", since = "1.27.0")]
1792#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1793pub const unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1794    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1795    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1796}
1797
1798/// Loads packed single-precision (32-bit) floating-point elements from memory
1799/// into result using `mask` (elements are zeroed out when the high bit of the
1800/// corresponding element is not set).
1801///
1802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1803#[inline]
1804#[target_feature(enable = "avx")]
1805#[cfg_attr(test, assert_instr(vmaskmovps))]
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1808pub const unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1809    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1810    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1811}
1812
1813/// Stores packed single-precision (32-bit) floating-point elements from `a`
1814/// into memory using `mask`.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vmaskmovps))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1822pub const unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1823    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1824    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1825}
1826
1827/// Loads packed single-precision (32-bit) floating-point elements from memory
1828/// into result using `mask` (elements are zeroed out when the high bit of the
1829/// corresponding element is not set).
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1832#[inline]
1833#[target_feature(enable = "avx")]
1834#[cfg_attr(test, assert_instr(vmaskmovps))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1837pub const unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1838    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1839    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1840}
1841
1842/// Stores packed single-precision (32-bit) floating-point elements from `a`
1843/// into memory using `mask`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1846#[inline]
1847#[target_feature(enable = "avx")]
1848#[cfg_attr(test, assert_instr(vmaskmovps))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1851pub const unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1852    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1853    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1854}
1855
1856/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1857/// from `a`, and returns the results.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1860#[inline]
1861#[target_feature(enable = "avx")]
1862#[cfg_attr(test, assert_instr(vmovshdup))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1866    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1867}
1868
1869/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1870/// from `a`, and returns the results.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1873#[inline]
1874#[target_feature(enable = "avx")]
1875#[cfg_attr(test, assert_instr(vmovsldup))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1878pub const fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1879    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1880}
1881
1882/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1883/// from `a`, and returns the results.
1884///
1885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1886#[inline]
1887#[target_feature(enable = "avx")]
1888#[cfg_attr(test, assert_instr(vmovddup))]
1889#[stable(feature = "simd_x86", since = "1.27.0")]
1890#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1891pub const fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1892    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1893}
1894
1895/// Loads 256-bits of integer data from unaligned memory into result.
1896/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1897/// data crosses a cache line boundary.
1898///
1899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1900#[inline]
1901#[target_feature(enable = "avx")]
1902#[cfg_attr(test, assert_instr(vlddqu))]
1903#[stable(feature = "simd_x86", since = "1.27.0")]
1904pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1905    transmute(vlddqu(mem_addr as *const i8))
1906}
1907
1908/// Moves integer data from a 256-bit integer vector to a 32-byte
1909/// aligned memory location. To minimize caching, the data is flagged as
1910/// non-temporal (unlikely to be used again soon)
1911///
1912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1913///
1914/// # Safety of non-temporal stores
1915///
1916/// After using this intrinsic, but before any other access to the memory that this intrinsic
1917/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1918/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1919/// return.
1920///
1921/// See [`_mm_sfence`] for details.
1922#[inline]
1923#[target_feature(enable = "avx")]
1924#[cfg_attr(test, assert_instr(vmovntdq))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1927    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1928    crate::arch::asm!(
1929        vps!("vmovntdq", ",{a}"),
1930        p = in(reg) mem_addr,
1931        a = in(ymm_reg) a,
1932        options(nostack, preserves_flags),
1933    );
1934}
1935
1936/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1937/// to a 32-byte aligned memory location. To minimize caching, the data is
1938/// flagged as non-temporal (unlikely to be used again soon).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1941///
1942/// # Safety of non-temporal stores
1943///
1944/// After using this intrinsic, but before any other access to the memory that this intrinsic
1945/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1946/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1947/// return.
1948///
1949/// See [`_mm_sfence`] for details.
1950#[inline]
1951#[target_feature(enable = "avx")]
1952#[cfg_attr(test, assert_instr(vmovntpd))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954#[allow(clippy::cast_ptr_alignment)]
1955pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1956    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1957    crate::arch::asm!(
1958        vps!("vmovntpd", ",{a}"),
1959        p = in(reg) mem_addr,
1960        a = in(ymm_reg) a,
1961        options(nostack, preserves_flags),
1962    );
1963}
1964
1965/// Moves single-precision floating point values from a 256-bit vector
1966/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1967/// caching, the data is flagged as non-temporal (unlikely to be used again
1968/// soon).
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1971///
1972/// # Safety of non-temporal stores
1973///
1974/// After using this intrinsic, but before any other access to the memory that this intrinsic
1975/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1976/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1977/// return.
1978///
1979/// See [`_mm_sfence`] for details.
1980#[inline]
1981#[target_feature(enable = "avx")]
1982#[cfg_attr(test, assert_instr(vmovntps))]
1983#[stable(feature = "simd_x86", since = "1.27.0")]
1984#[allow(clippy::cast_ptr_alignment)]
1985pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1986    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1987    crate::arch::asm!(
1988        vps!("vmovntps", ",{a}"),
1989        p = in(reg) mem_addr,
1990        a = in(ymm_reg) a,
1991        options(nostack, preserves_flags),
1992    );
1993}
1994
1995/// Computes the approximate reciprocal of packed single-precision (32-bit)
1996/// floating-point elements in `a`, and returns the results. The maximum
1997/// relative error for this approximation is less than 1.5*2^-12.
1998///
1999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
2000#[inline]
2001#[target_feature(enable = "avx")]
2002#[cfg_attr(test, assert_instr(vrcpps))]
2003#[stable(feature = "simd_x86", since = "1.27.0")]
2004pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
2005    unsafe { vrcpps(a) }
2006}
2007
2008/// Computes the approximate reciprocal square root of packed single-precision
2009/// (32-bit) floating-point elements in `a`, and returns the results.
2010/// The maximum relative error for this approximation is less than 1.5*2^-12.
2011///
2012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
2013#[inline]
2014#[target_feature(enable = "avx")]
2015#[cfg_attr(test, assert_instr(vrsqrtps))]
2016#[stable(feature = "simd_x86", since = "1.27.0")]
2017pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
2018    unsafe { vrsqrtps(a) }
2019}
2020
2021/// Unpacks and interleave double-precision (64-bit) floating-point elements
2022/// from the high half of each 128-bit lane in `a` and `b`.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
2025#[inline]
2026#[target_feature(enable = "avx")]
2027#[cfg_attr(test, assert_instr(vunpckhpd))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2030pub const fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
2031    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
2032}
2033
2034/// Unpacks and interleave single-precision (32-bit) floating-point elements
2035/// from the high half of each 128-bit lane in `a` and `b`.
2036///
2037/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
2038#[inline]
2039#[target_feature(enable = "avx")]
2040#[cfg_attr(test, assert_instr(vunpckhps))]
2041#[stable(feature = "simd_x86", since = "1.27.0")]
2042#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2043pub const fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
2044    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
2045}
2046
2047/// Unpacks and interleave double-precision (64-bit) floating-point elements
2048/// from the low half of each 128-bit lane in `a` and `b`.
2049///
2050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
2051#[inline]
2052#[target_feature(enable = "avx")]
2053#[cfg_attr(test, assert_instr(vunpcklpd))]
2054#[stable(feature = "simd_x86", since = "1.27.0")]
2055#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2056pub const fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
2057    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
2058}
2059
2060/// Unpacks and interleave single-precision (32-bit) floating-point elements
2061/// from the low half of each 128-bit lane in `a` and `b`.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vunpcklps))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2069pub const fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
2070    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
2071}
2072
2073/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2074/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2075/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2076/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vptest))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2084pub const fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
2085    unsafe {
2086        let r = simd_and(a.as_i64x4(), b.as_i64x4());
2087        (0i64 == simd_reduce_or(r)) as i32
2088    }
2089}
2090
2091/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2092/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2093/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2094/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2097#[inline]
2098#[target_feature(enable = "avx")]
2099#[cfg_attr(test, assert_instr(vptest))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2102pub const fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2103    unsafe {
2104        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2105        (0i64 == simd_reduce_or(r)) as i32
2106    }
2107}
2108
2109/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2110/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2111/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2112/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2113/// `CF` values are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vptest))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2121    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2122}
2123
2124/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestpd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2138    unsafe { vtestzpd256(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestpd))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2155    unsafe { vtestcpd256(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2173    unsafe { vtestnzcpd256(a, b) }
2174}
2175
2176/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2177/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2178/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2179/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2180/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2181/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2182/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2185#[inline]
2186#[target_feature(enable = "avx")]
2187#[cfg_attr(test, assert_instr(vtestpd))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2190pub const fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2191    unsafe {
2192        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2193        (0i64 == simd_reduce_or(r)) as i32
2194    }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestpd))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2211pub const fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2212    unsafe {
2213        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2214        (0i64 == simd_reduce_or(r)) as i32
2215    }
2216}
2217
2218/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2219/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2220/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2221/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2222/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2223/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2224/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2225/// are zero, otherwise return 0.
2226///
2227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2228#[inline]
2229#[target_feature(enable = "avx")]
2230#[cfg_attr(test, assert_instr(vtestpd))]
2231#[stable(feature = "simd_x86", since = "1.27.0")]
2232pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2233    unsafe { vtestnzcpd(a, b) }
2234}
2235
2236/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2237/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2238/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2239/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2240/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2241/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2242/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2245#[inline]
2246#[target_feature(enable = "avx")]
2247#[cfg_attr(test, assert_instr(vtestps))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2250    unsafe { vtestzps256(a, b) }
2251}
2252
2253/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2254/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2255/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2256/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2257/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2258/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2259/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264#[cfg_attr(test, assert_instr(vtestps))]
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2267    unsafe { vtestcps256(a, b) }
2268}
2269
2270/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2271/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2272/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2273/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2274/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2275/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2276/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2277/// are zero, otherwise return 0.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2280#[inline]
2281#[target_feature(enable = "avx")]
2282#[cfg_attr(test, assert_instr(vtestps))]
2283#[stable(feature = "simd_x86", since = "1.27.0")]
2284pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2285    unsafe { vtestnzcps256(a, b) }
2286}
2287
2288/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2289/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2290/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2291/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2292/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2293/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2294/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2297#[inline]
2298#[target_feature(enable = "avx")]
2299#[cfg_attr(test, assert_instr(vtestps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2302pub const fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2303    unsafe {
2304        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2305        (0i32 == simd_reduce_or(r)) as i32
2306    }
2307}
2308
2309/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2310/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2311/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2312/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2313/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2314/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2315/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2318#[inline]
2319#[target_feature(enable = "avx")]
2320#[cfg_attr(test, assert_instr(vtestps))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2323pub const fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2324    unsafe {
2325        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2326        (0i32 == simd_reduce_or(r)) as i32
2327    }
2328}
2329
2330/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2331/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2332/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2333/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2334/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2335/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2336/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2337/// are zero, otherwise return 0.
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2340#[inline]
2341#[target_feature(enable = "avx")]
2342#[cfg_attr(test, assert_instr(vtestps))]
2343#[stable(feature = "simd_x86", since = "1.27.0")]
2344pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2345    unsafe { vtestnzcps(a, b) }
2346}
2347
2348/// Sets each bit of the returned mask based on the most significant bit of the
2349/// corresponding packed double-precision (64-bit) floating-point element in
2350/// `a`.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2353#[inline]
2354#[target_feature(enable = "avx")]
2355#[cfg_attr(test, assert_instr(vmovmskpd))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2358pub const fn _mm256_movemask_pd(a: __m256d) -> i32 {
2359    // Propagate the highest bit to the rest, because simd_bitmask
2360    // requires all-1 or all-0.
2361    unsafe {
2362        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2363        simd_bitmask::<i64x4, u8>(mask) as i32
2364    }
2365}
2366
2367/// Sets each bit of the returned mask based on the most significant bit of the
2368/// corresponding packed single-precision (32-bit) floating-point element in
2369/// `a`.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2372#[inline]
2373#[target_feature(enable = "avx")]
2374#[cfg_attr(test, assert_instr(vmovmskps))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2377pub const fn _mm256_movemask_ps(a: __m256) -> i32 {
2378    // Propagate the highest bit to the rest, because simd_bitmask
2379    // requires all-1 or all-0.
2380    unsafe {
2381        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2382        simd_bitmask::<i32x8, u8>(mask) as i32
2383    }
2384}
2385
2386/// Returns vector of type __m256d with all elements set to zero.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391#[cfg_attr(test, assert_instr(vxorp))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2394pub const fn _mm256_setzero_pd() -> __m256d {
2395    const { unsafe { mem::zeroed() } }
2396}
2397
2398/// Returns vector of type __m256 with all elements set to zero.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403#[cfg_attr(test, assert_instr(vxorps))]
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm256_setzero_ps() -> __m256 {
2407    const { unsafe { mem::zeroed() } }
2408}
2409
2410/// Returns vector of type __m256i with all elements set to zero.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415#[cfg_attr(test, assert_instr(vxor))]
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2418pub const fn _mm256_setzero_si256() -> __m256i {
2419    const { unsafe { mem::zeroed() } }
2420}
2421
2422/// Sets packed double-precision (64-bit) floating-point elements in returned
2423/// vector with the supplied values.
2424///
2425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2426#[inline]
2427#[target_feature(enable = "avx")]
2428// This intrinsic has no corresponding instruction.
2429#[stable(feature = "simd_x86", since = "1.27.0")]
2430#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2431pub const fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2432    _mm256_setr_pd(d, c, b, a)
2433}
2434
2435/// Sets packed single-precision (32-bit) floating-point elements in returned
2436/// vector with the supplied values.
2437///
2438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2439#[inline]
2440#[target_feature(enable = "avx")]
2441// This intrinsic has no corresponding instruction.
2442#[stable(feature = "simd_x86", since = "1.27.0")]
2443#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2444pub const fn _mm256_set_ps(
2445    a: f32,
2446    b: f32,
2447    c: f32,
2448    d: f32,
2449    e: f32,
2450    f: f32,
2451    g: f32,
2452    h: f32,
2453) -> __m256 {
2454    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2455}
2456
2457/// Sets packed 8-bit integers in returned vector with the supplied values.
2458///
2459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2460#[inline]
2461#[target_feature(enable = "avx")]
2462// This intrinsic has no corresponding instruction.
2463#[stable(feature = "simd_x86", since = "1.27.0")]
2464#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2465pub const fn _mm256_set_epi8(
2466    e00: i8,
2467    e01: i8,
2468    e02: i8,
2469    e03: i8,
2470    e04: i8,
2471    e05: i8,
2472    e06: i8,
2473    e07: i8,
2474    e08: i8,
2475    e09: i8,
2476    e10: i8,
2477    e11: i8,
2478    e12: i8,
2479    e13: i8,
2480    e14: i8,
2481    e15: i8,
2482    e16: i8,
2483    e17: i8,
2484    e18: i8,
2485    e19: i8,
2486    e20: i8,
2487    e21: i8,
2488    e22: i8,
2489    e23: i8,
2490    e24: i8,
2491    e25: i8,
2492    e26: i8,
2493    e27: i8,
2494    e28: i8,
2495    e29: i8,
2496    e30: i8,
2497    e31: i8,
2498) -> __m256i {
2499    #[rustfmt::skip]
2500    _mm256_setr_epi8(
2501        e31, e30, e29, e28, e27, e26, e25, e24,
2502        e23, e22, e21, e20, e19, e18, e17, e16,
2503        e15, e14, e13, e12, e11, e10, e09, e08,
2504        e07, e06, e05, e04, e03, e02, e01, e00,
2505    )
2506}
2507
2508/// Sets packed 16-bit integers in returned vector with the supplied values.
2509///
2510/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2511#[inline]
2512#[target_feature(enable = "avx")]
2513// This intrinsic has no corresponding instruction.
2514#[stable(feature = "simd_x86", since = "1.27.0")]
2515#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2516pub const fn _mm256_set_epi16(
2517    e00: i16,
2518    e01: i16,
2519    e02: i16,
2520    e03: i16,
2521    e04: i16,
2522    e05: i16,
2523    e06: i16,
2524    e07: i16,
2525    e08: i16,
2526    e09: i16,
2527    e10: i16,
2528    e11: i16,
2529    e12: i16,
2530    e13: i16,
2531    e14: i16,
2532    e15: i16,
2533) -> __m256i {
2534    #[rustfmt::skip]
2535    _mm256_setr_epi16(
2536        e15, e14, e13, e12,
2537        e11, e10, e09, e08,
2538        e07, e06, e05, e04,
2539        e03, e02, e01, e00,
2540    )
2541}
2542
2543/// Sets packed 32-bit integers in returned vector with the supplied values.
2544///
2545/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2546#[inline]
2547#[target_feature(enable = "avx")]
2548// This intrinsic has no corresponding instruction.
2549#[stable(feature = "simd_x86", since = "1.27.0")]
2550#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2551pub const fn _mm256_set_epi32(
2552    e0: i32,
2553    e1: i32,
2554    e2: i32,
2555    e3: i32,
2556    e4: i32,
2557    e5: i32,
2558    e6: i32,
2559    e7: i32,
2560) -> __m256i {
2561    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2562}
2563
2564/// Sets packed 64-bit integers in returned vector with the supplied values.
2565///
2566/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2567#[inline]
2568#[target_feature(enable = "avx")]
2569// This intrinsic has no corresponding instruction.
2570#[stable(feature = "simd_x86", since = "1.27.0")]
2571#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2572pub const fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2573    _mm256_setr_epi64x(d, c, b, a)
2574}
2575
2576/// Sets packed double-precision (64-bit) floating-point elements in returned
2577/// vector with the supplied values in reverse order.
2578///
2579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2580#[inline]
2581#[target_feature(enable = "avx")]
2582// This intrinsic has no corresponding instruction.
2583#[stable(feature = "simd_x86", since = "1.27.0")]
2584#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2585pub const fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2586    __m256d([a, b, c, d])
2587}
2588
2589/// Sets packed single-precision (32-bit) floating-point elements in returned
2590/// vector with the supplied values in reverse order.
2591///
2592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2593#[inline]
2594#[target_feature(enable = "avx")]
2595// This intrinsic has no corresponding instruction.
2596#[stable(feature = "simd_x86", since = "1.27.0")]
2597#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2598pub const fn _mm256_setr_ps(
2599    a: f32,
2600    b: f32,
2601    c: f32,
2602    d: f32,
2603    e: f32,
2604    f: f32,
2605    g: f32,
2606    h: f32,
2607) -> __m256 {
2608    __m256([a, b, c, d, e, f, g, h])
2609}
2610
2611/// Sets packed 8-bit integers in returned vector with the supplied values in
2612/// reverse order.
2613///
2614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2615#[inline]
2616#[target_feature(enable = "avx")]
2617// This intrinsic has no corresponding instruction.
2618#[stable(feature = "simd_x86", since = "1.27.0")]
2619#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2620pub const fn _mm256_setr_epi8(
2621    e00: i8,
2622    e01: i8,
2623    e02: i8,
2624    e03: i8,
2625    e04: i8,
2626    e05: i8,
2627    e06: i8,
2628    e07: i8,
2629    e08: i8,
2630    e09: i8,
2631    e10: i8,
2632    e11: i8,
2633    e12: i8,
2634    e13: i8,
2635    e14: i8,
2636    e15: i8,
2637    e16: i8,
2638    e17: i8,
2639    e18: i8,
2640    e19: i8,
2641    e20: i8,
2642    e21: i8,
2643    e22: i8,
2644    e23: i8,
2645    e24: i8,
2646    e25: i8,
2647    e26: i8,
2648    e27: i8,
2649    e28: i8,
2650    e29: i8,
2651    e30: i8,
2652    e31: i8,
2653) -> __m256i {
2654    unsafe {
2655        #[rustfmt::skip]
2656        transmute(i8x32::new(
2657            e00, e01, e02, e03, e04, e05, e06, e07,
2658            e08, e09, e10, e11, e12, e13, e14, e15,
2659            e16, e17, e18, e19, e20, e21, e22, e23,
2660            e24, e25, e26, e27, e28, e29, e30, e31,
2661        ))
2662    }
2663}
2664
2665/// Sets packed 16-bit integers in returned vector with the supplied values in
2666/// reverse order.
2667///
2668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2669#[inline]
2670#[target_feature(enable = "avx")]
2671// This intrinsic has no corresponding instruction.
2672#[stable(feature = "simd_x86", since = "1.27.0")]
2673#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2674pub const fn _mm256_setr_epi16(
2675    e00: i16,
2676    e01: i16,
2677    e02: i16,
2678    e03: i16,
2679    e04: i16,
2680    e05: i16,
2681    e06: i16,
2682    e07: i16,
2683    e08: i16,
2684    e09: i16,
2685    e10: i16,
2686    e11: i16,
2687    e12: i16,
2688    e13: i16,
2689    e14: i16,
2690    e15: i16,
2691) -> __m256i {
2692    unsafe {
2693        #[rustfmt::skip]
2694        transmute(i16x16::new(
2695            e00, e01, e02, e03,
2696            e04, e05, e06, e07,
2697            e08, e09, e10, e11,
2698            e12, e13, e14, e15,
2699        ))
2700    }
2701}
2702
2703/// Sets packed 32-bit integers in returned vector with the supplied values in
2704/// reverse order.
2705///
2706/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2707#[inline]
2708#[target_feature(enable = "avx")]
2709// This intrinsic has no corresponding instruction.
2710#[stable(feature = "simd_x86", since = "1.27.0")]
2711#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2712pub const fn _mm256_setr_epi32(
2713    e0: i32,
2714    e1: i32,
2715    e2: i32,
2716    e3: i32,
2717    e4: i32,
2718    e5: i32,
2719    e6: i32,
2720    e7: i32,
2721) -> __m256i {
2722    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2723}
2724
2725/// Sets packed 64-bit integers in returned vector with the supplied values in
2726/// reverse order.
2727///
2728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2729#[inline]
2730#[target_feature(enable = "avx")]
2731// This intrinsic has no corresponding instruction.
2732#[stable(feature = "simd_x86", since = "1.27.0")]
2733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2734pub const fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2735    unsafe { transmute(i64x4::new(a, b, c, d)) }
2736}
2737
2738/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2739/// elements of returned vector.
2740///
2741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2742#[inline]
2743#[target_feature(enable = "avx")]
2744// This intrinsic has no corresponding instruction.
2745#[stable(feature = "simd_x86", since = "1.27.0")]
2746#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2747pub const fn _mm256_set1_pd(a: f64) -> __m256d {
2748    f64x4::splat(a).as_m256d()
2749}
2750
2751/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2752/// elements of returned vector.
2753///
2754/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2755#[inline]
2756#[target_feature(enable = "avx")]
2757// This intrinsic has no corresponding instruction.
2758#[stable(feature = "simd_x86", since = "1.27.0")]
2759#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2760pub const fn _mm256_set1_ps(a: f32) -> __m256 {
2761    f32x8::splat(a).as_m256()
2762}
2763
2764/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2765/// This intrinsic may generate the `vpbroadcastb`.
2766///
2767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2768#[inline]
2769#[target_feature(enable = "avx")]
2770// This intrinsic has no corresponding instruction.
2771#[stable(feature = "simd_x86", since = "1.27.0")]
2772#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2773pub const fn _mm256_set1_epi8(a: i8) -> __m256i {
2774    i8x32::splat(a).as_m256i()
2775}
2776
2777/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2778/// This intrinsic may generate the `vpbroadcastw`.
2779///
2780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2781#[inline]
2782#[target_feature(enable = "avx")]
2783//#[cfg_attr(test, assert_instr(vpshufb))]
2784#[cfg_attr(test, assert_instr(vinsertf128))]
2785// This intrinsic has no corresponding instruction.
2786#[stable(feature = "simd_x86", since = "1.27.0")]
2787#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2788pub const fn _mm256_set1_epi16(a: i16) -> __m256i {
2789    i16x16::splat(a).as_m256i()
2790}
2791
2792/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2793/// This intrinsic may generate the `vpbroadcastd`.
2794///
2795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2796#[inline]
2797#[target_feature(enable = "avx")]
2798// This intrinsic has no corresponding instruction.
2799#[stable(feature = "simd_x86", since = "1.27.0")]
2800#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2801pub const fn _mm256_set1_epi32(a: i32) -> __m256i {
2802    i32x8::splat(a).as_m256i()
2803}
2804
2805/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2806/// This intrinsic may generate the `vpbroadcastq`.
2807///
2808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2809#[inline]
2810#[target_feature(enable = "avx")]
2811#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2812#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2813// This intrinsic has no corresponding instruction.
2814#[stable(feature = "simd_x86", since = "1.27.0")]
2815#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2816pub const fn _mm256_set1_epi64x(a: i64) -> __m256i {
2817    i64x4::splat(a).as_m256i()
2818}
2819
2820/// Cast vector of type __m256d to type __m256.
2821///
2822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2823#[inline]
2824#[target_feature(enable = "avx")]
2825// This intrinsic is only used for compilation and does not generate any
2826// instructions, thus it has zero latency.
2827#[stable(feature = "simd_x86", since = "1.27.0")]
2828#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2829pub const fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2830    unsafe { transmute(a) }
2831}
2832
2833/// Cast vector of type __m256 to type __m256d.
2834///
2835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2836#[inline]
2837#[target_feature(enable = "avx")]
2838// This intrinsic is only used for compilation and does not generate any
2839// instructions, thus it has zero latency.
2840#[stable(feature = "simd_x86", since = "1.27.0")]
2841#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2842pub const fn _mm256_castps_pd(a: __m256) -> __m256d {
2843    unsafe { transmute(a) }
2844}
2845
2846/// Casts vector of type __m256 to type __m256i.
2847///
2848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2849#[inline]
2850#[target_feature(enable = "avx")]
2851// This intrinsic is only used for compilation and does not generate any
2852// instructions, thus it has zero latency.
2853#[stable(feature = "simd_x86", since = "1.27.0")]
2854#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2855pub const fn _mm256_castps_si256(a: __m256) -> __m256i {
2856    unsafe { transmute(a) }
2857}
2858
2859/// Casts vector of type __m256i to type __m256.
2860///
2861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2862#[inline]
2863#[target_feature(enable = "avx")]
2864// This intrinsic is only used for compilation and does not generate any
2865// instructions, thus it has zero latency.
2866#[stable(feature = "simd_x86", since = "1.27.0")]
2867#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2868pub const fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2869    unsafe { transmute(a) }
2870}
2871
2872/// Casts vector of type __m256d to type __m256i.
2873///
2874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2875#[inline]
2876#[target_feature(enable = "avx")]
2877// This intrinsic is only used for compilation and does not generate any
2878// instructions, thus it has zero latency.
2879#[stable(feature = "simd_x86", since = "1.27.0")]
2880#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2881pub const fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2882    unsafe { transmute(a) }
2883}
2884
2885/// Casts vector of type __m256i to type __m256d.
2886///
2887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2888#[inline]
2889#[target_feature(enable = "avx")]
2890// This intrinsic is only used for compilation and does not generate any
2891// instructions, thus it has zero latency.
2892#[stable(feature = "simd_x86", since = "1.27.0")]
2893#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2894pub const fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2895    unsafe { transmute(a) }
2896}
2897
2898/// Casts vector of type __m256 to type __m128.
2899///
2900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2901#[inline]
2902#[target_feature(enable = "avx")]
2903// This intrinsic is only used for compilation and does not generate any
2904// instructions, thus it has zero latency.
2905#[stable(feature = "simd_x86", since = "1.27.0")]
2906#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2907pub const fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2908    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2909}
2910
2911/// Casts vector of type __m256d to type __m128d.
2912///
2913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2914#[inline]
2915#[target_feature(enable = "avx")]
2916// This intrinsic is only used for compilation and does not generate any
2917// instructions, thus it has zero latency.
2918#[stable(feature = "simd_x86", since = "1.27.0")]
2919#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2920pub const fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2921    unsafe { simd_shuffle!(a, a, [0, 1]) }
2922}
2923
2924/// Casts vector of type __m256i to type __m128i.
2925///
2926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2927#[inline]
2928#[target_feature(enable = "avx")]
2929// This intrinsic is only used for compilation and does not generate any
2930// instructions, thus it has zero latency.
2931#[stable(feature = "simd_x86", since = "1.27.0")]
2932#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2933pub const fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2934    unsafe {
2935        let a = a.as_i64x4();
2936        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2937        transmute(dst)
2938    }
2939}
2940
2941/// Casts vector of type __m128 to type __m256;
2942/// the upper 128 bits of the result are indeterminate.
2943///
2944/// In the Intel documentation, the upper bits are declared to be "undefined".
2945/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2946/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2947///
2948/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2949#[inline]
2950#[target_feature(enable = "avx")]
2951// This intrinsic is only used for compilation and does not generate any
2952// instructions, thus it has zero latency.
2953#[stable(feature = "simd_x86", since = "1.27.0")]
2954#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2955pub const fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2956    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2957}
2958
2959/// Casts vector of type __m128d to type __m256d;
2960/// the upper 128 bits of the result are indeterminate.
2961///
2962/// In the Intel documentation, the upper bits are declared to be "undefined".
2963/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2964/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2965///
2966/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2967#[inline]
2968#[target_feature(enable = "avx")]
2969// This intrinsic is only used for compilation and does not generate any
2970// instructions, thus it has zero latency.
2971#[stable(feature = "simd_x86", since = "1.27.0")]
2972#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2973pub const fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2974    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2975}
2976
2977/// Casts vector of type __m128i to type __m256i;
2978/// the upper 128 bits of the result are indeterminate.
2979///
2980/// In the Intel documentation, the upper bits are declared to be "undefined".
2981/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2982/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2983///
2984/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2985#[inline]
2986#[target_feature(enable = "avx")]
2987// This intrinsic is only used for compilation and does not generate any
2988// instructions, thus it has zero latency.
2989#[stable(feature = "simd_x86", since = "1.27.0")]
2990#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2991pub const fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2992    unsafe {
2993        let a = a.as_i64x2();
2994        let undefined = i64x2::ZERO;
2995        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2996        transmute(dst)
2997    }
2998}
2999
3000/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
3001/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
3002/// the value of the source vector. The upper 128 bits are set to zero.
3003///
3004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
3005#[inline]
3006#[target_feature(enable = "avx")]
3007// This intrinsic is only used for compilation and does not generate any
3008// instructions, thus it has zero latency.
3009#[stable(feature = "simd_x86", since = "1.27.0")]
3010#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3011pub const fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
3012    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
3013}
3014
3015/// Constructs a 256-bit integer vector from a 128-bit integer vector.
3016/// The lower 128 bits contain the value of the source vector. The upper
3017/// 128 bits are set to zero.
3018///
3019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
3020#[inline]
3021#[target_feature(enable = "avx")]
3022// This intrinsic is only used for compilation and does not generate any
3023// instructions, thus it has zero latency.
3024#[stable(feature = "simd_x86", since = "1.27.0")]
3025#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3026pub const fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
3027    unsafe {
3028        let b = i64x2::ZERO;
3029        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
3030        transmute(dst)
3031    }
3032}
3033
3034/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
3035/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
3036/// contain the value of the source vector. The upper 128 bits are set
3037/// to zero.
3038///
3039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
3040#[inline]
3041#[target_feature(enable = "avx")]
3042// This intrinsic is only used for compilation and does not generate any
3043// instructions, thus it has zero latency.
3044#[stable(feature = "simd_x86", since = "1.27.0")]
3045#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3046pub const fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
3047    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
3048}
3049
3050/// Returns vector of type `__m256` with indeterminate elements.
3051/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3052/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3053/// In practice, this is typically equivalent to [`mem::zeroed`].
3054///
3055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
3056#[inline]
3057#[target_feature(enable = "avx")]
3058// This intrinsic has no corresponding instruction.
3059#[stable(feature = "simd_x86", since = "1.27.0")]
3060#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3061pub const fn _mm256_undefined_ps() -> __m256 {
3062    const { unsafe { mem::zeroed() } }
3063}
3064
3065/// Returns vector of type `__m256d` with indeterminate elements.
3066/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3067/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3068/// In practice, this is typically equivalent to [`mem::zeroed`].
3069///
3070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
3071#[inline]
3072#[target_feature(enable = "avx")]
3073// This intrinsic has no corresponding instruction.
3074#[stable(feature = "simd_x86", since = "1.27.0")]
3075#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3076pub const fn _mm256_undefined_pd() -> __m256d {
3077    const { unsafe { mem::zeroed() } }
3078}
3079
3080/// Returns vector of type __m256i with with indeterminate elements.
3081/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3082/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3083/// In practice, this is typically equivalent to [`mem::zeroed`].
3084///
3085/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
3086#[inline]
3087#[target_feature(enable = "avx")]
3088// This intrinsic has no corresponding instruction.
3089#[stable(feature = "simd_x86", since = "1.27.0")]
3090#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3091pub const fn _mm256_undefined_si256() -> __m256i {
3092    const { unsafe { mem::zeroed() } }
3093}
3094
3095/// Sets packed __m256 returned vector with the supplied values.
3096///
3097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
3098#[inline]
3099#[target_feature(enable = "avx")]
3100#[cfg_attr(test, assert_instr(vinsertf128))]
3101#[stable(feature = "simd_x86", since = "1.27.0")]
3102#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3103pub const fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
3104    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
3105}
3106
3107/// Sets packed __m256d returned vector with the supplied values.
3108///
3109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
3110#[inline]
3111#[target_feature(enable = "avx")]
3112#[cfg_attr(test, assert_instr(vinsertf128))]
3113#[stable(feature = "simd_x86", since = "1.27.0")]
3114#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3115pub const fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
3116    unsafe {
3117        let hi: __m128 = transmute(hi);
3118        let lo: __m128 = transmute(lo);
3119        transmute(_mm256_set_m128(hi, lo))
3120    }
3121}
3122
3123/// Sets packed __m256i returned vector with the supplied values.
3124///
3125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
3126#[inline]
3127#[target_feature(enable = "avx")]
3128#[cfg_attr(test, assert_instr(vinsertf128))]
3129#[stable(feature = "simd_x86", since = "1.27.0")]
3130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3131pub const fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
3132    unsafe {
3133        let hi: __m128 = transmute(hi);
3134        let lo: __m128 = transmute(lo);
3135        transmute(_mm256_set_m128(hi, lo))
3136    }
3137}
3138
3139/// Sets packed __m256 returned vector with the supplied values.
3140///
3141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
3142#[inline]
3143#[target_feature(enable = "avx")]
3144#[cfg_attr(test, assert_instr(vinsertf128))]
3145#[stable(feature = "simd_x86", since = "1.27.0")]
3146#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3147pub const fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
3148    _mm256_set_m128(hi, lo)
3149}
3150
3151/// Sets packed __m256d returned vector with the supplied values.
3152///
3153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
3154#[inline]
3155#[target_feature(enable = "avx")]
3156#[cfg_attr(test, assert_instr(vinsertf128))]
3157#[stable(feature = "simd_x86", since = "1.27.0")]
3158#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3159pub const fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
3160    _mm256_set_m128d(hi, lo)
3161}
3162
3163/// Sets packed __m256i returned vector with the supplied values.
3164///
3165/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3166#[inline]
3167#[target_feature(enable = "avx")]
3168#[cfg_attr(test, assert_instr(vinsertf128))]
3169#[stable(feature = "simd_x86", since = "1.27.0")]
3170#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3171pub const fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3172    _mm256_set_m128i(hi, lo)
3173}
3174
3175/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3176/// floating-point elements) from memory, and combine them into a 256-bit
3177/// value.
3178/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3179///
3180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3181#[inline]
3182#[target_feature(enable = "avx")]
3183// This intrinsic has no corresponding instruction.
3184#[stable(feature = "simd_x86", since = "1.27.0")]
3185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3186pub const unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3187    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3188    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3189}
3190
3191/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3192/// floating-point elements) from memory, and combine them into a 256-bit
3193/// value.
3194/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3195///
3196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3197#[inline]
3198#[target_feature(enable = "avx")]
3199// This intrinsic has no corresponding instruction.
3200#[stable(feature = "simd_x86", since = "1.27.0")]
3201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3202pub const unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3203    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3204    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3205}
3206
3207/// Loads two 128-bit values (composed of integer data) from memory, and combine
3208/// them into a 256-bit value.
3209/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3210///
3211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3212#[inline]
3213#[target_feature(enable = "avx")]
3214// This intrinsic has no corresponding instruction.
3215#[stable(feature = "simd_x86", since = "1.27.0")]
3216#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3217pub const unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3218    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3219    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3220}
3221
3222/// Stores the high and low 128-bit halves (each composed of 4 packed
3223/// single-precision (32-bit) floating-point elements) from `a` into memory two
3224/// different 128-bit locations.
3225/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3226///
3227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3228#[inline]
3229#[target_feature(enable = "avx")]
3230// This intrinsic has no corresponding instruction.
3231#[stable(feature = "simd_x86", since = "1.27.0")]
3232#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3233pub const unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3234    let lo = _mm256_castps256_ps128(a);
3235    _mm_storeu_ps(loaddr, lo);
3236    let hi = _mm256_extractf128_ps::<1>(a);
3237    _mm_storeu_ps(hiaddr, hi);
3238}
3239
3240/// Stores the high and low 128-bit halves (each composed of 2 packed
3241/// double-precision (64-bit) floating-point elements) from `a` into memory two
3242/// different 128-bit locations.
3243/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3244///
3245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3246#[inline]
3247#[target_feature(enable = "avx")]
3248// This intrinsic has no corresponding instruction.
3249#[stable(feature = "simd_x86", since = "1.27.0")]
3250#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3251pub const unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3252    let lo = _mm256_castpd256_pd128(a);
3253    _mm_storeu_pd(loaddr, lo);
3254    let hi = _mm256_extractf128_pd::<1>(a);
3255    _mm_storeu_pd(hiaddr, hi);
3256}
3257
3258/// Stores the high and low 128-bit halves (each composed of integer data) from
3259/// `a` into memory two different 128-bit locations.
3260/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3261///
3262/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3263#[inline]
3264#[target_feature(enable = "avx")]
3265// This intrinsic has no corresponding instruction.
3266#[stable(feature = "simd_x86", since = "1.27.0")]
3267#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3268pub const unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3269    let lo = _mm256_castsi256_si128(a);
3270    _mm_storeu_si128(loaddr, lo);
3271    let hi = _mm256_extractf128_si256::<1>(a);
3272    _mm_storeu_si128(hiaddr, hi);
3273}
3274
3275/// Returns the first element of the input vector of `[8 x float]`.
3276///
3277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3278#[inline]
3279#[target_feature(enable = "avx")]
3280//#[cfg_attr(test, assert_instr(movss))] FIXME
3281#[stable(feature = "simd_x86", since = "1.27.0")]
3282#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3283pub const fn _mm256_cvtss_f32(a: __m256) -> f32 {
3284    unsafe { simd_extract!(a, 0) }
3285}
3286
3287// LLVM intrinsics used in the above functions
3288#[allow(improper_ctypes)]
3289unsafe extern "C" {
3290    #[link_name = "llvm.x86.avx.round.pd.256"]
3291    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3292    #[link_name = "llvm.x86.avx.round.ps.256"]
3293    fn roundps256(a: __m256, b: i32) -> __m256;
3294    #[link_name = "llvm.x86.avx.dp.ps.256"]
3295    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3296    #[link_name = "llvm.x86.sse2.cmp.pd"]
3297    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3298    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3299    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3300    #[link_name = "llvm.x86.sse.cmp.ps"]
3301    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3302    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3303    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3304    #[link_name = "llvm.x86.sse2.cmp.sd"]
3305    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3306    #[link_name = "llvm.x86.sse.cmp.ss"]
3307    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3308    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3309    fn vcvtps2dq(a: __m256) -> i32x8;
3310    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3311    fn vcvttpd2dq(a: __m256d) -> i32x4;
3312    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3313    fn vcvtpd2dq(a: __m256d) -> i32x4;
3314    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3315    fn vcvttps2dq(a: __m256) -> i32x8;
3316    #[link_name = "llvm.x86.avx.vzeroall"]
3317    fn vzeroall();
3318    #[link_name = "llvm.x86.avx.vzeroupper"]
3319    fn vzeroupper();
3320    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3321    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3322    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3323    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3324    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3325    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3326    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3327    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3328    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3329    fn vlddqu(mem_addr: *const i8) -> i8x32;
3330    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3331    fn vrcpps(a: __m256) -> __m256;
3332    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3333    fn vrsqrtps(a: __m256) -> __m256;
3334    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3335    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3336    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3337    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3338    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3339    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3340    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3341    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3342    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3343    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3344    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3345    fn vtestzps256(a: __m256, b: __m256) -> i32;
3346    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3347    fn vtestcps256(a: __m256, b: __m256) -> i32;
3348    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3349    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3350    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3351    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3352    #[link_name = "llvm.x86.avx.min.ps.256"]
3353    fn vminps(a: __m256, b: __m256) -> __m256;
3354    #[link_name = "llvm.x86.avx.max.ps.256"]
3355    fn vmaxps(a: __m256, b: __m256) -> __m256;
3356    #[link_name = "llvm.x86.avx.min.pd.256"]
3357    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3358    #[link_name = "llvm.x86.avx.max.pd.256"]
3359    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3360}
3361
3362#[cfg(test)]
3363mod tests {
3364    use crate::core_arch::assert_eq_const as assert_eq;
3365    use crate::core_arch::simd::*;
3366    use crate::hint::black_box;
3367    use crate::ptr;
3368    use stdarch_test::simd_test;
3369
3370    use crate::core_arch::x86::*;
3371
3372    #[simd_test(enable = "avx")]
3373    const fn test_mm256_add_pd() {
3374        let a = _mm256_setr_pd(1., 2., 3., 4.);
3375        let b = _mm256_setr_pd(5., 6., 7., 8.);
3376        let r = _mm256_add_pd(a, b);
3377        let e = _mm256_setr_pd(6., 8., 10., 12.);
3378        assert_eq_m256d(r, e);
3379    }
3380
3381    #[simd_test(enable = "avx")]
3382    const fn test_mm256_add_ps() {
3383        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3384        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3385        let r = _mm256_add_ps(a, b);
3386        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3387        assert_eq_m256(r, e);
3388    }
3389
3390    #[simd_test(enable = "avx")]
3391    const fn test_mm256_and_pd() {
3392        let a = _mm256_set1_pd(1.);
3393        let b = _mm256_set1_pd(0.6);
3394        let r = _mm256_and_pd(a, b);
3395        let e = _mm256_set1_pd(0.5);
3396        assert_eq_m256d(r, e);
3397    }
3398
3399    #[simd_test(enable = "avx")]
3400    const fn test_mm256_and_ps() {
3401        let a = _mm256_set1_ps(1.);
3402        let b = _mm256_set1_ps(0.6);
3403        let r = _mm256_and_ps(a, b);
3404        let e = _mm256_set1_ps(0.5);
3405        assert_eq_m256(r, e);
3406    }
3407
3408    #[simd_test(enable = "avx")]
3409    const fn test_mm256_or_pd() {
3410        let a = _mm256_set1_pd(1.);
3411        let b = _mm256_set1_pd(0.6);
3412        let r = _mm256_or_pd(a, b);
3413        let e = _mm256_set1_pd(1.2);
3414        assert_eq_m256d(r, e);
3415    }
3416
3417    #[simd_test(enable = "avx")]
3418    const fn test_mm256_or_ps() {
3419        let a = _mm256_set1_ps(1.);
3420        let b = _mm256_set1_ps(0.6);
3421        let r = _mm256_or_ps(a, b);
3422        let e = _mm256_set1_ps(1.2);
3423        assert_eq_m256(r, e);
3424    }
3425
3426    #[simd_test(enable = "avx")]
3427    const fn test_mm256_shuffle_pd() {
3428        let a = _mm256_setr_pd(1., 4., 5., 8.);
3429        let b = _mm256_setr_pd(2., 3., 6., 7.);
3430        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3431        let e = _mm256_setr_pd(4., 3., 8., 7.);
3432        assert_eq_m256d(r, e);
3433    }
3434
3435    #[simd_test(enable = "avx")]
3436    const fn test_mm256_shuffle_ps() {
3437        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3438        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3439        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3440        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3441        assert_eq_m256(r, e);
3442    }
3443
3444    #[simd_test(enable = "avx")]
3445    const fn test_mm256_andnot_pd() {
3446        let a = _mm256_set1_pd(0.);
3447        let b = _mm256_set1_pd(0.6);
3448        let r = _mm256_andnot_pd(a, b);
3449        assert_eq_m256d(r, b);
3450    }
3451
3452    #[simd_test(enable = "avx")]
3453    const fn test_mm256_andnot_ps() {
3454        let a = _mm256_set1_ps(0.);
3455        let b = _mm256_set1_ps(0.6);
3456        let r = _mm256_andnot_ps(a, b);
3457        assert_eq_m256(r, b);
3458    }
3459
3460    #[simd_test(enable = "avx")]
3461    fn test_mm256_max_pd() {
3462        let a = _mm256_setr_pd(1., 4., 5., 8.);
3463        let b = _mm256_setr_pd(2., 3., 6., 7.);
3464        let r = _mm256_max_pd(a, b);
3465        let e = _mm256_setr_pd(2., 4., 6., 8.);
3466        assert_eq_m256d(r, e);
3467        // > If the values being compared are both 0.0s (of either sign), the
3468        // > value in the second operand (source operand) is returned.
3469        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3470        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3471        let wu = _mm256_castpd_si256(w).as_u64x4();
3472        let xu = _mm256_castpd_si256(x).as_u64x4();
3473        assert_eq!(wu, u64x4::splat(0x8000_0000_0000_0000u64));
3474        assert_eq!(xu, u64x4::splat(0u64));
3475        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3476        // > second operand (source operand), either a NaN or a valid
3477        // > floating-point value, is written to the result.
3478        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3479        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3480        assert_eq_m256d(y, _mm256_set1_pd(0.0));
3481        let zf = *z.as_f64x4().as_array();
3482        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3483    }
3484
3485    #[simd_test(enable = "avx")]
3486    fn test_mm256_max_ps() {
3487        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3488        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3489        let r = _mm256_max_ps(a, b);
3490        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3491        assert_eq_m256(r, e);
3492        // > If the values being compared are both 0.0s (of either sign), the
3493        // > value in the second operand (source operand) is returned.
3494        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3495        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3496        let wu = _mm256_castps_si256(w).as_u32x8();
3497        let xu = _mm256_castps_si256(x).as_u32x8();
3498        assert_eq!(wu, u32x8::splat(0x8000_0000u32));
3499        assert_eq!(xu, u32x8::splat(0u32));
3500        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3501        // > second operand (source operand), either a NaN or a valid
3502        // > floating-point value, is written to the result.
3503        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3504        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3505        assert_eq_m256(y, _mm256_set1_ps(0.0));
3506        let zf = *z.as_f32x8().as_array();
3507        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3508    }
3509
3510    #[simd_test(enable = "avx")]
3511    fn test_mm256_min_pd() {
3512        let a = _mm256_setr_pd(1., 4., 5., 8.);
3513        let b = _mm256_setr_pd(2., 3., 6., 7.);
3514        let r = _mm256_min_pd(a, b);
3515        let e = _mm256_setr_pd(1., 3., 5., 7.);
3516        assert_eq_m256d(r, e);
3517        // > If the values being compared are both 0.0s (of either sign), the
3518        // > value in the second operand (source operand) is returned.
3519        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3520        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3521        let wu = _mm256_castpd_si256(w).as_u64x4();
3522        let xu = _mm256_castpd_si256(x).as_u64x4();
3523        assert_eq!(wu, u64x4::splat(0x8000_0000_0000_0000u64));
3524        assert_eq!(xu, u64x4::splat(0u64));
3525        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3526        // > second operand (source operand), either a NaN or a valid
3527        // > floating-point value, is written to the result.
3528        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3529        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3530        assert_eq_m256d(y, _mm256_set1_pd(0.0));
3531        let zf = *z.as_f64x4().as_array();
3532        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3533    }
3534
3535    #[simd_test(enable = "avx")]
3536    fn test_mm256_min_ps() {
3537        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3538        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3539        let r = _mm256_min_ps(a, b);
3540        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3541        assert_eq_m256(r, e);
3542        // > If the values being compared are both 0.0s (of either sign), the
3543        // > value in the second operand (source operand) is returned.
3544        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3545        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3546        let wu = _mm256_castps_si256(w).as_u32x8();
3547        let xu = _mm256_castps_si256(x).as_u32x8();
3548        assert_eq!(wu, u32x8::splat(0x8000_0000u32));
3549        assert_eq!(xu, u32x8::splat(0u32));
3550        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3551        // > second operand (source operand), either a NaN or a valid
3552        // > floating-point value, is written to the result.
3553        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3554        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3555        assert_eq_m256(y, _mm256_set1_ps(0.0));
3556        let zf = *z.as_f32x8().as_array();
3557        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3558    }
3559
3560    #[simd_test(enable = "avx")]
3561    const fn test_mm256_mul_pd() {
3562        let a = _mm256_setr_pd(1., 2., 3., 4.);
3563        let b = _mm256_setr_pd(5., 6., 7., 8.);
3564        let r = _mm256_mul_pd(a, b);
3565        let e = _mm256_setr_pd(5., 12., 21., 32.);
3566        assert_eq_m256d(r, e);
3567    }
3568
3569    #[simd_test(enable = "avx")]
3570    const fn test_mm256_mul_ps() {
3571        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3572        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3573        let r = _mm256_mul_ps(a, b);
3574        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3575        assert_eq_m256(r, e);
3576    }
3577
3578    #[simd_test(enable = "avx")]
3579    const fn test_mm256_addsub_pd() {
3580        let a = _mm256_setr_pd(1., 2., 3., 4.);
3581        let b = _mm256_setr_pd(5., 6., 7., 8.);
3582        let r = _mm256_addsub_pd(a, b);
3583        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3584        assert_eq_m256d(r, e);
3585    }
3586
3587    #[simd_test(enable = "avx")]
3588    const fn test_mm256_addsub_ps() {
3589        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3590        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3591        let r = _mm256_addsub_ps(a, b);
3592        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3593        assert_eq_m256(r, e);
3594    }
3595
3596    #[simd_test(enable = "avx")]
3597    const fn test_mm256_sub_pd() {
3598        let a = _mm256_setr_pd(1., 2., 3., 4.);
3599        let b = _mm256_setr_pd(5., 6., 7., 8.);
3600        let r = _mm256_sub_pd(a, b);
3601        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3602        assert_eq_m256d(r, e);
3603    }
3604
3605    #[simd_test(enable = "avx")]
3606    const fn test_mm256_sub_ps() {
3607        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3608        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3609        let r = _mm256_sub_ps(a, b);
3610        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3611        assert_eq_m256(r, e);
3612    }
3613
3614    #[simd_test(enable = "avx")]
3615    fn test_mm256_round_pd() {
3616        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3617        let result_closest = _mm256_round_pd::<0b0000>(a);
3618        let result_down = _mm256_round_pd::<0b0001>(a);
3619        let result_up = _mm256_round_pd::<0b0010>(a);
3620        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3621        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3622        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3623        assert_eq_m256d(result_closest, expected_closest);
3624        assert_eq_m256d(result_down, expected_down);
3625        assert_eq_m256d(result_up, expected_up);
3626    }
3627
3628    #[simd_test(enable = "avx")]
3629    const fn test_mm256_floor_pd() {
3630        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3631        let result_down = _mm256_floor_pd(a);
3632        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3633        assert_eq_m256d(result_down, expected_down);
3634    }
3635
3636    #[simd_test(enable = "avx")]
3637    const fn test_mm256_ceil_pd() {
3638        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3639        let result_up = _mm256_ceil_pd(a);
3640        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3641        assert_eq_m256d(result_up, expected_up);
3642    }
3643
3644    #[simd_test(enable = "avx")]
3645    fn test_mm256_round_ps() {
3646        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3647        let result_closest = _mm256_round_ps::<0b0000>(a);
3648        let result_down = _mm256_round_ps::<0b0001>(a);
3649        let result_up = _mm256_round_ps::<0b0010>(a);
3650        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3651        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3652        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3653        assert_eq_m256(result_closest, expected_closest);
3654        assert_eq_m256(result_down, expected_down);
3655        assert_eq_m256(result_up, expected_up);
3656    }
3657
3658    #[simd_test(enable = "avx")]
3659    const fn test_mm256_floor_ps() {
3660        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3661        let result_down = _mm256_floor_ps(a);
3662        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3663        assert_eq_m256(result_down, expected_down);
3664    }
3665
3666    #[simd_test(enable = "avx")]
3667    const fn test_mm256_ceil_ps() {
3668        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3669        let result_up = _mm256_ceil_ps(a);
3670        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3671        assert_eq_m256(result_up, expected_up);
3672    }
3673
3674    #[simd_test(enable = "avx")]
3675    fn test_mm256_sqrt_pd() {
3676        let a = _mm256_setr_pd(4., 9., 16., 25.);
3677        let r = _mm256_sqrt_pd(a);
3678        let e = _mm256_setr_pd(2., 3., 4., 5.);
3679        assert_eq_m256d(r, e);
3680    }
3681
3682    #[simd_test(enable = "avx")]
3683    fn test_mm256_sqrt_ps() {
3684        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3685        let r = _mm256_sqrt_ps(a);
3686        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3687        assert_eq_m256(r, e);
3688    }
3689
3690    #[simd_test(enable = "avx")]
3691    const fn test_mm256_div_ps() {
3692        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3693        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3694        let r = _mm256_div_ps(a, b);
3695        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3696        assert_eq_m256(r, e);
3697    }
3698
3699    #[simd_test(enable = "avx")]
3700    const fn test_mm256_div_pd() {
3701        let a = _mm256_setr_pd(4., 9., 16., 25.);
3702        let b = _mm256_setr_pd(4., 3., 2., 5.);
3703        let r = _mm256_div_pd(a, b);
3704        let e = _mm256_setr_pd(1., 3., 8., 5.);
3705        assert_eq_m256d(r, e);
3706    }
3707
3708    #[simd_test(enable = "avx")]
3709    const fn test_mm256_blend_pd() {
3710        let a = _mm256_setr_pd(4., 9., 16., 25.);
3711        let b = _mm256_setr_pd(4., 3., 2., 5.);
3712        let r = _mm256_blend_pd::<0x0>(a, b);
3713        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3714        let r = _mm256_blend_pd::<0x3>(a, b);
3715        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3716        let r = _mm256_blend_pd::<0xF>(a, b);
3717        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3718    }
3719
3720    #[simd_test(enable = "avx")]
3721    const fn test_mm256_blend_ps() {
3722        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3723        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3724        let r = _mm256_blend_ps::<0x0>(a, b);
3725        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3726        let r = _mm256_blend_ps::<0x3>(a, b);
3727        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3728        let r = _mm256_blend_ps::<0xF>(a, b);
3729        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3730    }
3731
3732    #[simd_test(enable = "avx")]
3733    const fn test_mm256_blendv_pd() {
3734        let a = _mm256_setr_pd(4., 9., 16., 25.);
3735        let b = _mm256_setr_pd(4., 3., 2., 5.);
3736        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3737        let r = _mm256_blendv_pd(a, b, c);
3738        let e = _mm256_setr_pd(4., 9., 2., 5.);
3739        assert_eq_m256d(r, e);
3740    }
3741
3742    #[simd_test(enable = "avx")]
3743    const fn test_mm256_blendv_ps() {
3744        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3745        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3746        #[rustfmt::skip]
3747        let c = _mm256_setr_ps(
3748            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3749        );
3750        let r = _mm256_blendv_ps(a, b, c);
3751        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3752        assert_eq_m256(r, e);
3753    }
3754
3755    #[simd_test(enable = "avx")]
3756    fn test_mm256_dp_ps() {
3757        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3758        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3759        let r = _mm256_dp_ps::<0xFF>(a, b);
3760        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3761        assert_eq_m256(r, e);
3762    }
3763
3764    #[simd_test(enable = "avx")]
3765    const fn test_mm256_hadd_pd() {
3766        let a = _mm256_setr_pd(4., 9., 16., 25.);
3767        let b = _mm256_setr_pd(4., 3., 2., 5.);
3768        let r = _mm256_hadd_pd(a, b);
3769        let e = _mm256_setr_pd(13., 7., 41., 7.);
3770        assert_eq_m256d(r, e);
3771
3772        let a = _mm256_setr_pd(1., 2., 3., 4.);
3773        let b = _mm256_setr_pd(5., 6., 7., 8.);
3774        let r = _mm256_hadd_pd(a, b);
3775        let e = _mm256_setr_pd(3., 11., 7., 15.);
3776        assert_eq_m256d(r, e);
3777    }
3778
3779    #[simd_test(enable = "avx")]
3780    const fn test_mm256_hadd_ps() {
3781        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3782        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3783        let r = _mm256_hadd_ps(a, b);
3784        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3785        assert_eq_m256(r, e);
3786
3787        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3788        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3789        let r = _mm256_hadd_ps(a, b);
3790        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3791        assert_eq_m256(r, e);
3792    }
3793
3794    #[simd_test(enable = "avx")]
3795    const fn test_mm256_hsub_pd() {
3796        let a = _mm256_setr_pd(4., 9., 16., 25.);
3797        let b = _mm256_setr_pd(4., 3., 2., 5.);
3798        let r = _mm256_hsub_pd(a, b);
3799        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3800        assert_eq_m256d(r, e);
3801
3802        let a = _mm256_setr_pd(1., 2., 3., 4.);
3803        let b = _mm256_setr_pd(5., 6., 7., 8.);
3804        let r = _mm256_hsub_pd(a, b);
3805        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3806        assert_eq_m256d(r, e);
3807    }
3808
3809    #[simd_test(enable = "avx")]
3810    const fn test_mm256_hsub_ps() {
3811        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3812        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3813        let r = _mm256_hsub_ps(a, b);
3814        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3815        assert_eq_m256(r, e);
3816
3817        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3818        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3819        let r = _mm256_hsub_ps(a, b);
3820        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3821        assert_eq_m256(r, e);
3822    }
3823
3824    #[simd_test(enable = "avx")]
3825    const fn test_mm256_xor_pd() {
3826        let a = _mm256_setr_pd(4., 9., 16., 25.);
3827        let b = _mm256_set1_pd(0.);
3828        let r = _mm256_xor_pd(a, b);
3829        assert_eq_m256d(r, a);
3830    }
3831
3832    #[simd_test(enable = "avx")]
3833    const fn test_mm256_xor_ps() {
3834        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3835        let b = _mm256_set1_ps(0.);
3836        let r = _mm256_xor_ps(a, b);
3837        assert_eq_m256(r, a);
3838    }
3839
3840    #[simd_test(enable = "avx")]
3841    fn test_mm_cmp_pd() {
3842        let a = _mm_setr_pd(4., 9.);
3843        let b = _mm_setr_pd(4., 3.);
3844        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3845        assert!(get_m128d(r, 0).is_nan());
3846        assert!(get_m128d(r, 1).is_nan());
3847    }
3848
3849    #[simd_test(enable = "avx")]
3850    fn test_mm256_cmp_pd() {
3851        let a = _mm256_setr_pd(1., 2., 3., 4.);
3852        let b = _mm256_setr_pd(5., 6., 7., 8.);
3853        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3854        let e = _mm256_set1_pd(0.);
3855        assert_eq_m256d(r, e);
3856    }
3857
3858    #[simd_test(enable = "avx")]
3859    fn test_mm_cmp_ps() {
3860        let a = _mm_setr_ps(4., 3., 2., 5.);
3861        let b = _mm_setr_ps(4., 9., 16., 25.);
3862        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3863        assert!(get_m128(r, 0).is_nan());
3864        assert_eq!(get_m128(r, 1), 0.);
3865        assert_eq!(get_m128(r, 2), 0.);
3866        assert_eq!(get_m128(r, 3), 0.);
3867    }
3868
3869    #[simd_test(enable = "avx")]
3870    fn test_mm256_cmp_ps() {
3871        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3872        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3873        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3874        let e = _mm256_set1_ps(0.);
3875        assert_eq_m256(r, e);
3876    }
3877
3878    #[simd_test(enable = "avx")]
3879    fn test_mm_cmp_sd() {
3880        let a = _mm_setr_pd(4., 9.);
3881        let b = _mm_setr_pd(4., 3.);
3882        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3883        assert!(get_m128d(r, 0).is_nan());
3884        assert_eq!(get_m128d(r, 1), 9.);
3885    }
3886
3887    #[simd_test(enable = "avx")]
3888    fn test_mm_cmp_ss() {
3889        let a = _mm_setr_ps(4., 3., 2., 5.);
3890        let b = _mm_setr_ps(4., 9., 16., 25.);
3891        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3892        assert!(get_m128(r, 0).is_nan());
3893        assert_eq!(get_m128(r, 1), 3.);
3894        assert_eq!(get_m128(r, 2), 2.);
3895        assert_eq!(get_m128(r, 3), 5.);
3896    }
3897
3898    #[simd_test(enable = "avx")]
3899    const fn test_mm256_cvtepi32_pd() {
3900        let a = _mm_setr_epi32(4, 9, 16, 25);
3901        let r = _mm256_cvtepi32_pd(a);
3902        let e = _mm256_setr_pd(4., 9., 16., 25.);
3903        assert_eq_m256d(r, e);
3904    }
3905
3906    #[simd_test(enable = "avx")]
3907    const fn test_mm256_cvtepi32_ps() {
3908        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3909        let r = _mm256_cvtepi32_ps(a);
3910        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3911        assert_eq_m256(r, e);
3912    }
3913
3914    #[simd_test(enable = "avx")]
3915    const fn test_mm256_cvtpd_ps() {
3916        let a = _mm256_setr_pd(4., 9., 16., 25.);
3917        let r = _mm256_cvtpd_ps(a);
3918        let e = _mm_setr_ps(4., 9., 16., 25.);
3919        assert_eq_m128(r, e);
3920    }
3921
3922    #[simd_test(enable = "avx")]
3923    fn test_mm256_cvtps_epi32() {
3924        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3925        let r = _mm256_cvtps_epi32(a);
3926        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3927        assert_eq_m256i(r, e);
3928    }
3929
3930    #[simd_test(enable = "avx")]
3931    const fn test_mm256_cvtps_pd() {
3932        let a = _mm_setr_ps(4., 9., 16., 25.);
3933        let r = _mm256_cvtps_pd(a);
3934        let e = _mm256_setr_pd(4., 9., 16., 25.);
3935        assert_eq_m256d(r, e);
3936    }
3937
3938    #[simd_test(enable = "avx")]
3939    const fn test_mm256_cvtsd_f64() {
3940        let a = _mm256_setr_pd(1., 2., 3., 4.);
3941        let r = _mm256_cvtsd_f64(a);
3942        assert_eq!(r, 1.);
3943    }
3944
3945    #[simd_test(enable = "avx")]
3946    fn test_mm256_cvttpd_epi32() {
3947        let a = _mm256_setr_pd(4., 9., 16., 25.);
3948        let r = _mm256_cvttpd_epi32(a);
3949        let e = _mm_setr_epi32(4, 9, 16, 25);
3950        assert_eq_m128i(r, e);
3951    }
3952
3953    #[simd_test(enable = "avx")]
3954    fn test_mm256_cvtpd_epi32() {
3955        let a = _mm256_setr_pd(4., 9., 16., 25.);
3956        let r = _mm256_cvtpd_epi32(a);
3957        let e = _mm_setr_epi32(4, 9, 16, 25);
3958        assert_eq_m128i(r, e);
3959    }
3960
3961    #[simd_test(enable = "avx")]
3962    fn test_mm256_cvttps_epi32() {
3963        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3964        let r = _mm256_cvttps_epi32(a);
3965        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3966        assert_eq_m256i(r, e);
3967    }
3968
3969    #[simd_test(enable = "avx")]
3970    const fn test_mm256_extractf128_ps() {
3971        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3972        let r = _mm256_extractf128_ps::<0>(a);
3973        let e = _mm_setr_ps(4., 3., 2., 5.);
3974        assert_eq_m128(r, e);
3975    }
3976
3977    #[simd_test(enable = "avx")]
3978    const fn test_mm256_extractf128_pd() {
3979        let a = _mm256_setr_pd(4., 3., 2., 5.);
3980        let r = _mm256_extractf128_pd::<0>(a);
3981        let e = _mm_setr_pd(4., 3.);
3982        assert_eq_m128d(r, e);
3983    }
3984
3985    #[simd_test(enable = "avx")]
3986    const fn test_mm256_extractf128_si256() {
3987        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3988        let r = _mm256_extractf128_si256::<0>(a);
3989        let e = _mm_setr_epi64x(4, 3);
3990        assert_eq_m128i(r, e);
3991    }
3992
3993    #[simd_test(enable = "avx")]
3994    const fn test_mm256_extract_epi32() {
3995        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3996        let r1 = _mm256_extract_epi32::<0>(a);
3997        let r2 = _mm256_extract_epi32::<3>(a);
3998        assert_eq!(r1, -1);
3999        assert_eq!(r2, 3);
4000    }
4001
4002    #[simd_test(enable = "avx")]
4003    const fn test_mm256_cvtsi256_si32() {
4004        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4005        let r = _mm256_cvtsi256_si32(a);
4006        assert_eq!(r, 1);
4007    }
4008
4009    #[simd_test(enable = "avx")]
4010    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4011    fn test_mm256_zeroall() {
4012        _mm256_zeroall();
4013    }
4014
4015    #[simd_test(enable = "avx")]
4016    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4017    fn test_mm256_zeroupper() {
4018        _mm256_zeroupper();
4019    }
4020
4021    #[simd_test(enable = "avx")]
4022    fn test_mm256_permutevar_ps() {
4023        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4024        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4025        let r = _mm256_permutevar_ps(a, b);
4026        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
4027        assert_eq_m256(r, e);
4028    }
4029
4030    #[simd_test(enable = "avx")]
4031    fn test_mm_permutevar_ps() {
4032        let a = _mm_setr_ps(4., 3., 2., 5.);
4033        let b = _mm_setr_epi32(1, 2, 3, 4);
4034        let r = _mm_permutevar_ps(a, b);
4035        let e = _mm_setr_ps(3., 2., 5., 4.);
4036        assert_eq_m128(r, e);
4037    }
4038
4039    #[simd_test(enable = "avx")]
4040    const fn test_mm256_permute_ps() {
4041        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4042        let r = _mm256_permute_ps::<0x1b>(a);
4043        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
4044        assert_eq_m256(r, e);
4045    }
4046
4047    #[simd_test(enable = "avx")]
4048    const fn test_mm_permute_ps() {
4049        let a = _mm_setr_ps(4., 3., 2., 5.);
4050        let r = _mm_permute_ps::<0x1b>(a);
4051        let e = _mm_setr_ps(5., 2., 3., 4.);
4052        assert_eq_m128(r, e);
4053    }
4054
4055    #[simd_test(enable = "avx")]
4056    fn test_mm256_permutevar_pd() {
4057        let a = _mm256_setr_pd(4., 3., 2., 5.);
4058        let b = _mm256_setr_epi64x(1, 2, 3, 4);
4059        let r = _mm256_permutevar_pd(a, b);
4060        let e = _mm256_setr_pd(4., 3., 5., 2.);
4061        assert_eq_m256d(r, e);
4062    }
4063
4064    #[simd_test(enable = "avx")]
4065    fn test_mm_permutevar_pd() {
4066        let a = _mm_setr_pd(4., 3.);
4067        let b = _mm_setr_epi64x(3, 0);
4068        let r = _mm_permutevar_pd(a, b);
4069        let e = _mm_setr_pd(3., 4.);
4070        assert_eq_m128d(r, e);
4071    }
4072
4073    #[simd_test(enable = "avx")]
4074    const fn test_mm256_permute_pd() {
4075        let a = _mm256_setr_pd(4., 3., 2., 5.);
4076        let r = _mm256_permute_pd::<5>(a);
4077        let e = _mm256_setr_pd(3., 4., 5., 2.);
4078        assert_eq_m256d(r, e);
4079    }
4080
4081    #[simd_test(enable = "avx")]
4082    const fn test_mm_permute_pd() {
4083        let a = _mm_setr_pd(4., 3.);
4084        let r = _mm_permute_pd::<1>(a);
4085        let e = _mm_setr_pd(3., 4.);
4086        assert_eq_m128d(r, e);
4087    }
4088
4089    #[simd_test(enable = "avx")]
4090    const fn test_mm256_permute2f128_ps() {
4091        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
4092        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
4093        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
4094        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
4095        assert_eq_m256(r, e);
4096
4097        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4098        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
4099        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
4100        assert_eq_m256(r, z);
4101    }
4102
4103    #[simd_test(enable = "avx")]
4104    const fn test_mm256_permute2f128_pd() {
4105        let a = _mm256_setr_pd(1., 2., 3., 4.);
4106        let b = _mm256_setr_pd(5., 6., 7., 8.);
4107        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
4108        let e = _mm256_setr_pd(3., 4., 7., 8.);
4109        assert_eq_m256d(r, e);
4110
4111        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4112        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
4113        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
4114        assert_eq_m256d(r, e);
4115    }
4116
4117    #[simd_test(enable = "avx")]
4118    const fn test_mm256_permute2f128_si256() {
4119        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
4120        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
4121        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
4122        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
4123        assert_eq_m256i(r, e);
4124
4125        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4126        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
4127        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
4128        assert_eq_m256i(r, e);
4129    }
4130
4131    #[simd_test(enable = "avx")]
4132    const fn test_mm256_broadcast_ss() {
4133        let r = _mm256_broadcast_ss(&3.);
4134        let e = _mm256_set1_ps(3.);
4135        assert_eq_m256(r, e);
4136    }
4137
4138    #[simd_test(enable = "avx")]
4139    const fn test_mm_broadcast_ss() {
4140        let r = _mm_broadcast_ss(&3.);
4141        let e = _mm_set1_ps(3.);
4142        assert_eq_m128(r, e);
4143    }
4144
4145    #[simd_test(enable = "avx")]
4146    const fn test_mm256_broadcast_sd() {
4147        let r = _mm256_broadcast_sd(&3.);
4148        let e = _mm256_set1_pd(3.);
4149        assert_eq_m256d(r, e);
4150    }
4151
4152    #[simd_test(enable = "avx")]
4153    const fn test_mm256_broadcast_ps() {
4154        let a = _mm_setr_ps(4., 3., 2., 5.);
4155        let r = _mm256_broadcast_ps(&a);
4156        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
4157        assert_eq_m256(r, e);
4158    }
4159
4160    #[simd_test(enable = "avx")]
4161    const fn test_mm256_broadcast_pd() {
4162        let a = _mm_setr_pd(4., 3.);
4163        let r = _mm256_broadcast_pd(&a);
4164        let e = _mm256_setr_pd(4., 3., 4., 3.);
4165        assert_eq_m256d(r, e);
4166    }
4167
4168    #[simd_test(enable = "avx")]
4169    const fn test_mm256_insertf128_ps() {
4170        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4171        let b = _mm_setr_ps(4., 9., 16., 25.);
4172        let r = _mm256_insertf128_ps::<0>(a, b);
4173        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4174        assert_eq_m256(r, e);
4175    }
4176
4177    #[simd_test(enable = "avx")]
4178    const fn test_mm256_insertf128_pd() {
4179        let a = _mm256_setr_pd(1., 2., 3., 4.);
4180        let b = _mm_setr_pd(5., 6.);
4181        let r = _mm256_insertf128_pd::<0>(a, b);
4182        let e = _mm256_setr_pd(5., 6., 3., 4.);
4183        assert_eq_m256d(r, e);
4184    }
4185
4186    #[simd_test(enable = "avx")]
4187    const fn test_mm256_insertf128_si256() {
4188        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4189        let b = _mm_setr_epi64x(5, 6);
4190        let r = _mm256_insertf128_si256::<0>(a, b);
4191        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4192        assert_eq_m256i(r, e);
4193    }
4194
4195    #[simd_test(enable = "avx")]
4196    const fn test_mm256_insert_epi8() {
4197        #[rustfmt::skip]
4198        let a = _mm256_setr_epi8(
4199            1, 2, 3, 4, 5, 6, 7, 8,
4200            9, 10, 11, 12, 13, 14, 15, 16,
4201            17, 18, 19, 20, 21, 22, 23, 24,
4202            25, 26, 27, 28, 29, 30, 31, 32,
4203        );
4204        let r = _mm256_insert_epi8::<31>(a, 0);
4205        #[rustfmt::skip]
4206        let e = _mm256_setr_epi8(
4207            1, 2, 3, 4, 5, 6, 7, 8,
4208            9, 10, 11, 12, 13, 14, 15, 16,
4209            17, 18, 19, 20, 21, 22, 23, 24,
4210            25, 26, 27, 28, 29, 30, 31, 0,
4211        );
4212        assert_eq_m256i(r, e);
4213    }
4214
4215    #[simd_test(enable = "avx")]
4216    const fn test_mm256_insert_epi16() {
4217        #[rustfmt::skip]
4218        let a = _mm256_setr_epi16(
4219            0, 1, 2, 3, 4, 5, 6, 7,
4220            8, 9, 10, 11, 12, 13, 14, 15,
4221        );
4222        let r = _mm256_insert_epi16::<15>(a, 0);
4223        #[rustfmt::skip]
4224        let e = _mm256_setr_epi16(
4225            0, 1, 2, 3, 4, 5, 6, 7,
4226            8, 9, 10, 11, 12, 13, 14, 0,
4227        );
4228        assert_eq_m256i(r, e);
4229    }
4230
4231    #[simd_test(enable = "avx")]
4232    const fn test_mm256_insert_epi32() {
4233        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4234        let r = _mm256_insert_epi32::<7>(a, 0);
4235        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4236        assert_eq_m256i(r, e);
4237    }
4238
4239    #[simd_test(enable = "avx")]
4240    const fn test_mm256_load_pd() {
4241        let a = _mm256_setr_pd(1., 2., 3., 4.);
4242        let p = ptr::addr_of!(a) as *const f64;
4243        let r = unsafe { _mm256_load_pd(p) };
4244        let e = _mm256_setr_pd(1., 2., 3., 4.);
4245        assert_eq_m256d(r, e);
4246    }
4247
4248    #[simd_test(enable = "avx")]
4249    const fn test_mm256_store_pd() {
4250        let a = _mm256_setr_pd(1., 2., 3., 4.);
4251        let mut r = _mm256_undefined_pd();
4252        unsafe {
4253            _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4254        }
4255        assert_eq_m256d(r, a);
4256    }
4257
4258    #[simd_test(enable = "avx")]
4259    const fn test_mm256_load_ps() {
4260        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4261        let p = ptr::addr_of!(a) as *const f32;
4262        let r = unsafe { _mm256_load_ps(p) };
4263        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4264        assert_eq_m256(r, e);
4265    }
4266
4267    #[simd_test(enable = "avx")]
4268    const fn test_mm256_store_ps() {
4269        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4270        let mut r = _mm256_undefined_ps();
4271        unsafe {
4272            _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4273        }
4274        assert_eq_m256(r, a);
4275    }
4276
4277    #[simd_test(enable = "avx")]
4278    const fn test_mm256_loadu_pd() {
4279        let a = &[1.0f64, 2., 3., 4.];
4280        let p = a.as_ptr();
4281        let r = unsafe { _mm256_loadu_pd(black_box(p)) };
4282        let e = _mm256_setr_pd(1., 2., 3., 4.);
4283        assert_eq_m256d(r, e);
4284    }
4285
4286    #[simd_test(enable = "avx")]
4287    const fn test_mm256_storeu_pd() {
4288        let a = _mm256_set1_pd(9.);
4289        let mut r = _mm256_undefined_pd();
4290        unsafe {
4291            _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4292        }
4293        assert_eq_m256d(r, a);
4294    }
4295
4296    #[simd_test(enable = "avx")]
4297    const fn test_mm256_loadu_ps() {
4298        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4299        let p = a.as_ptr();
4300        let r = unsafe { _mm256_loadu_ps(black_box(p)) };
4301        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4302        assert_eq_m256(r, e);
4303    }
4304
4305    #[simd_test(enable = "avx")]
4306    const fn test_mm256_storeu_ps() {
4307        let a = _mm256_set1_ps(9.);
4308        let mut r = _mm256_undefined_ps();
4309        unsafe {
4310            _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4311        }
4312        assert_eq_m256(r, a);
4313    }
4314
4315    #[simd_test(enable = "avx")]
4316    const fn test_mm256_load_si256() {
4317        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4318        let p = ptr::addr_of!(a);
4319        let r = unsafe { _mm256_load_si256(p) };
4320        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4321        assert_eq_m256i(r, e);
4322    }
4323
4324    #[simd_test(enable = "avx")]
4325    const fn test_mm256_store_si256() {
4326        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4327        let mut r = _mm256_undefined_si256();
4328        unsafe {
4329            _mm256_store_si256(ptr::addr_of_mut!(r), a);
4330        }
4331        assert_eq_m256i(r, a);
4332    }
4333
4334    #[simd_test(enable = "avx")]
4335    const fn test_mm256_loadu_si256() {
4336        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4337        let p = ptr::addr_of!(a);
4338        let r = unsafe { _mm256_loadu_si256(black_box(p)) };
4339        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4340        assert_eq_m256i(r, e);
4341    }
4342
4343    #[simd_test(enable = "avx")]
4344    const fn test_mm256_storeu_si256() {
4345        let a = _mm256_set1_epi8(9);
4346        let mut r = _mm256_undefined_si256();
4347        unsafe {
4348            _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4349        }
4350        assert_eq_m256i(r, a);
4351    }
4352
4353    #[simd_test(enable = "avx")]
4354    const fn test_mm256_maskload_pd() {
4355        let a = &[1.0f64, 2., 3., 4.];
4356        let p = a.as_ptr();
4357        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4358        let r = unsafe { _mm256_maskload_pd(black_box(p), mask) };
4359        let e = _mm256_setr_pd(0., 2., 0., 4.);
4360        assert_eq_m256d(r, e);
4361    }
4362
4363    #[simd_test(enable = "avx")]
4364    const fn test_mm256_maskstore_pd() {
4365        let mut r = _mm256_set1_pd(0.);
4366        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4367        let a = _mm256_setr_pd(1., 2., 3., 4.);
4368        unsafe {
4369            _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4370        }
4371        let e = _mm256_setr_pd(0., 2., 0., 4.);
4372        assert_eq_m256d(r, e);
4373    }
4374
4375    #[simd_test(enable = "avx")]
4376    const fn test_mm_maskload_pd() {
4377        let a = &[1.0f64, 2.];
4378        let p = a.as_ptr();
4379        let mask = _mm_setr_epi64x(0, !0);
4380        let r = unsafe { _mm_maskload_pd(black_box(p), mask) };
4381        let e = _mm_setr_pd(0., 2.);
4382        assert_eq_m128d(r, e);
4383    }
4384
4385    #[simd_test(enable = "avx")]
4386    const fn test_mm_maskstore_pd() {
4387        let mut r = _mm_set1_pd(0.);
4388        let mask = _mm_setr_epi64x(0, !0);
4389        let a = _mm_setr_pd(1., 2.);
4390        unsafe {
4391            _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4392        }
4393        let e = _mm_setr_pd(0., 2.);
4394        assert_eq_m128d(r, e);
4395    }
4396
4397    #[simd_test(enable = "avx")]
4398    const fn test_mm256_maskload_ps() {
4399        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4400        let p = a.as_ptr();
4401        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4402        let r = unsafe { _mm256_maskload_ps(black_box(p), mask) };
4403        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4404        assert_eq_m256(r, e);
4405    }
4406
4407    #[simd_test(enable = "avx")]
4408    const fn test_mm256_maskstore_ps() {
4409        let mut r = _mm256_set1_ps(0.);
4410        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4411        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4412        unsafe {
4413            _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4414        }
4415        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4416        assert_eq_m256(r, e);
4417    }
4418
4419    #[simd_test(enable = "avx")]
4420    const fn test_mm_maskload_ps() {
4421        let a = &[1.0f32, 2., 3., 4.];
4422        let p = a.as_ptr();
4423        let mask = _mm_setr_epi32(0, !0, 0, !0);
4424        let r = unsafe { _mm_maskload_ps(black_box(p), mask) };
4425        let e = _mm_setr_ps(0., 2., 0., 4.);
4426        assert_eq_m128(r, e);
4427    }
4428
4429    #[simd_test(enable = "avx")]
4430    const fn test_mm_maskstore_ps() {
4431        let mut r = _mm_set1_ps(0.);
4432        let mask = _mm_setr_epi32(0, !0, 0, !0);
4433        let a = _mm_setr_ps(1., 2., 3., 4.);
4434        unsafe {
4435            _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4436        }
4437        let e = _mm_setr_ps(0., 2., 0., 4.);
4438        assert_eq_m128(r, e);
4439    }
4440
4441    #[simd_test(enable = "avx")]
4442    const fn test_mm256_movehdup_ps() {
4443        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4444        let r = _mm256_movehdup_ps(a);
4445        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4446        assert_eq_m256(r, e);
4447    }
4448
4449    #[simd_test(enable = "avx")]
4450    const fn test_mm256_moveldup_ps() {
4451        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4452        let r = _mm256_moveldup_ps(a);
4453        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4454        assert_eq_m256(r, e);
4455    }
4456
4457    #[simd_test(enable = "avx")]
4458    const fn test_mm256_movedup_pd() {
4459        let a = _mm256_setr_pd(1., 2., 3., 4.);
4460        let r = _mm256_movedup_pd(a);
4461        let e = _mm256_setr_pd(1., 1., 3., 3.);
4462        assert_eq_m256d(r, e);
4463    }
4464
4465    #[simd_test(enable = "avx")]
4466    fn test_mm256_lddqu_si256() {
4467        #[rustfmt::skip]
4468        let a = _mm256_setr_epi8(
4469            1, 2, 3, 4, 5, 6, 7, 8,
4470            9, 10, 11, 12, 13, 14, 15, 16,
4471            17, 18, 19, 20, 21, 22, 23, 24,
4472            25, 26, 27, 28, 29, 30, 31, 32,
4473        );
4474        let p = ptr::addr_of!(a);
4475        let r = unsafe { _mm256_lddqu_si256(black_box(p)) };
4476        #[rustfmt::skip]
4477        let e = _mm256_setr_epi8(
4478            1, 2, 3, 4, 5, 6, 7, 8,
4479            9, 10, 11, 12, 13, 14, 15, 16,
4480            17, 18, 19, 20, 21, 22, 23, 24,
4481            25, 26, 27, 28, 29, 30, 31, 32,
4482        );
4483        assert_eq_m256i(r, e);
4484    }
4485
4486    #[simd_test(enable = "avx")]
4487    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4488    fn test_mm256_stream_si256() {
4489        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4490        let mut r = _mm256_undefined_si256();
4491        unsafe {
4492            _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4493        }
4494        _mm_sfence();
4495        assert_eq_m256i(r, a);
4496    }
4497
4498    #[simd_test(enable = "avx")]
4499    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4500    fn test_mm256_stream_pd() {
4501        #[repr(align(32))]
4502        struct Memory {
4503            pub data: [f64; 4],
4504        }
4505        let a = _mm256_set1_pd(7.0);
4506        let mut mem = Memory { data: [-1.0; 4] };
4507
4508        unsafe {
4509            _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4510        }
4511        _mm_sfence();
4512        for i in 0..4 {
4513            assert_eq!(mem.data[i], get_m256d(a, i));
4514        }
4515    }
4516
4517    #[simd_test(enable = "avx")]
4518    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4519    fn test_mm256_stream_ps() {
4520        #[repr(align(32))]
4521        struct Memory {
4522            pub data: [f32; 8],
4523        }
4524        let a = _mm256_set1_ps(7.0);
4525        let mut mem = Memory { data: [-1.0; 8] };
4526
4527        unsafe {
4528            _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4529        }
4530        _mm_sfence();
4531        for i in 0..8 {
4532            assert_eq!(mem.data[i], get_m256(a, i));
4533        }
4534    }
4535
4536    #[simd_test(enable = "avx")]
4537    fn test_mm256_rcp_ps() {
4538        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4539        let r = _mm256_rcp_ps(a);
4540        #[rustfmt::skip]
4541        let e = _mm256_setr_ps(
4542            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4543            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4544        );
4545        let rel_err = 0.00048828125;
4546        for i in 0..8 {
4547            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4548        }
4549    }
4550
4551    #[simd_test(enable = "avx")]
4552    fn test_mm256_rsqrt_ps() {
4553        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4554        let r = _mm256_rsqrt_ps(a);
4555        #[rustfmt::skip]
4556        let e = _mm256_setr_ps(
4557            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4558            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4559        );
4560        let rel_err = 0.00048828125;
4561        for i in 0..8 {
4562            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4563        }
4564    }
4565
4566    #[simd_test(enable = "avx")]
4567    const fn test_mm256_unpackhi_pd() {
4568        let a = _mm256_setr_pd(1., 2., 3., 4.);
4569        let b = _mm256_setr_pd(5., 6., 7., 8.);
4570        let r = _mm256_unpackhi_pd(a, b);
4571        let e = _mm256_setr_pd(2., 6., 4., 8.);
4572        assert_eq_m256d(r, e);
4573    }
4574
4575    #[simd_test(enable = "avx")]
4576    const fn test_mm256_unpackhi_ps() {
4577        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4578        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4579        let r = _mm256_unpackhi_ps(a, b);
4580        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4581        assert_eq_m256(r, e);
4582    }
4583
4584    #[simd_test(enable = "avx")]
4585    const fn test_mm256_unpacklo_pd() {
4586        let a = _mm256_setr_pd(1., 2., 3., 4.);
4587        let b = _mm256_setr_pd(5., 6., 7., 8.);
4588        let r = _mm256_unpacklo_pd(a, b);
4589        let e = _mm256_setr_pd(1., 5., 3., 7.);
4590        assert_eq_m256d(r, e);
4591    }
4592
4593    #[simd_test(enable = "avx")]
4594    const fn test_mm256_unpacklo_ps() {
4595        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4596        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4597        let r = _mm256_unpacklo_ps(a, b);
4598        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4599        assert_eq_m256(r, e);
4600    }
4601
4602    #[simd_test(enable = "avx")]
4603    const fn test_mm256_testz_si256() {
4604        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4605        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4606        let r = _mm256_testz_si256(a, b);
4607        assert_eq!(r, 0);
4608        let b = _mm256_set1_epi64x(0);
4609        let r = _mm256_testz_si256(a, b);
4610        assert_eq!(r, 1);
4611    }
4612
4613    #[simd_test(enable = "avx")]
4614    const fn test_mm256_testc_si256() {
4615        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4616        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4617        let r = _mm256_testc_si256(a, b);
4618        assert_eq!(r, 0);
4619        let b = _mm256_set1_epi64x(0);
4620        let r = _mm256_testc_si256(a, b);
4621        assert_eq!(r, 1);
4622    }
4623
4624    #[simd_test(enable = "avx")]
4625    fn test_mm256_testnzc_si256() {
4626        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4627        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4628        let r = _mm256_testnzc_si256(a, b);
4629        assert_eq!(r, 1);
4630        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4631        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4632        let r = _mm256_testnzc_si256(a, b);
4633        assert_eq!(r, 0);
4634    }
4635
4636    #[simd_test(enable = "avx")]
4637    fn test_mm256_testz_pd() {
4638        let a = _mm256_setr_pd(1., 2., 3., 4.);
4639        let b = _mm256_setr_pd(5., 6., 7., 8.);
4640        let r = _mm256_testz_pd(a, b);
4641        assert_eq!(r, 1);
4642        let a = _mm256_set1_pd(-1.);
4643        let r = _mm256_testz_pd(a, a);
4644        assert_eq!(r, 0);
4645    }
4646
4647    #[simd_test(enable = "avx")]
4648    fn test_mm256_testc_pd() {
4649        let a = _mm256_setr_pd(1., 2., 3., 4.);
4650        let b = _mm256_setr_pd(5., 6., 7., 8.);
4651        let r = _mm256_testc_pd(a, b);
4652        assert_eq!(r, 1);
4653        let a = _mm256_set1_pd(1.);
4654        let b = _mm256_set1_pd(-1.);
4655        let r = _mm256_testc_pd(a, b);
4656        assert_eq!(r, 0);
4657    }
4658
4659    #[simd_test(enable = "avx")]
4660    fn test_mm256_testnzc_pd() {
4661        let a = _mm256_setr_pd(1., 2., 3., 4.);
4662        let b = _mm256_setr_pd(5., 6., 7., 8.);
4663        let r = _mm256_testnzc_pd(a, b);
4664        assert_eq!(r, 0);
4665        let a = _mm256_setr_pd(1., -1., -1., -1.);
4666        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4667        let r = _mm256_testnzc_pd(a, b);
4668        assert_eq!(r, 1);
4669    }
4670
4671    #[simd_test(enable = "avx")]
4672    const fn test_mm_testz_pd() {
4673        let a = _mm_setr_pd(1., 2.);
4674        let b = _mm_setr_pd(5., 6.);
4675        let r = _mm_testz_pd(a, b);
4676        assert_eq!(r, 1);
4677        let a = _mm_set1_pd(-1.);
4678        let r = _mm_testz_pd(a, a);
4679        assert_eq!(r, 0);
4680    }
4681
4682    #[simd_test(enable = "avx")]
4683    const fn test_mm_testc_pd() {
4684        let a = _mm_setr_pd(1., 2.);
4685        let b = _mm_setr_pd(5., 6.);
4686        let r = _mm_testc_pd(a, b);
4687        assert_eq!(r, 1);
4688        let a = _mm_set1_pd(1.);
4689        let b = _mm_set1_pd(-1.);
4690        let r = _mm_testc_pd(a, b);
4691        assert_eq!(r, 0);
4692    }
4693
4694    #[simd_test(enable = "avx")]
4695    fn test_mm_testnzc_pd() {
4696        let a = _mm_setr_pd(1., 2.);
4697        let b = _mm_setr_pd(5., 6.);
4698        let r = _mm_testnzc_pd(a, b);
4699        assert_eq!(r, 0);
4700        let a = _mm_setr_pd(1., -1.);
4701        let b = _mm_setr_pd(-1., -1.);
4702        let r = _mm_testnzc_pd(a, b);
4703        assert_eq!(r, 1);
4704    }
4705
4706    #[simd_test(enable = "avx")]
4707    fn test_mm256_testz_ps() {
4708        let a = _mm256_set1_ps(1.);
4709        let r = _mm256_testz_ps(a, a);
4710        assert_eq!(r, 1);
4711        let a = _mm256_set1_ps(-1.);
4712        let r = _mm256_testz_ps(a, a);
4713        assert_eq!(r, 0);
4714    }
4715
4716    #[simd_test(enable = "avx")]
4717    fn test_mm256_testc_ps() {
4718        let a = _mm256_set1_ps(1.);
4719        let r = _mm256_testc_ps(a, a);
4720        assert_eq!(r, 1);
4721        let b = _mm256_set1_ps(-1.);
4722        let r = _mm256_testc_ps(a, b);
4723        assert_eq!(r, 0);
4724    }
4725
4726    #[simd_test(enable = "avx")]
4727    fn test_mm256_testnzc_ps() {
4728        let a = _mm256_set1_ps(1.);
4729        let r = _mm256_testnzc_ps(a, a);
4730        assert_eq!(r, 0);
4731        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4732        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4733        let r = _mm256_testnzc_ps(a, b);
4734        assert_eq!(r, 1);
4735    }
4736
4737    #[simd_test(enable = "avx")]
4738    const fn test_mm_testz_ps() {
4739        let a = _mm_set1_ps(1.);
4740        let r = _mm_testz_ps(a, a);
4741        assert_eq!(r, 1);
4742        let a = _mm_set1_ps(-1.);
4743        let r = _mm_testz_ps(a, a);
4744        assert_eq!(r, 0);
4745    }
4746
4747    #[simd_test(enable = "avx")]
4748    const fn test_mm_testc_ps() {
4749        let a = _mm_set1_ps(1.);
4750        let r = _mm_testc_ps(a, a);
4751        assert_eq!(r, 1);
4752        let b = _mm_set1_ps(-1.);
4753        let r = _mm_testc_ps(a, b);
4754        assert_eq!(r, 0);
4755    }
4756
4757    #[simd_test(enable = "avx")]
4758    fn test_mm_testnzc_ps() {
4759        let a = _mm_set1_ps(1.);
4760        let r = _mm_testnzc_ps(a, a);
4761        assert_eq!(r, 0);
4762        let a = _mm_setr_ps(1., -1., -1., -1.);
4763        let b = _mm_setr_ps(-1., -1., 1., 1.);
4764        let r = _mm_testnzc_ps(a, b);
4765        assert_eq!(r, 1);
4766    }
4767
4768    #[simd_test(enable = "avx")]
4769    const fn test_mm256_movemask_pd() {
4770        let a = _mm256_setr_pd(1., -2., 3., -4.);
4771        let r = _mm256_movemask_pd(a);
4772        assert_eq!(r, 0xA);
4773    }
4774
4775    #[simd_test(enable = "avx")]
4776    const fn test_mm256_movemask_ps() {
4777        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4778        let r = _mm256_movemask_ps(a);
4779        assert_eq!(r, 0xAA);
4780    }
4781
4782    #[simd_test(enable = "avx")]
4783    const fn test_mm256_setzero_pd() {
4784        let r = _mm256_setzero_pd();
4785        assert_eq_m256d(r, _mm256_set1_pd(0.));
4786    }
4787
4788    #[simd_test(enable = "avx")]
4789    const fn test_mm256_setzero_ps() {
4790        let r = _mm256_setzero_ps();
4791        assert_eq_m256(r, _mm256_set1_ps(0.));
4792    }
4793
4794    #[simd_test(enable = "avx")]
4795    const fn test_mm256_setzero_si256() {
4796        let r = _mm256_setzero_si256();
4797        assert_eq_m256i(r, _mm256_set1_epi8(0));
4798    }
4799
4800    #[simd_test(enable = "avx")]
4801    const fn test_mm256_set_pd() {
4802        let r = _mm256_set_pd(1., 2., 3., 4.);
4803        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4804    }
4805
4806    #[simd_test(enable = "avx")]
4807    const fn test_mm256_set_ps() {
4808        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4809        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4810    }
4811
4812    #[simd_test(enable = "avx")]
4813    const fn test_mm256_set_epi8() {
4814        #[rustfmt::skip]
4815        let r = _mm256_set_epi8(
4816            1, 2, 3, 4, 5, 6, 7, 8,
4817            9, 10, 11, 12, 13, 14, 15, 16,
4818            17, 18, 19, 20, 21, 22, 23, 24,
4819            25, 26, 27, 28, 29, 30, 31, 32,
4820        );
4821        #[rustfmt::skip]
4822        let e = _mm256_setr_epi8(
4823            32, 31, 30, 29, 28, 27, 26, 25,
4824            24, 23, 22, 21, 20, 19, 18, 17,
4825            16, 15, 14, 13, 12, 11, 10, 9,
4826            8, 7, 6, 5, 4, 3, 2, 1
4827        );
4828        assert_eq_m256i(r, e);
4829    }
4830
4831    #[simd_test(enable = "avx")]
4832    const fn test_mm256_set_epi16() {
4833        #[rustfmt::skip]
4834        let r = _mm256_set_epi16(
4835            1, 2, 3, 4, 5, 6, 7, 8,
4836            9, 10, 11, 12, 13, 14, 15, 16,
4837        );
4838        #[rustfmt::skip]
4839        let e = _mm256_setr_epi16(
4840            16, 15, 14, 13, 12, 11, 10, 9, 8,
4841            7, 6, 5, 4, 3, 2, 1,
4842        );
4843        assert_eq_m256i(r, e);
4844    }
4845
4846    #[simd_test(enable = "avx")]
4847    const fn test_mm256_set_epi32() {
4848        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4849        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4850    }
4851
4852    #[simd_test(enable = "avx")]
4853    const fn test_mm256_set_epi64x() {
4854        let r = _mm256_set_epi64x(1, 2, 3, 4);
4855        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4856    }
4857
4858    #[simd_test(enable = "avx")]
4859    const fn test_mm256_setr_pd() {
4860        let r = _mm256_setr_pd(1., 2., 3., 4.);
4861        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4862    }
4863
4864    #[simd_test(enable = "avx")]
4865    const fn test_mm256_setr_ps() {
4866        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4867        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4868    }
4869
4870    #[simd_test(enable = "avx")]
4871    const fn test_mm256_setr_epi8() {
4872        #[rustfmt::skip]
4873        let r = _mm256_setr_epi8(
4874            1, 2, 3, 4, 5, 6, 7, 8,
4875            9, 10, 11, 12, 13, 14, 15, 16,
4876            17, 18, 19, 20, 21, 22, 23, 24,
4877            25, 26, 27, 28, 29, 30, 31, 32,
4878        );
4879        #[rustfmt::skip]
4880        let e = _mm256_setr_epi8(
4881            1, 2, 3, 4, 5, 6, 7, 8,
4882            9, 10, 11, 12, 13, 14, 15, 16,
4883            17, 18, 19, 20, 21, 22, 23, 24,
4884            25, 26, 27, 28, 29, 30, 31, 32
4885        );
4886
4887        assert_eq_m256i(r, e);
4888    }
4889
4890    #[simd_test(enable = "avx")]
4891    const fn test_mm256_setr_epi16() {
4892        #[rustfmt::skip]
4893        let r = _mm256_setr_epi16(
4894            1, 2, 3, 4, 5, 6, 7, 8,
4895            9, 10, 11, 12, 13, 14, 15, 16,
4896        );
4897        #[rustfmt::skip]
4898        let e = _mm256_setr_epi16(
4899            1, 2, 3, 4, 5, 6, 7, 8,
4900            9, 10, 11, 12, 13, 14, 15, 16,
4901        );
4902        assert_eq_m256i(r, e);
4903    }
4904
4905    #[simd_test(enable = "avx")]
4906    const fn test_mm256_setr_epi32() {
4907        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4908        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4909    }
4910
4911    #[simd_test(enable = "avx")]
4912    const fn test_mm256_setr_epi64x() {
4913        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4914        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4915    }
4916
4917    #[simd_test(enable = "avx")]
4918    const fn test_mm256_set1_pd() {
4919        let r = _mm256_set1_pd(1.);
4920        assert_eq_m256d(r, _mm256_set1_pd(1.));
4921    }
4922
4923    #[simd_test(enable = "avx")]
4924    const fn test_mm256_set1_ps() {
4925        let r = _mm256_set1_ps(1.);
4926        assert_eq_m256(r, _mm256_set1_ps(1.));
4927    }
4928
4929    #[simd_test(enable = "avx")]
4930    const fn test_mm256_set1_epi8() {
4931        let r = _mm256_set1_epi8(1);
4932        assert_eq_m256i(r, _mm256_set1_epi8(1));
4933    }
4934
4935    #[simd_test(enable = "avx")]
4936    const fn test_mm256_set1_epi16() {
4937        let r = _mm256_set1_epi16(1);
4938        assert_eq_m256i(r, _mm256_set1_epi16(1));
4939    }
4940
4941    #[simd_test(enable = "avx")]
4942    const fn test_mm256_set1_epi32() {
4943        let r = _mm256_set1_epi32(1);
4944        assert_eq_m256i(r, _mm256_set1_epi32(1));
4945    }
4946
4947    #[simd_test(enable = "avx")]
4948    const fn test_mm256_set1_epi64x() {
4949        let r = _mm256_set1_epi64x(1);
4950        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4951    }
4952
4953    #[simd_test(enable = "avx")]
4954    const fn test_mm256_castpd_ps() {
4955        let a = _mm256_setr_pd(1., 2., 3., 4.);
4956        let r = _mm256_castpd_ps(a);
4957        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4958        assert_eq_m256(r, e);
4959    }
4960
4961    #[simd_test(enable = "avx")]
4962    const fn test_mm256_castps_pd() {
4963        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4964        let r = _mm256_castps_pd(a);
4965        let e = _mm256_setr_pd(1., 2., 3., 4.);
4966        assert_eq_m256d(r, e);
4967    }
4968
4969    #[simd_test(enable = "avx")]
4970    const fn test_mm256_castps_si256() {
4971        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4972        let r = _mm256_castps_si256(a);
4973        #[rustfmt::skip]
4974        let e = _mm256_setr_epi8(
4975            0, 0, -128, 63, 0, 0, 0, 64,
4976            0, 0, 64, 64, 0, 0, -128, 64,
4977            0, 0, -96, 64, 0, 0, -64, 64,
4978            0, 0, -32, 64, 0, 0, 0, 65,
4979        );
4980        assert_eq_m256i(r, e);
4981    }
4982
4983    #[simd_test(enable = "avx")]
4984    const fn test_mm256_castsi256_ps() {
4985        #[rustfmt::skip]
4986        let a = _mm256_setr_epi8(
4987            0, 0, -128, 63, 0, 0, 0, 64,
4988            0, 0, 64, 64, 0, 0, -128, 64,
4989            0, 0, -96, 64, 0, 0, -64, 64,
4990            0, 0, -32, 64, 0, 0, 0, 65,
4991        );
4992        let r = _mm256_castsi256_ps(a);
4993        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4994        assert_eq_m256(r, e);
4995    }
4996
4997    #[simd_test(enable = "avx")]
4998    const fn test_mm256_castpd_si256() {
4999        let a = _mm256_setr_pd(1., 2., 3., 4.);
5000        let r = _mm256_castpd_si256(a);
5001        assert_eq_m256d(unsafe { transmute(r) }, a);
5002    }
5003
5004    #[simd_test(enable = "avx")]
5005    const fn test_mm256_castsi256_pd() {
5006        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5007        let r = _mm256_castsi256_pd(a);
5008        assert_eq_m256d(r, unsafe { transmute(a) });
5009    }
5010
5011    #[simd_test(enable = "avx")]
5012    const fn test_mm256_castps256_ps128() {
5013        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5014        let r = _mm256_castps256_ps128(a);
5015        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
5016    }
5017
5018    #[simd_test(enable = "avx")]
5019    const fn test_mm256_castpd256_pd128() {
5020        let a = _mm256_setr_pd(1., 2., 3., 4.);
5021        let r = _mm256_castpd256_pd128(a);
5022        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
5023    }
5024
5025    #[simd_test(enable = "avx")]
5026    const fn test_mm256_castsi256_si128() {
5027        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5028        let r = _mm256_castsi256_si128(a);
5029        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5030    }
5031
5032    #[simd_test(enable = "avx")]
5033    const fn test_mm256_castps128_ps256() {
5034        let a = _mm_setr_ps(1., 2., 3., 4.);
5035        let r = _mm256_castps128_ps256(a);
5036        assert_eq_m128(_mm256_castps256_ps128(r), a);
5037    }
5038
5039    #[simd_test(enable = "avx")]
5040    const fn test_mm256_castpd128_pd256() {
5041        let a = _mm_setr_pd(1., 2.);
5042        let r = _mm256_castpd128_pd256(a);
5043        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
5044    }
5045
5046    #[simd_test(enable = "avx")]
5047    const fn test_mm256_castsi128_si256() {
5048        let a = _mm_setr_epi32(1, 2, 3, 4);
5049        let r = _mm256_castsi128_si256(a);
5050        assert_eq_m128i(_mm256_castsi256_si128(r), a);
5051    }
5052
5053    #[simd_test(enable = "avx")]
5054    const fn test_mm256_zextps128_ps256() {
5055        let a = _mm_setr_ps(1., 2., 3., 4.);
5056        let r = _mm256_zextps128_ps256(a);
5057        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
5058        assert_eq_m256(r, e);
5059    }
5060
5061    #[simd_test(enable = "avx")]
5062    const fn test_mm256_zextsi128_si256() {
5063        let a = _mm_setr_epi64x(1, 2);
5064        let r = _mm256_zextsi128_si256(a);
5065        let e = _mm256_setr_epi64x(1, 2, 0, 0);
5066        assert_eq_m256i(r, e);
5067    }
5068
5069    #[simd_test(enable = "avx")]
5070    const fn test_mm256_zextpd128_pd256() {
5071        let a = _mm_setr_pd(1., 2.);
5072        let r = _mm256_zextpd128_pd256(a);
5073        let e = _mm256_setr_pd(1., 2., 0., 0.);
5074        assert_eq_m256d(r, e);
5075    }
5076
5077    #[simd_test(enable = "avx")]
5078    const fn test_mm256_set_m128() {
5079        let hi = _mm_setr_ps(5., 6., 7., 8.);
5080        let lo = _mm_setr_ps(1., 2., 3., 4.);
5081        let r = _mm256_set_m128(hi, lo);
5082        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5083        assert_eq_m256(r, e);
5084    }
5085
5086    #[simd_test(enable = "avx")]
5087    const fn test_mm256_set_m128d() {
5088        let hi = _mm_setr_pd(3., 4.);
5089        let lo = _mm_setr_pd(1., 2.);
5090        let r = _mm256_set_m128d(hi, lo);
5091        let e = _mm256_setr_pd(1., 2., 3., 4.);
5092        assert_eq_m256d(r, e);
5093    }
5094
5095    #[simd_test(enable = "avx")]
5096    const fn test_mm256_set_m128i() {
5097        #[rustfmt::skip]
5098        let hi = _mm_setr_epi8(
5099            17, 18, 19, 20,
5100            21, 22, 23, 24,
5101            25, 26, 27, 28,
5102            29, 30, 31, 32,
5103        );
5104        #[rustfmt::skip]
5105        let lo = _mm_setr_epi8(
5106            1, 2, 3, 4,
5107            5, 6, 7, 8,
5108            9, 10, 11, 12,
5109            13, 14, 15, 16,
5110        );
5111        let r = _mm256_set_m128i(hi, lo);
5112        #[rustfmt::skip]
5113        let e = _mm256_setr_epi8(
5114            1, 2, 3, 4, 5, 6, 7, 8,
5115            9, 10, 11, 12, 13, 14, 15, 16,
5116            17, 18, 19, 20, 21, 22, 23, 24,
5117            25, 26, 27, 28, 29, 30, 31, 32,
5118        );
5119        assert_eq_m256i(r, e);
5120    }
5121
5122    #[simd_test(enable = "avx")]
5123    const fn test_mm256_setr_m128() {
5124        let lo = _mm_setr_ps(1., 2., 3., 4.);
5125        let hi = _mm_setr_ps(5., 6., 7., 8.);
5126        let r = _mm256_setr_m128(lo, hi);
5127        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5128        assert_eq_m256(r, e);
5129    }
5130
5131    #[simd_test(enable = "avx")]
5132    const fn test_mm256_setr_m128d() {
5133        let lo = _mm_setr_pd(1., 2.);
5134        let hi = _mm_setr_pd(3., 4.);
5135        let r = _mm256_setr_m128d(lo, hi);
5136        let e = _mm256_setr_pd(1., 2., 3., 4.);
5137        assert_eq_m256d(r, e);
5138    }
5139
5140    #[simd_test(enable = "avx")]
5141    const fn test_mm256_setr_m128i() {
5142        #[rustfmt::skip]
5143        let lo = _mm_setr_epi8(
5144            1, 2, 3, 4,
5145            5, 6, 7, 8,
5146            9, 10, 11, 12,
5147            13, 14, 15, 16,
5148        );
5149        #[rustfmt::skip]
5150        let hi = _mm_setr_epi8(
5151            17, 18, 19, 20, 21, 22, 23, 24,
5152            25, 26, 27, 28, 29, 30, 31, 32,
5153        );
5154        let r = _mm256_setr_m128i(lo, hi);
5155        #[rustfmt::skip]
5156        let e = _mm256_setr_epi8(
5157            1, 2, 3, 4, 5, 6, 7, 8,
5158            9, 10, 11, 12, 13, 14, 15, 16,
5159            17, 18, 19, 20, 21, 22, 23, 24,
5160            25, 26, 27, 28, 29, 30, 31, 32,
5161        );
5162        assert_eq_m256i(r, e);
5163    }
5164
5165    #[simd_test(enable = "avx")]
5166    const fn test_mm256_loadu2_m128() {
5167        let hi = &[5., 6., 7., 8.];
5168        let hiaddr = hi.as_ptr();
5169        let lo = &[1., 2., 3., 4.];
5170        let loaddr = lo.as_ptr();
5171        let r = unsafe { _mm256_loadu2_m128(hiaddr, loaddr) };
5172        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5173        assert_eq_m256(r, e);
5174    }
5175
5176    #[simd_test(enable = "avx")]
5177    const fn test_mm256_loadu2_m128d() {
5178        let hi = &[3., 4.];
5179        let hiaddr = hi.as_ptr();
5180        let lo = &[1., 2.];
5181        let loaddr = lo.as_ptr();
5182        let r = unsafe { _mm256_loadu2_m128d(hiaddr, loaddr) };
5183        let e = _mm256_setr_pd(1., 2., 3., 4.);
5184        assert_eq_m256d(r, e);
5185    }
5186
5187    #[simd_test(enable = "avx")]
5188    const fn test_mm256_loadu2_m128i() {
5189        #[rustfmt::skip]
5190        let hi = _mm_setr_epi8(
5191            17, 18, 19, 20, 21, 22, 23, 24,
5192            25, 26, 27, 28, 29, 30, 31, 32,
5193        );
5194        #[rustfmt::skip]
5195        let lo = _mm_setr_epi8(
5196            1, 2, 3, 4, 5, 6, 7, 8,
5197            9, 10, 11, 12, 13, 14, 15, 16,
5198        );
5199        let r = unsafe {
5200            _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _)
5201        };
5202        #[rustfmt::skip]
5203        let e = _mm256_setr_epi8(
5204            1, 2, 3, 4, 5, 6, 7, 8,
5205            9, 10, 11, 12, 13, 14, 15, 16,
5206            17, 18, 19, 20, 21, 22, 23, 24,
5207            25, 26, 27, 28, 29, 30, 31, 32,
5208        );
5209        assert_eq_m256i(r, e);
5210    }
5211
5212    #[simd_test(enable = "avx")]
5213    const fn test_mm256_storeu2_m128() {
5214        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5215        let mut hi = _mm_undefined_ps();
5216        let mut lo = _mm_undefined_ps();
5217        unsafe {
5218            _mm256_storeu2_m128(
5219                ptr::addr_of_mut!(hi) as *mut f32,
5220                ptr::addr_of_mut!(lo) as *mut f32,
5221                a,
5222            );
5223        }
5224        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5225        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5226    }
5227
5228    #[simd_test(enable = "avx")]
5229    const fn test_mm256_storeu2_m128d() {
5230        let a = _mm256_setr_pd(1., 2., 3., 4.);
5231        let mut hi = _mm_undefined_pd();
5232        let mut lo = _mm_undefined_pd();
5233        unsafe {
5234            _mm256_storeu2_m128d(
5235                ptr::addr_of_mut!(hi) as *mut f64,
5236                ptr::addr_of_mut!(lo) as *mut f64,
5237                a,
5238            );
5239        }
5240        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5241        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5242    }
5243
5244    #[simd_test(enable = "avx")]
5245    const fn test_mm256_storeu2_m128i() {
5246        #[rustfmt::skip]
5247        let a = _mm256_setr_epi8(
5248            1, 2, 3, 4, 5, 6, 7, 8,
5249            9, 10, 11, 12, 13, 14, 15, 16,
5250            17, 18, 19, 20, 21, 22, 23, 24,
5251            25, 26, 27, 28, 29, 30, 31, 32,
5252        );
5253        let mut hi = _mm_undefined_si128();
5254        let mut lo = _mm_undefined_si128();
5255        unsafe {
5256            _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5257        }
5258        #[rustfmt::skip]
5259        let e_hi = _mm_setr_epi8(
5260            17, 18, 19, 20, 21, 22, 23, 24,
5261            25, 26, 27, 28, 29, 30, 31, 32
5262        );
5263        #[rustfmt::skip]
5264        let e_lo = _mm_setr_epi8(
5265            1, 2, 3, 4, 5, 6, 7, 8,
5266            9, 10, 11, 12, 13, 14, 15, 16
5267        );
5268
5269        assert_eq_m128i(hi, e_hi);
5270        assert_eq_m128i(lo, e_lo);
5271    }
5272
5273    #[simd_test(enable = "avx")]
5274    const fn test_mm256_cvtss_f32() {
5275        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5276        let r = _mm256_cvtss_f32(a);
5277        assert_eq!(r, 1.);
5278    }
5279}