core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
34pub const fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35    unsafe { simd_add(a, b) }
36}
37
38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
39/// `b`.
40///
41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
45#[stable(feature = "simd_x86", since = "1.27.0")]
46#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
47pub const fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
48    unsafe { simd_add(a, b) }
49}
50
51/// Computes the bitwise AND of a packed double-precision (64-bit)
52/// floating-point elements in `a` and `b`.
53///
54/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
55#[inline]
56#[target_feature(enable = "avx")]
57// See https://github.com/rust-lang/stdarch/issues/71
58#[cfg_attr(test, assert_instr(vandp))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
62    unsafe {
63        let a: u64x4 = transmute(a);
64        let b: u64x4 = transmute(b);
65        transmute(simd_and(a, b))
66    }
67}
68
69/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
70/// elements in `a` and `b`.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
73#[inline]
74#[target_feature(enable = "avx")]
75#[cfg_attr(test, assert_instr(vandps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
78pub const fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
79    unsafe {
80        let a: u32x8 = transmute(a);
81        let b: u32x8 = transmute(b);
82        transmute(simd_and(a, b))
83    }
84}
85
86/// Computes the bitwise OR packed double-precision (64-bit) floating-point
87/// elements in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
90#[inline]
91#[target_feature(enable = "avx")]
92// See <https://github.com/rust-lang/stdarch/issues/71>.
93#[cfg_attr(test, assert_instr(vorp))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
96pub const fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
97    unsafe {
98        let a: u64x4 = transmute(a);
99        let b: u64x4 = transmute(b);
100        transmute(simd_or(a, b))
101    }
102}
103
104/// Computes the bitwise OR packed single-precision (32-bit) floating-point
105/// elements in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
108#[inline]
109#[target_feature(enable = "avx")]
110#[cfg_attr(test, assert_instr(vorps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
114    unsafe {
115        let a: u32x8 = transmute(a);
116        let b: u32x8 = transmute(b);
117        transmute(simd_or(a, b))
118    }
119}
120
121/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
122/// lanes using the control in `imm8`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
125#[inline]
126#[target_feature(enable = "avx")]
127#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
128#[rustc_legacy_const_generics(2)]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
132    static_assert_uimm_bits!(MASK, 8);
133    unsafe {
134        simd_shuffle!(
135            a,
136            b,
137            [
138                MASK as u32 & 0b1,
139                ((MASK as u32 >> 1) & 0b1) + 4,
140                ((MASK as u32 >> 2) & 0b1) + 2,
141                ((MASK as u32 >> 3) & 0b1) + 6,
142            ],
143        )
144    }
145}
146
147/// Shuffles single-precision (32-bit) floating-point elements in `a` within
148/// 128-bit lanes using the control in `imm8`.
149///
150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
151#[inline]
152#[target_feature(enable = "avx")]
153#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
154#[rustc_legacy_const_generics(2)]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
157pub const fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
158    static_assert_uimm_bits!(MASK, 8);
159    unsafe {
160        simd_shuffle!(
161            a,
162            b,
163            [
164                MASK as u32 & 0b11,
165                (MASK as u32 >> 2) & 0b11,
166                ((MASK as u32 >> 4) & 0b11) + 8,
167                ((MASK as u32 >> 6) & 0b11) + 8,
168                (MASK as u32 & 0b11) + 4,
169                ((MASK as u32 >> 2) & 0b11) + 4,
170                ((MASK as u32 >> 4) & 0b11) + 12,
171                ((MASK as u32 >> 6) & 0b11) + 12,
172            ],
173        )
174    }
175}
176
177/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
178/// elements in `a`, and then AND with `b`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
181#[inline]
182#[target_feature(enable = "avx")]
183#[cfg_attr(test, assert_instr(vandnp))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
187    unsafe {
188        let a: u64x4 = transmute(a);
189        let b: u64x4 = transmute(b);
190        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
191    }
192}
193
194/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
195/// elements in `a`
196/// and then AND with `b`.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
199#[inline]
200#[target_feature(enable = "avx")]
201#[cfg_attr(test, assert_instr(vandnps))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
204pub const fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
205    unsafe {
206        let a: u32x8 = transmute(a);
207        let b: u32x8 = transmute(b);
208        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
209    }
210}
211
212/// Compares packed double-precision (64-bit) floating-point elements
213/// in `a` and `b`, and returns packed maximum values
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
216#[inline]
217#[target_feature(enable = "avx")]
218#[cfg_attr(test, assert_instr(vmaxpd))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
221    unsafe { vmaxpd(a, b) }
222}
223
224/// Compares packed single-precision (32-bit) floating-point elements in `a`
225/// and `b`, and returns packed maximum values
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
228#[inline]
229#[target_feature(enable = "avx")]
230#[cfg_attr(test, assert_instr(vmaxps))]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
233    unsafe { vmaxps(a, b) }
234}
235
236/// Compares packed double-precision (64-bit) floating-point elements
237/// in `a` and `b`, and returns packed minimum values
238///
239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
240#[inline]
241#[target_feature(enable = "avx")]
242#[cfg_attr(test, assert_instr(vminpd))]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
245    unsafe { vminpd(a, b) }
246}
247
248/// Compares packed single-precision (32-bit) floating-point elements in `a`
249/// and `b`, and returns packed minimum values
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
252#[inline]
253#[target_feature(enable = "avx")]
254#[cfg_attr(test, assert_instr(vminps))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
257    unsafe { vminps(a, b) }
258}
259
260/// Multiplies packed double-precision (64-bit) floating-point elements
261/// in `a` and `b`.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
264#[inline]
265#[target_feature(enable = "avx")]
266#[cfg_attr(test, assert_instr(vmulpd))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
269pub const fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
270    unsafe { simd_mul(a, b) }
271}
272
273/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
274/// `b`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
277#[inline]
278#[target_feature(enable = "avx")]
279#[cfg_attr(test, assert_instr(vmulps))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
282pub const fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
283    unsafe { simd_mul(a, b) }
284}
285
286/// Alternatively adds and subtracts packed double-precision (64-bit)
287/// floating-point elements in `a` to/from packed elements in `b`.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vaddsubpd))]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
295pub const fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
296    unsafe {
297        let a = a.as_f64x4();
298        let b = b.as_f64x4();
299        let add = simd_add(a, b);
300        let sub = simd_sub(a, b);
301        simd_shuffle!(add, sub, [4, 1, 6, 3])
302    }
303}
304
305/// Alternatively adds and subtracts packed single-precision (32-bit)
306/// floating-point elements in `a` to/from packed elements in `b`.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
309#[inline]
310#[target_feature(enable = "avx")]
311#[cfg_attr(test, assert_instr(vaddsubps))]
312#[stable(feature = "simd_x86", since = "1.27.0")]
313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
314pub const fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
315    unsafe {
316        let a = a.as_f32x8();
317        let b = b.as_f32x8();
318        let add = simd_add(a, b);
319        let sub = simd_sub(a, b);
320        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
321    }
322}
323
324/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
325/// from packed elements in `a`.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
328#[inline]
329#[target_feature(enable = "avx")]
330#[cfg_attr(test, assert_instr(vsubpd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
333pub const fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
334    unsafe { simd_sub(a, b) }
335}
336
337/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
338/// from packed elements in `a`.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
341#[inline]
342#[target_feature(enable = "avx")]
343#[cfg_attr(test, assert_instr(vsubps))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
346pub const fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
347    unsafe { simd_sub(a, b) }
348}
349
350/// Computes the division of each of the 8 packed 32-bit floating-point elements
351/// in `a` by the corresponding packed elements in `b`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
354#[inline]
355#[target_feature(enable = "avx")]
356#[cfg_attr(test, assert_instr(vdivps))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
359pub const fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
360    unsafe { simd_div(a, b) }
361}
362
363/// Computes the division of each of the 4 packed 64-bit floating-point elements
364/// in `a` by the corresponding packed elements in `b`.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
367#[inline]
368#[target_feature(enable = "avx")]
369#[cfg_attr(test, assert_instr(vdivpd))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
372pub const fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
373    unsafe { simd_div(a, b) }
374}
375
376/// Rounds packed double-precision (64-bit) floating point elements in `a`
377/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
378///
379/// - `0x00`: Round to the nearest whole number.
380/// - `0x01`: Round down, toward negative infinity.
381/// - `0x02`: Round up, toward positive infinity.
382/// - `0x03`: Truncate the values.
383///
384/// For a complete list of options, check [the LLVM docs][llvm_docs].
385///
386/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
389#[inline]
390#[target_feature(enable = "avx")]
391#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
392#[rustc_legacy_const_generics(1)]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
395    static_assert_uimm_bits!(ROUNDING, 4);
396    unsafe { roundpd256(a, ROUNDING) }
397}
398
399/// Rounds packed double-precision (64-bit) floating point elements in `a`
400/// toward positive infinity.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
403#[inline]
404#[target_feature(enable = "avx")]
405#[cfg_attr(test, assert_instr(vroundpd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_ceil_pd(a: __m256d) -> __m256d {
409    unsafe { simd_ceil(a) }
410}
411
412/// Rounds packed double-precision (64-bit) floating point elements in `a`
413/// toward negative infinity.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
416#[inline]
417#[target_feature(enable = "avx")]
418#[cfg_attr(test, assert_instr(vroundpd))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
421pub const fn _mm256_floor_pd(a: __m256d) -> __m256d {
422    unsafe { simd_floor(a) }
423}
424
425/// Rounds packed single-precision (32-bit) floating point elements in `a`
426/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
427///
428/// - `0x00`: Round to the nearest whole number.
429/// - `0x01`: Round down, toward negative infinity.
430/// - `0x02`: Round up, toward positive infinity.
431/// - `0x03`: Truncate the values.
432///
433/// For a complete list of options, check [the LLVM docs][llvm_docs].
434///
435/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
438#[inline]
439#[target_feature(enable = "avx")]
440#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
441#[rustc_legacy_const_generics(1)]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
444    static_assert_uimm_bits!(ROUNDING, 4);
445    unsafe { roundps256(a, ROUNDING) }
446}
447
448/// Rounds packed single-precision (32-bit) floating point elements in `a`
449/// toward positive infinity.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
452#[inline]
453#[target_feature(enable = "avx")]
454#[cfg_attr(test, assert_instr(vroundps))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
457pub const fn _mm256_ceil_ps(a: __m256) -> __m256 {
458    unsafe { simd_ceil(a) }
459}
460
461/// Rounds packed single-precision (32-bit) floating point elements in `a`
462/// toward negative infinity.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
465#[inline]
466#[target_feature(enable = "avx")]
467#[cfg_attr(test, assert_instr(vroundps))]
468#[stable(feature = "simd_x86", since = "1.27.0")]
469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
470pub const fn _mm256_floor_ps(a: __m256) -> __m256 {
471    unsafe { simd_floor(a) }
472}
473
474/// Returns the square root of packed single-precision (32-bit) floating point
475/// elements in `a`.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
478#[inline]
479#[target_feature(enable = "avx")]
480#[cfg_attr(test, assert_instr(vsqrtps))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
483    unsafe { simd_fsqrt(a) }
484}
485
486/// Returns the square root of packed double-precision (64-bit) floating point
487/// elements in `a`.
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
490#[inline]
491#[target_feature(enable = "avx")]
492#[cfg_attr(test, assert_instr(vsqrtpd))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
495    unsafe { simd_fsqrt(a) }
496}
497
498/// Blends packed double-precision (64-bit) floating-point elements from
499/// `a` and `b` using control mask `imm8`.
500///
501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
502#[inline]
503#[target_feature(enable = "avx")]
504// Note: LLVM7 prefers single-precision blend instructions when
505// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
506// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
507#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
508#[rustc_legacy_const_generics(2)]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
511pub const fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
512    static_assert_uimm_bits!(IMM4, 4);
513    unsafe {
514        simd_shuffle!(
515            a,
516            b,
517            [
518                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
519                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
520                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
521                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
522            ],
523        )
524    }
525}
526
527/// Blends packed single-precision (32-bit) floating-point elements from
528/// `a` and `b` using control mask `imm8`.
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
531#[inline]
532#[target_feature(enable = "avx")]
533#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
534#[rustc_legacy_const_generics(2)]
535#[stable(feature = "simd_x86", since = "1.27.0")]
536#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
537pub const fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
538    static_assert_uimm_bits!(IMM8, 8);
539    unsafe {
540        simd_shuffle!(
541            a,
542            b,
543            [
544                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
545                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
546                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
547                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
548                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
549                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
550                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
551                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
552            ],
553        )
554    }
555}
556
557/// Blends packed double-precision (64-bit) floating-point elements from
558/// `a` and `b` using `c` as a mask.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
561#[inline]
562#[target_feature(enable = "avx")]
563#[cfg_attr(test, assert_instr(vblendvpd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
567    unsafe {
568        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
569        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
570    }
571}
572
573/// Blends packed single-precision (32-bit) floating-point elements from
574/// `a` and `b` using `c` as a mask.
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
577#[inline]
578#[target_feature(enable = "avx")]
579#[cfg_attr(test, assert_instr(vblendvps))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
583    unsafe {
584        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
585        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
586    }
587}
588
589/// Conditionally multiplies the packed single-precision (32-bit) floating-point
590/// elements in `a` and `b` using the high 4 bits in `imm8`,
591/// sum the four products, and conditionally return the sum
592///  using the low 4 bits of `imm8`.
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
598#[rustc_legacy_const_generics(2)]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
601    static_assert_uimm_bits!(IMM8, 8);
602    unsafe { vdpps(a, b, IMM8 as i8) }
603}
604
605/// Horizontal addition of adjacent pairs in the two packed vectors
606/// of 4 64-bit floating points `a` and `b`.
607/// In the result, sums of elements from `a` are returned in even locations,
608/// while sums of elements from `b` are returned in odd locations.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
611#[inline]
612#[target_feature(enable = "avx")]
613#[cfg_attr(test, assert_instr(vhaddpd))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617    unsafe {
618        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620        simd_add(even, odd)
621    }
622}
623
624/// Horizontal addition of adjacent pairs in the two packed vectors
625/// of 8 32-bit floating points `a` and `b`.
626/// In the result, sums of elements from `a` are returned in locations of
627/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
628/// 2, 3, 6, 7.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
631#[inline]
632#[target_feature(enable = "avx")]
633#[cfg_attr(test, assert_instr(vhaddps))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
636pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637    unsafe {
638        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640        simd_add(even, odd)
641    }
642}
643
644/// Horizontal subtraction of adjacent pairs in the two packed vectors
645/// of 4 64-bit floating points `a` and `b`.
646/// In the result, sums of elements from `a` are returned in even locations,
647/// while sums of elements from `b` are returned in odd locations.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
650#[inline]
651#[target_feature(enable = "avx")]
652#[cfg_attr(test, assert_instr(vhsubpd))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
655pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656    unsafe {
657        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659        simd_sub(even, odd)
660    }
661}
662
663/// Horizontal subtraction of adjacent pairs in the two packed vectors
664/// of 8 32-bit floating points `a` and `b`.
665/// In the result, sums of elements from `a` are returned in locations of
666/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
667/// 2, 3, 6, 7.
668///
669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
670#[inline]
671#[target_feature(enable = "avx")]
672#[cfg_attr(test, assert_instr(vhsubps))]
673#[stable(feature = "simd_x86", since = "1.27.0")]
674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
675pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676    unsafe {
677        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679        simd_sub(even, odd)
680    }
681}
682
683/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
684/// elements in `a` and `b`.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
687#[inline]
688#[target_feature(enable = "avx")]
689#[cfg_attr(test, assert_instr(vxorp))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
692pub const fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
693    unsafe {
694        let a: u64x4 = transmute(a);
695        let b: u64x4 = transmute(b);
696        transmute(simd_xor(a, b))
697    }
698}
699
700/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
701/// elements in `a` and `b`.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
704#[inline]
705#[target_feature(enable = "avx")]
706#[cfg_attr(test, assert_instr(vxorps))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
710    unsafe {
711        let a: u32x8 = transmute(a);
712        let b: u32x8 = transmute(b);
713        transmute(simd_xor(a, b))
714    }
715}
716
717/// Equal (ordered, non-signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OQ: i32 = 0x00;
720/// Less-than (ordered, signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OS: i32 = 0x01;
723/// Less-than-or-equal (ordered, signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OS: i32 = 0x02;
726/// Unordered (non-signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_Q: i32 = 0x03;
729/// Not-equal (unordered, non-signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_UQ: i32 = 0x04;
732/// Not-less-than (unordered, signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_US: i32 = 0x05;
735/// Not-less-than-or-equal (unordered, signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_US: i32 = 0x06;
738/// Ordered (non-signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_Q: i32 = 0x07;
741/// Equal (unordered, non-signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_UQ: i32 = 0x08;
744/// Not-greater-than-or-equal (unordered, signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_US: i32 = 0x09;
747/// Not-greater-than (unordered, signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_US: i32 = 0x0a;
750/// False (ordered, non-signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OQ: i32 = 0x0b;
753/// Not-equal (ordered, non-signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OQ: i32 = 0x0c;
756/// Greater-than-or-equal (ordered, signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OS: i32 = 0x0d;
759/// Greater-than (ordered, signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OS: i32 = 0x0e;
762/// True (unordered, non-signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_UQ: i32 = 0x0f;
765/// Equal (ordered, signaling)
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub const _CMP_EQ_OS: i32 = 0x10;
768/// Less-than (ordered, non-signaling)
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub const _CMP_LT_OQ: i32 = 0x11;
771/// Less-than-or-equal (ordered, non-signaling)
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub const _CMP_LE_OQ: i32 = 0x12;
774/// Unordered (signaling)
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub const _CMP_UNORD_S: i32 = 0x13;
777/// Not-equal (unordered, signaling)
778#[stable(feature = "simd_x86", since = "1.27.0")]
779pub const _CMP_NEQ_US: i32 = 0x14;
780/// Not-less-than (unordered, non-signaling)
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub const _CMP_NLT_UQ: i32 = 0x15;
783/// Not-less-than-or-equal (unordered, non-signaling)
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub const _CMP_NLE_UQ: i32 = 0x16;
786/// Ordered (signaling)
787#[stable(feature = "simd_x86", since = "1.27.0")]
788pub const _CMP_ORD_S: i32 = 0x17;
789/// Equal (unordered, signaling)
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub const _CMP_EQ_US: i32 = 0x18;
792/// Not-greater-than-or-equal (unordered, non-signaling)
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub const _CMP_NGE_UQ: i32 = 0x19;
795/// Not-greater-than (unordered, non-signaling)
796#[stable(feature = "simd_x86", since = "1.27.0")]
797pub const _CMP_NGT_UQ: i32 = 0x1a;
798/// False (ordered, signaling)
799#[stable(feature = "simd_x86", since = "1.27.0")]
800pub const _CMP_FALSE_OS: i32 = 0x1b;
801/// Not-equal (ordered, signaling)
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub const _CMP_NEQ_OS: i32 = 0x1c;
804/// Greater-than-or-equal (ordered, non-signaling)
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub const _CMP_GE_OQ: i32 = 0x1d;
807/// Greater-than (ordered, non-signaling)
808#[stable(feature = "simd_x86", since = "1.27.0")]
809pub const _CMP_GT_OQ: i32 = 0x1e;
810/// True (unordered, signaling)
811#[stable(feature = "simd_x86", since = "1.27.0")]
812pub const _CMP_TRUE_US: i32 = 0x1f;
813
814/// Compares packed double-precision (64-bit) floating-point
815/// elements in `a` and `b` based on the comparison operand
816/// specified by `IMM5`.
817///
818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
819#[inline]
820#[target_feature(enable = "avx")]
821#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
822#[rustc_legacy_const_generics(2)]
823#[stable(feature = "simd_x86", since = "1.27.0")]
824pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
825    static_assert_uimm_bits!(IMM5, 5);
826    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
827}
828
829/// Compares packed double-precision (64-bit) floating-point
830/// elements in `a` and `b` based on the comparison operand
831/// specified by `IMM5`.
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
834#[inline]
835#[target_feature(enable = "avx")]
836#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
837#[rustc_legacy_const_generics(2)]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
840    static_assert_uimm_bits!(IMM5, 5);
841    unsafe { vcmppd256(a, b, IMM5 as u8) }
842}
843
844/// Compares packed single-precision (32-bit) floating-point
845/// elements in `a` and `b` based on the comparison operand
846/// specified by `IMM5`.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
857}
858
859/// Compares packed single-precision (32-bit) floating-point
860/// elements in `a` and `b` based on the comparison operand
861/// specified by `IMM5`.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
870    static_assert_uimm_bits!(IMM5, 5);
871    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
872}
873
874/// Compares the lower double-precision (64-bit) floating-point element in
875/// `a` and `b` based on the comparison operand specified by `IMM5`,
876/// store the result in the lower element of returned vector,
877/// and copies the upper element from `a` to the upper element of returned
878/// vector.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
881#[inline]
882#[target_feature(enable = "avx")]
883#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
884#[rustc_legacy_const_generics(2)]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
887    static_assert_uimm_bits!(IMM5, 5);
888    unsafe { vcmpsd(a, b, IMM5 as i8) }
889}
890
891/// Compares the lower single-precision (32-bit) floating-point element in
892/// `a` and `b` based on the comparison operand specified by `IMM5`,
893/// store the result in the lower element of returned vector,
894/// and copies the upper 3 packed elements from `a` to the upper elements of
895/// returned vector.
896///
897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
898#[inline]
899#[target_feature(enable = "avx")]
900#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
901#[rustc_legacy_const_generics(2)]
902#[stable(feature = "simd_x86", since = "1.27.0")]
903pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
904    static_assert_uimm_bits!(IMM5, 5);
905    unsafe { vcmpss(a, b, IMM5 as i8) }
906}
907
908/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
909/// floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtdq2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
917pub const fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
918    unsafe { simd_cast(a.as_i32x4()) }
919}
920
921/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
922/// floating-point elements.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
925#[inline]
926#[target_feature(enable = "avx")]
927#[cfg_attr(test, assert_instr(vcvtdq2ps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
930pub const fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
931    unsafe { simd_cast(a.as_i32x8()) }
932}
933
934/// Converts packed double-precision (64-bit) floating-point elements in `a`
935/// to packed single-precision (32-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
938#[inline]
939#[target_feature(enable = "avx")]
940#[cfg_attr(test, assert_instr(vcvtpd2ps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
944    unsafe { simd_cast(a) }
945}
946
947/// Converts packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvtps2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
956    unsafe { transmute(vcvtps2dq(a)) }
957}
958
959/// Converts packed single-precision (32-bit) floating-point elements in `a`
960/// to packed double-precision (64-bit) floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtps2pd))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
968pub const fn _mm256_cvtps_pd(a: __m128) -> __m256d {
969    unsafe { simd_cast(a) }
970}
971
972/// Returns the first element of the input vector of `[4 x double]`.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
975#[inline]
976#[target_feature(enable = "avx")]
977//#[cfg_attr(test, assert_instr(movsd))] FIXME
978#[stable(feature = "simd_x86", since = "1.27.0")]
979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
980pub const fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
981    unsafe { simd_extract!(a, 0) }
982}
983
984/// Converts packed double-precision (64-bit) floating-point elements in `a`
985/// to packed 32-bit integers with truncation.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
988#[inline]
989#[target_feature(enable = "avx")]
990#[cfg_attr(test, assert_instr(vcvttpd2dq))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
993    unsafe { transmute(vcvttpd2dq(a)) }
994}
995
996/// Converts packed double-precision (64-bit) floating-point elements in `a`
997/// to packed 32-bit integers.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
1000#[inline]
1001#[target_feature(enable = "avx")]
1002#[cfg_attr(test, assert_instr(vcvtpd2dq))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
1005    unsafe { transmute(vcvtpd2dq(a)) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers with truncation.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
1012#[inline]
1013#[target_feature(enable = "avx")]
1014#[cfg_attr(test, assert_instr(vcvttps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
1017    unsafe { transmute(vcvttps2dq(a)) }
1018}
1019
1020/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
1021/// floating-point elements) from `a`, selected with `imm8`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
1024#[inline]
1025#[target_feature(enable = "avx")]
1026#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1027#[rustc_legacy_const_generics(1)]
1028#[stable(feature = "simd_x86", since = "1.27.0")]
1029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1030pub const fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
1031    static_assert_uimm_bits!(IMM1, 1);
1032    unsafe {
1033        simd_shuffle!(
1034            a,
1035            _mm256_undefined_ps(),
1036            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
1037        )
1038    }
1039}
1040
1041/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1042/// floating-point elements) from `a`, selected with `imm8`.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1045#[inline]
1046#[target_feature(enable = "avx")]
1047#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1048#[rustc_legacy_const_generics(1)]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1052    static_assert_uimm_bits!(IMM1, 1);
1053    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1054}
1055
1056/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1062#[rustc_legacy_const_generics(1)]
1063#[stable(feature = "simd_x86", since = "1.27.0")]
1064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1065pub const fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1066    static_assert_uimm_bits!(IMM1, 1);
1067    unsafe {
1068        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1069        transmute(dst)
1070    }
1071}
1072
1073/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078// This intrinsic has no corresponding instruction.
1079#[rustc_legacy_const_generics(1)]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1082pub const fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1083    static_assert_uimm_bits!(INDEX, 3);
1084    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1085}
1086
1087/// Returns the first element of the input vector of `[8 x i32]`.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1090#[inline]
1091#[target_feature(enable = "avx")]
1092#[stable(feature = "simd_x86", since = "1.27.0")]
1093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1094pub const fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1095    unsafe { simd_extract!(a.as_i32x8(), 0) }
1096}
1097
1098/// Zeroes the contents of all XMM or YMM registers.
1099///
1100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1101#[inline]
1102#[target_feature(enable = "avx")]
1103#[cfg_attr(test, assert_instr(vzeroall))]
1104#[stable(feature = "simd_x86", since = "1.27.0")]
1105pub fn _mm256_zeroall() {
1106    unsafe { vzeroall() }
1107}
1108
1109/// Zeroes the upper 128 bits of all YMM registers;
1110/// the lower 128-bits of the registers are unmodified.
1111///
1112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1113#[inline]
1114#[target_feature(enable = "avx")]
1115#[cfg_attr(test, assert_instr(vzeroupper))]
1116#[stable(feature = "simd_x86", since = "1.27.0")]
1117pub fn _mm256_zeroupper() {
1118    unsafe { vzeroupper() }
1119}
1120
1121/// Shuffles single-precision (32-bit) floating-point elements in `a`
1122/// within 128-bit lanes using the control in `b`.
1123///
1124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1125#[inline]
1126#[target_feature(enable = "avx")]
1127#[cfg_attr(test, assert_instr(vpermilps))]
1128#[stable(feature = "simd_x86", since = "1.27.0")]
1129pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1130    unsafe { vpermilps256(a, b.as_i32x8()) }
1131}
1132
1133/// Shuffles single-precision (32-bit) floating-point elements in `a`
1134/// using the control in `b`.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1137#[inline]
1138#[target_feature(enable = "avx")]
1139#[cfg_attr(test, assert_instr(vpermilps))]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1142    unsafe { vpermilps(a, b.as_i32x4()) }
1143}
1144
1145/// Shuffles single-precision (32-bit) floating-point elements in `a`
1146/// within 128-bit lanes using the control in `imm8`.
1147///
1148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1149#[inline]
1150#[target_feature(enable = "avx")]
1151#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1152#[rustc_legacy_const_generics(1)]
1153#[stable(feature = "simd_x86", since = "1.27.0")]
1154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1155pub const fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1156    static_assert_uimm_bits!(IMM8, 8);
1157    unsafe {
1158        simd_shuffle!(
1159            a,
1160            _mm256_undefined_ps(),
1161            [
1162                (IMM8 as u32 >> 0) & 0b11,
1163                (IMM8 as u32 >> 2) & 0b11,
1164                (IMM8 as u32 >> 4) & 0b11,
1165                (IMM8 as u32 >> 6) & 0b11,
1166                ((IMM8 as u32 >> 0) & 0b11) + 4,
1167                ((IMM8 as u32 >> 2) & 0b11) + 4,
1168                ((IMM8 as u32 >> 4) & 0b11) + 4,
1169                ((IMM8 as u32 >> 6) & 0b11) + 4,
1170            ],
1171        )
1172    }
1173}
1174
1175/// Shuffles single-precision (32-bit) floating-point elements in `a`
1176/// using the control in `imm8`.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1179#[inline]
1180#[target_feature(enable = "avx")]
1181#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1182#[rustc_legacy_const_generics(1)]
1183#[stable(feature = "simd_x86", since = "1.27.0")]
1184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1185pub const fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1186    static_assert_uimm_bits!(IMM8, 8);
1187    unsafe {
1188        simd_shuffle!(
1189            a,
1190            _mm_undefined_ps(),
1191            [
1192                (IMM8 as u32 >> 0) & 0b11,
1193                (IMM8 as u32 >> 2) & 0b11,
1194                (IMM8 as u32 >> 4) & 0b11,
1195                (IMM8 as u32 >> 6) & 0b11,
1196            ],
1197        )
1198    }
1199}
1200
1201/// Shuffles double-precision (64-bit) floating-point elements in `a`
1202/// within 256-bit lanes using the control in `b`.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1205#[inline]
1206#[target_feature(enable = "avx")]
1207#[cfg_attr(test, assert_instr(vpermilpd))]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1210    unsafe { vpermilpd256(a, b.as_i64x4()) }
1211}
1212
1213/// Shuffles double-precision (64-bit) floating-point elements in `a`
1214/// using the control in `b`.
1215///
1216/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1217#[inline]
1218#[target_feature(enable = "avx")]
1219#[cfg_attr(test, assert_instr(vpermilpd))]
1220#[stable(feature = "simd_x86", since = "1.27.0")]
1221pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1222    unsafe { vpermilpd(a, b.as_i64x2()) }
1223}
1224
1225/// Shuffles double-precision (64-bit) floating-point elements in `a`
1226/// within 128-bit lanes using the control in `imm8`.
1227///
1228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1229#[inline]
1230#[target_feature(enable = "avx")]
1231#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1232#[rustc_legacy_const_generics(1)]
1233#[stable(feature = "simd_x86", since = "1.27.0")]
1234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1235pub const fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1236    static_assert_uimm_bits!(IMM4, 4);
1237    unsafe {
1238        simd_shuffle!(
1239            a,
1240            _mm256_undefined_pd(),
1241            [
1242                ((IMM4 as u32 >> 0) & 1),
1243                ((IMM4 as u32 >> 1) & 1),
1244                ((IMM4 as u32 >> 2) & 1) + 2,
1245                ((IMM4 as u32 >> 3) & 1) + 2,
1246            ],
1247        )
1248    }
1249}
1250
1251/// Shuffles double-precision (64-bit) floating-point elements in `a`
1252/// using the control in `imm8`.
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1255#[inline]
1256#[target_feature(enable = "avx")]
1257#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1258#[rustc_legacy_const_generics(1)]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1261pub const fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1262    static_assert_uimm_bits!(IMM2, 2);
1263    unsafe {
1264        simd_shuffle!(
1265            a,
1266            _mm_undefined_pd(),
1267            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1268        )
1269    }
1270}
1271
1272/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1273/// floating-point elements) selected by `imm8` from `a` and `b`.
1274///
1275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1276#[inline]
1277#[target_feature(enable = "avx")]
1278#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1279#[rustc_legacy_const_generics(2)]
1280#[stable(feature = "simd_x86", since = "1.27.0")]
1281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1282pub const fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1283    static_assert_uimm_bits!(IMM8, 8);
1284    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1285        _mm256_castps_si256(a),
1286        _mm256_castps_si256(b),
1287    ))
1288}
1289
1290/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1291/// floating-point elements) selected by `imm8` from `a` and `b`.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1294#[inline]
1295#[target_feature(enable = "avx")]
1296#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1297#[rustc_legacy_const_generics(2)]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1300pub const fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1301    static_assert_uimm_bits!(IMM8, 8);
1302    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1303        _mm256_castpd_si256(a),
1304        _mm256_castpd_si256(b),
1305    ))
1306}
1307
1308/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1309/// from `a` and `b`.
1310///
1311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1312#[inline]
1313#[target_feature(enable = "avx")]
1314#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1315#[rustc_legacy_const_generics(2)]
1316#[stable(feature = "simd_x86", since = "1.27.0")]
1317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1318pub const fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1319    static_assert_uimm_bits!(IMM8, 8);
1320    const fn idx(imm8: i32, pos: u32) -> u32 {
1321        let part = if pos < 2 {
1322            imm8 & 0xf
1323        } else {
1324            (imm8 & 0xf0) >> 4
1325        };
1326        2 * (part as u32 & 0b11) + (pos & 1)
1327    }
1328    const fn idx0(imm8: i32, pos: u32) -> u32 {
1329        let part = if pos < 2 {
1330            imm8 & 0xf
1331        } else {
1332            (imm8 & 0xf0) >> 4
1333        };
1334        if part & 0b1000 != 0 { 4 } else { pos }
1335    }
1336    unsafe {
1337        let r = simd_shuffle!(
1338            a.as_i64x4(),
1339            b.as_i64x4(),
1340            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1341        );
1342        let r: i64x4 = simd_shuffle!(
1343            r,
1344            i64x4::ZERO,
1345            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1346        );
1347        r.as_m256i()
1348    }
1349}
1350
1351/// Broadcasts a single-precision (32-bit) floating-point element from memory
1352/// to all elements of the returned vector.
1353///
1354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1355#[inline]
1356#[target_feature(enable = "avx")]
1357#[cfg_attr(test, assert_instr(vbroadcastss))]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359#[allow(clippy::trivially_copy_pass_by_ref)]
1360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1361pub const fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1362    _mm256_set1_ps(*f)
1363}
1364
1365/// Broadcasts a single-precision (32-bit) floating-point element from memory
1366/// to all elements of the returned vector.
1367///
1368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1369#[inline]
1370#[target_feature(enable = "avx")]
1371#[cfg_attr(test, assert_instr(vbroadcastss))]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373#[allow(clippy::trivially_copy_pass_by_ref)]
1374#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1375pub const fn _mm_broadcast_ss(f: &f32) -> __m128 {
1376    _mm_set1_ps(*f)
1377}
1378
1379/// Broadcasts a double-precision (64-bit) floating-point element from memory
1380/// to all elements of the returned vector.
1381///
1382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1383#[inline]
1384#[target_feature(enable = "avx")]
1385#[cfg_attr(test, assert_instr(vbroadcastsd))]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387#[allow(clippy::trivially_copy_pass_by_ref)]
1388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1389pub const fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1390    _mm256_set1_pd(*f)
1391}
1392
1393/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1394/// (32-bit) floating-point elements) to all elements of the returned vector.
1395///
1396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1397#[inline]
1398#[target_feature(enable = "avx")]
1399#[cfg_attr(test, assert_instr(vbroadcastf128))]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1402pub const fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1403    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1404}
1405
1406/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1407/// (64-bit) floating-point elements) to all elements of the returned vector.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412#[cfg_attr(test, assert_instr(vbroadcastf128))]
1413#[stable(feature = "simd_x86", since = "1.27.0")]
1414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1415pub const fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1416    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1417}
1418
1419/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1420/// single-precision (32-bit) floating-point elements) from `b` into result
1421/// at the location specified by `imm8`.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1424#[inline]
1425#[target_feature(enable = "avx")]
1426#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1427#[rustc_legacy_const_generics(2)]
1428#[stable(feature = "simd_x86", since = "1.27.0")]
1429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1430pub const fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1431    static_assert_uimm_bits!(IMM1, 1);
1432    unsafe {
1433        simd_shuffle!(
1434            a,
1435            _mm256_castps128_ps256(b),
1436            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1437        )
1438    }
1439}
1440
1441/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1442/// double-precision (64-bit) floating-point elements) from `b` into result
1443/// at the location specified by `imm8`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1452pub const fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1453    static_assert_uimm_bits!(IMM1, 1);
1454    unsafe {
1455        simd_shuffle!(
1456            a,
1457            _mm256_castpd128_pd256(b),
1458            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1459        )
1460    }
1461}
1462
1463/// Copies `a` to result, then inserts 128 bits from `b` into result
1464/// at the location specified by `imm8`.
1465///
1466/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1467#[inline]
1468#[target_feature(enable = "avx")]
1469#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1470#[rustc_legacy_const_generics(2)]
1471#[stable(feature = "simd_x86", since = "1.27.0")]
1472#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1473pub const fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1474    static_assert_uimm_bits!(IMM1, 1);
1475    unsafe {
1476        let dst: i64x4 = simd_shuffle!(
1477            a.as_i64x4(),
1478            _mm256_castsi128_si256(b).as_i64x4(),
1479            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1480        );
1481        transmute(dst)
1482    }
1483}
1484
1485/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1486/// at the location specified by `index`.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1489#[inline]
1490#[target_feature(enable = "avx")]
1491// This intrinsic has no corresponding instruction.
1492#[rustc_legacy_const_generics(2)]
1493#[stable(feature = "simd_x86", since = "1.27.0")]
1494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1495pub const fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1496    static_assert_uimm_bits!(INDEX, 5);
1497    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1498}
1499
1500/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1501/// at the location specified by `index`.
1502///
1503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1504#[inline]
1505#[target_feature(enable = "avx")]
1506// This intrinsic has no corresponding instruction.
1507#[rustc_legacy_const_generics(2)]
1508#[stable(feature = "simd_x86", since = "1.27.0")]
1509#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1510pub const fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1511    static_assert_uimm_bits!(INDEX, 4);
1512    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1513}
1514
1515/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1516/// at the location specified by `index`.
1517///
1518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1519#[inline]
1520#[target_feature(enable = "avx")]
1521// This intrinsic has no corresponding instruction.
1522#[rustc_legacy_const_generics(2)]
1523#[stable(feature = "simd_x86", since = "1.27.0")]
1524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1525pub const fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1526    static_assert_uimm_bits!(INDEX, 3);
1527    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1528}
1529
1530/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1531/// floating-point elements) from memory into result.
1532/// `mem_addr` must be aligned on a 32-byte boundary or a
1533/// general-protection exception may be generated.
1534///
1535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1536#[inline]
1537#[target_feature(enable = "avx")]
1538#[cfg_attr(
1539    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1540    assert_instr(vmovap)
1541)]
1542#[stable(feature = "simd_x86", since = "1.27.0")]
1543#[allow(clippy::cast_ptr_alignment)]
1544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1545pub const unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1546    *(mem_addr as *const __m256d)
1547}
1548
1549/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1550/// floating-point elements) from `a` into memory.
1551/// `mem_addr` must be aligned on a 32-byte boundary or a
1552/// general-protection exception may be generated.
1553///
1554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1555#[inline]
1556#[target_feature(enable = "avx")]
1557#[cfg_attr(
1558    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1559    assert_instr(vmovap)
1560)]
1561#[stable(feature = "simd_x86", since = "1.27.0")]
1562#[allow(clippy::cast_ptr_alignment)]
1563#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1564pub const unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1565    *(mem_addr as *mut __m256d) = a;
1566}
1567
1568/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1569/// floating-point elements) from memory into result.
1570/// `mem_addr` must be aligned on a 32-byte boundary or a
1571/// general-protection exception may be generated.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1574#[inline]
1575#[target_feature(enable = "avx")]
1576#[cfg_attr(
1577    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1578    assert_instr(vmovaps)
1579)]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581#[allow(clippy::cast_ptr_alignment)]
1582#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1583pub const unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1584    *(mem_addr as *const __m256)
1585}
1586
1587/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1588/// floating-point elements) from `a` into memory.
1589/// `mem_addr` must be aligned on a 32-byte boundary or a
1590/// general-protection exception may be generated.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1593#[inline]
1594#[target_feature(enable = "avx")]
1595#[cfg_attr(
1596    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1597    assert_instr(vmovaps)
1598)]
1599#[stable(feature = "simd_x86", since = "1.27.0")]
1600#[allow(clippy::cast_ptr_alignment)]
1601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1602pub const unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1603    *(mem_addr as *mut __m256) = a;
1604}
1605
1606/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1607/// floating-point elements) from memory into result.
1608/// `mem_addr` does not need to be aligned on any particular boundary.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(test, assert_instr(vmovup))]
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1616pub const unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1617    let mut dst = _mm256_undefined_pd();
1618    ptr::copy_nonoverlapping(
1619        mem_addr as *const u8,
1620        ptr::addr_of_mut!(dst) as *mut u8,
1621        mem::size_of::<__m256d>(),
1622    );
1623    dst
1624}
1625
1626/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1627/// floating-point elements) from `a` into memory.
1628/// `mem_addr` does not need to be aligned on any particular boundary.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1631#[inline]
1632#[target_feature(enable = "avx")]
1633#[cfg_attr(test, assert_instr(vmovup))]
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1636pub const unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1637    mem_addr.cast::<__m256d>().write_unaligned(a);
1638}
1639
1640/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1641/// floating-point elements) from memory into result.
1642/// `mem_addr` does not need to be aligned on any particular boundary.
1643///
1644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1645#[inline]
1646#[target_feature(enable = "avx")]
1647#[cfg_attr(test, assert_instr(vmovups))]
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1650pub const unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1651    let mut dst = _mm256_undefined_ps();
1652    ptr::copy_nonoverlapping(
1653        mem_addr as *const u8,
1654        ptr::addr_of_mut!(dst) as *mut u8,
1655        mem::size_of::<__m256>(),
1656    );
1657    dst
1658}
1659
1660/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1661/// floating-point elements) from `a` into memory.
1662/// `mem_addr` does not need to be aligned on any particular boundary.
1663///
1664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1665#[inline]
1666#[target_feature(enable = "avx")]
1667#[cfg_attr(test, assert_instr(vmovups))]
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1670pub const unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1671    mem_addr.cast::<__m256>().write_unaligned(a);
1672}
1673
1674/// Loads 256-bits of integer data from memory into result.
1675/// `mem_addr` must be aligned on a 32-byte boundary or a
1676/// general-protection exception may be generated.
1677///
1678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1679#[inline]
1680#[target_feature(enable = "avx")]
1681#[cfg_attr(
1682    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1683    assert_instr(vmovaps)
1684)] // FIXME vmovdqa expected
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1688    *mem_addr
1689}
1690
1691/// Stores 256-bits of integer data from `a` into memory.
1692/// `mem_addr` must be aligned on a 32-byte boundary or a
1693/// general-protection exception may be generated.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1696#[inline]
1697#[target_feature(enable = "avx")]
1698#[cfg_attr(
1699    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1700    assert_instr(vmovaps)
1701)] // FIXME vmovdqa expected
1702#[stable(feature = "simd_x86", since = "1.27.0")]
1703#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1704pub const unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1705    *mem_addr = a;
1706}
1707
1708/// Loads 256-bits of integer data from memory into result.
1709/// `mem_addr` does not need to be aligned on any particular boundary.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1712#[inline]
1713#[target_feature(enable = "avx")]
1714#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1718    let mut dst = _mm256_undefined_si256();
1719    ptr::copy_nonoverlapping(
1720        mem_addr as *const u8,
1721        ptr::addr_of_mut!(dst) as *mut u8,
1722        mem::size_of::<__m256i>(),
1723    );
1724    dst
1725}
1726
1727/// Stores 256-bits of integer data from `a` into memory.
1728/// `mem_addr` does not need to be aligned on any particular boundary.
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1731#[inline]
1732#[target_feature(enable = "avx")]
1733#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1736pub const unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1737    mem_addr.write_unaligned(a);
1738}
1739
1740/// Loads packed double-precision (64-bit) floating-point elements from memory
1741/// into result using `mask` (elements are zeroed out when the high bit of the
1742/// corresponding element is not set).
1743///
1744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1745#[inline]
1746#[target_feature(enable = "avx")]
1747#[cfg_attr(test, assert_instr(vmaskmovpd))]
1748#[stable(feature = "simd_x86", since = "1.27.0")]
1749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1750pub const unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1751    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1752    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1753}
1754
1755/// Stores packed double-precision (64-bit) floating-point elements from `a`
1756/// into memory using `mask`.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vmaskmovpd))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1764pub const unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1765    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1766    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1767}
1768
1769/// Loads packed double-precision (64-bit) floating-point elements from memory
1770/// into result using `mask` (elements are zeroed out when the high bit of the
1771/// corresponding element is not set).
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1774#[inline]
1775#[target_feature(enable = "avx")]
1776#[cfg_attr(test, assert_instr(vmaskmovpd))]
1777#[stable(feature = "simd_x86", since = "1.27.0")]
1778#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1779pub const unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1780    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1781    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1782}
1783
1784/// Stores packed double-precision (64-bit) floating-point elements from `a`
1785/// into memory using `mask`.
1786///
1787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1788#[inline]
1789#[target_feature(enable = "avx")]
1790#[cfg_attr(test, assert_instr(vmaskmovpd))]
1791#[stable(feature = "simd_x86", since = "1.27.0")]
1792#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1793pub const unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1794    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1795    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1796}
1797
1798/// Loads packed single-precision (32-bit) floating-point elements from memory
1799/// into result using `mask` (elements are zeroed out when the high bit of the
1800/// corresponding element is not set).
1801///
1802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1803#[inline]
1804#[target_feature(enable = "avx")]
1805#[cfg_attr(test, assert_instr(vmaskmovps))]
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1808pub const unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1809    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1810    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1811}
1812
1813/// Stores packed single-precision (32-bit) floating-point elements from `a`
1814/// into memory using `mask`.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vmaskmovps))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1822pub const unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1823    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1824    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1825}
1826
1827/// Loads packed single-precision (32-bit) floating-point elements from memory
1828/// into result using `mask` (elements are zeroed out when the high bit of the
1829/// corresponding element is not set).
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1832#[inline]
1833#[target_feature(enable = "avx")]
1834#[cfg_attr(test, assert_instr(vmaskmovps))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1837pub const unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1838    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1839    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1840}
1841
1842/// Stores packed single-precision (32-bit) floating-point elements from `a`
1843/// into memory using `mask`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1846#[inline]
1847#[target_feature(enable = "avx")]
1848#[cfg_attr(test, assert_instr(vmaskmovps))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1851pub const unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1852    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1853    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1854}
1855
1856/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1857/// from `a`, and returns the results.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1860#[inline]
1861#[target_feature(enable = "avx")]
1862#[cfg_attr(test, assert_instr(vmovshdup))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1866    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1867}
1868
1869/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1870/// from `a`, and returns the results.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1873#[inline]
1874#[target_feature(enable = "avx")]
1875#[cfg_attr(test, assert_instr(vmovsldup))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1878pub const fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1879    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1880}
1881
1882/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1883/// from `a`, and returns the results.
1884///
1885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1886#[inline]
1887#[target_feature(enable = "avx")]
1888#[cfg_attr(test, assert_instr(vmovddup))]
1889#[stable(feature = "simd_x86", since = "1.27.0")]
1890#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1891pub const fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1892    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1893}
1894
1895/// Loads 256-bits of integer data from unaligned memory into result.
1896/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1897/// data crosses a cache line boundary.
1898///
1899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1900#[inline]
1901#[target_feature(enable = "avx")]
1902#[cfg_attr(test, assert_instr(vlddqu))]
1903#[stable(feature = "simd_x86", since = "1.27.0")]
1904pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1905    transmute(vlddqu(mem_addr as *const i8))
1906}
1907
1908/// Moves integer data from a 256-bit integer vector to a 32-byte
1909/// aligned memory location. To minimize caching, the data is flagged as
1910/// non-temporal (unlikely to be used again soon)
1911///
1912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1913///
1914/// # Safety of non-temporal stores
1915///
1916/// After using this intrinsic, but before any other access to the memory that this intrinsic
1917/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1918/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1919/// return.
1920///
1921/// See [`_mm_sfence`] for details.
1922#[inline]
1923#[target_feature(enable = "avx")]
1924#[cfg_attr(test, assert_instr(vmovntdq))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1927    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1928    crate::arch::asm!(
1929        vps!("vmovntdq", ",{a}"),
1930        p = in(reg) mem_addr,
1931        a = in(ymm_reg) a,
1932        options(nostack, preserves_flags),
1933    );
1934}
1935
1936/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1937/// to a 32-byte aligned memory location. To minimize caching, the data is
1938/// flagged as non-temporal (unlikely to be used again soon).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1941///
1942/// # Safety of non-temporal stores
1943///
1944/// After using this intrinsic, but before any other access to the memory that this intrinsic
1945/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1946/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1947/// return.
1948///
1949/// See [`_mm_sfence`] for details.
1950#[inline]
1951#[target_feature(enable = "avx")]
1952#[cfg_attr(test, assert_instr(vmovntpd))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954#[allow(clippy::cast_ptr_alignment)]
1955pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1956    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1957    crate::arch::asm!(
1958        vps!("vmovntpd", ",{a}"),
1959        p = in(reg) mem_addr,
1960        a = in(ymm_reg) a,
1961        options(nostack, preserves_flags),
1962    );
1963}
1964
1965/// Moves single-precision floating point values from a 256-bit vector
1966/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1967/// caching, the data is flagged as non-temporal (unlikely to be used again
1968/// soon).
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1971///
1972/// # Safety of non-temporal stores
1973///
1974/// After using this intrinsic, but before any other access to the memory that this intrinsic
1975/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1976/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1977/// return.
1978///
1979/// See [`_mm_sfence`] for details.
1980#[inline]
1981#[target_feature(enable = "avx")]
1982#[cfg_attr(test, assert_instr(vmovntps))]
1983#[stable(feature = "simd_x86", since = "1.27.0")]
1984#[allow(clippy::cast_ptr_alignment)]
1985pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1986    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1987    crate::arch::asm!(
1988        vps!("vmovntps", ",{a}"),
1989        p = in(reg) mem_addr,
1990        a = in(ymm_reg) a,
1991        options(nostack, preserves_flags),
1992    );
1993}
1994
1995/// Computes the approximate reciprocal of packed single-precision (32-bit)
1996/// floating-point elements in `a`, and returns the results. The maximum
1997/// relative error for this approximation is less than 1.5*2^-12.
1998///
1999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
2000#[inline]
2001#[target_feature(enable = "avx")]
2002#[cfg_attr(test, assert_instr(vrcpps))]
2003#[stable(feature = "simd_x86", since = "1.27.0")]
2004pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
2005    unsafe { vrcpps(a) }
2006}
2007
2008/// Computes the approximate reciprocal square root of packed single-precision
2009/// (32-bit) floating-point elements in `a`, and returns the results.
2010/// The maximum relative error for this approximation is less than 1.5*2^-12.
2011///
2012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
2013#[inline]
2014#[target_feature(enable = "avx")]
2015#[cfg_attr(test, assert_instr(vrsqrtps))]
2016#[stable(feature = "simd_x86", since = "1.27.0")]
2017pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
2018    unsafe { vrsqrtps(a) }
2019}
2020
2021/// Unpacks and interleave double-precision (64-bit) floating-point elements
2022/// from the high half of each 128-bit lane in `a` and `b`.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
2025#[inline]
2026#[target_feature(enable = "avx")]
2027#[cfg_attr(test, assert_instr(vunpckhpd))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2030pub const fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
2031    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
2032}
2033
2034/// Unpacks and interleave single-precision (32-bit) floating-point elements
2035/// from the high half of each 128-bit lane in `a` and `b`.
2036///
2037/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
2038#[inline]
2039#[target_feature(enable = "avx")]
2040#[cfg_attr(test, assert_instr(vunpckhps))]
2041#[stable(feature = "simd_x86", since = "1.27.0")]
2042#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2043pub const fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
2044    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
2045}
2046
2047/// Unpacks and interleave double-precision (64-bit) floating-point elements
2048/// from the low half of each 128-bit lane in `a` and `b`.
2049///
2050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
2051#[inline]
2052#[target_feature(enable = "avx")]
2053#[cfg_attr(test, assert_instr(vunpcklpd))]
2054#[stable(feature = "simd_x86", since = "1.27.0")]
2055#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2056pub const fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
2057    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
2058}
2059
2060/// Unpacks and interleave single-precision (32-bit) floating-point elements
2061/// from the low half of each 128-bit lane in `a` and `b`.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vunpcklps))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2069pub const fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
2070    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
2071}
2072
2073/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2074/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2075/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2076/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vptest))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2084pub const fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
2085    unsafe {
2086        let r = simd_and(a.as_i64x4(), b.as_i64x4());
2087        (0i64 == simd_reduce_or(r)) as i32
2088    }
2089}
2090
2091/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2092/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2093/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2094/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2097#[inline]
2098#[target_feature(enable = "avx")]
2099#[cfg_attr(test, assert_instr(vptest))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2102pub const fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2103    unsafe {
2104        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2105        (0i64 == simd_reduce_or(r)) as i32
2106    }
2107}
2108
2109/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2110/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2111/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2112/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2113/// `CF` values are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vptest))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2121    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2122}
2123
2124/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestpd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2138    unsafe { vtestzpd256(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestpd))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2155    unsafe { vtestcpd256(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2173    unsafe { vtestnzcpd256(a, b) }
2174}
2175
2176/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2177/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2178/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2179/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2180/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2181/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2182/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2185#[inline]
2186#[target_feature(enable = "avx")]
2187#[cfg_attr(test, assert_instr(vtestpd))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2190pub const fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2191    unsafe {
2192        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2193        (0i64 == simd_reduce_or(r)) as i32
2194    }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestpd))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2211pub const fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2212    unsafe {
2213        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2214        (0i64 == simd_reduce_or(r)) as i32
2215    }
2216}
2217
2218/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2219/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2220/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2221/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2222/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2223/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2224/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2225/// are zero, otherwise return 0.
2226///
2227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2228#[inline]
2229#[target_feature(enable = "avx")]
2230#[cfg_attr(test, assert_instr(vtestpd))]
2231#[stable(feature = "simd_x86", since = "1.27.0")]
2232pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2233    unsafe { vtestnzcpd(a, b) }
2234}
2235
2236/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2237/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2238/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2239/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2240/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2241/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2242/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2245#[inline]
2246#[target_feature(enable = "avx")]
2247#[cfg_attr(test, assert_instr(vtestps))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2250    unsafe { vtestzps256(a, b) }
2251}
2252
2253/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2254/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2255/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2256/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2257/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2258/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2259/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264#[cfg_attr(test, assert_instr(vtestps))]
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2267    unsafe { vtestcps256(a, b) }
2268}
2269
2270/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2271/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2272/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2273/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2274/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2275/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2276/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2277/// are zero, otherwise return 0.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2280#[inline]
2281#[target_feature(enable = "avx")]
2282#[cfg_attr(test, assert_instr(vtestps))]
2283#[stable(feature = "simd_x86", since = "1.27.0")]
2284pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2285    unsafe { vtestnzcps256(a, b) }
2286}
2287
2288/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2289/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2290/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2291/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2292/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2293/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2294/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2297#[inline]
2298#[target_feature(enable = "avx")]
2299#[cfg_attr(test, assert_instr(vtestps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2302pub const fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2303    unsafe {
2304        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2305        (0i32 == simd_reduce_or(r)) as i32
2306    }
2307}
2308
2309/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2310/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2311/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2312/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2313/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2314/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2315/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2318#[inline]
2319#[target_feature(enable = "avx")]
2320#[cfg_attr(test, assert_instr(vtestps))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2323pub const fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2324    unsafe {
2325        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2326        (0i32 == simd_reduce_or(r)) as i32
2327    }
2328}
2329
2330/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2331/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2332/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2333/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2334/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2335/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2336/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2337/// are zero, otherwise return 0.
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2340#[inline]
2341#[target_feature(enable = "avx")]
2342#[cfg_attr(test, assert_instr(vtestps))]
2343#[stable(feature = "simd_x86", since = "1.27.0")]
2344pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2345    unsafe { vtestnzcps(a, b) }
2346}
2347
2348/// Sets each bit of the returned mask based on the most significant bit of the
2349/// corresponding packed double-precision (64-bit) floating-point element in
2350/// `a`.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2353#[inline]
2354#[target_feature(enable = "avx")]
2355#[cfg_attr(test, assert_instr(vmovmskpd))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2358pub const fn _mm256_movemask_pd(a: __m256d) -> i32 {
2359    // Propagate the highest bit to the rest, because simd_bitmask
2360    // requires all-1 or all-0.
2361    unsafe {
2362        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2363        simd_bitmask::<i64x4, u8>(mask) as i32
2364    }
2365}
2366
2367/// Sets each bit of the returned mask based on the most significant bit of the
2368/// corresponding packed single-precision (32-bit) floating-point element in
2369/// `a`.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2372#[inline]
2373#[target_feature(enable = "avx")]
2374#[cfg_attr(test, assert_instr(vmovmskps))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2377pub const fn _mm256_movemask_ps(a: __m256) -> i32 {
2378    // Propagate the highest bit to the rest, because simd_bitmask
2379    // requires all-1 or all-0.
2380    unsafe {
2381        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2382        simd_bitmask::<i32x8, u8>(mask) as i32
2383    }
2384}
2385
2386/// Returns vector of type __m256d with all elements set to zero.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391#[cfg_attr(test, assert_instr(vxorp))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2394pub const fn _mm256_setzero_pd() -> __m256d {
2395    const { unsafe { mem::zeroed() } }
2396}
2397
2398/// Returns vector of type __m256 with all elements set to zero.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403#[cfg_attr(test, assert_instr(vxorps))]
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm256_setzero_ps() -> __m256 {
2407    const { unsafe { mem::zeroed() } }
2408}
2409
2410/// Returns vector of type __m256i with all elements set to zero.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415#[cfg_attr(test, assert_instr(vxor))]
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2418pub const fn _mm256_setzero_si256() -> __m256i {
2419    const { unsafe { mem::zeroed() } }
2420}
2421
2422/// Sets packed double-precision (64-bit) floating-point elements in returned
2423/// vector with the supplied values.
2424///
2425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2426#[inline]
2427#[target_feature(enable = "avx")]
2428// This intrinsic has no corresponding instruction.
2429#[cfg_attr(test, assert_instr(vinsertf128))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2432pub const fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2433    _mm256_setr_pd(d, c, b, a)
2434}
2435
2436/// Sets packed single-precision (32-bit) floating-point elements in returned
2437/// vector with the supplied values.
2438///
2439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2440#[inline]
2441#[target_feature(enable = "avx")]
2442// This intrinsic has no corresponding instruction.
2443#[stable(feature = "simd_x86", since = "1.27.0")]
2444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2445pub const fn _mm256_set_ps(
2446    a: f32,
2447    b: f32,
2448    c: f32,
2449    d: f32,
2450    e: f32,
2451    f: f32,
2452    g: f32,
2453    h: f32,
2454) -> __m256 {
2455    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2456}
2457
2458/// Sets packed 8-bit integers in returned vector with the supplied values.
2459///
2460/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2461#[inline]
2462#[target_feature(enable = "avx")]
2463// This intrinsic has no corresponding instruction.
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2466pub const fn _mm256_set_epi8(
2467    e00: i8,
2468    e01: i8,
2469    e02: i8,
2470    e03: i8,
2471    e04: i8,
2472    e05: i8,
2473    e06: i8,
2474    e07: i8,
2475    e08: i8,
2476    e09: i8,
2477    e10: i8,
2478    e11: i8,
2479    e12: i8,
2480    e13: i8,
2481    e14: i8,
2482    e15: i8,
2483    e16: i8,
2484    e17: i8,
2485    e18: i8,
2486    e19: i8,
2487    e20: i8,
2488    e21: i8,
2489    e22: i8,
2490    e23: i8,
2491    e24: i8,
2492    e25: i8,
2493    e26: i8,
2494    e27: i8,
2495    e28: i8,
2496    e29: i8,
2497    e30: i8,
2498    e31: i8,
2499) -> __m256i {
2500    #[rustfmt::skip]
2501    _mm256_setr_epi8(
2502        e31, e30, e29, e28, e27, e26, e25, e24,
2503        e23, e22, e21, e20, e19, e18, e17, e16,
2504        e15, e14, e13, e12, e11, e10, e09, e08,
2505        e07, e06, e05, e04, e03, e02, e01, e00,
2506    )
2507}
2508
2509/// Sets packed 16-bit integers in returned vector with the supplied values.
2510///
2511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2512#[inline]
2513#[target_feature(enable = "avx")]
2514// This intrinsic has no corresponding instruction.
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2517pub const fn _mm256_set_epi16(
2518    e00: i16,
2519    e01: i16,
2520    e02: i16,
2521    e03: i16,
2522    e04: i16,
2523    e05: i16,
2524    e06: i16,
2525    e07: i16,
2526    e08: i16,
2527    e09: i16,
2528    e10: i16,
2529    e11: i16,
2530    e12: i16,
2531    e13: i16,
2532    e14: i16,
2533    e15: i16,
2534) -> __m256i {
2535    #[rustfmt::skip]
2536    _mm256_setr_epi16(
2537        e15, e14, e13, e12,
2538        e11, e10, e09, e08,
2539        e07, e06, e05, e04,
2540        e03, e02, e01, e00,
2541    )
2542}
2543
2544/// Sets packed 32-bit integers in returned vector with the supplied values.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2547#[inline]
2548#[target_feature(enable = "avx")]
2549// This intrinsic has no corresponding instruction.
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2552pub const fn _mm256_set_epi32(
2553    e0: i32,
2554    e1: i32,
2555    e2: i32,
2556    e3: i32,
2557    e4: i32,
2558    e5: i32,
2559    e6: i32,
2560    e7: i32,
2561) -> __m256i {
2562    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2563}
2564
2565/// Sets packed 64-bit integers in returned vector with the supplied values.
2566///
2567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2568#[inline]
2569#[target_feature(enable = "avx")]
2570// This intrinsic has no corresponding instruction.
2571#[stable(feature = "simd_x86", since = "1.27.0")]
2572#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2573pub const fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2574    _mm256_setr_epi64x(d, c, b, a)
2575}
2576
2577/// Sets packed double-precision (64-bit) floating-point elements in returned
2578/// vector with the supplied values in reverse order.
2579///
2580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2581#[inline]
2582#[target_feature(enable = "avx")]
2583// This intrinsic has no corresponding instruction.
2584#[stable(feature = "simd_x86", since = "1.27.0")]
2585#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2586pub const fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2587    __m256d([a, b, c, d])
2588}
2589
2590/// Sets packed single-precision (32-bit) floating-point elements in returned
2591/// vector with the supplied values in reverse order.
2592///
2593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2594#[inline]
2595#[target_feature(enable = "avx")]
2596// This intrinsic has no corresponding instruction.
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm256_setr_ps(
2600    a: f32,
2601    b: f32,
2602    c: f32,
2603    d: f32,
2604    e: f32,
2605    f: f32,
2606    g: f32,
2607    h: f32,
2608) -> __m256 {
2609    __m256([a, b, c, d, e, f, g, h])
2610}
2611
2612/// Sets packed 8-bit integers in returned vector with the supplied values in
2613/// reverse order.
2614///
2615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2616#[inline]
2617#[target_feature(enable = "avx")]
2618// This intrinsic has no corresponding instruction.
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2621pub const fn _mm256_setr_epi8(
2622    e00: i8,
2623    e01: i8,
2624    e02: i8,
2625    e03: i8,
2626    e04: i8,
2627    e05: i8,
2628    e06: i8,
2629    e07: i8,
2630    e08: i8,
2631    e09: i8,
2632    e10: i8,
2633    e11: i8,
2634    e12: i8,
2635    e13: i8,
2636    e14: i8,
2637    e15: i8,
2638    e16: i8,
2639    e17: i8,
2640    e18: i8,
2641    e19: i8,
2642    e20: i8,
2643    e21: i8,
2644    e22: i8,
2645    e23: i8,
2646    e24: i8,
2647    e25: i8,
2648    e26: i8,
2649    e27: i8,
2650    e28: i8,
2651    e29: i8,
2652    e30: i8,
2653    e31: i8,
2654) -> __m256i {
2655    unsafe {
2656        #[rustfmt::skip]
2657        transmute(i8x32::new(
2658            e00, e01, e02, e03, e04, e05, e06, e07,
2659            e08, e09, e10, e11, e12, e13, e14, e15,
2660            e16, e17, e18, e19, e20, e21, e22, e23,
2661            e24, e25, e26, e27, e28, e29, e30, e31,
2662        ))
2663    }
2664}
2665
2666/// Sets packed 16-bit integers in returned vector with the supplied values in
2667/// reverse order.
2668///
2669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2670#[inline]
2671#[target_feature(enable = "avx")]
2672// This intrinsic has no corresponding instruction.
2673#[stable(feature = "simd_x86", since = "1.27.0")]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const fn _mm256_setr_epi16(
2676    e00: i16,
2677    e01: i16,
2678    e02: i16,
2679    e03: i16,
2680    e04: i16,
2681    e05: i16,
2682    e06: i16,
2683    e07: i16,
2684    e08: i16,
2685    e09: i16,
2686    e10: i16,
2687    e11: i16,
2688    e12: i16,
2689    e13: i16,
2690    e14: i16,
2691    e15: i16,
2692) -> __m256i {
2693    unsafe {
2694        #[rustfmt::skip]
2695        transmute(i16x16::new(
2696            e00, e01, e02, e03,
2697            e04, e05, e06, e07,
2698            e08, e09, e10, e11,
2699            e12, e13, e14, e15,
2700        ))
2701    }
2702}
2703
2704/// Sets packed 32-bit integers in returned vector with the supplied values in
2705/// reverse order.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic has no corresponding instruction.
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2713pub const fn _mm256_setr_epi32(
2714    e0: i32,
2715    e1: i32,
2716    e2: i32,
2717    e3: i32,
2718    e4: i32,
2719    e5: i32,
2720    e6: i32,
2721    e7: i32,
2722) -> __m256i {
2723    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2724}
2725
2726/// Sets packed 64-bit integers in returned vector with the supplied values in
2727/// reverse order.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic has no corresponding instruction.
2733#[stable(feature = "simd_x86", since = "1.27.0")]
2734#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2735pub const fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2736    unsafe { transmute(i64x4::new(a, b, c, d)) }
2737}
2738
2739/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2740/// elements of returned vector.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic has no corresponding instruction.
2746#[stable(feature = "simd_x86", since = "1.27.0")]
2747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2748pub const fn _mm256_set1_pd(a: f64) -> __m256d {
2749    _mm256_setr_pd(a, a, a, a)
2750}
2751
2752/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2753/// elements of returned vector.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic has no corresponding instruction.
2759#[stable(feature = "simd_x86", since = "1.27.0")]
2760#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2761pub const fn _mm256_set1_ps(a: f32) -> __m256 {
2762    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2763}
2764
2765/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2766/// This intrinsic may generate the `vpbroadcastb`.
2767///
2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2769#[inline]
2770#[target_feature(enable = "avx")]
2771// This intrinsic has no corresponding instruction.
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2774pub const fn _mm256_set1_epi8(a: i8) -> __m256i {
2775    #[rustfmt::skip]
2776    _mm256_setr_epi8(
2777        a, a, a, a, a, a, a, a,
2778        a, a, a, a, a, a, a, a,
2779        a, a, a, a, a, a, a, a,
2780        a, a, a, a, a, a, a, a,
2781    )
2782}
2783
2784/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2785/// This intrinsic may generate the `vpbroadcastw`.
2786///
2787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2788#[inline]
2789#[target_feature(enable = "avx")]
2790//#[cfg_attr(test, assert_instr(vpshufb))]
2791#[cfg_attr(test, assert_instr(vinsertf128))]
2792// This intrinsic has no corresponding instruction.
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2795pub const fn _mm256_set1_epi16(a: i16) -> __m256i {
2796    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2797}
2798
2799/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2800/// This intrinsic may generate the `vpbroadcastd`.
2801///
2802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2803#[inline]
2804#[target_feature(enable = "avx")]
2805// This intrinsic has no corresponding instruction.
2806#[stable(feature = "simd_x86", since = "1.27.0")]
2807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2808pub const fn _mm256_set1_epi32(a: i32) -> __m256i {
2809    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2810}
2811
2812/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2813/// This intrinsic may generate the `vpbroadcastq`.
2814///
2815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2816#[inline]
2817#[target_feature(enable = "avx")]
2818#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2819#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2820// This intrinsic has no corresponding instruction.
2821#[stable(feature = "simd_x86", since = "1.27.0")]
2822#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2823pub const fn _mm256_set1_epi64x(a: i64) -> __m256i {
2824    _mm256_setr_epi64x(a, a, a, a)
2825}
2826
2827/// Cast vector of type __m256d to type __m256.
2828///
2829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2830#[inline]
2831#[target_feature(enable = "avx")]
2832// This intrinsic is only used for compilation and does not generate any
2833// instructions, thus it has zero latency.
2834#[stable(feature = "simd_x86", since = "1.27.0")]
2835#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2836pub const fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2837    unsafe { transmute(a) }
2838}
2839
2840/// Cast vector of type __m256 to type __m256d.
2841///
2842/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2843#[inline]
2844#[target_feature(enable = "avx")]
2845// This intrinsic is only used for compilation and does not generate any
2846// instructions, thus it has zero latency.
2847#[stable(feature = "simd_x86", since = "1.27.0")]
2848#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2849pub const fn _mm256_castps_pd(a: __m256) -> __m256d {
2850    unsafe { transmute(a) }
2851}
2852
2853/// Casts vector of type __m256 to type __m256i.
2854///
2855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2856#[inline]
2857#[target_feature(enable = "avx")]
2858// This intrinsic is only used for compilation and does not generate any
2859// instructions, thus it has zero latency.
2860#[stable(feature = "simd_x86", since = "1.27.0")]
2861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2862pub const fn _mm256_castps_si256(a: __m256) -> __m256i {
2863    unsafe { transmute(a) }
2864}
2865
2866/// Casts vector of type __m256i to type __m256.
2867///
2868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2869#[inline]
2870#[target_feature(enable = "avx")]
2871// This intrinsic is only used for compilation and does not generate any
2872// instructions, thus it has zero latency.
2873#[stable(feature = "simd_x86", since = "1.27.0")]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2876    unsafe { transmute(a) }
2877}
2878
2879/// Casts vector of type __m256d to type __m256i.
2880///
2881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2882#[inline]
2883#[target_feature(enable = "avx")]
2884// This intrinsic is only used for compilation and does not generate any
2885// instructions, thus it has zero latency.
2886#[stable(feature = "simd_x86", since = "1.27.0")]
2887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2888pub const fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2889    unsafe { transmute(a) }
2890}
2891
2892/// Casts vector of type __m256i to type __m256d.
2893///
2894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2895#[inline]
2896#[target_feature(enable = "avx")]
2897// This intrinsic is only used for compilation and does not generate any
2898// instructions, thus it has zero latency.
2899#[stable(feature = "simd_x86", since = "1.27.0")]
2900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2901pub const fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2902    unsafe { transmute(a) }
2903}
2904
2905/// Casts vector of type __m256 to type __m128.
2906///
2907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2908#[inline]
2909#[target_feature(enable = "avx")]
2910// This intrinsic is only used for compilation and does not generate any
2911// instructions, thus it has zero latency.
2912#[stable(feature = "simd_x86", since = "1.27.0")]
2913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2914pub const fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2915    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2916}
2917
2918/// Casts vector of type __m256d to type __m128d.
2919///
2920/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2921#[inline]
2922#[target_feature(enable = "avx")]
2923// This intrinsic is only used for compilation and does not generate any
2924// instructions, thus it has zero latency.
2925#[stable(feature = "simd_x86", since = "1.27.0")]
2926#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2927pub const fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2928    unsafe { simd_shuffle!(a, a, [0, 1]) }
2929}
2930
2931/// Casts vector of type __m256i to type __m128i.
2932///
2933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2934#[inline]
2935#[target_feature(enable = "avx")]
2936// This intrinsic is only used for compilation and does not generate any
2937// instructions, thus it has zero latency.
2938#[stable(feature = "simd_x86", since = "1.27.0")]
2939#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2940pub const fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2941    unsafe {
2942        let a = a.as_i64x4();
2943        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2944        transmute(dst)
2945    }
2946}
2947
2948/// Casts vector of type __m128 to type __m256;
2949/// the upper 128 bits of the result are undefined.
2950///
2951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2952#[inline]
2953#[target_feature(enable = "avx")]
2954// This intrinsic is only used for compilation and does not generate any
2955// instructions, thus it has zero latency.
2956#[stable(feature = "simd_x86", since = "1.27.0")]
2957#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2958pub const fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2959    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2960}
2961
2962/// Casts vector of type __m128d to type __m256d;
2963/// the upper 128 bits of the result are undefined.
2964///
2965/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2966#[inline]
2967#[target_feature(enable = "avx")]
2968// This intrinsic is only used for compilation and does not generate any
2969// instructions, thus it has zero latency.
2970#[stable(feature = "simd_x86", since = "1.27.0")]
2971#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2972pub const fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2973    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2974}
2975
2976/// Casts vector of type __m128i to type __m256i;
2977/// the upper 128 bits of the result are undefined.
2978///
2979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2980#[inline]
2981#[target_feature(enable = "avx")]
2982// This intrinsic is only used for compilation and does not generate any
2983// instructions, thus it has zero latency.
2984#[stable(feature = "simd_x86", since = "1.27.0")]
2985#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2986pub const fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2987    unsafe {
2988        let a = a.as_i64x2();
2989        let undefined = i64x2::ZERO;
2990        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2991        transmute(dst)
2992    }
2993}
2994
2995/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2996/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2997/// the value of the source vector. The upper 128 bits are set to zero.
2998///
2999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
3000#[inline]
3001#[target_feature(enable = "avx")]
3002// This intrinsic is only used for compilation and does not generate any
3003// instructions, thus it has zero latency.
3004#[stable(feature = "simd_x86", since = "1.27.0")]
3005#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3006pub const fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
3007    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
3008}
3009
3010/// Constructs a 256-bit integer vector from a 128-bit integer vector.
3011/// The lower 128 bits contain the value of the source vector. The upper
3012/// 128 bits are set to zero.
3013///
3014/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
3015#[inline]
3016#[target_feature(enable = "avx")]
3017// This intrinsic is only used for compilation and does not generate any
3018// instructions, thus it has zero latency.
3019#[stable(feature = "simd_x86", since = "1.27.0")]
3020#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3021pub const fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
3022    unsafe {
3023        let b = i64x2::ZERO;
3024        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
3025        transmute(dst)
3026    }
3027}
3028
3029/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
3030/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
3031/// contain the value of the source vector. The upper 128 bits are set
3032/// to zero.
3033///
3034/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
3035#[inline]
3036#[target_feature(enable = "avx")]
3037// This intrinsic is only used for compilation and does not generate any
3038// instructions, thus it has zero latency.
3039#[stable(feature = "simd_x86", since = "1.27.0")]
3040#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3041pub const fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
3042    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
3043}
3044
3045/// Returns vector of type `__m256` with indeterminate elements.
3046/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3047/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3048/// In practice, this is typically equivalent to [`mem::zeroed`].
3049///
3050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
3051#[inline]
3052#[target_feature(enable = "avx")]
3053// This intrinsic has no corresponding instruction.
3054#[stable(feature = "simd_x86", since = "1.27.0")]
3055#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3056pub const fn _mm256_undefined_ps() -> __m256 {
3057    const { unsafe { mem::zeroed() } }
3058}
3059
3060/// Returns vector of type `__m256d` with indeterminate elements.
3061/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3062/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3063/// In practice, this is typically equivalent to [`mem::zeroed`].
3064///
3065/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
3066#[inline]
3067#[target_feature(enable = "avx")]
3068// This intrinsic has no corresponding instruction.
3069#[stable(feature = "simd_x86", since = "1.27.0")]
3070#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3071pub const fn _mm256_undefined_pd() -> __m256d {
3072    const { unsafe { mem::zeroed() } }
3073}
3074
3075/// Returns vector of type __m256i with with indeterminate elements.
3076/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3077/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3078/// In practice, this is typically equivalent to [`mem::zeroed`].
3079///
3080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
3081#[inline]
3082#[target_feature(enable = "avx")]
3083// This intrinsic has no corresponding instruction.
3084#[stable(feature = "simd_x86", since = "1.27.0")]
3085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3086pub const fn _mm256_undefined_si256() -> __m256i {
3087    const { unsafe { mem::zeroed() } }
3088}
3089
3090/// Sets packed __m256 returned vector with the supplied values.
3091///
3092/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
3093#[inline]
3094#[target_feature(enable = "avx")]
3095#[cfg_attr(test, assert_instr(vinsertf128))]
3096#[stable(feature = "simd_x86", since = "1.27.0")]
3097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3098pub const fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
3099    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
3100}
3101
3102/// Sets packed __m256d returned vector with the supplied values.
3103///
3104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
3105#[inline]
3106#[target_feature(enable = "avx")]
3107#[cfg_attr(test, assert_instr(vinsertf128))]
3108#[stable(feature = "simd_x86", since = "1.27.0")]
3109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3110pub const fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
3111    unsafe {
3112        let hi: __m128 = transmute(hi);
3113        let lo: __m128 = transmute(lo);
3114        transmute(_mm256_set_m128(hi, lo))
3115    }
3116}
3117
3118/// Sets packed __m256i returned vector with the supplied values.
3119///
3120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
3121#[inline]
3122#[target_feature(enable = "avx")]
3123#[cfg_attr(test, assert_instr(vinsertf128))]
3124#[stable(feature = "simd_x86", since = "1.27.0")]
3125#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3126pub const fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
3127    unsafe {
3128        let hi: __m128 = transmute(hi);
3129        let lo: __m128 = transmute(lo);
3130        transmute(_mm256_set_m128(hi, lo))
3131    }
3132}
3133
3134/// Sets packed __m256 returned vector with the supplied values.
3135///
3136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
3137#[inline]
3138#[target_feature(enable = "avx")]
3139#[cfg_attr(test, assert_instr(vinsertf128))]
3140#[stable(feature = "simd_x86", since = "1.27.0")]
3141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3142pub const fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
3143    _mm256_set_m128(hi, lo)
3144}
3145
3146/// Sets packed __m256d returned vector with the supplied values.
3147///
3148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
3149#[inline]
3150#[target_feature(enable = "avx")]
3151#[cfg_attr(test, assert_instr(vinsertf128))]
3152#[stable(feature = "simd_x86", since = "1.27.0")]
3153#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3154pub const fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
3155    _mm256_set_m128d(hi, lo)
3156}
3157
3158/// Sets packed __m256i returned vector with the supplied values.
3159///
3160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3161#[inline]
3162#[target_feature(enable = "avx")]
3163#[cfg_attr(test, assert_instr(vinsertf128))]
3164#[stable(feature = "simd_x86", since = "1.27.0")]
3165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3166pub const fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3167    _mm256_set_m128i(hi, lo)
3168}
3169
3170/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3171/// floating-point elements) from memory, and combine them into a 256-bit
3172/// value.
3173/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3174///
3175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3176#[inline]
3177#[target_feature(enable = "avx")]
3178// This intrinsic has no corresponding instruction.
3179#[stable(feature = "simd_x86", since = "1.27.0")]
3180#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3181pub const unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3182    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3183    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3184}
3185
3186/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3187/// floating-point elements) from memory, and combine them into a 256-bit
3188/// value.
3189/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3190///
3191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3192#[inline]
3193#[target_feature(enable = "avx")]
3194// This intrinsic has no corresponding instruction.
3195#[stable(feature = "simd_x86", since = "1.27.0")]
3196#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3197pub const unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3198    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3199    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3200}
3201
3202/// Loads two 128-bit values (composed of integer data) from memory, and combine
3203/// them into a 256-bit value.
3204/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3205///
3206/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3207#[inline]
3208#[target_feature(enable = "avx")]
3209// This intrinsic has no corresponding instruction.
3210#[stable(feature = "simd_x86", since = "1.27.0")]
3211#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3212pub const unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3213    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3214    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3215}
3216
3217/// Stores the high and low 128-bit halves (each composed of 4 packed
3218/// single-precision (32-bit) floating-point elements) from `a` into memory two
3219/// different 128-bit locations.
3220/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3221///
3222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3223#[inline]
3224#[target_feature(enable = "avx")]
3225// This intrinsic has no corresponding instruction.
3226#[stable(feature = "simd_x86", since = "1.27.0")]
3227#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3228pub const unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3229    let lo = _mm256_castps256_ps128(a);
3230    _mm_storeu_ps(loaddr, lo);
3231    let hi = _mm256_extractf128_ps::<1>(a);
3232    _mm_storeu_ps(hiaddr, hi);
3233}
3234
3235/// Stores the high and low 128-bit halves (each composed of 2 packed
3236/// double-precision (64-bit) floating-point elements) from `a` into memory two
3237/// different 128-bit locations.
3238/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3239///
3240/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3241#[inline]
3242#[target_feature(enable = "avx")]
3243// This intrinsic has no corresponding instruction.
3244#[stable(feature = "simd_x86", since = "1.27.0")]
3245#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3246pub const unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3247    let lo = _mm256_castpd256_pd128(a);
3248    _mm_storeu_pd(loaddr, lo);
3249    let hi = _mm256_extractf128_pd::<1>(a);
3250    _mm_storeu_pd(hiaddr, hi);
3251}
3252
3253/// Stores the high and low 128-bit halves (each composed of integer data) from
3254/// `a` into memory two different 128-bit locations.
3255/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3256///
3257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3258#[inline]
3259#[target_feature(enable = "avx")]
3260// This intrinsic has no corresponding instruction.
3261#[stable(feature = "simd_x86", since = "1.27.0")]
3262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3263pub const unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3264    let lo = _mm256_castsi256_si128(a);
3265    _mm_storeu_si128(loaddr, lo);
3266    let hi = _mm256_extractf128_si256::<1>(a);
3267    _mm_storeu_si128(hiaddr, hi);
3268}
3269
3270/// Returns the first element of the input vector of `[8 x float]`.
3271///
3272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3273#[inline]
3274#[target_feature(enable = "avx")]
3275//#[cfg_attr(test, assert_instr(movss))] FIXME
3276#[stable(feature = "simd_x86", since = "1.27.0")]
3277#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3278pub const fn _mm256_cvtss_f32(a: __m256) -> f32 {
3279    unsafe { simd_extract!(a, 0) }
3280}
3281
3282// LLVM intrinsics used in the above functions
3283#[allow(improper_ctypes)]
3284unsafe extern "C" {
3285    #[link_name = "llvm.x86.avx.round.pd.256"]
3286    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3287    #[link_name = "llvm.x86.avx.round.ps.256"]
3288    fn roundps256(a: __m256, b: i32) -> __m256;
3289    #[link_name = "llvm.x86.avx.dp.ps.256"]
3290    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3291    #[link_name = "llvm.x86.sse2.cmp.pd"]
3292    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3293    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3294    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3295    #[link_name = "llvm.x86.sse.cmp.ps"]
3296    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3297    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3298    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3299    #[link_name = "llvm.x86.sse2.cmp.sd"]
3300    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3301    #[link_name = "llvm.x86.sse.cmp.ss"]
3302    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3303    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3304    fn vcvtps2dq(a: __m256) -> i32x8;
3305    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3306    fn vcvttpd2dq(a: __m256d) -> i32x4;
3307    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3308    fn vcvtpd2dq(a: __m256d) -> i32x4;
3309    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3310    fn vcvttps2dq(a: __m256) -> i32x8;
3311    #[link_name = "llvm.x86.avx.vzeroall"]
3312    fn vzeroall();
3313    #[link_name = "llvm.x86.avx.vzeroupper"]
3314    fn vzeroupper();
3315    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3316    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3317    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3318    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3319    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3320    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3321    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3322    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3323    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3324    fn vlddqu(mem_addr: *const i8) -> i8x32;
3325    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3326    fn vrcpps(a: __m256) -> __m256;
3327    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3328    fn vrsqrtps(a: __m256) -> __m256;
3329    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3330    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3331    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3332    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3333    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3334    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3335    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3336    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3337    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3338    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3339    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3340    fn vtestzps256(a: __m256, b: __m256) -> i32;
3341    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3342    fn vtestcps256(a: __m256, b: __m256) -> i32;
3343    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3344    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3345    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3346    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3347    #[link_name = "llvm.x86.avx.min.ps.256"]
3348    fn vminps(a: __m256, b: __m256) -> __m256;
3349    #[link_name = "llvm.x86.avx.max.ps.256"]
3350    fn vmaxps(a: __m256, b: __m256) -> __m256;
3351    #[link_name = "llvm.x86.avx.min.pd.256"]
3352    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3353    #[link_name = "llvm.x86.avx.max.pd.256"]
3354    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3355}
3356
3357#[cfg(test)]
3358mod tests {
3359    use crate::core_arch::assert_eq_const as assert_eq;
3360    use crate::hint::black_box;
3361    use crate::ptr;
3362    use stdarch_test::simd_test;
3363
3364    use crate::core_arch::x86::*;
3365
3366    #[simd_test(enable = "avx")]
3367    const unsafe fn test_mm256_add_pd() {
3368        let a = _mm256_setr_pd(1., 2., 3., 4.);
3369        let b = _mm256_setr_pd(5., 6., 7., 8.);
3370        let r = _mm256_add_pd(a, b);
3371        let e = _mm256_setr_pd(6., 8., 10., 12.);
3372        assert_eq_m256d(r, e);
3373    }
3374
3375    #[simd_test(enable = "avx")]
3376    const unsafe fn test_mm256_add_ps() {
3377        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3378        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3379        let r = _mm256_add_ps(a, b);
3380        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3381        assert_eq_m256(r, e);
3382    }
3383
3384    #[simd_test(enable = "avx")]
3385    const unsafe fn test_mm256_and_pd() {
3386        let a = _mm256_set1_pd(1.);
3387        let b = _mm256_set1_pd(0.6);
3388        let r = _mm256_and_pd(a, b);
3389        let e = _mm256_set1_pd(0.5);
3390        assert_eq_m256d(r, e);
3391    }
3392
3393    #[simd_test(enable = "avx")]
3394    const unsafe fn test_mm256_and_ps() {
3395        let a = _mm256_set1_ps(1.);
3396        let b = _mm256_set1_ps(0.6);
3397        let r = _mm256_and_ps(a, b);
3398        let e = _mm256_set1_ps(0.5);
3399        assert_eq_m256(r, e);
3400    }
3401
3402    #[simd_test(enable = "avx")]
3403    const unsafe fn test_mm256_or_pd() {
3404        let a = _mm256_set1_pd(1.);
3405        let b = _mm256_set1_pd(0.6);
3406        let r = _mm256_or_pd(a, b);
3407        let e = _mm256_set1_pd(1.2);
3408        assert_eq_m256d(r, e);
3409    }
3410
3411    #[simd_test(enable = "avx")]
3412    const unsafe fn test_mm256_or_ps() {
3413        let a = _mm256_set1_ps(1.);
3414        let b = _mm256_set1_ps(0.6);
3415        let r = _mm256_or_ps(a, b);
3416        let e = _mm256_set1_ps(1.2);
3417        assert_eq_m256(r, e);
3418    }
3419
3420    #[simd_test(enable = "avx")]
3421    const unsafe fn test_mm256_shuffle_pd() {
3422        let a = _mm256_setr_pd(1., 4., 5., 8.);
3423        let b = _mm256_setr_pd(2., 3., 6., 7.);
3424        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3425        let e = _mm256_setr_pd(4., 3., 8., 7.);
3426        assert_eq_m256d(r, e);
3427    }
3428
3429    #[simd_test(enable = "avx")]
3430    const unsafe fn test_mm256_shuffle_ps() {
3431        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3432        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3433        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3434        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3435        assert_eq_m256(r, e);
3436    }
3437
3438    #[simd_test(enable = "avx")]
3439    const unsafe fn test_mm256_andnot_pd() {
3440        let a = _mm256_set1_pd(0.);
3441        let b = _mm256_set1_pd(0.6);
3442        let r = _mm256_andnot_pd(a, b);
3443        assert_eq_m256d(r, b);
3444    }
3445
3446    #[simd_test(enable = "avx")]
3447    const unsafe fn test_mm256_andnot_ps() {
3448        let a = _mm256_set1_ps(0.);
3449        let b = _mm256_set1_ps(0.6);
3450        let r = _mm256_andnot_ps(a, b);
3451        assert_eq_m256(r, b);
3452    }
3453
3454    #[simd_test(enable = "avx")]
3455    unsafe fn test_mm256_max_pd() {
3456        let a = _mm256_setr_pd(1., 4., 5., 8.);
3457        let b = _mm256_setr_pd(2., 3., 6., 7.);
3458        let r = _mm256_max_pd(a, b);
3459        let e = _mm256_setr_pd(2., 4., 6., 8.);
3460        assert_eq_m256d(r, e);
3461        // > If the values being compared are both 0.0s (of either sign), the
3462        // > value in the second operand (source operand) is returned.
3463        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3464        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3465        let wu: [u64; 4] = transmute(w);
3466        let xu: [u64; 4] = transmute(x);
3467        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3468        assert_eq!(xu, [0u64; 4]);
3469        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3470        // > second operand (source operand), either a NaN or a valid
3471        // > floating-point value, is written to the result.
3472        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3473        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3474        let yf: [f64; 4] = transmute(y);
3475        let zf: [f64; 4] = transmute(z);
3476        assert_eq!(yf, [0.0; 4]);
3477        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3478    }
3479
3480    #[simd_test(enable = "avx")]
3481    unsafe fn test_mm256_max_ps() {
3482        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3483        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3484        let r = _mm256_max_ps(a, b);
3485        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3486        assert_eq_m256(r, e);
3487        // > If the values being compared are both 0.0s (of either sign), the
3488        // > value in the second operand (source operand) is returned.
3489        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3490        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3491        let wu: [u32; 8] = transmute(w);
3492        let xu: [u32; 8] = transmute(x);
3493        assert_eq!(wu, [0x8000_0000u32; 8]);
3494        assert_eq!(xu, [0u32; 8]);
3495        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3496        // > second operand (source operand), either a NaN or a valid
3497        // > floating-point value, is written to the result.
3498        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3499        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3500        let yf: [f32; 8] = transmute(y);
3501        let zf: [f32; 8] = transmute(z);
3502        assert_eq!(yf, [0.0; 8]);
3503        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3504    }
3505
3506    #[simd_test(enable = "avx")]
3507    unsafe fn test_mm256_min_pd() {
3508        let a = _mm256_setr_pd(1., 4., 5., 8.);
3509        let b = _mm256_setr_pd(2., 3., 6., 7.);
3510        let r = _mm256_min_pd(a, b);
3511        let e = _mm256_setr_pd(1., 3., 5., 7.);
3512        assert_eq_m256d(r, e);
3513        // > If the values being compared are both 0.0s (of either sign), the
3514        // > value in the second operand (source operand) is returned.
3515        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3516        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3517        let wu: [u64; 4] = transmute(w);
3518        let xu: [u64; 4] = transmute(x);
3519        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3520        assert_eq!(xu, [0u64; 4]);
3521        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3522        // > second operand (source operand), either a NaN or a valid
3523        // > floating-point value, is written to the result.
3524        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3525        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3526        let yf: [f64; 4] = transmute(y);
3527        let zf: [f64; 4] = transmute(z);
3528        assert_eq!(yf, [0.0; 4]);
3529        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3530    }
3531
3532    #[simd_test(enable = "avx")]
3533    unsafe fn test_mm256_min_ps() {
3534        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3535        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3536        let r = _mm256_min_ps(a, b);
3537        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3538        assert_eq_m256(r, e);
3539        // > If the values being compared are both 0.0s (of either sign), the
3540        // > value in the second operand (source operand) is returned.
3541        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3542        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3543        let wu: [u32; 8] = transmute(w);
3544        let xu: [u32; 8] = transmute(x);
3545        assert_eq!(wu, [0x8000_0000u32; 8]);
3546        assert_eq!(xu, [0u32; 8]);
3547        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3548        // > second operand (source operand), either a NaN or a valid
3549        // > floating-point value, is written to the result.
3550        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3551        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3552        let yf: [f32; 8] = transmute(y);
3553        let zf: [f32; 8] = transmute(z);
3554        assert_eq!(yf, [0.0; 8]);
3555        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3556    }
3557
3558    #[simd_test(enable = "avx")]
3559    const unsafe fn test_mm256_mul_pd() {
3560        let a = _mm256_setr_pd(1., 2., 3., 4.);
3561        let b = _mm256_setr_pd(5., 6., 7., 8.);
3562        let r = _mm256_mul_pd(a, b);
3563        let e = _mm256_setr_pd(5., 12., 21., 32.);
3564        assert_eq_m256d(r, e);
3565    }
3566
3567    #[simd_test(enable = "avx")]
3568    const unsafe fn test_mm256_mul_ps() {
3569        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3570        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3571        let r = _mm256_mul_ps(a, b);
3572        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3573        assert_eq_m256(r, e);
3574    }
3575
3576    #[simd_test(enable = "avx")]
3577    const unsafe fn test_mm256_addsub_pd() {
3578        let a = _mm256_setr_pd(1., 2., 3., 4.);
3579        let b = _mm256_setr_pd(5., 6., 7., 8.);
3580        let r = _mm256_addsub_pd(a, b);
3581        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3582        assert_eq_m256d(r, e);
3583    }
3584
3585    #[simd_test(enable = "avx")]
3586    const unsafe fn test_mm256_addsub_ps() {
3587        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3588        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3589        let r = _mm256_addsub_ps(a, b);
3590        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3591        assert_eq_m256(r, e);
3592    }
3593
3594    #[simd_test(enable = "avx")]
3595    const unsafe fn test_mm256_sub_pd() {
3596        let a = _mm256_setr_pd(1., 2., 3., 4.);
3597        let b = _mm256_setr_pd(5., 6., 7., 8.);
3598        let r = _mm256_sub_pd(a, b);
3599        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3600        assert_eq_m256d(r, e);
3601    }
3602
3603    #[simd_test(enable = "avx")]
3604    const unsafe fn test_mm256_sub_ps() {
3605        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3606        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3607        let r = _mm256_sub_ps(a, b);
3608        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3609        assert_eq_m256(r, e);
3610    }
3611
3612    #[simd_test(enable = "avx")]
3613    unsafe fn test_mm256_round_pd() {
3614        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3615        let result_closest = _mm256_round_pd::<0b0000>(a);
3616        let result_down = _mm256_round_pd::<0b0001>(a);
3617        let result_up = _mm256_round_pd::<0b0010>(a);
3618        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3619        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3620        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3621        assert_eq_m256d(result_closest, expected_closest);
3622        assert_eq_m256d(result_down, expected_down);
3623        assert_eq_m256d(result_up, expected_up);
3624    }
3625
3626    #[simd_test(enable = "avx")]
3627    const unsafe fn test_mm256_floor_pd() {
3628        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3629        let result_down = _mm256_floor_pd(a);
3630        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3631        assert_eq_m256d(result_down, expected_down);
3632    }
3633
3634    #[simd_test(enable = "avx")]
3635    const unsafe fn test_mm256_ceil_pd() {
3636        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3637        let result_up = _mm256_ceil_pd(a);
3638        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3639        assert_eq_m256d(result_up, expected_up);
3640    }
3641
3642    #[simd_test(enable = "avx")]
3643    unsafe fn test_mm256_round_ps() {
3644        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3645        let result_closest = _mm256_round_ps::<0b0000>(a);
3646        let result_down = _mm256_round_ps::<0b0001>(a);
3647        let result_up = _mm256_round_ps::<0b0010>(a);
3648        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3649        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3650        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3651        assert_eq_m256(result_closest, expected_closest);
3652        assert_eq_m256(result_down, expected_down);
3653        assert_eq_m256(result_up, expected_up);
3654    }
3655
3656    #[simd_test(enable = "avx")]
3657    const unsafe fn test_mm256_floor_ps() {
3658        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3659        let result_down = _mm256_floor_ps(a);
3660        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3661        assert_eq_m256(result_down, expected_down);
3662    }
3663
3664    #[simd_test(enable = "avx")]
3665    const unsafe fn test_mm256_ceil_ps() {
3666        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3667        let result_up = _mm256_ceil_ps(a);
3668        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3669        assert_eq_m256(result_up, expected_up);
3670    }
3671
3672    #[simd_test(enable = "avx")]
3673    unsafe fn test_mm256_sqrt_pd() {
3674        let a = _mm256_setr_pd(4., 9., 16., 25.);
3675        let r = _mm256_sqrt_pd(a);
3676        let e = _mm256_setr_pd(2., 3., 4., 5.);
3677        assert_eq_m256d(r, e);
3678    }
3679
3680    #[simd_test(enable = "avx")]
3681    unsafe fn test_mm256_sqrt_ps() {
3682        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3683        let r = _mm256_sqrt_ps(a);
3684        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3685        assert_eq_m256(r, e);
3686    }
3687
3688    #[simd_test(enable = "avx")]
3689    const unsafe fn test_mm256_div_ps() {
3690        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3691        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3692        let r = _mm256_div_ps(a, b);
3693        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3694        assert_eq_m256(r, e);
3695    }
3696
3697    #[simd_test(enable = "avx")]
3698    const unsafe fn test_mm256_div_pd() {
3699        let a = _mm256_setr_pd(4., 9., 16., 25.);
3700        let b = _mm256_setr_pd(4., 3., 2., 5.);
3701        let r = _mm256_div_pd(a, b);
3702        let e = _mm256_setr_pd(1., 3., 8., 5.);
3703        assert_eq_m256d(r, e);
3704    }
3705
3706    #[simd_test(enable = "avx")]
3707    const unsafe fn test_mm256_blend_pd() {
3708        let a = _mm256_setr_pd(4., 9., 16., 25.);
3709        let b = _mm256_setr_pd(4., 3., 2., 5.);
3710        let r = _mm256_blend_pd::<0x0>(a, b);
3711        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3712        let r = _mm256_blend_pd::<0x3>(a, b);
3713        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3714        let r = _mm256_blend_pd::<0xF>(a, b);
3715        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3716    }
3717
3718    #[simd_test(enable = "avx")]
3719    const unsafe fn test_mm256_blend_ps() {
3720        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3721        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3722        let r = _mm256_blend_ps::<0x0>(a, b);
3723        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3724        let r = _mm256_blend_ps::<0x3>(a, b);
3725        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3726        let r = _mm256_blend_ps::<0xF>(a, b);
3727        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3728    }
3729
3730    #[simd_test(enable = "avx")]
3731    const unsafe fn test_mm256_blendv_pd() {
3732        let a = _mm256_setr_pd(4., 9., 16., 25.);
3733        let b = _mm256_setr_pd(4., 3., 2., 5.);
3734        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3735        let r = _mm256_blendv_pd(a, b, c);
3736        let e = _mm256_setr_pd(4., 9., 2., 5.);
3737        assert_eq_m256d(r, e);
3738    }
3739
3740    #[simd_test(enable = "avx")]
3741    const unsafe fn test_mm256_blendv_ps() {
3742        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3743        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3744        #[rustfmt::skip]
3745        let c = _mm256_setr_ps(
3746            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3747        );
3748        let r = _mm256_blendv_ps(a, b, c);
3749        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3750        assert_eq_m256(r, e);
3751    }
3752
3753    #[simd_test(enable = "avx")]
3754    unsafe fn test_mm256_dp_ps() {
3755        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3756        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3757        let r = _mm256_dp_ps::<0xFF>(a, b);
3758        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3759        assert_eq_m256(r, e);
3760    }
3761
3762    #[simd_test(enable = "avx")]
3763    const unsafe fn test_mm256_hadd_pd() {
3764        let a = _mm256_setr_pd(4., 9., 16., 25.);
3765        let b = _mm256_setr_pd(4., 3., 2., 5.);
3766        let r = _mm256_hadd_pd(a, b);
3767        let e = _mm256_setr_pd(13., 7., 41., 7.);
3768        assert_eq_m256d(r, e);
3769
3770        let a = _mm256_setr_pd(1., 2., 3., 4.);
3771        let b = _mm256_setr_pd(5., 6., 7., 8.);
3772        let r = _mm256_hadd_pd(a, b);
3773        let e = _mm256_setr_pd(3., 11., 7., 15.);
3774        assert_eq_m256d(r, e);
3775    }
3776
3777    #[simd_test(enable = "avx")]
3778    const unsafe fn test_mm256_hadd_ps() {
3779        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3780        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3781        let r = _mm256_hadd_ps(a, b);
3782        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3783        assert_eq_m256(r, e);
3784
3785        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3786        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3787        let r = _mm256_hadd_ps(a, b);
3788        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3789        assert_eq_m256(r, e);
3790    }
3791
3792    #[simd_test(enable = "avx")]
3793    const unsafe fn test_mm256_hsub_pd() {
3794        let a = _mm256_setr_pd(4., 9., 16., 25.);
3795        let b = _mm256_setr_pd(4., 3., 2., 5.);
3796        let r = _mm256_hsub_pd(a, b);
3797        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3798        assert_eq_m256d(r, e);
3799
3800        let a = _mm256_setr_pd(1., 2., 3., 4.);
3801        let b = _mm256_setr_pd(5., 6., 7., 8.);
3802        let r = _mm256_hsub_pd(a, b);
3803        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3804        assert_eq_m256d(r, e);
3805    }
3806
3807    #[simd_test(enable = "avx")]
3808    const unsafe fn test_mm256_hsub_ps() {
3809        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3810        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3811        let r = _mm256_hsub_ps(a, b);
3812        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3813        assert_eq_m256(r, e);
3814
3815        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3816        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3817        let r = _mm256_hsub_ps(a, b);
3818        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3819        assert_eq_m256(r, e);
3820    }
3821
3822    #[simd_test(enable = "avx")]
3823    const unsafe fn test_mm256_xor_pd() {
3824        let a = _mm256_setr_pd(4., 9., 16., 25.);
3825        let b = _mm256_set1_pd(0.);
3826        let r = _mm256_xor_pd(a, b);
3827        assert_eq_m256d(r, a);
3828    }
3829
3830    #[simd_test(enable = "avx")]
3831    const unsafe fn test_mm256_xor_ps() {
3832        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3833        let b = _mm256_set1_ps(0.);
3834        let r = _mm256_xor_ps(a, b);
3835        assert_eq_m256(r, a);
3836    }
3837
3838    #[simd_test(enable = "avx")]
3839    unsafe fn test_mm_cmp_pd() {
3840        let a = _mm_setr_pd(4., 9.);
3841        let b = _mm_setr_pd(4., 3.);
3842        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3843        assert!(get_m128d(r, 0).is_nan());
3844        assert!(get_m128d(r, 1).is_nan());
3845    }
3846
3847    #[simd_test(enable = "avx")]
3848    unsafe fn test_mm256_cmp_pd() {
3849        let a = _mm256_setr_pd(1., 2., 3., 4.);
3850        let b = _mm256_setr_pd(5., 6., 7., 8.);
3851        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3852        let e = _mm256_set1_pd(0.);
3853        assert_eq_m256d(r, e);
3854    }
3855
3856    #[simd_test(enable = "avx")]
3857    unsafe fn test_mm_cmp_ps() {
3858        let a = _mm_setr_ps(4., 3., 2., 5.);
3859        let b = _mm_setr_ps(4., 9., 16., 25.);
3860        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3861        assert!(get_m128(r, 0).is_nan());
3862        assert_eq!(get_m128(r, 1), 0.);
3863        assert_eq!(get_m128(r, 2), 0.);
3864        assert_eq!(get_m128(r, 3), 0.);
3865    }
3866
3867    #[simd_test(enable = "avx")]
3868    unsafe fn test_mm256_cmp_ps() {
3869        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3870        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3871        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3872        let e = _mm256_set1_ps(0.);
3873        assert_eq_m256(r, e);
3874    }
3875
3876    #[simd_test(enable = "avx")]
3877    unsafe fn test_mm_cmp_sd() {
3878        let a = _mm_setr_pd(4., 9.);
3879        let b = _mm_setr_pd(4., 3.);
3880        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3881        assert!(get_m128d(r, 0).is_nan());
3882        assert_eq!(get_m128d(r, 1), 9.);
3883    }
3884
3885    #[simd_test(enable = "avx")]
3886    unsafe fn test_mm_cmp_ss() {
3887        let a = _mm_setr_ps(4., 3., 2., 5.);
3888        let b = _mm_setr_ps(4., 9., 16., 25.);
3889        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3890        assert!(get_m128(r, 0).is_nan());
3891        assert_eq!(get_m128(r, 1), 3.);
3892        assert_eq!(get_m128(r, 2), 2.);
3893        assert_eq!(get_m128(r, 3), 5.);
3894    }
3895
3896    #[simd_test(enable = "avx")]
3897    const unsafe fn test_mm256_cvtepi32_pd() {
3898        let a = _mm_setr_epi32(4, 9, 16, 25);
3899        let r = _mm256_cvtepi32_pd(a);
3900        let e = _mm256_setr_pd(4., 9., 16., 25.);
3901        assert_eq_m256d(r, e);
3902    }
3903
3904    #[simd_test(enable = "avx")]
3905    const unsafe fn test_mm256_cvtepi32_ps() {
3906        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3907        let r = _mm256_cvtepi32_ps(a);
3908        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3909        assert_eq_m256(r, e);
3910    }
3911
3912    #[simd_test(enable = "avx")]
3913    const unsafe fn test_mm256_cvtpd_ps() {
3914        let a = _mm256_setr_pd(4., 9., 16., 25.);
3915        let r = _mm256_cvtpd_ps(a);
3916        let e = _mm_setr_ps(4., 9., 16., 25.);
3917        assert_eq_m128(r, e);
3918    }
3919
3920    #[simd_test(enable = "avx")]
3921    unsafe fn test_mm256_cvtps_epi32() {
3922        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3923        let r = _mm256_cvtps_epi32(a);
3924        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3925        assert_eq_m256i(r, e);
3926    }
3927
3928    #[simd_test(enable = "avx")]
3929    const unsafe fn test_mm256_cvtps_pd() {
3930        let a = _mm_setr_ps(4., 9., 16., 25.);
3931        let r = _mm256_cvtps_pd(a);
3932        let e = _mm256_setr_pd(4., 9., 16., 25.);
3933        assert_eq_m256d(r, e);
3934    }
3935
3936    #[simd_test(enable = "avx")]
3937    const unsafe fn test_mm256_cvtsd_f64() {
3938        let a = _mm256_setr_pd(1., 2., 3., 4.);
3939        let r = _mm256_cvtsd_f64(a);
3940        assert_eq!(r, 1.);
3941    }
3942
3943    #[simd_test(enable = "avx")]
3944    unsafe fn test_mm256_cvttpd_epi32() {
3945        let a = _mm256_setr_pd(4., 9., 16., 25.);
3946        let r = _mm256_cvttpd_epi32(a);
3947        let e = _mm_setr_epi32(4, 9, 16, 25);
3948        assert_eq_m128i(r, e);
3949    }
3950
3951    #[simd_test(enable = "avx")]
3952    unsafe fn test_mm256_cvtpd_epi32() {
3953        let a = _mm256_setr_pd(4., 9., 16., 25.);
3954        let r = _mm256_cvtpd_epi32(a);
3955        let e = _mm_setr_epi32(4, 9, 16, 25);
3956        assert_eq_m128i(r, e);
3957    }
3958
3959    #[simd_test(enable = "avx")]
3960    unsafe fn test_mm256_cvttps_epi32() {
3961        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3962        let r = _mm256_cvttps_epi32(a);
3963        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3964        assert_eq_m256i(r, e);
3965    }
3966
3967    #[simd_test(enable = "avx")]
3968    const unsafe fn test_mm256_extractf128_ps() {
3969        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3970        let r = _mm256_extractf128_ps::<0>(a);
3971        let e = _mm_setr_ps(4., 3., 2., 5.);
3972        assert_eq_m128(r, e);
3973    }
3974
3975    #[simd_test(enable = "avx")]
3976    const unsafe fn test_mm256_extractf128_pd() {
3977        let a = _mm256_setr_pd(4., 3., 2., 5.);
3978        let r = _mm256_extractf128_pd::<0>(a);
3979        let e = _mm_setr_pd(4., 3.);
3980        assert_eq_m128d(r, e);
3981    }
3982
3983    #[simd_test(enable = "avx")]
3984    const unsafe fn test_mm256_extractf128_si256() {
3985        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3986        let r = _mm256_extractf128_si256::<0>(a);
3987        let e = _mm_setr_epi64x(4, 3);
3988        assert_eq_m128i(r, e);
3989    }
3990
3991    #[simd_test(enable = "avx")]
3992    const unsafe fn test_mm256_extract_epi32() {
3993        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3994        let r1 = _mm256_extract_epi32::<0>(a);
3995        let r2 = _mm256_extract_epi32::<3>(a);
3996        assert_eq!(r1, -1);
3997        assert_eq!(r2, 3);
3998    }
3999
4000    #[simd_test(enable = "avx")]
4001    const unsafe fn test_mm256_cvtsi256_si32() {
4002        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4003        let r = _mm256_cvtsi256_si32(a);
4004        assert_eq!(r, 1);
4005    }
4006
4007    #[simd_test(enable = "avx")]
4008    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4009    unsafe fn test_mm256_zeroall() {
4010        _mm256_zeroall();
4011    }
4012
4013    #[simd_test(enable = "avx")]
4014    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4015    unsafe fn test_mm256_zeroupper() {
4016        _mm256_zeroupper();
4017    }
4018
4019    #[simd_test(enable = "avx")]
4020    unsafe fn test_mm256_permutevar_ps() {
4021        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4022        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4023        let r = _mm256_permutevar_ps(a, b);
4024        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
4025        assert_eq_m256(r, e);
4026    }
4027
4028    #[simd_test(enable = "avx")]
4029    unsafe fn test_mm_permutevar_ps() {
4030        let a = _mm_setr_ps(4., 3., 2., 5.);
4031        let b = _mm_setr_epi32(1, 2, 3, 4);
4032        let r = _mm_permutevar_ps(a, b);
4033        let e = _mm_setr_ps(3., 2., 5., 4.);
4034        assert_eq_m128(r, e);
4035    }
4036
4037    #[simd_test(enable = "avx")]
4038    const unsafe fn test_mm256_permute_ps() {
4039        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4040        let r = _mm256_permute_ps::<0x1b>(a);
4041        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
4042        assert_eq_m256(r, e);
4043    }
4044
4045    #[simd_test(enable = "avx")]
4046    const unsafe fn test_mm_permute_ps() {
4047        let a = _mm_setr_ps(4., 3., 2., 5.);
4048        let r = _mm_permute_ps::<0x1b>(a);
4049        let e = _mm_setr_ps(5., 2., 3., 4.);
4050        assert_eq_m128(r, e);
4051    }
4052
4053    #[simd_test(enable = "avx")]
4054    unsafe fn test_mm256_permutevar_pd() {
4055        let a = _mm256_setr_pd(4., 3., 2., 5.);
4056        let b = _mm256_setr_epi64x(1, 2, 3, 4);
4057        let r = _mm256_permutevar_pd(a, b);
4058        let e = _mm256_setr_pd(4., 3., 5., 2.);
4059        assert_eq_m256d(r, e);
4060    }
4061
4062    #[simd_test(enable = "avx")]
4063    unsafe fn test_mm_permutevar_pd() {
4064        let a = _mm_setr_pd(4., 3.);
4065        let b = _mm_setr_epi64x(3, 0);
4066        let r = _mm_permutevar_pd(a, b);
4067        let e = _mm_setr_pd(3., 4.);
4068        assert_eq_m128d(r, e);
4069    }
4070
4071    #[simd_test(enable = "avx")]
4072    const unsafe fn test_mm256_permute_pd() {
4073        let a = _mm256_setr_pd(4., 3., 2., 5.);
4074        let r = _mm256_permute_pd::<5>(a);
4075        let e = _mm256_setr_pd(3., 4., 5., 2.);
4076        assert_eq_m256d(r, e);
4077    }
4078
4079    #[simd_test(enable = "avx")]
4080    const unsafe fn test_mm_permute_pd() {
4081        let a = _mm_setr_pd(4., 3.);
4082        let r = _mm_permute_pd::<1>(a);
4083        let e = _mm_setr_pd(3., 4.);
4084        assert_eq_m128d(r, e);
4085    }
4086
4087    #[simd_test(enable = "avx")]
4088    const unsafe fn test_mm256_permute2f128_ps() {
4089        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
4090        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
4091        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
4092        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
4093        assert_eq_m256(r, e);
4094
4095        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4096        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
4097        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
4098        assert_eq_m256(r, z);
4099    }
4100
4101    #[simd_test(enable = "avx")]
4102    const unsafe fn test_mm256_permute2f128_pd() {
4103        let a = _mm256_setr_pd(1., 2., 3., 4.);
4104        let b = _mm256_setr_pd(5., 6., 7., 8.);
4105        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
4106        let e = _mm256_setr_pd(3., 4., 7., 8.);
4107        assert_eq_m256d(r, e);
4108
4109        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4110        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
4111        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
4112        assert_eq_m256d(r, e);
4113    }
4114
4115    #[simd_test(enable = "avx")]
4116    const unsafe fn test_mm256_permute2f128_si256() {
4117        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
4118        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
4119        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
4120        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
4121        assert_eq_m256i(r, e);
4122
4123        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4124        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
4125        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
4126        assert_eq_m256i(r, e);
4127    }
4128
4129    #[simd_test(enable = "avx")]
4130    const unsafe fn test_mm256_broadcast_ss() {
4131        let r = _mm256_broadcast_ss(&3.);
4132        let e = _mm256_set1_ps(3.);
4133        assert_eq_m256(r, e);
4134    }
4135
4136    #[simd_test(enable = "avx")]
4137    const unsafe fn test_mm_broadcast_ss() {
4138        let r = _mm_broadcast_ss(&3.);
4139        let e = _mm_set1_ps(3.);
4140        assert_eq_m128(r, e);
4141    }
4142
4143    #[simd_test(enable = "avx")]
4144    const unsafe fn test_mm256_broadcast_sd() {
4145        let r = _mm256_broadcast_sd(&3.);
4146        let e = _mm256_set1_pd(3.);
4147        assert_eq_m256d(r, e);
4148    }
4149
4150    #[simd_test(enable = "avx")]
4151    const unsafe fn test_mm256_broadcast_ps() {
4152        let a = _mm_setr_ps(4., 3., 2., 5.);
4153        let r = _mm256_broadcast_ps(&a);
4154        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
4155        assert_eq_m256(r, e);
4156    }
4157
4158    #[simd_test(enable = "avx")]
4159    const unsafe fn test_mm256_broadcast_pd() {
4160        let a = _mm_setr_pd(4., 3.);
4161        let r = _mm256_broadcast_pd(&a);
4162        let e = _mm256_setr_pd(4., 3., 4., 3.);
4163        assert_eq_m256d(r, e);
4164    }
4165
4166    #[simd_test(enable = "avx")]
4167    const unsafe fn test_mm256_insertf128_ps() {
4168        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4169        let b = _mm_setr_ps(4., 9., 16., 25.);
4170        let r = _mm256_insertf128_ps::<0>(a, b);
4171        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4172        assert_eq_m256(r, e);
4173    }
4174
4175    #[simd_test(enable = "avx")]
4176    const unsafe fn test_mm256_insertf128_pd() {
4177        let a = _mm256_setr_pd(1., 2., 3., 4.);
4178        let b = _mm_setr_pd(5., 6.);
4179        let r = _mm256_insertf128_pd::<0>(a, b);
4180        let e = _mm256_setr_pd(5., 6., 3., 4.);
4181        assert_eq_m256d(r, e);
4182    }
4183
4184    #[simd_test(enable = "avx")]
4185    const unsafe fn test_mm256_insertf128_si256() {
4186        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4187        let b = _mm_setr_epi64x(5, 6);
4188        let r = _mm256_insertf128_si256::<0>(a, b);
4189        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4190        assert_eq_m256i(r, e);
4191    }
4192
4193    #[simd_test(enable = "avx")]
4194    const unsafe fn test_mm256_insert_epi8() {
4195        #[rustfmt::skip]
4196        let a = _mm256_setr_epi8(
4197            1, 2, 3, 4, 5, 6, 7, 8,
4198            9, 10, 11, 12, 13, 14, 15, 16,
4199            17, 18, 19, 20, 21, 22, 23, 24,
4200            25, 26, 27, 28, 29, 30, 31, 32,
4201        );
4202        let r = _mm256_insert_epi8::<31>(a, 0);
4203        #[rustfmt::skip]
4204        let e = _mm256_setr_epi8(
4205            1, 2, 3, 4, 5, 6, 7, 8,
4206            9, 10, 11, 12, 13, 14, 15, 16,
4207            17, 18, 19, 20, 21, 22, 23, 24,
4208            25, 26, 27, 28, 29, 30, 31, 0,
4209        );
4210        assert_eq_m256i(r, e);
4211    }
4212
4213    #[simd_test(enable = "avx")]
4214    const unsafe fn test_mm256_insert_epi16() {
4215        #[rustfmt::skip]
4216        let a = _mm256_setr_epi16(
4217            0, 1, 2, 3, 4, 5, 6, 7,
4218            8, 9, 10, 11, 12, 13, 14, 15,
4219        );
4220        let r = _mm256_insert_epi16::<15>(a, 0);
4221        #[rustfmt::skip]
4222        let e = _mm256_setr_epi16(
4223            0, 1, 2, 3, 4, 5, 6, 7,
4224            8, 9, 10, 11, 12, 13, 14, 0,
4225        );
4226        assert_eq_m256i(r, e);
4227    }
4228
4229    #[simd_test(enable = "avx")]
4230    const unsafe fn test_mm256_insert_epi32() {
4231        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4232        let r = _mm256_insert_epi32::<7>(a, 0);
4233        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4234        assert_eq_m256i(r, e);
4235    }
4236
4237    #[simd_test(enable = "avx")]
4238    const unsafe fn test_mm256_load_pd() {
4239        let a = _mm256_setr_pd(1., 2., 3., 4.);
4240        let p = ptr::addr_of!(a) as *const f64;
4241        let r = _mm256_load_pd(p);
4242        let e = _mm256_setr_pd(1., 2., 3., 4.);
4243        assert_eq_m256d(r, e);
4244    }
4245
4246    #[simd_test(enable = "avx")]
4247    const unsafe fn test_mm256_store_pd() {
4248        let a = _mm256_setr_pd(1., 2., 3., 4.);
4249        let mut r = _mm256_undefined_pd();
4250        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4251        assert_eq_m256d(r, a);
4252    }
4253
4254    #[simd_test(enable = "avx")]
4255    const unsafe fn test_mm256_load_ps() {
4256        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4257        let p = ptr::addr_of!(a) as *const f32;
4258        let r = _mm256_load_ps(p);
4259        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4260        assert_eq_m256(r, e);
4261    }
4262
4263    #[simd_test(enable = "avx")]
4264    const unsafe fn test_mm256_store_ps() {
4265        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4266        let mut r = _mm256_undefined_ps();
4267        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4268        assert_eq_m256(r, a);
4269    }
4270
4271    #[simd_test(enable = "avx")]
4272    const unsafe fn test_mm256_loadu_pd() {
4273        let a = &[1.0f64, 2., 3., 4.];
4274        let p = a.as_ptr();
4275        let r = _mm256_loadu_pd(black_box(p));
4276        let e = _mm256_setr_pd(1., 2., 3., 4.);
4277        assert_eq_m256d(r, e);
4278    }
4279
4280    #[simd_test(enable = "avx")]
4281    const unsafe fn test_mm256_storeu_pd() {
4282        let a = _mm256_set1_pd(9.);
4283        let mut r = _mm256_undefined_pd();
4284        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4285        assert_eq_m256d(r, a);
4286    }
4287
4288    #[simd_test(enable = "avx")]
4289    const unsafe fn test_mm256_loadu_ps() {
4290        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4291        let p = a.as_ptr();
4292        let r = _mm256_loadu_ps(black_box(p));
4293        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4294        assert_eq_m256(r, e);
4295    }
4296
4297    #[simd_test(enable = "avx")]
4298    const unsafe fn test_mm256_storeu_ps() {
4299        let a = _mm256_set1_ps(9.);
4300        let mut r = _mm256_undefined_ps();
4301        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4302        assert_eq_m256(r, a);
4303    }
4304
4305    #[simd_test(enable = "avx")]
4306    const unsafe fn test_mm256_load_si256() {
4307        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4308        let p = ptr::addr_of!(a);
4309        let r = _mm256_load_si256(p);
4310        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4311        assert_eq_m256i(r, e);
4312    }
4313
4314    #[simd_test(enable = "avx")]
4315    const unsafe fn test_mm256_store_si256() {
4316        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4317        let mut r = _mm256_undefined_si256();
4318        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4319        assert_eq_m256i(r, a);
4320    }
4321
4322    #[simd_test(enable = "avx")]
4323    const unsafe fn test_mm256_loadu_si256() {
4324        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4325        let p = ptr::addr_of!(a);
4326        let r = _mm256_loadu_si256(black_box(p));
4327        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4328        assert_eq_m256i(r, e);
4329    }
4330
4331    #[simd_test(enable = "avx")]
4332    const unsafe fn test_mm256_storeu_si256() {
4333        let a = _mm256_set1_epi8(9);
4334        let mut r = _mm256_undefined_si256();
4335        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4336        assert_eq_m256i(r, a);
4337    }
4338
4339    #[simd_test(enable = "avx")]
4340    const unsafe fn test_mm256_maskload_pd() {
4341        let a = &[1.0f64, 2., 3., 4.];
4342        let p = a.as_ptr();
4343        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4344        let r = _mm256_maskload_pd(black_box(p), mask);
4345        let e = _mm256_setr_pd(0., 2., 0., 4.);
4346        assert_eq_m256d(r, e);
4347    }
4348
4349    #[simd_test(enable = "avx")]
4350    const unsafe fn test_mm256_maskstore_pd() {
4351        let mut r = _mm256_set1_pd(0.);
4352        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4353        let a = _mm256_setr_pd(1., 2., 3., 4.);
4354        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4355        let e = _mm256_setr_pd(0., 2., 0., 4.);
4356        assert_eq_m256d(r, e);
4357    }
4358
4359    #[simd_test(enable = "avx")]
4360    const unsafe fn test_mm_maskload_pd() {
4361        let a = &[1.0f64, 2.];
4362        let p = a.as_ptr();
4363        let mask = _mm_setr_epi64x(0, !0);
4364        let r = _mm_maskload_pd(black_box(p), mask);
4365        let e = _mm_setr_pd(0., 2.);
4366        assert_eq_m128d(r, e);
4367    }
4368
4369    #[simd_test(enable = "avx")]
4370    const unsafe fn test_mm_maskstore_pd() {
4371        let mut r = _mm_set1_pd(0.);
4372        let mask = _mm_setr_epi64x(0, !0);
4373        let a = _mm_setr_pd(1., 2.);
4374        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4375        let e = _mm_setr_pd(0., 2.);
4376        assert_eq_m128d(r, e);
4377    }
4378
4379    #[simd_test(enable = "avx")]
4380    const unsafe fn test_mm256_maskload_ps() {
4381        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4382        let p = a.as_ptr();
4383        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4384        let r = _mm256_maskload_ps(black_box(p), mask);
4385        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4386        assert_eq_m256(r, e);
4387    }
4388
4389    #[simd_test(enable = "avx")]
4390    const unsafe fn test_mm256_maskstore_ps() {
4391        let mut r = _mm256_set1_ps(0.);
4392        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4393        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4394        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4395        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4396        assert_eq_m256(r, e);
4397    }
4398
4399    #[simd_test(enable = "avx")]
4400    const unsafe fn test_mm_maskload_ps() {
4401        let a = &[1.0f32, 2., 3., 4.];
4402        let p = a.as_ptr();
4403        let mask = _mm_setr_epi32(0, !0, 0, !0);
4404        let r = _mm_maskload_ps(black_box(p), mask);
4405        let e = _mm_setr_ps(0., 2., 0., 4.);
4406        assert_eq_m128(r, e);
4407    }
4408
4409    #[simd_test(enable = "avx")]
4410    const unsafe fn test_mm_maskstore_ps() {
4411        let mut r = _mm_set1_ps(0.);
4412        let mask = _mm_setr_epi32(0, !0, 0, !0);
4413        let a = _mm_setr_ps(1., 2., 3., 4.);
4414        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4415        let e = _mm_setr_ps(0., 2., 0., 4.);
4416        assert_eq_m128(r, e);
4417    }
4418
4419    #[simd_test(enable = "avx")]
4420    const unsafe fn test_mm256_movehdup_ps() {
4421        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4422        let r = _mm256_movehdup_ps(a);
4423        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4424        assert_eq_m256(r, e);
4425    }
4426
4427    #[simd_test(enable = "avx")]
4428    const unsafe fn test_mm256_moveldup_ps() {
4429        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4430        let r = _mm256_moveldup_ps(a);
4431        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4432        assert_eq_m256(r, e);
4433    }
4434
4435    #[simd_test(enable = "avx")]
4436    const unsafe fn test_mm256_movedup_pd() {
4437        let a = _mm256_setr_pd(1., 2., 3., 4.);
4438        let r = _mm256_movedup_pd(a);
4439        let e = _mm256_setr_pd(1., 1., 3., 3.);
4440        assert_eq_m256d(r, e);
4441    }
4442
4443    #[simd_test(enable = "avx")]
4444    unsafe fn test_mm256_lddqu_si256() {
4445        #[rustfmt::skip]
4446        let a = _mm256_setr_epi8(
4447            1, 2, 3, 4, 5, 6, 7, 8,
4448            9, 10, 11, 12, 13, 14, 15, 16,
4449            17, 18, 19, 20, 21, 22, 23, 24,
4450            25, 26, 27, 28, 29, 30, 31, 32,
4451        );
4452        let p = ptr::addr_of!(a);
4453        let r = _mm256_lddqu_si256(black_box(p));
4454        #[rustfmt::skip]
4455        let e = _mm256_setr_epi8(
4456            1, 2, 3, 4, 5, 6, 7, 8,
4457            9, 10, 11, 12, 13, 14, 15, 16,
4458            17, 18, 19, 20, 21, 22, 23, 24,
4459            25, 26, 27, 28, 29, 30, 31, 32,
4460        );
4461        assert_eq_m256i(r, e);
4462    }
4463
4464    #[simd_test(enable = "avx")]
4465    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4466    unsafe fn test_mm256_stream_si256() {
4467        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4468        let mut r = _mm256_undefined_si256();
4469        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4470        _mm_sfence();
4471        assert_eq_m256i(r, a);
4472    }
4473
4474    #[simd_test(enable = "avx")]
4475    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4476    unsafe fn test_mm256_stream_pd() {
4477        #[repr(align(32))]
4478        struct Memory {
4479            pub data: [f64; 4],
4480        }
4481        let a = _mm256_set1_pd(7.0);
4482        let mut mem = Memory { data: [-1.0; 4] };
4483
4484        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4485        _mm_sfence();
4486        for i in 0..4 {
4487            assert_eq!(mem.data[i], get_m256d(a, i));
4488        }
4489    }
4490
4491    #[simd_test(enable = "avx")]
4492    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4493    unsafe fn test_mm256_stream_ps() {
4494        #[repr(align(32))]
4495        struct Memory {
4496            pub data: [f32; 8],
4497        }
4498        let a = _mm256_set1_ps(7.0);
4499        let mut mem = Memory { data: [-1.0; 8] };
4500
4501        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4502        _mm_sfence();
4503        for i in 0..8 {
4504            assert_eq!(mem.data[i], get_m256(a, i));
4505        }
4506    }
4507
4508    #[simd_test(enable = "avx")]
4509    unsafe fn test_mm256_rcp_ps() {
4510        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4511        let r = _mm256_rcp_ps(a);
4512        #[rustfmt::skip]
4513        let e = _mm256_setr_ps(
4514            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4515            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4516        );
4517        let rel_err = 0.00048828125;
4518        for i in 0..8 {
4519            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4520        }
4521    }
4522
4523    #[simd_test(enable = "avx")]
4524    unsafe fn test_mm256_rsqrt_ps() {
4525        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4526        let r = _mm256_rsqrt_ps(a);
4527        #[rustfmt::skip]
4528        let e = _mm256_setr_ps(
4529            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4530            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4531        );
4532        let rel_err = 0.00048828125;
4533        for i in 0..8 {
4534            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4535        }
4536    }
4537
4538    #[simd_test(enable = "avx")]
4539    const unsafe fn test_mm256_unpackhi_pd() {
4540        let a = _mm256_setr_pd(1., 2., 3., 4.);
4541        let b = _mm256_setr_pd(5., 6., 7., 8.);
4542        let r = _mm256_unpackhi_pd(a, b);
4543        let e = _mm256_setr_pd(2., 6., 4., 8.);
4544        assert_eq_m256d(r, e);
4545    }
4546
4547    #[simd_test(enable = "avx")]
4548    const unsafe fn test_mm256_unpackhi_ps() {
4549        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4550        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4551        let r = _mm256_unpackhi_ps(a, b);
4552        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4553        assert_eq_m256(r, e);
4554    }
4555
4556    #[simd_test(enable = "avx")]
4557    const unsafe fn test_mm256_unpacklo_pd() {
4558        let a = _mm256_setr_pd(1., 2., 3., 4.);
4559        let b = _mm256_setr_pd(5., 6., 7., 8.);
4560        let r = _mm256_unpacklo_pd(a, b);
4561        let e = _mm256_setr_pd(1., 5., 3., 7.);
4562        assert_eq_m256d(r, e);
4563    }
4564
4565    #[simd_test(enable = "avx")]
4566    const unsafe fn test_mm256_unpacklo_ps() {
4567        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4568        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4569        let r = _mm256_unpacklo_ps(a, b);
4570        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4571        assert_eq_m256(r, e);
4572    }
4573
4574    #[simd_test(enable = "avx")]
4575    const unsafe fn test_mm256_testz_si256() {
4576        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4577        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4578        let r = _mm256_testz_si256(a, b);
4579        assert_eq!(r, 0);
4580        let b = _mm256_set1_epi64x(0);
4581        let r = _mm256_testz_si256(a, b);
4582        assert_eq!(r, 1);
4583    }
4584
4585    #[simd_test(enable = "avx")]
4586    const unsafe fn test_mm256_testc_si256() {
4587        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4588        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4589        let r = _mm256_testc_si256(a, b);
4590        assert_eq!(r, 0);
4591        let b = _mm256_set1_epi64x(0);
4592        let r = _mm256_testc_si256(a, b);
4593        assert_eq!(r, 1);
4594    }
4595
4596    #[simd_test(enable = "avx")]
4597    unsafe fn test_mm256_testnzc_si256() {
4598        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4599        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4600        let r = _mm256_testnzc_si256(a, b);
4601        assert_eq!(r, 1);
4602        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4603        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4604        let r = _mm256_testnzc_si256(a, b);
4605        assert_eq!(r, 0);
4606    }
4607
4608    #[simd_test(enable = "avx")]
4609    unsafe fn test_mm256_testz_pd() {
4610        let a = _mm256_setr_pd(1., 2., 3., 4.);
4611        let b = _mm256_setr_pd(5., 6., 7., 8.);
4612        let r = _mm256_testz_pd(a, b);
4613        assert_eq!(r, 1);
4614        let a = _mm256_set1_pd(-1.);
4615        let r = _mm256_testz_pd(a, a);
4616        assert_eq!(r, 0);
4617    }
4618
4619    #[simd_test(enable = "avx")]
4620    unsafe fn test_mm256_testc_pd() {
4621        let a = _mm256_setr_pd(1., 2., 3., 4.);
4622        let b = _mm256_setr_pd(5., 6., 7., 8.);
4623        let r = _mm256_testc_pd(a, b);
4624        assert_eq!(r, 1);
4625        let a = _mm256_set1_pd(1.);
4626        let b = _mm256_set1_pd(-1.);
4627        let r = _mm256_testc_pd(a, b);
4628        assert_eq!(r, 0);
4629    }
4630
4631    #[simd_test(enable = "avx")]
4632    unsafe fn test_mm256_testnzc_pd() {
4633        let a = _mm256_setr_pd(1., 2., 3., 4.);
4634        let b = _mm256_setr_pd(5., 6., 7., 8.);
4635        let r = _mm256_testnzc_pd(a, b);
4636        assert_eq!(r, 0);
4637        let a = _mm256_setr_pd(1., -1., -1., -1.);
4638        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4639        let r = _mm256_testnzc_pd(a, b);
4640        assert_eq!(r, 1);
4641    }
4642
4643    #[simd_test(enable = "avx")]
4644    const unsafe fn test_mm_testz_pd() {
4645        let a = _mm_setr_pd(1., 2.);
4646        let b = _mm_setr_pd(5., 6.);
4647        let r = _mm_testz_pd(a, b);
4648        assert_eq!(r, 1);
4649        let a = _mm_set1_pd(-1.);
4650        let r = _mm_testz_pd(a, a);
4651        assert_eq!(r, 0);
4652    }
4653
4654    #[simd_test(enable = "avx")]
4655    const unsafe fn test_mm_testc_pd() {
4656        let a = _mm_setr_pd(1., 2.);
4657        let b = _mm_setr_pd(5., 6.);
4658        let r = _mm_testc_pd(a, b);
4659        assert_eq!(r, 1);
4660        let a = _mm_set1_pd(1.);
4661        let b = _mm_set1_pd(-1.);
4662        let r = _mm_testc_pd(a, b);
4663        assert_eq!(r, 0);
4664    }
4665
4666    #[simd_test(enable = "avx")]
4667    unsafe fn test_mm_testnzc_pd() {
4668        let a = _mm_setr_pd(1., 2.);
4669        let b = _mm_setr_pd(5., 6.);
4670        let r = _mm_testnzc_pd(a, b);
4671        assert_eq!(r, 0);
4672        let a = _mm_setr_pd(1., -1.);
4673        let b = _mm_setr_pd(-1., -1.);
4674        let r = _mm_testnzc_pd(a, b);
4675        assert_eq!(r, 1);
4676    }
4677
4678    #[simd_test(enable = "avx")]
4679    unsafe fn test_mm256_testz_ps() {
4680        let a = _mm256_set1_ps(1.);
4681        let r = _mm256_testz_ps(a, a);
4682        assert_eq!(r, 1);
4683        let a = _mm256_set1_ps(-1.);
4684        let r = _mm256_testz_ps(a, a);
4685        assert_eq!(r, 0);
4686    }
4687
4688    #[simd_test(enable = "avx")]
4689    unsafe fn test_mm256_testc_ps() {
4690        let a = _mm256_set1_ps(1.);
4691        let r = _mm256_testc_ps(a, a);
4692        assert_eq!(r, 1);
4693        let b = _mm256_set1_ps(-1.);
4694        let r = _mm256_testc_ps(a, b);
4695        assert_eq!(r, 0);
4696    }
4697
4698    #[simd_test(enable = "avx")]
4699    unsafe fn test_mm256_testnzc_ps() {
4700        let a = _mm256_set1_ps(1.);
4701        let r = _mm256_testnzc_ps(a, a);
4702        assert_eq!(r, 0);
4703        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4704        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4705        let r = _mm256_testnzc_ps(a, b);
4706        assert_eq!(r, 1);
4707    }
4708
4709    #[simd_test(enable = "avx")]
4710    const unsafe fn test_mm_testz_ps() {
4711        let a = _mm_set1_ps(1.);
4712        let r = _mm_testz_ps(a, a);
4713        assert_eq!(r, 1);
4714        let a = _mm_set1_ps(-1.);
4715        let r = _mm_testz_ps(a, a);
4716        assert_eq!(r, 0);
4717    }
4718
4719    #[simd_test(enable = "avx")]
4720    const unsafe fn test_mm_testc_ps() {
4721        let a = _mm_set1_ps(1.);
4722        let r = _mm_testc_ps(a, a);
4723        assert_eq!(r, 1);
4724        let b = _mm_set1_ps(-1.);
4725        let r = _mm_testc_ps(a, b);
4726        assert_eq!(r, 0);
4727    }
4728
4729    #[simd_test(enable = "avx")]
4730    unsafe fn test_mm_testnzc_ps() {
4731        let a = _mm_set1_ps(1.);
4732        let r = _mm_testnzc_ps(a, a);
4733        assert_eq!(r, 0);
4734        let a = _mm_setr_ps(1., -1., -1., -1.);
4735        let b = _mm_setr_ps(-1., -1., 1., 1.);
4736        let r = _mm_testnzc_ps(a, b);
4737        assert_eq!(r, 1);
4738    }
4739
4740    #[simd_test(enable = "avx")]
4741    const unsafe fn test_mm256_movemask_pd() {
4742        let a = _mm256_setr_pd(1., -2., 3., -4.);
4743        let r = _mm256_movemask_pd(a);
4744        assert_eq!(r, 0xA);
4745    }
4746
4747    #[simd_test(enable = "avx")]
4748    const unsafe fn test_mm256_movemask_ps() {
4749        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4750        let r = _mm256_movemask_ps(a);
4751        assert_eq!(r, 0xAA);
4752    }
4753
4754    #[simd_test(enable = "avx")]
4755    const unsafe fn test_mm256_setzero_pd() {
4756        let r = _mm256_setzero_pd();
4757        assert_eq_m256d(r, _mm256_set1_pd(0.));
4758    }
4759
4760    #[simd_test(enable = "avx")]
4761    const unsafe fn test_mm256_setzero_ps() {
4762        let r = _mm256_setzero_ps();
4763        assert_eq_m256(r, _mm256_set1_ps(0.));
4764    }
4765
4766    #[simd_test(enable = "avx")]
4767    const unsafe fn test_mm256_setzero_si256() {
4768        let r = _mm256_setzero_si256();
4769        assert_eq_m256i(r, _mm256_set1_epi8(0));
4770    }
4771
4772    #[simd_test(enable = "avx")]
4773    const unsafe fn test_mm256_set_pd() {
4774        let r = _mm256_set_pd(1., 2., 3., 4.);
4775        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4776    }
4777
4778    #[simd_test(enable = "avx")]
4779    const unsafe fn test_mm256_set_ps() {
4780        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4781        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4782    }
4783
4784    #[simd_test(enable = "avx")]
4785    const unsafe fn test_mm256_set_epi8() {
4786        #[rustfmt::skip]
4787        let r = _mm256_set_epi8(
4788            1, 2, 3, 4, 5, 6, 7, 8,
4789            9, 10, 11, 12, 13, 14, 15, 16,
4790            17, 18, 19, 20, 21, 22, 23, 24,
4791            25, 26, 27, 28, 29, 30, 31, 32,
4792        );
4793        #[rustfmt::skip]
4794        let e = _mm256_setr_epi8(
4795            32, 31, 30, 29, 28, 27, 26, 25,
4796            24, 23, 22, 21, 20, 19, 18, 17,
4797            16, 15, 14, 13, 12, 11, 10, 9,
4798            8, 7, 6, 5, 4, 3, 2, 1
4799        );
4800        assert_eq_m256i(r, e);
4801    }
4802
4803    #[simd_test(enable = "avx")]
4804    const unsafe fn test_mm256_set_epi16() {
4805        #[rustfmt::skip]
4806        let r = _mm256_set_epi16(
4807            1, 2, 3, 4, 5, 6, 7, 8,
4808            9, 10, 11, 12, 13, 14, 15, 16,
4809        );
4810        #[rustfmt::skip]
4811        let e = _mm256_setr_epi16(
4812            16, 15, 14, 13, 12, 11, 10, 9, 8,
4813            7, 6, 5, 4, 3, 2, 1,
4814        );
4815        assert_eq_m256i(r, e);
4816    }
4817
4818    #[simd_test(enable = "avx")]
4819    const unsafe fn test_mm256_set_epi32() {
4820        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4821        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4822    }
4823
4824    #[simd_test(enable = "avx")]
4825    const unsafe fn test_mm256_set_epi64x() {
4826        let r = _mm256_set_epi64x(1, 2, 3, 4);
4827        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4828    }
4829
4830    #[simd_test(enable = "avx")]
4831    const unsafe fn test_mm256_setr_pd() {
4832        let r = _mm256_setr_pd(1., 2., 3., 4.);
4833        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4834    }
4835
4836    #[simd_test(enable = "avx")]
4837    const unsafe fn test_mm256_setr_ps() {
4838        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4839        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4840    }
4841
4842    #[simd_test(enable = "avx")]
4843    const unsafe fn test_mm256_setr_epi8() {
4844        #[rustfmt::skip]
4845        let r = _mm256_setr_epi8(
4846            1, 2, 3, 4, 5, 6, 7, 8,
4847            9, 10, 11, 12, 13, 14, 15, 16,
4848            17, 18, 19, 20, 21, 22, 23, 24,
4849            25, 26, 27, 28, 29, 30, 31, 32,
4850        );
4851        #[rustfmt::skip]
4852        let e = _mm256_setr_epi8(
4853            1, 2, 3, 4, 5, 6, 7, 8,
4854            9, 10, 11, 12, 13, 14, 15, 16,
4855            17, 18, 19, 20, 21, 22, 23, 24,
4856            25, 26, 27, 28, 29, 30, 31, 32
4857        );
4858
4859        assert_eq_m256i(r, e);
4860    }
4861
4862    #[simd_test(enable = "avx")]
4863    const unsafe fn test_mm256_setr_epi16() {
4864        #[rustfmt::skip]
4865        let r = _mm256_setr_epi16(
4866            1, 2, 3, 4, 5, 6, 7, 8,
4867            9, 10, 11, 12, 13, 14, 15, 16,
4868        );
4869        #[rustfmt::skip]
4870        let e = _mm256_setr_epi16(
4871            1, 2, 3, 4, 5, 6, 7, 8,
4872            9, 10, 11, 12, 13, 14, 15, 16,
4873        );
4874        assert_eq_m256i(r, e);
4875    }
4876
4877    #[simd_test(enable = "avx")]
4878    const unsafe fn test_mm256_setr_epi32() {
4879        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4880        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4881    }
4882
4883    #[simd_test(enable = "avx")]
4884    const unsafe fn test_mm256_setr_epi64x() {
4885        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4886        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4887    }
4888
4889    #[simd_test(enable = "avx")]
4890    const unsafe fn test_mm256_set1_pd() {
4891        let r = _mm256_set1_pd(1.);
4892        assert_eq_m256d(r, _mm256_set1_pd(1.));
4893    }
4894
4895    #[simd_test(enable = "avx")]
4896    const unsafe fn test_mm256_set1_ps() {
4897        let r = _mm256_set1_ps(1.);
4898        assert_eq_m256(r, _mm256_set1_ps(1.));
4899    }
4900
4901    #[simd_test(enable = "avx")]
4902    const unsafe fn test_mm256_set1_epi8() {
4903        let r = _mm256_set1_epi8(1);
4904        assert_eq_m256i(r, _mm256_set1_epi8(1));
4905    }
4906
4907    #[simd_test(enable = "avx")]
4908    const unsafe fn test_mm256_set1_epi16() {
4909        let r = _mm256_set1_epi16(1);
4910        assert_eq_m256i(r, _mm256_set1_epi16(1));
4911    }
4912
4913    #[simd_test(enable = "avx")]
4914    const unsafe fn test_mm256_set1_epi32() {
4915        let r = _mm256_set1_epi32(1);
4916        assert_eq_m256i(r, _mm256_set1_epi32(1));
4917    }
4918
4919    #[simd_test(enable = "avx")]
4920    const unsafe fn test_mm256_set1_epi64x() {
4921        let r = _mm256_set1_epi64x(1);
4922        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4923    }
4924
4925    #[simd_test(enable = "avx")]
4926    const unsafe fn test_mm256_castpd_ps() {
4927        let a = _mm256_setr_pd(1., 2., 3., 4.);
4928        let r = _mm256_castpd_ps(a);
4929        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4930        assert_eq_m256(r, e);
4931    }
4932
4933    #[simd_test(enable = "avx")]
4934    const unsafe fn test_mm256_castps_pd() {
4935        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4936        let r = _mm256_castps_pd(a);
4937        let e = _mm256_setr_pd(1., 2., 3., 4.);
4938        assert_eq_m256d(r, e);
4939    }
4940
4941    #[simd_test(enable = "avx")]
4942    const unsafe fn test_mm256_castps_si256() {
4943        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4944        let r = _mm256_castps_si256(a);
4945        #[rustfmt::skip]
4946        let e = _mm256_setr_epi8(
4947            0, 0, -128, 63, 0, 0, 0, 64,
4948            0, 0, 64, 64, 0, 0, -128, 64,
4949            0, 0, -96, 64, 0, 0, -64, 64,
4950            0, 0, -32, 64, 0, 0, 0, 65,
4951        );
4952        assert_eq_m256i(r, e);
4953    }
4954
4955    #[simd_test(enable = "avx")]
4956    const unsafe fn test_mm256_castsi256_ps() {
4957        #[rustfmt::skip]
4958        let a = _mm256_setr_epi8(
4959            0, 0, -128, 63, 0, 0, 0, 64,
4960            0, 0, 64, 64, 0, 0, -128, 64,
4961            0, 0, -96, 64, 0, 0, -64, 64,
4962            0, 0, -32, 64, 0, 0, 0, 65,
4963        );
4964        let r = _mm256_castsi256_ps(a);
4965        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4966        assert_eq_m256(r, e);
4967    }
4968
4969    #[simd_test(enable = "avx")]
4970    const unsafe fn test_mm256_castpd_si256() {
4971        let a = _mm256_setr_pd(1., 2., 3., 4.);
4972        let r = _mm256_castpd_si256(a);
4973        assert_eq_m256d(transmute(r), a);
4974    }
4975
4976    #[simd_test(enable = "avx")]
4977    const unsafe fn test_mm256_castsi256_pd() {
4978        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4979        let r = _mm256_castsi256_pd(a);
4980        assert_eq_m256d(r, transmute(a));
4981    }
4982
4983    #[simd_test(enable = "avx")]
4984    const unsafe fn test_mm256_castps256_ps128() {
4985        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4986        let r = _mm256_castps256_ps128(a);
4987        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4988    }
4989
4990    #[simd_test(enable = "avx")]
4991    const unsafe fn test_mm256_castpd256_pd128() {
4992        let a = _mm256_setr_pd(1., 2., 3., 4.);
4993        let r = _mm256_castpd256_pd128(a);
4994        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4995    }
4996
4997    #[simd_test(enable = "avx")]
4998    const unsafe fn test_mm256_castsi256_si128() {
4999        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5000        let r = _mm256_castsi256_si128(a);
5001        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5002    }
5003
5004    #[simd_test(enable = "avx")]
5005    const unsafe fn test_mm256_castps128_ps256() {
5006        let a = _mm_setr_ps(1., 2., 3., 4.);
5007        let r = _mm256_castps128_ps256(a);
5008        assert_eq_m128(_mm256_castps256_ps128(r), a);
5009    }
5010
5011    #[simd_test(enable = "avx")]
5012    const unsafe fn test_mm256_castpd128_pd256() {
5013        let a = _mm_setr_pd(1., 2.);
5014        let r = _mm256_castpd128_pd256(a);
5015        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
5016    }
5017
5018    #[simd_test(enable = "avx")]
5019    const unsafe fn test_mm256_castsi128_si256() {
5020        let a = _mm_setr_epi32(1, 2, 3, 4);
5021        let r = _mm256_castsi128_si256(a);
5022        assert_eq_m128i(_mm256_castsi256_si128(r), a);
5023    }
5024
5025    #[simd_test(enable = "avx")]
5026    const unsafe fn test_mm256_zextps128_ps256() {
5027        let a = _mm_setr_ps(1., 2., 3., 4.);
5028        let r = _mm256_zextps128_ps256(a);
5029        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
5030        assert_eq_m256(r, e);
5031    }
5032
5033    #[simd_test(enable = "avx")]
5034    const unsafe fn test_mm256_zextsi128_si256() {
5035        let a = _mm_setr_epi64x(1, 2);
5036        let r = _mm256_zextsi128_si256(a);
5037        let e = _mm256_setr_epi64x(1, 2, 0, 0);
5038        assert_eq_m256i(r, e);
5039    }
5040
5041    #[simd_test(enable = "avx")]
5042    const unsafe fn test_mm256_zextpd128_pd256() {
5043        let a = _mm_setr_pd(1., 2.);
5044        let r = _mm256_zextpd128_pd256(a);
5045        let e = _mm256_setr_pd(1., 2., 0., 0.);
5046        assert_eq_m256d(r, e);
5047    }
5048
5049    #[simd_test(enable = "avx")]
5050    const unsafe fn test_mm256_set_m128() {
5051        let hi = _mm_setr_ps(5., 6., 7., 8.);
5052        let lo = _mm_setr_ps(1., 2., 3., 4.);
5053        let r = _mm256_set_m128(hi, lo);
5054        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5055        assert_eq_m256(r, e);
5056    }
5057
5058    #[simd_test(enable = "avx")]
5059    const unsafe fn test_mm256_set_m128d() {
5060        let hi = _mm_setr_pd(3., 4.);
5061        let lo = _mm_setr_pd(1., 2.);
5062        let r = _mm256_set_m128d(hi, lo);
5063        let e = _mm256_setr_pd(1., 2., 3., 4.);
5064        assert_eq_m256d(r, e);
5065    }
5066
5067    #[simd_test(enable = "avx")]
5068    const unsafe fn test_mm256_set_m128i() {
5069        #[rustfmt::skip]
5070        let hi = _mm_setr_epi8(
5071            17, 18, 19, 20,
5072            21, 22, 23, 24,
5073            25, 26, 27, 28,
5074            29, 30, 31, 32,
5075        );
5076        #[rustfmt::skip]
5077        let lo = _mm_setr_epi8(
5078            1, 2, 3, 4,
5079            5, 6, 7, 8,
5080            9, 10, 11, 12,
5081            13, 14, 15, 16,
5082        );
5083        let r = _mm256_set_m128i(hi, lo);
5084        #[rustfmt::skip]
5085        let e = _mm256_setr_epi8(
5086            1, 2, 3, 4, 5, 6, 7, 8,
5087            9, 10, 11, 12, 13, 14, 15, 16,
5088            17, 18, 19, 20, 21, 22, 23, 24,
5089            25, 26, 27, 28, 29, 30, 31, 32,
5090        );
5091        assert_eq_m256i(r, e);
5092    }
5093
5094    #[simd_test(enable = "avx")]
5095    const unsafe fn test_mm256_setr_m128() {
5096        let lo = _mm_setr_ps(1., 2., 3., 4.);
5097        let hi = _mm_setr_ps(5., 6., 7., 8.);
5098        let r = _mm256_setr_m128(lo, hi);
5099        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5100        assert_eq_m256(r, e);
5101    }
5102
5103    #[simd_test(enable = "avx")]
5104    const unsafe fn test_mm256_setr_m128d() {
5105        let lo = _mm_setr_pd(1., 2.);
5106        let hi = _mm_setr_pd(3., 4.);
5107        let r = _mm256_setr_m128d(lo, hi);
5108        let e = _mm256_setr_pd(1., 2., 3., 4.);
5109        assert_eq_m256d(r, e);
5110    }
5111
5112    #[simd_test(enable = "avx")]
5113    const unsafe fn test_mm256_setr_m128i() {
5114        #[rustfmt::skip]
5115        let lo = _mm_setr_epi8(
5116            1, 2, 3, 4,
5117            5, 6, 7, 8,
5118            9, 10, 11, 12,
5119            13, 14, 15, 16,
5120        );
5121        #[rustfmt::skip]
5122        let hi = _mm_setr_epi8(
5123            17, 18, 19, 20, 21, 22, 23, 24,
5124            25, 26, 27, 28, 29, 30, 31, 32,
5125        );
5126        let r = _mm256_setr_m128i(lo, hi);
5127        #[rustfmt::skip]
5128        let e = _mm256_setr_epi8(
5129            1, 2, 3, 4, 5, 6, 7, 8,
5130            9, 10, 11, 12, 13, 14, 15, 16,
5131            17, 18, 19, 20, 21, 22, 23, 24,
5132            25, 26, 27, 28, 29, 30, 31, 32,
5133        );
5134        assert_eq_m256i(r, e);
5135    }
5136
5137    #[simd_test(enable = "avx")]
5138    const unsafe fn test_mm256_loadu2_m128() {
5139        let hi = &[5., 6., 7., 8.];
5140        let hiaddr = hi.as_ptr();
5141        let lo = &[1., 2., 3., 4.];
5142        let loaddr = lo.as_ptr();
5143        let r = _mm256_loadu2_m128(hiaddr, loaddr);
5144        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5145        assert_eq_m256(r, e);
5146    }
5147
5148    #[simd_test(enable = "avx")]
5149    const unsafe fn test_mm256_loadu2_m128d() {
5150        let hi = &[3., 4.];
5151        let hiaddr = hi.as_ptr();
5152        let lo = &[1., 2.];
5153        let loaddr = lo.as_ptr();
5154        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
5155        let e = _mm256_setr_pd(1., 2., 3., 4.);
5156        assert_eq_m256d(r, e);
5157    }
5158
5159    #[simd_test(enable = "avx")]
5160    const unsafe fn test_mm256_loadu2_m128i() {
5161        #[rustfmt::skip]
5162        let hi = _mm_setr_epi8(
5163            17, 18, 19, 20, 21, 22, 23, 24,
5164            25, 26, 27, 28, 29, 30, 31, 32,
5165        );
5166        #[rustfmt::skip]
5167        let lo = _mm_setr_epi8(
5168            1, 2, 3, 4, 5, 6, 7, 8,
5169            9, 10, 11, 12, 13, 14, 15, 16,
5170        );
5171        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
5172        #[rustfmt::skip]
5173        let e = _mm256_setr_epi8(
5174            1, 2, 3, 4, 5, 6, 7, 8,
5175            9, 10, 11, 12, 13, 14, 15, 16,
5176            17, 18, 19, 20, 21, 22, 23, 24,
5177            25, 26, 27, 28, 29, 30, 31, 32,
5178        );
5179        assert_eq_m256i(r, e);
5180    }
5181
5182    #[simd_test(enable = "avx")]
5183    const unsafe fn test_mm256_storeu2_m128() {
5184        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5185        let mut hi = _mm_undefined_ps();
5186        let mut lo = _mm_undefined_ps();
5187        _mm256_storeu2_m128(
5188            ptr::addr_of_mut!(hi) as *mut f32,
5189            ptr::addr_of_mut!(lo) as *mut f32,
5190            a,
5191        );
5192        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5193        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5194    }
5195
5196    #[simd_test(enable = "avx")]
5197    const unsafe fn test_mm256_storeu2_m128d() {
5198        let a = _mm256_setr_pd(1., 2., 3., 4.);
5199        let mut hi = _mm_undefined_pd();
5200        let mut lo = _mm_undefined_pd();
5201        _mm256_storeu2_m128d(
5202            ptr::addr_of_mut!(hi) as *mut f64,
5203            ptr::addr_of_mut!(lo) as *mut f64,
5204            a,
5205        );
5206        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5207        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5208    }
5209
5210    #[simd_test(enable = "avx")]
5211    const unsafe fn test_mm256_storeu2_m128i() {
5212        #[rustfmt::skip]
5213        let a = _mm256_setr_epi8(
5214            1, 2, 3, 4, 5, 6, 7, 8,
5215            9, 10, 11, 12, 13, 14, 15, 16,
5216            17, 18, 19, 20, 21, 22, 23, 24,
5217            25, 26, 27, 28, 29, 30, 31, 32,
5218        );
5219        let mut hi = _mm_undefined_si128();
5220        let mut lo = _mm_undefined_si128();
5221        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5222        #[rustfmt::skip]
5223        let e_hi = _mm_setr_epi8(
5224            17, 18, 19, 20, 21, 22, 23, 24,
5225            25, 26, 27, 28, 29, 30, 31, 32
5226        );
5227        #[rustfmt::skip]
5228        let e_lo = _mm_setr_epi8(
5229            1, 2, 3, 4, 5, 6, 7, 8,
5230            9, 10, 11, 12, 13, 14, 15, 16
5231        );
5232
5233        assert_eq_m128i(hi, e_hi);
5234        assert_eq_m128i(lo, e_lo);
5235    }
5236
5237    #[simd_test(enable = "avx")]
5238    const unsafe fn test_mm256_cvtss_f32() {
5239        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5240        let r = _mm256_cvtss_f32(a);
5241        assert_eq!(r, 1.);
5242    }
5243}