Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
35pub const fn _mm256_abs_epi32(a: __m256i) -> __m256i {
36    unsafe {
37        let a = a.as_i32x8();
38        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
39        transmute(r)
40    }
41}
42
43/// Computes the absolute values of packed 16-bit integers in `a`.
44///
45/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
46#[inline]
47#[target_feature(enable = "avx2")]
48#[cfg_attr(test, assert_instr(vpabsw))]
49#[stable(feature = "simd_x86", since = "1.27.0")]
50#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
51pub const fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
67pub const fn _mm256_abs_epi8(a: __m256i) -> __m256i {
68    unsafe {
69        let a = a.as_i8x32();
70        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
71        transmute(r)
72    }
73}
74
75/// Adds packed 64-bit integers in `a` and `b`.
76///
77/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
78#[inline]
79#[target_feature(enable = "avx2")]
80#[cfg_attr(test, assert_instr(vpaddq))]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
84    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
85}
86
87/// Adds packed 32-bit integers in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
90#[inline]
91#[target_feature(enable = "avx2")]
92#[cfg_attr(test, assert_instr(vpaddd))]
93#[stable(feature = "simd_x86", since = "1.27.0")]
94#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
95pub const fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
96    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
97}
98
99/// Adds packed 16-bit integers in `a` and `b`.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
102#[inline]
103#[target_feature(enable = "avx2")]
104#[cfg_attr(test, assert_instr(vpaddw))]
105#[stable(feature = "simd_x86", since = "1.27.0")]
106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
107pub const fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
108    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
109}
110
111/// Adds packed 8-bit integers in `a` and `b`.
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
114#[inline]
115#[target_feature(enable = "avx2")]
116#[cfg_attr(test, assert_instr(vpaddb))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
119pub const fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
120    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
121}
122
123/// Adds packed 8-bit integers in `a` and `b` using saturation.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
126#[inline]
127#[target_feature(enable = "avx2")]
128#[cfg_attr(test, assert_instr(vpaddsb))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
132    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
133}
134
135/// Adds packed 16-bit integers in `a` and `b` using saturation.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
138#[inline]
139#[target_feature(enable = "avx2")]
140#[cfg_attr(test, assert_instr(vpaddsw))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
143pub const fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
144    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
145}
146
147/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
150#[inline]
151#[target_feature(enable = "avx2")]
152#[cfg_attr(test, assert_instr(vpaddusb))]
153#[stable(feature = "simd_x86", since = "1.27.0")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
156    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
157}
158
159/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
162#[inline]
163#[target_feature(enable = "avx2")]
164#[cfg_attr(test, assert_instr(vpaddusw))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
167pub const fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
168    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
169}
170
171/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
172/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
173///
174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
175#[inline]
176#[target_feature(enable = "avx2")]
177#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
178#[rustc_legacy_const_generics(2)]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
181pub const fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
182    static_assert_uimm_bits!(IMM8, 8);
183
184    // If palignr is shifting the pair of vectors more than the size of two
185    // lanes, emit zero.
186    if IMM8 >= 32 {
187        return _mm256_setzero_si256();
188    }
189    // If palignr is shifting the pair of input vectors more than one lane,
190    // but less than two lanes, convert to shifting in zeroes.
191    let (a, b) = if IMM8 > 16 {
192        (_mm256_setzero_si256(), a)
193    } else {
194        (a, b)
195    };
196    unsafe {
197        if IMM8 == 16 {
198            return transmute(a);
199        }
200    }
201    const fn mask(shift: u32, i: u32) -> u32 {
202        let shift = shift % 16;
203        let mod_i = i % 16;
204        if mod_i < (16 - shift) {
205            i + shift
206        } else {
207            i + 16 + shift
208        }
209    }
210
211    unsafe {
212        let r: i8x32 = simd_shuffle!(
213            b.as_i8x32(),
214            a.as_i8x32(),
215            [
216                mask(IMM8 as u32, 0),
217                mask(IMM8 as u32, 1),
218                mask(IMM8 as u32, 2),
219                mask(IMM8 as u32, 3),
220                mask(IMM8 as u32, 4),
221                mask(IMM8 as u32, 5),
222                mask(IMM8 as u32, 6),
223                mask(IMM8 as u32, 7),
224                mask(IMM8 as u32, 8),
225                mask(IMM8 as u32, 9),
226                mask(IMM8 as u32, 10),
227                mask(IMM8 as u32, 11),
228                mask(IMM8 as u32, 12),
229                mask(IMM8 as u32, 13),
230                mask(IMM8 as u32, 14),
231                mask(IMM8 as u32, 15),
232                mask(IMM8 as u32, 16),
233                mask(IMM8 as u32, 17),
234                mask(IMM8 as u32, 18),
235                mask(IMM8 as u32, 19),
236                mask(IMM8 as u32, 20),
237                mask(IMM8 as u32, 21),
238                mask(IMM8 as u32, 22),
239                mask(IMM8 as u32, 23),
240                mask(IMM8 as u32, 24),
241                mask(IMM8 as u32, 25),
242                mask(IMM8 as u32, 26),
243                mask(IMM8 as u32, 27),
244                mask(IMM8 as u32, 28),
245                mask(IMM8 as u32, 29),
246                mask(IMM8 as u32, 30),
247                mask(IMM8 as u32, 31),
248            ],
249        );
250        transmute(r)
251    }
252}
253
254/// Computes the bitwise AND of 256 bits (representing integer data)
255/// in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
264    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
265}
266
267/// Computes the bitwise NOT of 256 bits (representing integer data)
268/// in `a` and then AND with `b`.
269///
270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
271#[inline]
272#[target_feature(enable = "avx2")]
273#[cfg_attr(test, assert_instr(vandnps))]
274#[stable(feature = "simd_x86", since = "1.27.0")]
275#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
276pub const fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
277    unsafe {
278        let all_ones = _mm256_set1_epi8(-1);
279        transmute(simd_and(
280            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
281            b.as_i64x4(),
282        ))
283    }
284}
285
286/// Averages packed unsigned 16-bit integers in `a` and `b`.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
289#[inline]
290#[target_feature(enable = "avx2")]
291#[cfg_attr(test, assert_instr(vpavgw))]
292#[stable(feature = "simd_x86", since = "1.27.0")]
293#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
294pub const fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
295    unsafe {
296        let a = simd_cast::<_, u32x16>(a.as_u16x16());
297        let b = simd_cast::<_, u32x16>(b.as_u16x16());
298        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
299        transmute(simd_cast::<_, u16x16>(r))
300    }
301}
302
303/// Averages packed unsigned 8-bit integers in `a` and `b`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
306#[inline]
307#[target_feature(enable = "avx2")]
308#[cfg_attr(test, assert_instr(vpavgb))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
311pub const fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
312    unsafe {
313        let a = simd_cast::<_, u16x32>(a.as_u8x32());
314        let b = simd_cast::<_, u16x32>(b.as_u8x32());
315        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
316        transmute(simd_cast::<_, u8x32>(r))
317    }
318}
319
320/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
321///
322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
323#[inline]
324#[target_feature(enable = "avx2")]
325#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
326#[rustc_legacy_const_generics(2)]
327#[stable(feature = "simd_x86", since = "1.27.0")]
328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
329pub const fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
330    static_assert_uimm_bits!(IMM4, 4);
331    unsafe {
332        let a = a.as_i32x4();
333        let b = b.as_i32x4();
334        let r: i32x4 = simd_shuffle!(
335            a,
336            b,
337            [
338                [0, 4, 0, 4][IMM4 as usize & 0b11],
339                [1, 1, 5, 5][IMM4 as usize & 0b11],
340                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
341                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
342            ],
343        );
344        transmute(r)
345    }
346}
347
348/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
349///
350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
351#[inline]
352#[target_feature(enable = "avx2")]
353#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
354#[rustc_legacy_const_generics(2)]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
357pub const fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
358    static_assert_uimm_bits!(IMM8, 8);
359    unsafe {
360        let a = a.as_i32x8();
361        let b = b.as_i32x8();
362        let r: i32x8 = simd_shuffle!(
363            a,
364            b,
365            [
366                [0, 8, 0, 8][IMM8 as usize & 0b11],
367                [1, 1, 9, 9][IMM8 as usize & 0b11],
368                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
369                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
370                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
371                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
372                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
373                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
374            ],
375        );
376        transmute(r)
377    }
378}
379
380/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
383#[inline]
384#[target_feature(enable = "avx2")]
385#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
386#[rustc_legacy_const_generics(2)]
387#[stable(feature = "simd_x86", since = "1.27.0")]
388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
389pub const fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
390    static_assert_uimm_bits!(IMM8, 8);
391    unsafe {
392        let a = a.as_i16x16();
393        let b = b.as_i16x16();
394
395        let r: i16x16 = simd_shuffle!(
396            a,
397            b,
398            [
399                [0, 16, 0, 16][IMM8 as usize & 0b11],
400                [1, 1, 17, 17][IMM8 as usize & 0b11],
401                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
402                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
403                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
404                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
405                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
406                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
407                [8, 24, 8, 24][IMM8 as usize & 0b11],
408                [9, 9, 25, 25][IMM8 as usize & 0b11],
409                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
410                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
411                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
412                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
413                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
414                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
415            ],
416        );
417        transmute(r)
418    }
419}
420
421/// Blends packed 8-bit integers from `a` and `b` using `mask`.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
424#[inline]
425#[target_feature(enable = "avx2")]
426#[cfg_attr(test, assert_instr(vpblendvb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
430    unsafe {
431        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
432        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
433    }
434}
435
436/// Broadcasts the low packed 8-bit integer from `a` to all elements of
437/// the 128-bit returned value.
438///
439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
440#[inline]
441#[target_feature(enable = "avx2")]
442#[cfg_attr(test, assert_instr(vpbroadcastb))]
443#[stable(feature = "simd_x86", since = "1.27.0")]
444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
445pub const fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
446    unsafe {
447        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
448        transmute::<i8x16, _>(ret)
449    }
450}
451
452/// Broadcasts the low packed 8-bit integer from `a` to all elements of
453/// the 256-bit returned value.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
456#[inline]
457#[target_feature(enable = "avx2")]
458#[cfg_attr(test, assert_instr(vpbroadcastb))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
461pub const fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
462    unsafe {
463        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
464        transmute::<i8x32, _>(ret)
465    }
466}
467
468// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
469// often compiled to `vbroadcastss`.
470/// Broadcasts the low packed 32-bit integer from `a` to all elements of
471/// the 128-bit returned value.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
474#[inline]
475#[target_feature(enable = "avx2")]
476#[cfg_attr(test, assert_instr(vbroadcastss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
479pub const fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
480    unsafe {
481        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
482        transmute::<i32x4, _>(ret)
483    }
484}
485
486// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
487// often compiled to `vbroadcastss`.
488/// Broadcasts the low packed 32-bit integer from `a` to all elements of
489/// the 256-bit returned value.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vbroadcastss))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
497pub const fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
498    unsafe {
499        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
500        transmute::<i32x8, _>(ret)
501    }
502}
503
504/// Broadcasts the low packed 64-bit integer from `a` to all elements of
505/// the 128-bit returned value.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
508#[inline]
509#[target_feature(enable = "avx2")]
510// Emits `vmovddup` instead of `vpbroadcastq`
511// See https://github.com/rust-lang/stdarch/issues/791
512#[cfg_attr(test, assert_instr(vmovddup))]
513#[stable(feature = "simd_x86", since = "1.27.0")]
514#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
515pub const fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
516    unsafe {
517        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
518        transmute::<i64x2, _>(ret)
519    }
520}
521
522/// Broadcasts the low packed 64-bit integer from `a` to all elements of
523/// the 256-bit returned value.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
526#[inline]
527#[target_feature(enable = "avx2")]
528#[cfg_attr(test, assert_instr(vbroadcastsd))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
531pub const fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
532    unsafe {
533        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
534        transmute::<i64x4, _>(ret)
535    }
536}
537
538/// Broadcasts the low double-precision (64-bit) floating-point element
539/// from `a` to all elements of the 128-bit returned value.
540///
541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
542#[inline]
543#[target_feature(enable = "avx2")]
544#[cfg_attr(test, assert_instr(vmovddup))]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
547pub const fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
548    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
549}
550
551/// Broadcasts the low double-precision (64-bit) floating-point element
552/// from `a` to all elements of the 256-bit returned value.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
555#[inline]
556#[target_feature(enable = "avx2")]
557#[cfg_attr(test, assert_instr(vbroadcastsd))]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
560pub const fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
561    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
562}
563
564/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
565/// the 256-bit returned value.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
568#[inline]
569#[target_feature(enable = "avx2")]
570#[stable(feature = "simd_x86_updates", since = "1.82.0")]
571#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
572pub const fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
573    unsafe {
574        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
575        transmute::<i64x4, _>(ret)
576    }
577}
578
579// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
580// `vbroadcastf128`.
581/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
582/// the 256-bit returned value.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
585#[inline]
586#[target_feature(enable = "avx2")]
587#[stable(feature = "simd_x86", since = "1.27.0")]
588#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
589pub const fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
590    unsafe {
591        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
592        transmute::<i64x4, _>(ret)
593    }
594}
595
596/// Broadcasts the low single-precision (32-bit) floating-point element
597/// from `a` to all elements of the 128-bit returned value.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
600#[inline]
601#[target_feature(enable = "avx2")]
602#[cfg_attr(test, assert_instr(vbroadcastss))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
605pub const fn _mm_broadcastss_ps(a: __m128) -> __m128 {
606    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
607}
608
609/// Broadcasts the low single-precision (32-bit) floating-point element
610/// from `a` to all elements of the 256-bit returned value.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
613#[inline]
614#[target_feature(enable = "avx2")]
615#[cfg_attr(test, assert_instr(vbroadcastss))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
618pub const fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
619    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
620}
621
622/// Broadcasts the low packed 16-bit integer from a to all elements of
623/// the 128-bit returned value
624///
625/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
626#[inline]
627#[target_feature(enable = "avx2")]
628#[cfg_attr(test, assert_instr(vpbroadcastw))]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
631pub const fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
632    unsafe {
633        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
634        transmute::<i16x8, _>(ret)
635    }
636}
637
638/// Broadcasts the low packed 16-bit integer from a to all elements of
639/// the 256-bit returned value
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
642#[inline]
643#[target_feature(enable = "avx2")]
644#[cfg_attr(test, assert_instr(vpbroadcastw))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
647pub const fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
648    unsafe {
649        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
650        transmute::<i16x16, _>(ret)
651    }
652}
653
654/// Compares packed 64-bit integers in `a` and `b` for equality.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vpcmpeqq))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
662pub const fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
663    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
664}
665
666/// Compares packed 32-bit integers in `a` and `b` for equality.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vpcmpeqd))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
674pub const fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
675    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
676}
677
678/// Compares packed 16-bit integers in `a` and `b` for equality.
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpcmpeqw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
686pub const fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
687    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
688}
689
690/// Compares packed 8-bit integers in `a` and `b` for equality.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
693#[inline]
694#[target_feature(enable = "avx2")]
695#[cfg_attr(test, assert_instr(vpcmpeqb))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
698pub const fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
699    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
700}
701
702/// Compares packed 64-bit integers in `a` and `b` for greater-than.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
705#[inline]
706#[target_feature(enable = "avx2")]
707#[cfg_attr(test, assert_instr(vpcmpgtq))]
708#[stable(feature = "simd_x86", since = "1.27.0")]
709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
710pub const fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
711    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
712}
713
714/// Compares packed 32-bit integers in `a` and `b` for greater-than.
715///
716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
717#[inline]
718#[target_feature(enable = "avx2")]
719#[cfg_attr(test, assert_instr(vpcmpgtd))]
720#[stable(feature = "simd_x86", since = "1.27.0")]
721#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
722pub const fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
723    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
724}
725
726/// Compares packed 16-bit integers in `a` and `b` for greater-than.
727///
728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
729#[inline]
730#[target_feature(enable = "avx2")]
731#[cfg_attr(test, assert_instr(vpcmpgtw))]
732#[stable(feature = "simd_x86", since = "1.27.0")]
733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
734pub const fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
735    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
736}
737
738/// Compares packed 8-bit integers in `a` and `b` for greater-than.
739///
740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
741#[inline]
742#[target_feature(enable = "avx2")]
743#[cfg_attr(test, assert_instr(vpcmpgtb))]
744#[stable(feature = "simd_x86", since = "1.27.0")]
745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
746pub const fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
747    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
748}
749
750/// Sign-extend 16-bit integers to 32-bit integers.
751///
752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
753#[inline]
754#[target_feature(enable = "avx2")]
755#[cfg_attr(test, assert_instr(vpmovsxwd))]
756#[stable(feature = "simd_x86", since = "1.27.0")]
757#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
758pub const fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
759    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
760}
761
762/// Sign-extend 16-bit integers to 64-bit integers.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpmovsxwq))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
770pub const fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
771    unsafe {
772        let a = a.as_i16x8();
773        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
774        transmute::<i64x4, _>(simd_cast(v64))
775    }
776}
777
778/// Sign-extend 32-bit integers to 64-bit integers.
779///
780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
781#[inline]
782#[target_feature(enable = "avx2")]
783#[cfg_attr(test, assert_instr(vpmovsxdq))]
784#[stable(feature = "simd_x86", since = "1.27.0")]
785#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
786pub const fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
787    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
788}
789
790/// Sign-extend 8-bit integers to 16-bit integers.
791///
792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
793#[inline]
794#[target_feature(enable = "avx2")]
795#[cfg_attr(test, assert_instr(vpmovsxbw))]
796#[stable(feature = "simd_x86", since = "1.27.0")]
797#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
798pub const fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
799    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
800}
801
802/// Sign-extend 8-bit integers to 32-bit integers.
803///
804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
805#[inline]
806#[target_feature(enable = "avx2")]
807#[cfg_attr(test, assert_instr(vpmovsxbd))]
808#[stable(feature = "simd_x86", since = "1.27.0")]
809#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
810pub const fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
811    unsafe {
812        let a = a.as_i8x16();
813        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
814        transmute::<i32x8, _>(simd_cast(v64))
815    }
816}
817
818/// Sign-extend 8-bit integers to 64-bit integers.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
821#[inline]
822#[target_feature(enable = "avx2")]
823#[cfg_attr(test, assert_instr(vpmovsxbq))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
826pub const fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
827    unsafe {
828        let a = a.as_i8x16();
829        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
830        transmute::<i64x4, _>(simd_cast(v32))
831    }
832}
833
834/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
835/// integers, and stores the results in `dst`.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
838#[inline]
839#[target_feature(enable = "avx2")]
840#[cfg_attr(test, assert_instr(vpmovzxwd))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
843pub const fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
844    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
845}
846
847/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
848/// integers. The upper four elements of `a` are unused.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
851#[inline]
852#[target_feature(enable = "avx2")]
853#[cfg_attr(test, assert_instr(vpmovzxwq))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
856pub const fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
857    unsafe {
858        let a = a.as_u16x8();
859        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
860        transmute::<i64x4, _>(simd_cast(v64))
861    }
862}
863
864/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
865///
866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
867#[inline]
868#[target_feature(enable = "avx2")]
869#[cfg_attr(test, assert_instr(vpmovzxdq))]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
872pub const fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
873    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
874}
875
876/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
879#[inline]
880#[target_feature(enable = "avx2")]
881#[cfg_attr(test, assert_instr(vpmovzxbw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
884pub const fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
885    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
886}
887
888/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
889/// integers. The upper eight elements of `a` are unused.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
892#[inline]
893#[target_feature(enable = "avx2")]
894#[cfg_attr(test, assert_instr(vpmovzxbd))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
897pub const fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
898    unsafe {
899        let a = a.as_u8x16();
900        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
901        transmute::<i32x8, _>(simd_cast(v64))
902    }
903}
904
905/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
906/// integers. The upper twelve elements of `a` are unused.
907///
908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
909#[inline]
910#[target_feature(enable = "avx2")]
911#[cfg_attr(test, assert_instr(vpmovzxbq))]
912#[stable(feature = "simd_x86", since = "1.27.0")]
913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
914pub const fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
915    unsafe {
916        let a = a.as_u8x16();
917        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
918        transmute::<i64x4, _>(simd_cast(v32))
919    }
920}
921
922/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
925#[inline]
926#[target_feature(enable = "avx2")]
927#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
928#[rustc_legacy_const_generics(1)]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
931pub const fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
932    static_assert_uimm_bits!(IMM1, 1);
933    unsafe {
934        let a = a.as_i64x4();
935        let b = i64x4::ZERO;
936        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
937        transmute(dst)
938    }
939}
940
941/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
944#[inline]
945#[target_feature(enable = "avx2")]
946#[cfg_attr(test, assert_instr(vphaddw))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
950    let a = a.as_i16x16();
951    let b = b.as_i16x16();
952    unsafe {
953        let even: i16x16 = simd_shuffle!(
954            a,
955            b,
956            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
957        );
958        let odd: i16x16 = simd_shuffle!(
959            a,
960            b,
961            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
962        );
963        simd_add(even, odd).as_m256i()
964    }
965}
966
967/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
970#[inline]
971#[target_feature(enable = "avx2")]
972#[cfg_attr(test, assert_instr(vphaddd))]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
975pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
976    let a = a.as_i32x8();
977    let b = b.as_i32x8();
978    unsafe {
979        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
980        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
981        simd_add(even, odd).as_m256i()
982    }
983}
984
985/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
986/// using saturation.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
989#[inline]
990#[target_feature(enable = "avx2")]
991#[cfg_attr(test, assert_instr(vphaddsw))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
994    let a = a.as_i16x16();
995    let b = b.as_i16x16();
996    unsafe {
997        let even: i16x16 = simd_shuffle!(
998            a,
999            b,
1000            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1001        );
1002        let odd: i16x16 = simd_shuffle!(
1003            a,
1004            b,
1005            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1006        );
1007        simd_saturating_add(even, odd).as_m256i()
1008    }
1009}
1010
1011/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1012///
1013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1014#[inline]
1015#[target_feature(enable = "avx2")]
1016#[cfg_attr(test, assert_instr(vphsubw))]
1017#[stable(feature = "simd_x86", since = "1.27.0")]
1018#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1019pub const fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1020    let a = a.as_i16x16();
1021    let b = b.as_i16x16();
1022    unsafe {
1023        let even: i16x16 = simd_shuffle!(
1024            a,
1025            b,
1026            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1027        );
1028        let odd: i16x16 = simd_shuffle!(
1029            a,
1030            b,
1031            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1032        );
1033        simd_sub(even, odd).as_m256i()
1034    }
1035}
1036
1037/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1040#[inline]
1041#[target_feature(enable = "avx2")]
1042#[cfg_attr(test, assert_instr(vphsubd))]
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1045pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1046    let a = a.as_i32x8();
1047    let b = b.as_i32x8();
1048    unsafe {
1049        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
1050        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
1051        simd_sub(even, odd).as_m256i()
1052    }
1053}
1054
1055/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1056/// using saturation.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1059#[inline]
1060#[target_feature(enable = "avx2")]
1061#[cfg_attr(test, assert_instr(vphsubsw))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1064    let a = a.as_i16x16();
1065    let b = b.as_i16x16();
1066    unsafe {
1067        let even: i16x16 = simd_shuffle!(
1068            a,
1069            b,
1070            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1071        );
1072        let odd: i16x16 = simd_shuffle!(
1073            a,
1074            b,
1075            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1076        );
1077        simd_saturating_sub(even, odd).as_m256i()
1078    }
1079}
1080
1081/// Returns values from `slice` at offsets determined by `offsets * scale`,
1082/// where
1083/// `scale` should be 1, 2, 4 or 8.
1084///
1085/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1086#[inline]
1087#[target_feature(enable = "avx2")]
1088#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1089#[rustc_legacy_const_generics(2)]
1090#[stable(feature = "simd_x86", since = "1.27.0")]
1091pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1092    slice: *const i32,
1093    offsets: __m128i,
1094) -> __m128i {
1095    static_assert_imm8_scale!(SCALE);
1096    let zero = i32x4::ZERO;
1097    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1098    let offsets = offsets.as_i32x4();
1099    let slice = slice as *const i8;
1100    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1101    transmute(r)
1102}
1103
1104/// Returns values from `slice` at offsets determined by `offsets * scale`,
1105/// where
1106/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1107/// that position instead.
1108///
1109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1110#[inline]
1111#[target_feature(enable = "avx2")]
1112#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1113#[rustc_legacy_const_generics(4)]
1114#[stable(feature = "simd_x86", since = "1.27.0")]
1115pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1116    src: __m128i,
1117    slice: *const i32,
1118    offsets: __m128i,
1119    mask: __m128i,
1120) -> __m128i {
1121    static_assert_imm8_scale!(SCALE);
1122    let src = src.as_i32x4();
1123    let mask = mask.as_i32x4();
1124    let offsets = offsets.as_i32x4();
1125    let slice = slice as *const i8;
1126    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1127    transmute(r)
1128}
1129
1130/// Returns values from `slice` at offsets determined by `offsets * scale`,
1131/// where
1132/// `scale` should be 1, 2, 4 or 8.
1133///
1134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1135#[inline]
1136#[target_feature(enable = "avx2")]
1137#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1138#[rustc_legacy_const_generics(2)]
1139#[stable(feature = "simd_x86", since = "1.27.0")]
1140pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1141    slice: *const i32,
1142    offsets: __m256i,
1143) -> __m256i {
1144    static_assert_imm8_scale!(SCALE);
1145    let zero = i32x8::ZERO;
1146    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1147    let offsets = offsets.as_i32x8();
1148    let slice = slice as *const i8;
1149    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1150    transmute(r)
1151}
1152
1153/// Returns values from `slice` at offsets determined by `offsets * scale`,
1154/// where
1155/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1156/// that position instead.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1159#[inline]
1160#[target_feature(enable = "avx2")]
1161#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1162#[rustc_legacy_const_generics(4)]
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1165    src: __m256i,
1166    slice: *const i32,
1167    offsets: __m256i,
1168    mask: __m256i,
1169) -> __m256i {
1170    static_assert_imm8_scale!(SCALE);
1171    let src = src.as_i32x8();
1172    let mask = mask.as_i32x8();
1173    let offsets = offsets.as_i32x8();
1174    let slice = slice as *const i8;
1175    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1176    transmute(r)
1177}
1178
1179/// Returns values from `slice` at offsets determined by `offsets * scale`,
1180/// where
1181/// `scale` should be 1, 2, 4 or 8.
1182///
1183/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1184#[inline]
1185#[target_feature(enable = "avx2")]
1186#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1187#[rustc_legacy_const_generics(2)]
1188#[stable(feature = "simd_x86", since = "1.27.0")]
1189pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1190    static_assert_imm8_scale!(SCALE);
1191    let zero = _mm_setzero_ps();
1192    let neg_one = _mm_set1_ps(-1.0);
1193    let offsets = offsets.as_i32x4();
1194    let slice = slice as *const i8;
1195    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1196}
1197
1198/// Returns values from `slice` at offsets determined by `offsets * scale`,
1199/// where
1200/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1201/// that position instead.
1202///
1203/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1204#[inline]
1205#[target_feature(enable = "avx2")]
1206#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1207#[rustc_legacy_const_generics(4)]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1210    src: __m128,
1211    slice: *const f32,
1212    offsets: __m128i,
1213    mask: __m128,
1214) -> __m128 {
1215    static_assert_imm8_scale!(SCALE);
1216    let offsets = offsets.as_i32x4();
1217    let slice = slice as *const i8;
1218    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1219}
1220
1221/// Returns values from `slice` at offsets determined by `offsets * scale`,
1222/// where
1223/// `scale` should be 1, 2, 4 or 8.
1224///
1225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1226#[inline]
1227#[target_feature(enable = "avx2")]
1228#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1229#[rustc_legacy_const_generics(2)]
1230#[stable(feature = "simd_x86", since = "1.27.0")]
1231pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1232    static_assert_imm8_scale!(SCALE);
1233    let zero = _mm256_setzero_ps();
1234    let neg_one = _mm256_set1_ps(-1.0);
1235    let offsets = offsets.as_i32x8();
1236    let slice = slice as *const i8;
1237    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1238}
1239
1240/// Returns values from `slice` at offsets determined by `offsets * scale`,
1241/// where
1242/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1243/// that position instead.
1244///
1245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1246#[inline]
1247#[target_feature(enable = "avx2")]
1248#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1249#[rustc_legacy_const_generics(4)]
1250#[stable(feature = "simd_x86", since = "1.27.0")]
1251pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1252    src: __m256,
1253    slice: *const f32,
1254    offsets: __m256i,
1255    mask: __m256,
1256) -> __m256 {
1257    static_assert_imm8_scale!(SCALE);
1258    let offsets = offsets.as_i32x8();
1259    let slice = slice as *const i8;
1260    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1261}
1262
1263/// Returns values from `slice` at offsets determined by `offsets * scale`,
1264/// where
1265/// `scale` should be 1, 2, 4 or 8.
1266///
1267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1268#[inline]
1269#[target_feature(enable = "avx2")]
1270#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1271#[rustc_legacy_const_generics(2)]
1272#[stable(feature = "simd_x86", since = "1.27.0")]
1273pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1274    slice: *const i64,
1275    offsets: __m128i,
1276) -> __m128i {
1277    static_assert_imm8_scale!(SCALE);
1278    let zero = i64x2::ZERO;
1279    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1280    let offsets = offsets.as_i32x4();
1281    let slice = slice as *const i8;
1282    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1283    transmute(r)
1284}
1285
1286/// Returns values from `slice` at offsets determined by `offsets * scale`,
1287/// where
1288/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1289/// that position instead.
1290///
1291/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1292#[inline]
1293#[target_feature(enable = "avx2")]
1294#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1295#[rustc_legacy_const_generics(4)]
1296#[stable(feature = "simd_x86", since = "1.27.0")]
1297pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1298    src: __m128i,
1299    slice: *const i64,
1300    offsets: __m128i,
1301    mask: __m128i,
1302) -> __m128i {
1303    static_assert_imm8_scale!(SCALE);
1304    let src = src.as_i64x2();
1305    let mask = mask.as_i64x2();
1306    let offsets = offsets.as_i32x4();
1307    let slice = slice as *const i8;
1308    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1309    transmute(r)
1310}
1311
1312/// Returns values from `slice` at offsets determined by `offsets * scale`,
1313/// where
1314/// `scale` should be 1, 2, 4 or 8.
1315///
1316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1317#[inline]
1318#[target_feature(enable = "avx2")]
1319#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1320#[rustc_legacy_const_generics(2)]
1321#[stable(feature = "simd_x86", since = "1.27.0")]
1322pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1323    slice: *const i64,
1324    offsets: __m128i,
1325) -> __m256i {
1326    static_assert_imm8_scale!(SCALE);
1327    let zero = i64x4::ZERO;
1328    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1329    let offsets = offsets.as_i32x4();
1330    let slice = slice as *const i8;
1331    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1332    transmute(r)
1333}
1334
1335/// Returns values from `slice` at offsets determined by `offsets * scale`,
1336/// where
1337/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1338/// that position instead.
1339///
1340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1341#[inline]
1342#[target_feature(enable = "avx2")]
1343#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1344#[rustc_legacy_const_generics(4)]
1345#[stable(feature = "simd_x86", since = "1.27.0")]
1346pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1347    src: __m256i,
1348    slice: *const i64,
1349    offsets: __m128i,
1350    mask: __m256i,
1351) -> __m256i {
1352    static_assert_imm8_scale!(SCALE);
1353    let src = src.as_i64x4();
1354    let mask = mask.as_i64x4();
1355    let offsets = offsets.as_i32x4();
1356    let slice = slice as *const i8;
1357    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1358    transmute(r)
1359}
1360
1361/// Returns values from `slice` at offsets determined by `offsets * scale`,
1362/// where
1363/// `scale` should be 1, 2, 4 or 8.
1364///
1365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1366#[inline]
1367#[target_feature(enable = "avx2")]
1368#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1369#[rustc_legacy_const_generics(2)]
1370#[stable(feature = "simd_x86", since = "1.27.0")]
1371pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1372    static_assert_imm8_scale!(SCALE);
1373    let zero = _mm_setzero_pd();
1374    let neg_one = _mm_set1_pd(-1.0);
1375    let offsets = offsets.as_i32x4();
1376    let slice = slice as *const i8;
1377    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1378}
1379
1380/// Returns values from `slice` at offsets determined by `offsets * scale`,
1381/// where
1382/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1383/// that position instead.
1384///
1385/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1386#[inline]
1387#[target_feature(enable = "avx2")]
1388#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1389#[rustc_legacy_const_generics(4)]
1390#[stable(feature = "simd_x86", since = "1.27.0")]
1391pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1392    src: __m128d,
1393    slice: *const f64,
1394    offsets: __m128i,
1395    mask: __m128d,
1396) -> __m128d {
1397    static_assert_imm8_scale!(SCALE);
1398    let offsets = offsets.as_i32x4();
1399    let slice = slice as *const i8;
1400    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1401}
1402
1403/// Returns values from `slice` at offsets determined by `offsets * scale`,
1404/// where
1405/// `scale` should be 1, 2, 4 or 8.
1406///
1407/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1408#[inline]
1409#[target_feature(enable = "avx2")]
1410#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1411#[rustc_legacy_const_generics(2)]
1412#[stable(feature = "simd_x86", since = "1.27.0")]
1413pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1414    slice: *const f64,
1415    offsets: __m128i,
1416) -> __m256d {
1417    static_assert_imm8_scale!(SCALE);
1418    let zero = _mm256_setzero_pd();
1419    let neg_one = _mm256_set1_pd(-1.0);
1420    let offsets = offsets.as_i32x4();
1421    let slice = slice as *const i8;
1422    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1423}
1424
1425/// Returns values from `slice` at offsets determined by `offsets * scale`,
1426/// where
1427/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1428/// that position instead.
1429///
1430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1431#[inline]
1432#[target_feature(enable = "avx2")]
1433#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1434#[rustc_legacy_const_generics(4)]
1435#[stable(feature = "simd_x86", since = "1.27.0")]
1436pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1437    src: __m256d,
1438    slice: *const f64,
1439    offsets: __m128i,
1440    mask: __m256d,
1441) -> __m256d {
1442    static_assert_imm8_scale!(SCALE);
1443    let offsets = offsets.as_i32x4();
1444    let slice = slice as *const i8;
1445    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1446}
1447
1448/// Returns values from `slice` at offsets determined by `offsets * scale`,
1449/// where
1450/// `scale` should be 1, 2, 4 or 8.
1451///
1452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1453#[inline]
1454#[target_feature(enable = "avx2")]
1455#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1456#[rustc_legacy_const_generics(2)]
1457#[stable(feature = "simd_x86", since = "1.27.0")]
1458pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1459    slice: *const i32,
1460    offsets: __m128i,
1461) -> __m128i {
1462    static_assert_imm8_scale!(SCALE);
1463    let zero = i32x4::ZERO;
1464    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1465    let offsets = offsets.as_i64x2();
1466    let slice = slice as *const i8;
1467    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1468    transmute(r)
1469}
1470
1471/// Returns values from `slice` at offsets determined by `offsets * scale`,
1472/// where
1473/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1474/// that position instead.
1475///
1476/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1477#[inline]
1478#[target_feature(enable = "avx2")]
1479#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1480#[rustc_legacy_const_generics(4)]
1481#[stable(feature = "simd_x86", since = "1.27.0")]
1482pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1483    src: __m128i,
1484    slice: *const i32,
1485    offsets: __m128i,
1486    mask: __m128i,
1487) -> __m128i {
1488    static_assert_imm8_scale!(SCALE);
1489    let src = src.as_i32x4();
1490    let mask = mask.as_i32x4();
1491    let offsets = offsets.as_i64x2();
1492    let slice = slice as *const i8;
1493    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1494    transmute(r)
1495}
1496
1497/// Returns values from `slice` at offsets determined by `offsets * scale`,
1498/// where
1499/// `scale` should be 1, 2, 4 or 8.
1500///
1501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1502#[inline]
1503#[target_feature(enable = "avx2")]
1504#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1505#[rustc_legacy_const_generics(2)]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1508    slice: *const i32,
1509    offsets: __m256i,
1510) -> __m128i {
1511    static_assert_imm8_scale!(SCALE);
1512    let zero = i32x4::ZERO;
1513    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1514    let offsets = offsets.as_i64x4();
1515    let slice = slice as *const i8;
1516    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1517    transmute(r)
1518}
1519
1520/// Returns values from `slice` at offsets determined by `offsets * scale`,
1521/// where
1522/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1523/// that position instead.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1526#[inline]
1527#[target_feature(enable = "avx2")]
1528#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1529#[rustc_legacy_const_generics(4)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1532    src: __m128i,
1533    slice: *const i32,
1534    offsets: __m256i,
1535    mask: __m128i,
1536) -> __m128i {
1537    static_assert_imm8_scale!(SCALE);
1538    let src = src.as_i32x4();
1539    let mask = mask.as_i32x4();
1540    let offsets = offsets.as_i64x4();
1541    let slice = slice as *const i8;
1542    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1543    transmute(r)
1544}
1545
1546/// Returns values from `slice` at offsets determined by `offsets * scale`,
1547/// where
1548/// `scale` should be 1, 2, 4 or 8.
1549///
1550/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1551#[inline]
1552#[target_feature(enable = "avx2")]
1553#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1554#[rustc_legacy_const_generics(2)]
1555#[stable(feature = "simd_x86", since = "1.27.0")]
1556pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1557    static_assert_imm8_scale!(SCALE);
1558    let zero = _mm_setzero_ps();
1559    let neg_one = _mm_set1_ps(-1.0);
1560    let offsets = offsets.as_i64x2();
1561    let slice = slice as *const i8;
1562    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1563}
1564
1565/// Returns values from `slice` at offsets determined by `offsets * scale`,
1566/// where
1567/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1568/// that position instead.
1569///
1570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1571#[inline]
1572#[target_feature(enable = "avx2")]
1573#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1574#[rustc_legacy_const_generics(4)]
1575#[stable(feature = "simd_x86", since = "1.27.0")]
1576pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1577    src: __m128,
1578    slice: *const f32,
1579    offsets: __m128i,
1580    mask: __m128,
1581) -> __m128 {
1582    static_assert_imm8_scale!(SCALE);
1583    let offsets = offsets.as_i64x2();
1584    let slice = slice as *const i8;
1585    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1586}
1587
1588/// Returns values from `slice` at offsets determined by `offsets * scale`,
1589/// where
1590/// `scale` should be 1, 2, 4 or 8.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1593#[inline]
1594#[target_feature(enable = "avx2")]
1595#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1596#[rustc_legacy_const_generics(2)]
1597#[stable(feature = "simd_x86", since = "1.27.0")]
1598pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1599    static_assert_imm8_scale!(SCALE);
1600    let zero = _mm_setzero_ps();
1601    let neg_one = _mm_set1_ps(-1.0);
1602    let offsets = offsets.as_i64x4();
1603    let slice = slice as *const i8;
1604    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1605}
1606
1607/// Returns values from `slice` at offsets determined by `offsets * scale`,
1608/// where
1609/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1610/// that position instead.
1611///
1612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1613#[inline]
1614#[target_feature(enable = "avx2")]
1615#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1616#[rustc_legacy_const_generics(4)]
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1619    src: __m128,
1620    slice: *const f32,
1621    offsets: __m256i,
1622    mask: __m128,
1623) -> __m128 {
1624    static_assert_imm8_scale!(SCALE);
1625    let offsets = offsets.as_i64x4();
1626    let slice = slice as *const i8;
1627    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1628}
1629
1630/// Returns values from `slice` at offsets determined by `offsets * scale`,
1631/// where
1632/// `scale` should be 1, 2, 4 or 8.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1635#[inline]
1636#[target_feature(enable = "avx2")]
1637#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1638#[rustc_legacy_const_generics(2)]
1639#[stable(feature = "simd_x86", since = "1.27.0")]
1640pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1641    slice: *const i64,
1642    offsets: __m128i,
1643) -> __m128i {
1644    static_assert_imm8_scale!(SCALE);
1645    let zero = i64x2::ZERO;
1646    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1647    let slice = slice as *const i8;
1648    let offsets = offsets.as_i64x2();
1649    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1650    transmute(r)
1651}
1652
1653/// Returns values from `slice` at offsets determined by `offsets * scale`,
1654/// where
1655/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1656/// that position instead.
1657///
1658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1659#[inline]
1660#[target_feature(enable = "avx2")]
1661#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1662#[rustc_legacy_const_generics(4)]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1665    src: __m128i,
1666    slice: *const i64,
1667    offsets: __m128i,
1668    mask: __m128i,
1669) -> __m128i {
1670    static_assert_imm8_scale!(SCALE);
1671    let src = src.as_i64x2();
1672    let mask = mask.as_i64x2();
1673    let offsets = offsets.as_i64x2();
1674    let slice = slice as *const i8;
1675    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1676    transmute(r)
1677}
1678
1679/// Returns values from `slice` at offsets determined by `offsets * scale`,
1680/// where
1681/// `scale` should be 1, 2, 4 or 8.
1682///
1683/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1684#[inline]
1685#[target_feature(enable = "avx2")]
1686#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1687#[rustc_legacy_const_generics(2)]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1690    slice: *const i64,
1691    offsets: __m256i,
1692) -> __m256i {
1693    static_assert_imm8_scale!(SCALE);
1694    let zero = i64x4::ZERO;
1695    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1696    let slice = slice as *const i8;
1697    let offsets = offsets.as_i64x4();
1698    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1699    transmute(r)
1700}
1701
1702/// Returns values from `slice` at offsets determined by `offsets * scale`,
1703/// where
1704/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1705/// that position instead.
1706///
1707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1708#[inline]
1709#[target_feature(enable = "avx2")]
1710#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1711#[rustc_legacy_const_generics(4)]
1712#[stable(feature = "simd_x86", since = "1.27.0")]
1713pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1714    src: __m256i,
1715    slice: *const i64,
1716    offsets: __m256i,
1717    mask: __m256i,
1718) -> __m256i {
1719    static_assert_imm8_scale!(SCALE);
1720    let src = src.as_i64x4();
1721    let mask = mask.as_i64x4();
1722    let offsets = offsets.as_i64x4();
1723    let slice = slice as *const i8;
1724    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1725    transmute(r)
1726}
1727
1728/// Returns values from `slice` at offsets determined by `offsets * scale`,
1729/// where
1730/// `scale` should be 1, 2, 4 or 8.
1731///
1732/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1733#[inline]
1734#[target_feature(enable = "avx2")]
1735#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1736#[rustc_legacy_const_generics(2)]
1737#[stable(feature = "simd_x86", since = "1.27.0")]
1738pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1739    static_assert_imm8_scale!(SCALE);
1740    let zero = _mm_setzero_pd();
1741    let neg_one = _mm_set1_pd(-1.0);
1742    let slice = slice as *const i8;
1743    let offsets = offsets.as_i64x2();
1744    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1745}
1746
1747/// Returns values from `slice` at offsets determined by `offsets * scale`,
1748/// where
1749/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1750/// that position instead.
1751///
1752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1753#[inline]
1754#[target_feature(enable = "avx2")]
1755#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1756#[rustc_legacy_const_generics(4)]
1757#[stable(feature = "simd_x86", since = "1.27.0")]
1758pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1759    src: __m128d,
1760    slice: *const f64,
1761    offsets: __m128i,
1762    mask: __m128d,
1763) -> __m128d {
1764    static_assert_imm8_scale!(SCALE);
1765    let slice = slice as *const i8;
1766    let offsets = offsets.as_i64x2();
1767    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1768}
1769
1770/// Returns values from `slice` at offsets determined by `offsets * scale`,
1771/// where
1772/// `scale` should be 1, 2, 4 or 8.
1773///
1774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1775#[inline]
1776#[target_feature(enable = "avx2")]
1777#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1778#[rustc_legacy_const_generics(2)]
1779#[stable(feature = "simd_x86", since = "1.27.0")]
1780pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1781    slice: *const f64,
1782    offsets: __m256i,
1783) -> __m256d {
1784    static_assert_imm8_scale!(SCALE);
1785    let zero = _mm256_setzero_pd();
1786    let neg_one = _mm256_set1_pd(-1.0);
1787    let slice = slice as *const i8;
1788    let offsets = offsets.as_i64x4();
1789    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1790}
1791
1792/// Returns values from `slice` at offsets determined by `offsets * scale`,
1793/// where
1794/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1795/// that position instead.
1796///
1797/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1798#[inline]
1799#[target_feature(enable = "avx2")]
1800#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1801#[rustc_legacy_const_generics(4)]
1802#[stable(feature = "simd_x86", since = "1.27.0")]
1803pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1804    src: __m256d,
1805    slice: *const f64,
1806    offsets: __m256i,
1807    mask: __m256d,
1808) -> __m256d {
1809    static_assert_imm8_scale!(SCALE);
1810    let slice = slice as *const i8;
1811    let offsets = offsets.as_i64x4();
1812    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1813}
1814
1815/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1816/// location specified by `IMM1`.
1817///
1818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1819#[inline]
1820#[target_feature(enable = "avx2")]
1821#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1822#[rustc_legacy_const_generics(2)]
1823#[stable(feature = "simd_x86", since = "1.27.0")]
1824#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1825pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1826    static_assert_uimm_bits!(IMM1, 1);
1827    unsafe {
1828        let a = a.as_i64x4();
1829        let b = _mm256_castsi128_si256(b).as_i64x4();
1830        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1831        transmute(dst)
1832    }
1833}
1834
1835/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1836/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1837/// of intermediate 32-bit integers.
1838///
1839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1840#[inline]
1841#[target_feature(enable = "avx2")]
1842#[cfg_attr(test, assert_instr(vpmaddwd))]
1843#[stable(feature = "simd_x86", since = "1.27.0")]
1844pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1845    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
1846    //
1847    // ```rust
1848    // #[target_feature(enable = "avx2")]
1849    // unsafe fn widening_add(mad: __m256i) -> __m256i {
1850    //     _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1851    // }
1852    // ```
1853    //
1854    // If we implement this using generic vector intrinsics, the optimizer
1855    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1856    // For this reason, we use x86 intrinsics.
1857    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1858}
1859
1860/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1861/// corresponding signed 8-bit integer from `b`, producing intermediate
1862/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1863/// signed 16-bit integers
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1866#[inline]
1867#[target_feature(enable = "avx2")]
1868#[cfg_attr(test, assert_instr(vpmaddubsw))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1871    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1872}
1873
1874/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1875/// (elements are zeroed out when the highest bit is not set in the
1876/// corresponding element).
1877///
1878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1879#[inline]
1880#[target_feature(enable = "avx2")]
1881#[cfg_attr(test, assert_instr(vpmaskmovd))]
1882#[stable(feature = "simd_x86", since = "1.27.0")]
1883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1884pub const unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1885    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1886    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1887}
1888
1889/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1890/// (elements are zeroed out when the highest bit is not set in the
1891/// corresponding element).
1892///
1893/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1894#[inline]
1895#[target_feature(enable = "avx2")]
1896#[cfg_attr(test, assert_instr(vpmaskmovd))]
1897#[stable(feature = "simd_x86", since = "1.27.0")]
1898#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1899pub const unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1900    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1901    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1902}
1903
1904/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1905/// (elements are zeroed out when the highest bit is not set in the
1906/// corresponding element).
1907///
1908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1909#[inline]
1910#[target_feature(enable = "avx2")]
1911#[cfg_attr(test, assert_instr(vpmaskmovq))]
1912#[stable(feature = "simd_x86", since = "1.27.0")]
1913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1914pub const unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1915    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1916    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1917}
1918
1919/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1920/// (elements are zeroed out when the highest bit is not set in the
1921/// corresponding element).
1922///
1923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1924#[inline]
1925#[target_feature(enable = "avx2")]
1926#[cfg_attr(test, assert_instr(vpmaskmovq))]
1927#[stable(feature = "simd_x86", since = "1.27.0")]
1928#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1929pub const unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1930    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1931    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1932}
1933
1934/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1935/// using `mask` (elements are not stored when the highest bit is not set
1936/// in the corresponding element).
1937///
1938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1939#[inline]
1940#[target_feature(enable = "avx2")]
1941#[cfg_attr(test, assert_instr(vpmaskmovd))]
1942#[stable(feature = "simd_x86", since = "1.27.0")]
1943#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1944pub const unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1945    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1946    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1947}
1948
1949/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1950/// using `mask` (elements are not stored when the highest bit is not set
1951/// in the corresponding element).
1952///
1953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1954#[inline]
1955#[target_feature(enable = "avx2")]
1956#[cfg_attr(test, assert_instr(vpmaskmovd))]
1957#[stable(feature = "simd_x86", since = "1.27.0")]
1958#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1959pub const unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1960    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1961    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1962}
1963
1964/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1965/// using `mask` (elements are not stored when the highest bit is not set
1966/// in the corresponding element).
1967///
1968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1969#[inline]
1970#[target_feature(enable = "avx2")]
1971#[cfg_attr(test, assert_instr(vpmaskmovq))]
1972#[stable(feature = "simd_x86", since = "1.27.0")]
1973#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1974pub const unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1975    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1976    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1977}
1978
1979/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1980/// using `mask` (elements are not stored when the highest bit is not set
1981/// in the corresponding element).
1982///
1983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1984#[inline]
1985#[target_feature(enable = "avx2")]
1986#[cfg_attr(test, assert_instr(vpmaskmovq))]
1987#[stable(feature = "simd_x86", since = "1.27.0")]
1988#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1989pub const unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1990    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1991    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1992}
1993
1994/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1995/// maximum values.
1996///
1997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1998#[inline]
1999#[target_feature(enable = "avx2")]
2000#[cfg_attr(test, assert_instr(vpmaxsw))]
2001#[stable(feature = "simd_x86", since = "1.27.0")]
2002#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2003pub const fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2004    unsafe { simd_imax(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2005}
2006
2007/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2008/// maximum values.
2009///
2010/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
2011#[inline]
2012#[target_feature(enable = "avx2")]
2013#[cfg_attr(test, assert_instr(vpmaxsd))]
2014#[stable(feature = "simd_x86", since = "1.27.0")]
2015#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2016pub const fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2017    unsafe { simd_imax(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2018}
2019
2020/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2021/// maximum values.
2022///
2023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
2024#[inline]
2025#[target_feature(enable = "avx2")]
2026#[cfg_attr(test, assert_instr(vpmaxsb))]
2027#[stable(feature = "simd_x86", since = "1.27.0")]
2028#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2029pub const fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2030    unsafe { simd_imax(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2031}
2032
2033/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2034/// the packed maximum values.
2035///
2036/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
2037#[inline]
2038#[target_feature(enable = "avx2")]
2039#[cfg_attr(test, assert_instr(vpmaxuw))]
2040#[stable(feature = "simd_x86", since = "1.27.0")]
2041#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2042pub const fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2043    unsafe { simd_imax(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2044}
2045
2046/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2047/// the packed maximum values.
2048///
2049/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
2050#[inline]
2051#[target_feature(enable = "avx2")]
2052#[cfg_attr(test, assert_instr(vpmaxud))]
2053#[stable(feature = "simd_x86", since = "1.27.0")]
2054#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2055pub const fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2056    unsafe { simd_imax(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2057}
2058
2059/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2060/// the packed maximum values.
2061///
2062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2063#[inline]
2064#[target_feature(enable = "avx2")]
2065#[cfg_attr(test, assert_instr(vpmaxub))]
2066#[stable(feature = "simd_x86", since = "1.27.0")]
2067#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2068pub const fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2069    unsafe { simd_imax(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2070}
2071
2072/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2073/// minimum values.
2074///
2075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2076#[inline]
2077#[target_feature(enable = "avx2")]
2078#[cfg_attr(test, assert_instr(vpminsw))]
2079#[stable(feature = "simd_x86", since = "1.27.0")]
2080#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2081pub const fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2082    unsafe { simd_imin(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2083}
2084
2085/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2086/// minimum values.
2087///
2088/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2089#[inline]
2090#[target_feature(enable = "avx2")]
2091#[cfg_attr(test, assert_instr(vpminsd))]
2092#[stable(feature = "simd_x86", since = "1.27.0")]
2093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2094pub const fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2095    unsafe { simd_imin(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2096}
2097
2098/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2099/// minimum values.
2100///
2101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2102#[inline]
2103#[target_feature(enable = "avx2")]
2104#[cfg_attr(test, assert_instr(vpminsb))]
2105#[stable(feature = "simd_x86", since = "1.27.0")]
2106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2107pub const fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2108    unsafe { simd_imin(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2109}
2110
2111/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2112/// the packed minimum values.
2113///
2114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2115#[inline]
2116#[target_feature(enable = "avx2")]
2117#[cfg_attr(test, assert_instr(vpminuw))]
2118#[stable(feature = "simd_x86", since = "1.27.0")]
2119#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2120pub const fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2121    unsafe { simd_imin(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2122}
2123
2124/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2125/// the packed minimum values.
2126///
2127/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2128#[inline]
2129#[target_feature(enable = "avx2")]
2130#[cfg_attr(test, assert_instr(vpminud))]
2131#[stable(feature = "simd_x86", since = "1.27.0")]
2132#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2133pub const fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2134    unsafe { simd_imin(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2135}
2136
2137/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2138/// the packed minimum values.
2139///
2140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2141#[inline]
2142#[target_feature(enable = "avx2")]
2143#[cfg_attr(test, assert_instr(vpminub))]
2144#[stable(feature = "simd_x86", since = "1.27.0")]
2145#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2146pub const fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2147    unsafe { simd_imin(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2148}
2149
2150/// Creates mask from the most significant bit of each 8-bit element in `a`,
2151/// return the result.
2152///
2153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2154#[inline]
2155#[target_feature(enable = "avx2")]
2156#[cfg_attr(test, assert_instr(vpmovmskb))]
2157#[stable(feature = "simd_x86", since = "1.27.0")]
2158#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2159pub const fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2160    unsafe {
2161        let z = i8x32::ZERO;
2162        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2163        simd_bitmask::<_, u32>(m) as i32
2164    }
2165}
2166
2167/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2168/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2169/// results in dst. Eight SADs are performed for each 128-bit lane using one
2170/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2171/// selected from `b` starting at on the offset specified in `imm8`. Eight
2172/// quadruplets are formed from sequential 8-bit integers selected from `a`
2173/// starting at the offset specified in `imm8`.
2174///
2175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2176#[inline]
2177#[target_feature(enable = "avx2")]
2178#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2179#[rustc_legacy_const_generics(2)]
2180#[stable(feature = "simd_x86", since = "1.27.0")]
2181pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2182    static_assert_uimm_bits!(IMM8, 8);
2183    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2184}
2185
2186/// Multiplies the low 32-bit integers from each packed 64-bit element in
2187/// `a` and `b`
2188///
2189/// Returns the 64-bit results.
2190///
2191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2192#[inline]
2193#[target_feature(enable = "avx2")]
2194#[cfg_attr(test, assert_instr(vpmuldq))]
2195#[stable(feature = "simd_x86", since = "1.27.0")]
2196#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2197pub const fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2198    unsafe {
2199        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2200        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2201        transmute(simd_mul(a, b))
2202    }
2203}
2204
2205/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2206/// element in `a` and `b`
2207///
2208/// Returns the unsigned 64-bit results.
2209///
2210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2211#[inline]
2212#[target_feature(enable = "avx2")]
2213#[cfg_attr(test, assert_instr(vpmuludq))]
2214#[stable(feature = "simd_x86", since = "1.27.0")]
2215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2216pub const fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2217    unsafe {
2218        let a = a.as_u64x4();
2219        let b = b.as_u64x4();
2220        let mask = u64x4::splat(u32::MAX as u64);
2221        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2222    }
2223}
2224
2225/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2226/// intermediate 32-bit integers and returning the high 16 bits of the
2227/// intermediate integers.
2228///
2229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2230#[inline]
2231#[target_feature(enable = "avx2")]
2232#[cfg_attr(test, assert_instr(vpmulhw))]
2233#[stable(feature = "simd_x86", since = "1.27.0")]
2234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2235pub const fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2236    unsafe {
2237        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2238        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2239        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2240        transmute(simd_cast::<i32x16, i16x16>(r))
2241    }
2242}
2243
2244/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2245/// intermediate 32-bit integers and returning the high 16 bits of the
2246/// intermediate integers.
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2249#[inline]
2250#[target_feature(enable = "avx2")]
2251#[cfg_attr(test, assert_instr(vpmulhuw))]
2252#[stable(feature = "simd_x86", since = "1.27.0")]
2253#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2254pub const fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2255    unsafe {
2256        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2257        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2258        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2259        transmute(simd_cast::<u32x16, u16x16>(r))
2260    }
2261}
2262
2263/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2264/// intermediate 32-bit integers, and returns the low 16 bits of the
2265/// intermediate integers
2266///
2267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2268#[inline]
2269#[target_feature(enable = "avx2")]
2270#[cfg_attr(test, assert_instr(vpmullw))]
2271#[stable(feature = "simd_x86", since = "1.27.0")]
2272#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2273pub const fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2274    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2275}
2276
2277/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2278/// intermediate 64-bit integers, and returns the low 32 bits of the
2279/// intermediate integers
2280///
2281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2282#[inline]
2283#[target_feature(enable = "avx2")]
2284#[cfg_attr(test, assert_instr(vpmulld))]
2285#[stable(feature = "simd_x86", since = "1.27.0")]
2286#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2287pub const fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2288    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2289}
2290
2291/// Multiplies packed 16-bit integers in `a` and `b`, producing
2292/// intermediate signed 32-bit integers. Truncate each intermediate
2293/// integer to the 18 most significant bits, round by adding 1, and
2294/// return bits `[16:1]`.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2297#[inline]
2298#[target_feature(enable = "avx2")]
2299#[cfg_attr(test, assert_instr(vpmulhrsw))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2302    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2303}
2304
2305/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2306/// and `b`
2307///
2308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2309#[inline]
2310#[target_feature(enable = "avx2")]
2311#[cfg_attr(test, assert_instr(vorps))]
2312#[stable(feature = "simd_x86", since = "1.27.0")]
2313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2314pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2315    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2316}
2317
2318/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2319/// using signed saturation
2320///
2321/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2322#[inline]
2323#[target_feature(enable = "avx2")]
2324#[cfg_attr(test, assert_instr(vpacksswb))]
2325#[stable(feature = "simd_x86", since = "1.27.0")]
2326pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2327    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2328}
2329
2330/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2331/// using signed saturation
2332///
2333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2334#[inline]
2335#[target_feature(enable = "avx2")]
2336#[cfg_attr(test, assert_instr(vpackssdw))]
2337#[stable(feature = "simd_x86", since = "1.27.0")]
2338pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2339    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2340}
2341
2342/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2343/// using unsigned saturation
2344///
2345/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2346#[inline]
2347#[target_feature(enable = "avx2")]
2348#[cfg_attr(test, assert_instr(vpackuswb))]
2349#[stable(feature = "simd_x86", since = "1.27.0")]
2350pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2351    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2352}
2353
2354/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2355/// using unsigned saturation
2356///
2357/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2358#[inline]
2359#[target_feature(enable = "avx2")]
2360#[cfg_attr(test, assert_instr(vpackusdw))]
2361#[stable(feature = "simd_x86", since = "1.27.0")]
2362pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2363    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2364}
2365
2366/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2367///
2368/// The last 3 bits of each integer of `b` are used as addresses into the 8
2369/// integers of `a`.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2372#[inline]
2373#[target_feature(enable = "avx2")]
2374#[cfg_attr(test, assert_instr(vpermps))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2377    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2378}
2379
2380/// Permutes 64-bit integers from `a` using control mask `imm8`.
2381///
2382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2383#[inline]
2384#[target_feature(enable = "avx2")]
2385#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2386#[rustc_legacy_const_generics(1)]
2387#[stable(feature = "simd_x86", since = "1.27.0")]
2388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2389pub const fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2390    static_assert_uimm_bits!(IMM8, 8);
2391    unsafe {
2392        let zero = i64x4::ZERO;
2393        let r: i64x4 = simd_shuffle!(
2394            a.as_i64x4(),
2395            zero,
2396            [
2397                IMM8 as u32 & 0b11,
2398                (IMM8 as u32 >> 2) & 0b11,
2399                (IMM8 as u32 >> 4) & 0b11,
2400                (IMM8 as u32 >> 6) & 0b11,
2401            ],
2402        );
2403        transmute(r)
2404    }
2405}
2406
2407/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2408///
2409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2410#[inline]
2411#[target_feature(enable = "avx2")]
2412#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2413#[rustc_legacy_const_generics(2)]
2414#[stable(feature = "simd_x86", since = "1.27.0")]
2415#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2416pub const fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2417    static_assert_uimm_bits!(IMM8, 8);
2418    _mm256_permute2f128_si256::<IMM8>(a, b)
2419}
2420
2421/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2422/// control in `imm8`.
2423///
2424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2425#[inline]
2426#[target_feature(enable = "avx2")]
2427#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2428#[rustc_legacy_const_generics(1)]
2429#[stable(feature = "simd_x86", since = "1.27.0")]
2430#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2431pub const fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2432    static_assert_uimm_bits!(IMM8, 8);
2433    unsafe {
2434        simd_shuffle!(
2435            a,
2436            _mm256_undefined_pd(),
2437            [
2438                IMM8 as u32 & 0b11,
2439                (IMM8 as u32 >> 2) & 0b11,
2440                (IMM8 as u32 >> 4) & 0b11,
2441                (IMM8 as u32 >> 6) & 0b11,
2442            ],
2443        )
2444    }
2445}
2446
2447/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2448/// the corresponding 32-bit integer index in `idx`.
2449///
2450/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2451#[inline]
2452#[target_feature(enable = "avx2")]
2453#[cfg_attr(test, assert_instr(vpermps))]
2454#[stable(feature = "simd_x86", since = "1.27.0")]
2455pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2456    unsafe { permps(a, idx.as_i32x8()) }
2457}
2458
2459/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2460/// and `b`, then horizontally sum each consecutive 8 differences to
2461/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2462/// integers in the low 16 bits of the 64-bit return value
2463///
2464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2465#[inline]
2466#[target_feature(enable = "avx2")]
2467#[cfg_attr(test, assert_instr(vpsadbw))]
2468#[stable(feature = "simd_x86", since = "1.27.0")]
2469pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2470    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2471}
2472
2473/// Shuffles bytes from `a` according to the content of `b`.
2474///
2475/// For each of the 128-bit low and high halves of the vectors, the last
2476/// 4 bits of each byte of `b` are used as addresses into the respective
2477/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2478///
2479/// In addition, if the highest significant bit of a byte of `b` is set, the
2480/// respective destination byte is set to 0.
2481///
2482/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2483/// equivalent to:
2484///
2485/// ```
2486/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2487///     let mut r = [0; 32];
2488///     for i in 0..16 {
2489///         // if the most significant bit of b is set,
2490///         // then the destination byte is set to 0.
2491///         if b[i] & 0x80 == 0u8 {
2492///             r[i] = a[(b[i] % 16) as usize];
2493///         }
2494///         if b[i + 16] & 0x80 == 0u8 {
2495///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2496///         }
2497///     }
2498///     r
2499/// }
2500/// ```
2501///
2502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2503#[inline]
2504#[target_feature(enable = "avx2")]
2505#[cfg_attr(test, assert_instr(vpshufb))]
2506#[stable(feature = "simd_x86", since = "1.27.0")]
2507pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2508    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2509}
2510
2511/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2512/// `imm8`.
2513///
2514/// ```rust
2515/// #[cfg(target_arch = "x86")]
2516/// use std::arch::x86::*;
2517/// #[cfg(target_arch = "x86_64")]
2518/// use std::arch::x86_64::*;
2519///
2520/// # fn main() {
2521/// #     if is_x86_feature_detected!("avx2") {
2522/// #         #[target_feature(enable = "avx2")]
2523/// #         unsafe fn worker() {
2524/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2525///
2526/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2527/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2528///
2529/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2530/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2531///
2532/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2533/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2534/// #         }
2535/// #         unsafe { worker(); }
2536/// #     }
2537/// # }
2538/// ```
2539///
2540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2541#[inline]
2542#[target_feature(enable = "avx2")]
2543#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2544#[rustc_legacy_const_generics(1)]
2545#[stable(feature = "simd_x86", since = "1.27.0")]
2546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2547pub const fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2548    static_assert_uimm_bits!(MASK, 8);
2549    unsafe {
2550        let r: i32x8 = simd_shuffle!(
2551            a.as_i32x8(),
2552            a.as_i32x8(),
2553            [
2554                MASK as u32 & 0b11,
2555                (MASK as u32 >> 2) & 0b11,
2556                (MASK as u32 >> 4) & 0b11,
2557                (MASK as u32 >> 6) & 0b11,
2558                (MASK as u32 & 0b11) + 4,
2559                ((MASK as u32 >> 2) & 0b11) + 4,
2560                ((MASK as u32 >> 4) & 0b11) + 4,
2561                ((MASK as u32 >> 6) & 0b11) + 4,
2562            ],
2563        );
2564        transmute(r)
2565    }
2566}
2567
2568/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2569/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2570/// to the output.
2571///
2572/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2573#[inline]
2574#[target_feature(enable = "avx2")]
2575#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2576#[rustc_legacy_const_generics(1)]
2577#[stable(feature = "simd_x86", since = "1.27.0")]
2578#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2579pub const fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2580    static_assert_uimm_bits!(IMM8, 8);
2581    unsafe {
2582        let a = a.as_i16x16();
2583        let r: i16x16 = simd_shuffle!(
2584            a,
2585            a,
2586            [
2587                0,
2588                1,
2589                2,
2590                3,
2591                4 + (IMM8 as u32 & 0b11),
2592                4 + ((IMM8 as u32 >> 2) & 0b11),
2593                4 + ((IMM8 as u32 >> 4) & 0b11),
2594                4 + ((IMM8 as u32 >> 6) & 0b11),
2595                8,
2596                9,
2597                10,
2598                11,
2599                12 + (IMM8 as u32 & 0b11),
2600                12 + ((IMM8 as u32 >> 2) & 0b11),
2601                12 + ((IMM8 as u32 >> 4) & 0b11),
2602                12 + ((IMM8 as u32 >> 6) & 0b11),
2603            ],
2604        );
2605        transmute(r)
2606    }
2607}
2608
2609/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2610/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2611/// to the output.
2612///
2613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2614#[inline]
2615#[target_feature(enable = "avx2")]
2616#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2617#[rustc_legacy_const_generics(1)]
2618#[stable(feature = "simd_x86", since = "1.27.0")]
2619#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2620pub const fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2621    static_assert_uimm_bits!(IMM8, 8);
2622    unsafe {
2623        let a = a.as_i16x16();
2624        let r: i16x16 = simd_shuffle!(
2625            a,
2626            a,
2627            [
2628                0 + (IMM8 as u32 & 0b11),
2629                0 + ((IMM8 as u32 >> 2) & 0b11),
2630                0 + ((IMM8 as u32 >> 4) & 0b11),
2631                0 + ((IMM8 as u32 >> 6) & 0b11),
2632                4,
2633                5,
2634                6,
2635                7,
2636                8 + (IMM8 as u32 & 0b11),
2637                8 + ((IMM8 as u32 >> 2) & 0b11),
2638                8 + ((IMM8 as u32 >> 4) & 0b11),
2639                8 + ((IMM8 as u32 >> 6) & 0b11),
2640                12,
2641                13,
2642                14,
2643                15,
2644            ],
2645        );
2646        transmute(r)
2647    }
2648}
2649
2650/// Negates packed 16-bit integers in `a` when the corresponding signed
2651/// 16-bit integer in `b` is negative, and returns the results.
2652/// Results are zeroed out when the corresponding element in `b` is zero.
2653///
2654/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2655#[inline]
2656#[target_feature(enable = "avx2")]
2657#[cfg_attr(test, assert_instr(vpsignw))]
2658#[stable(feature = "simd_x86", since = "1.27.0")]
2659pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2660    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2661}
2662
2663/// Negates packed 32-bit integers in `a` when the corresponding signed
2664/// 32-bit integer in `b` is negative, and returns the results.
2665/// Results are zeroed out when the corresponding element in `b` is zero.
2666///
2667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2668#[inline]
2669#[target_feature(enable = "avx2")]
2670#[cfg_attr(test, assert_instr(vpsignd))]
2671#[stable(feature = "simd_x86", since = "1.27.0")]
2672pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2673    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2674}
2675
2676/// Negates packed 8-bit integers in `a` when the corresponding signed
2677/// 8-bit integer in `b` is negative, and returns the results.
2678/// Results are zeroed out when the corresponding element in `b` is zero.
2679///
2680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2681#[inline]
2682#[target_feature(enable = "avx2")]
2683#[cfg_attr(test, assert_instr(vpsignb))]
2684#[stable(feature = "simd_x86", since = "1.27.0")]
2685pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2686    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2687}
2688
2689/// Shifts packed 16-bit integers in `a` left by `count` while
2690/// shifting in zeros, and returns the result
2691///
2692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2693#[inline]
2694#[target_feature(enable = "avx2")]
2695#[cfg_attr(test, assert_instr(vpsllw))]
2696#[stable(feature = "simd_x86", since = "1.27.0")]
2697pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2698    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2699}
2700
2701/// Shifts packed 32-bit integers in `a` left by `count` while
2702/// shifting in zeros, and returns the result
2703///
2704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2705#[inline]
2706#[target_feature(enable = "avx2")]
2707#[cfg_attr(test, assert_instr(vpslld))]
2708#[stable(feature = "simd_x86", since = "1.27.0")]
2709pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2710    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2711}
2712
2713/// Shifts packed 64-bit integers in `a` left by `count` while
2714/// shifting in zeros, and returns the result
2715///
2716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2717#[inline]
2718#[target_feature(enable = "avx2")]
2719#[cfg_attr(test, assert_instr(vpsllq))]
2720#[stable(feature = "simd_x86", since = "1.27.0")]
2721pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2722    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2723}
2724
2725/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2726/// shifting in zeros, return the results;
2727///
2728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2729#[inline]
2730#[target_feature(enable = "avx2")]
2731#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2732#[rustc_legacy_const_generics(1)]
2733#[stable(feature = "simd_x86", since = "1.27.0")]
2734#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2735pub const fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2736    static_assert_uimm_bits!(IMM8, 8);
2737    unsafe {
2738        if IMM8 >= 16 {
2739            _mm256_setzero_si256()
2740        } else {
2741            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2742        }
2743    }
2744}
2745
2746/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2747/// shifting in zeros, return the results;
2748///
2749/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2750#[inline]
2751#[target_feature(enable = "avx2")]
2752#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2753#[rustc_legacy_const_generics(1)]
2754#[stable(feature = "simd_x86", since = "1.27.0")]
2755#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2756pub const fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2757    unsafe {
2758        static_assert_uimm_bits!(IMM8, 8);
2759        if IMM8 >= 32 {
2760            _mm256_setzero_si256()
2761        } else {
2762            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2763        }
2764    }
2765}
2766
2767/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2768/// shifting in zeros, return the results;
2769///
2770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2771#[inline]
2772#[target_feature(enable = "avx2")]
2773#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2774#[rustc_legacy_const_generics(1)]
2775#[stable(feature = "simd_x86", since = "1.27.0")]
2776#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2777pub const fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2778    unsafe {
2779        static_assert_uimm_bits!(IMM8, 8);
2780        if IMM8 >= 64 {
2781            _mm256_setzero_si256()
2782        } else {
2783            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2784        }
2785    }
2786}
2787
2788/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2789///
2790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2791#[inline]
2792#[target_feature(enable = "avx2")]
2793#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2794#[rustc_legacy_const_generics(1)]
2795#[stable(feature = "simd_x86", since = "1.27.0")]
2796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2797pub const fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2798    static_assert_uimm_bits!(IMM8, 8);
2799    _mm256_bslli_epi128::<IMM8>(a)
2800}
2801
2802/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2803///
2804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2805#[inline]
2806#[target_feature(enable = "avx2")]
2807#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2808#[rustc_legacy_const_generics(1)]
2809#[stable(feature = "simd_x86", since = "1.27.0")]
2810#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2811pub const fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2812    static_assert_uimm_bits!(IMM8, 8);
2813    const fn mask(shift: i32, i: u32) -> u32 {
2814        let shift = shift as u32 & 0xff;
2815        if shift > 15 || i % 16 < shift {
2816            0
2817        } else {
2818            32 + (i - shift)
2819        }
2820    }
2821    unsafe {
2822        let a = a.as_i8x32();
2823        let r: i8x32 = simd_shuffle!(
2824            i8x32::ZERO,
2825            a,
2826            [
2827                mask(IMM8, 0),
2828                mask(IMM8, 1),
2829                mask(IMM8, 2),
2830                mask(IMM8, 3),
2831                mask(IMM8, 4),
2832                mask(IMM8, 5),
2833                mask(IMM8, 6),
2834                mask(IMM8, 7),
2835                mask(IMM8, 8),
2836                mask(IMM8, 9),
2837                mask(IMM8, 10),
2838                mask(IMM8, 11),
2839                mask(IMM8, 12),
2840                mask(IMM8, 13),
2841                mask(IMM8, 14),
2842                mask(IMM8, 15),
2843                mask(IMM8, 16),
2844                mask(IMM8, 17),
2845                mask(IMM8, 18),
2846                mask(IMM8, 19),
2847                mask(IMM8, 20),
2848                mask(IMM8, 21),
2849                mask(IMM8, 22),
2850                mask(IMM8, 23),
2851                mask(IMM8, 24),
2852                mask(IMM8, 25),
2853                mask(IMM8, 26),
2854                mask(IMM8, 27),
2855                mask(IMM8, 28),
2856                mask(IMM8, 29),
2857                mask(IMM8, 30),
2858                mask(IMM8, 31),
2859            ],
2860        );
2861        transmute(r)
2862    }
2863}
2864
2865/// Shifts packed 32-bit integers in `a` left by the amount
2866/// specified by the corresponding element in `count` while
2867/// shifting in zeros, and returns the result.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2870#[inline]
2871#[target_feature(enable = "avx2")]
2872#[cfg_attr(test, assert_instr(vpsllvd))]
2873#[stable(feature = "simd_x86", since = "1.27.0")]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2876    unsafe {
2877        let count = count.as_u32x4();
2878        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2879        let count = simd_select(no_overflow, count, u32x4::ZERO);
2880        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2881    }
2882}
2883
2884/// Shifts packed 32-bit integers in `a` left by the amount
2885/// specified by the corresponding element in `count` while
2886/// shifting in zeros, and returns the result.
2887///
2888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2889#[inline]
2890#[target_feature(enable = "avx2")]
2891#[cfg_attr(test, assert_instr(vpsllvd))]
2892#[stable(feature = "simd_x86", since = "1.27.0")]
2893#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2894pub const fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2895    unsafe {
2896        let count = count.as_u32x8();
2897        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2898        let count = simd_select(no_overflow, count, u32x8::ZERO);
2899        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2900    }
2901}
2902
2903/// Shifts packed 64-bit integers in `a` left by the amount
2904/// specified by the corresponding element in `count` while
2905/// shifting in zeros, and returns the result.
2906///
2907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2908#[inline]
2909#[target_feature(enable = "avx2")]
2910#[cfg_attr(test, assert_instr(vpsllvq))]
2911#[stable(feature = "simd_x86", since = "1.27.0")]
2912#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2913pub const fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2914    unsafe {
2915        let count = count.as_u64x2();
2916        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
2917        let count = simd_select(no_overflow, count, u64x2::ZERO);
2918        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
2919    }
2920}
2921
2922/// Shifts packed 64-bit integers in `a` left by the amount
2923/// specified by the corresponding element in `count` while
2924/// shifting in zeros, and returns the result.
2925///
2926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2927#[inline]
2928#[target_feature(enable = "avx2")]
2929#[cfg_attr(test, assert_instr(vpsllvq))]
2930#[stable(feature = "simd_x86", since = "1.27.0")]
2931#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2932pub const fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2933    unsafe {
2934        let count = count.as_u64x4();
2935        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
2936        let count = simd_select(no_overflow, count, u64x4::ZERO);
2937        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
2938    }
2939}
2940
2941/// Shifts packed 16-bit integers in `a` right by `count` while
2942/// shifting in sign bits.
2943///
2944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2945#[inline]
2946#[target_feature(enable = "avx2")]
2947#[cfg_attr(test, assert_instr(vpsraw))]
2948#[stable(feature = "simd_x86", since = "1.27.0")]
2949pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2950    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2951}
2952
2953/// Shifts packed 32-bit integers in `a` right by `count` while
2954/// shifting in sign bits.
2955///
2956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2957#[inline]
2958#[target_feature(enable = "avx2")]
2959#[cfg_attr(test, assert_instr(vpsrad))]
2960#[stable(feature = "simd_x86", since = "1.27.0")]
2961pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2962    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2963}
2964
2965/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2966/// shifting in sign bits.
2967///
2968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2969#[inline]
2970#[target_feature(enable = "avx2")]
2971#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2972#[rustc_legacy_const_generics(1)]
2973#[stable(feature = "simd_x86", since = "1.27.0")]
2974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2975pub const fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2976    static_assert_uimm_bits!(IMM8, 8);
2977    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2978}
2979
2980/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2981/// shifting in sign bits.
2982///
2983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2984#[inline]
2985#[target_feature(enable = "avx2")]
2986#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2987#[rustc_legacy_const_generics(1)]
2988#[stable(feature = "simd_x86", since = "1.27.0")]
2989#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2990pub const fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2991    static_assert_uimm_bits!(IMM8, 8);
2992    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2993}
2994
2995/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2996/// corresponding element in `count` while shifting in sign bits.
2997///
2998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2999#[inline]
3000#[target_feature(enable = "avx2")]
3001#[cfg_attr(test, assert_instr(vpsravd))]
3002#[stable(feature = "simd_x86", since = "1.27.0")]
3003#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3004pub const fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3005    unsafe {
3006        let count = count.as_u32x4();
3007        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3008        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
3009        simd_shr(a.as_i32x4(), count).as_m128i()
3010    }
3011}
3012
3013/// Shifts packed 32-bit integers in `a` right by the amount specified by the
3014/// corresponding element in `count` while shifting in sign bits.
3015///
3016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
3017#[inline]
3018#[target_feature(enable = "avx2")]
3019#[cfg_attr(test, assert_instr(vpsravd))]
3020#[stable(feature = "simd_x86", since = "1.27.0")]
3021#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3022pub const fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3023    unsafe {
3024        let count = count.as_u32x8();
3025        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3026        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
3027        simd_shr(a.as_i32x8(), count).as_m256i()
3028    }
3029}
3030
3031/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3032///
3033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
3034#[inline]
3035#[target_feature(enable = "avx2")]
3036#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3037#[rustc_legacy_const_generics(1)]
3038#[stable(feature = "simd_x86", since = "1.27.0")]
3039#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3040pub const fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
3041    static_assert_uimm_bits!(IMM8, 8);
3042    _mm256_bsrli_epi128::<IMM8>(a)
3043}
3044
3045/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3046///
3047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
3048#[inline]
3049#[target_feature(enable = "avx2")]
3050#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3051#[rustc_legacy_const_generics(1)]
3052#[stable(feature = "simd_x86", since = "1.27.0")]
3053#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3054pub const fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
3055    static_assert_uimm_bits!(IMM8, 8);
3056    const fn mask(shift: i32, i: u32) -> u32 {
3057        let shift = shift as u32 & 0xff;
3058        if shift > 15 || (15 - (i % 16)) < shift {
3059            0
3060        } else {
3061            32 + (i + shift)
3062        }
3063    }
3064    unsafe {
3065        let a = a.as_i8x32();
3066        let r: i8x32 = simd_shuffle!(
3067            i8x32::ZERO,
3068            a,
3069            [
3070                mask(IMM8, 0),
3071                mask(IMM8, 1),
3072                mask(IMM8, 2),
3073                mask(IMM8, 3),
3074                mask(IMM8, 4),
3075                mask(IMM8, 5),
3076                mask(IMM8, 6),
3077                mask(IMM8, 7),
3078                mask(IMM8, 8),
3079                mask(IMM8, 9),
3080                mask(IMM8, 10),
3081                mask(IMM8, 11),
3082                mask(IMM8, 12),
3083                mask(IMM8, 13),
3084                mask(IMM8, 14),
3085                mask(IMM8, 15),
3086                mask(IMM8, 16),
3087                mask(IMM8, 17),
3088                mask(IMM8, 18),
3089                mask(IMM8, 19),
3090                mask(IMM8, 20),
3091                mask(IMM8, 21),
3092                mask(IMM8, 22),
3093                mask(IMM8, 23),
3094                mask(IMM8, 24),
3095                mask(IMM8, 25),
3096                mask(IMM8, 26),
3097                mask(IMM8, 27),
3098                mask(IMM8, 28),
3099                mask(IMM8, 29),
3100                mask(IMM8, 30),
3101                mask(IMM8, 31),
3102            ],
3103        );
3104        transmute(r)
3105    }
3106}
3107
3108/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3109/// zeros.
3110///
3111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3112#[inline]
3113#[target_feature(enable = "avx2")]
3114#[cfg_attr(test, assert_instr(vpsrlw))]
3115#[stable(feature = "simd_x86", since = "1.27.0")]
3116pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3117    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3118}
3119
3120/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3121/// zeros.
3122///
3123/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3124#[inline]
3125#[target_feature(enable = "avx2")]
3126#[cfg_attr(test, assert_instr(vpsrld))]
3127#[stable(feature = "simd_x86", since = "1.27.0")]
3128pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3129    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3130}
3131
3132/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3133/// zeros.
3134///
3135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3136#[inline]
3137#[target_feature(enable = "avx2")]
3138#[cfg_attr(test, assert_instr(vpsrlq))]
3139#[stable(feature = "simd_x86", since = "1.27.0")]
3140pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3141    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3142}
3143
3144/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3145/// zeros
3146///
3147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3148#[inline]
3149#[target_feature(enable = "avx2")]
3150#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3151#[rustc_legacy_const_generics(1)]
3152#[stable(feature = "simd_x86", since = "1.27.0")]
3153#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3154pub const fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3155    static_assert_uimm_bits!(IMM8, 8);
3156    unsafe {
3157        if IMM8 >= 16 {
3158            _mm256_setzero_si256()
3159        } else {
3160            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3161        }
3162    }
3163}
3164
3165/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3166/// zeros
3167///
3168/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3169#[inline]
3170#[target_feature(enable = "avx2")]
3171#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3172#[rustc_legacy_const_generics(1)]
3173#[stable(feature = "simd_x86", since = "1.27.0")]
3174#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3175pub const fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3176    static_assert_uimm_bits!(IMM8, 8);
3177    unsafe {
3178        if IMM8 >= 32 {
3179            _mm256_setzero_si256()
3180        } else {
3181            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3182        }
3183    }
3184}
3185
3186/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3187/// zeros
3188///
3189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3190#[inline]
3191#[target_feature(enable = "avx2")]
3192#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3193#[rustc_legacy_const_generics(1)]
3194#[stable(feature = "simd_x86", since = "1.27.0")]
3195#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3196pub const fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3197    static_assert_uimm_bits!(IMM8, 8);
3198    unsafe {
3199        if IMM8 >= 64 {
3200            _mm256_setzero_si256()
3201        } else {
3202            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3203        }
3204    }
3205}
3206
3207/// Shifts packed 32-bit integers in `a` right by the amount specified by
3208/// the corresponding element in `count` while shifting in zeros,
3209///
3210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3211#[inline]
3212#[target_feature(enable = "avx2")]
3213#[cfg_attr(test, assert_instr(vpsrlvd))]
3214#[stable(feature = "simd_x86", since = "1.27.0")]
3215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3216pub const fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3217    unsafe {
3218        let count = count.as_u32x4();
3219        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3220        let count = simd_select(no_overflow, count, u32x4::ZERO);
3221        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3222    }
3223}
3224
3225/// Shifts packed 32-bit integers in `a` right by the amount specified by
3226/// the corresponding element in `count` while shifting in zeros,
3227///
3228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3229#[inline]
3230#[target_feature(enable = "avx2")]
3231#[cfg_attr(test, assert_instr(vpsrlvd))]
3232#[stable(feature = "simd_x86", since = "1.27.0")]
3233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3234pub const fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3235    unsafe {
3236        let count = count.as_u32x8();
3237        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3238        let count = simd_select(no_overflow, count, u32x8::ZERO);
3239        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3240    }
3241}
3242
3243/// Shifts packed 64-bit integers in `a` right by the amount specified by
3244/// the corresponding element in `count` while shifting in zeros,
3245///
3246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3247#[inline]
3248#[target_feature(enable = "avx2")]
3249#[cfg_attr(test, assert_instr(vpsrlvq))]
3250#[stable(feature = "simd_x86", since = "1.27.0")]
3251#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3252pub const fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3253    unsafe {
3254        let count = count.as_u64x2();
3255        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3256        let count = simd_select(no_overflow, count, u64x2::ZERO);
3257        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3258    }
3259}
3260
3261/// Shifts packed 64-bit integers in `a` right by the amount specified by
3262/// the corresponding element in `count` while shifting in zeros,
3263///
3264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3265#[inline]
3266#[target_feature(enable = "avx2")]
3267#[cfg_attr(test, assert_instr(vpsrlvq))]
3268#[stable(feature = "simd_x86", since = "1.27.0")]
3269#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3270pub const fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3271    unsafe {
3272        let count = count.as_u64x4();
3273        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3274        let count = simd_select(no_overflow, count, u64x4::ZERO);
3275        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3276    }
3277}
3278
3279/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3280/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3281/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3282///
3283/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3284#[inline]
3285#[target_feature(enable = "avx2")]
3286#[cfg_attr(test, assert_instr(vmovntdqa))]
3287#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3288pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3289    let dst: __m256i;
3290    crate::arch::asm!(
3291        vpl!("vmovntdqa {a}"),
3292        a = out(ymm_reg) dst,
3293        p = in(reg) mem_addr,
3294        options(pure, readonly, nostack, preserves_flags),
3295    );
3296    dst
3297}
3298
3299/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3300///
3301/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3302#[inline]
3303#[target_feature(enable = "avx2")]
3304#[cfg_attr(test, assert_instr(vpsubw))]
3305#[stable(feature = "simd_x86", since = "1.27.0")]
3306#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3307pub const fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3308    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3309}
3310
3311/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3312///
3313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3314#[inline]
3315#[target_feature(enable = "avx2")]
3316#[cfg_attr(test, assert_instr(vpsubd))]
3317#[stable(feature = "simd_x86", since = "1.27.0")]
3318#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3319pub const fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3320    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3321}
3322
3323/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3324///
3325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3326#[inline]
3327#[target_feature(enable = "avx2")]
3328#[cfg_attr(test, assert_instr(vpsubq))]
3329#[stable(feature = "simd_x86", since = "1.27.0")]
3330#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3331pub const fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3332    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3333}
3334
3335/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3336///
3337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3338#[inline]
3339#[target_feature(enable = "avx2")]
3340#[cfg_attr(test, assert_instr(vpsubb))]
3341#[stable(feature = "simd_x86", since = "1.27.0")]
3342#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3343pub const fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3344    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3345}
3346
3347/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3348/// `a` using saturation.
3349///
3350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3351#[inline]
3352#[target_feature(enable = "avx2")]
3353#[cfg_attr(test, assert_instr(vpsubsw))]
3354#[stable(feature = "simd_x86", since = "1.27.0")]
3355#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3356pub const fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3357    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3358}
3359
3360/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3361/// `a` using saturation.
3362///
3363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3364#[inline]
3365#[target_feature(enable = "avx2")]
3366#[cfg_attr(test, assert_instr(vpsubsb))]
3367#[stable(feature = "simd_x86", since = "1.27.0")]
3368#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3369pub const fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3370    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3371}
3372
3373/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3374/// integers in `a` using saturation.
3375///
3376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3377#[inline]
3378#[target_feature(enable = "avx2")]
3379#[cfg_attr(test, assert_instr(vpsubusw))]
3380#[stable(feature = "simd_x86", since = "1.27.0")]
3381#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3382pub const fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3383    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3384}
3385
3386/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3387/// integers in `a` using saturation.
3388///
3389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3390#[inline]
3391#[target_feature(enable = "avx2")]
3392#[cfg_attr(test, assert_instr(vpsubusb))]
3393#[stable(feature = "simd_x86", since = "1.27.0")]
3394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3395pub const fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3396    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3397}
3398
3399/// Unpacks and interleave 8-bit integers from the high half of each
3400/// 128-bit lane in `a` and `b`.
3401///
3402/// ```rust
3403/// #[cfg(target_arch = "x86")]
3404/// use std::arch::x86::*;
3405/// #[cfg(target_arch = "x86_64")]
3406/// use std::arch::x86_64::*;
3407///
3408/// # fn main() {
3409/// #     if is_x86_feature_detected!("avx2") {
3410/// #         #[target_feature(enable = "avx2")]
3411/// #         unsafe fn worker() {
3412/// let a = _mm256_setr_epi8(
3413///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3414///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3415/// );
3416/// let b = _mm256_setr_epi8(
3417///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3418///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3419///     -30, -31,
3420/// );
3421///
3422/// let c = _mm256_unpackhi_epi8(a, b);
3423///
3424/// let expected = _mm256_setr_epi8(
3425///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3426///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3427///     -31,
3428/// );
3429/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3430///
3431/// #         }
3432/// #         unsafe { worker(); }
3433/// #     }
3434/// # }
3435/// ```
3436///
3437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3438#[inline]
3439#[target_feature(enable = "avx2")]
3440#[cfg_attr(test, assert_instr(vpunpckhbw))]
3441#[stable(feature = "simd_x86", since = "1.27.0")]
3442#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3443pub const fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3444    unsafe {
3445        #[rustfmt::skip]
3446        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3447                8, 40, 9, 41, 10, 42, 11, 43,
3448                12, 44, 13, 45, 14, 46, 15, 47,
3449                24, 56, 25, 57, 26, 58, 27, 59,
3450                28, 60, 29, 61, 30, 62, 31, 63,
3451        ]);
3452        transmute(r)
3453    }
3454}
3455
3456/// Unpacks and interleave 8-bit integers from the low half of each
3457/// 128-bit lane of `a` and `b`.
3458///
3459/// ```rust
3460/// #[cfg(target_arch = "x86")]
3461/// use std::arch::x86::*;
3462/// #[cfg(target_arch = "x86_64")]
3463/// use std::arch::x86_64::*;
3464///
3465/// # fn main() {
3466/// #     if is_x86_feature_detected!("avx2") {
3467/// #         #[target_feature(enable = "avx2")]
3468/// #         unsafe fn worker() {
3469/// let a = _mm256_setr_epi8(
3470///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3471///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3472/// );
3473/// let b = _mm256_setr_epi8(
3474///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3475///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3476///     -30, -31,
3477/// );
3478///
3479/// let c = _mm256_unpacklo_epi8(a, b);
3480///
3481/// let expected = _mm256_setr_epi8(
3482///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3483///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3484/// );
3485/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3486///
3487/// #         }
3488/// #         unsafe { worker(); }
3489/// #     }
3490/// # }
3491/// ```
3492///
3493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3494#[inline]
3495#[target_feature(enable = "avx2")]
3496#[cfg_attr(test, assert_instr(vpunpcklbw))]
3497#[stable(feature = "simd_x86", since = "1.27.0")]
3498#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3499pub const fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3500    unsafe {
3501        #[rustfmt::skip]
3502        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3503            0, 32, 1, 33, 2, 34, 3, 35,
3504            4, 36, 5, 37, 6, 38, 7, 39,
3505            16, 48, 17, 49, 18, 50, 19, 51,
3506            20, 52, 21, 53, 22, 54, 23, 55,
3507        ]);
3508        transmute(r)
3509    }
3510}
3511
3512/// Unpacks and interleave 16-bit integers from the high half of each
3513/// 128-bit lane of `a` and `b`.
3514///
3515/// ```rust
3516/// #[cfg(target_arch = "x86")]
3517/// use std::arch::x86::*;
3518/// #[cfg(target_arch = "x86_64")]
3519/// use std::arch::x86_64::*;
3520///
3521/// # fn main() {
3522/// #     if is_x86_feature_detected!("avx2") {
3523/// #         #[target_feature(enable = "avx2")]
3524/// #         unsafe fn worker() {
3525/// let a = _mm256_setr_epi16(
3526///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3527/// );
3528/// let b = _mm256_setr_epi16(
3529///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3530/// );
3531///
3532/// let c = _mm256_unpackhi_epi16(a, b);
3533///
3534/// let expected = _mm256_setr_epi16(
3535///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3536/// );
3537/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3538///
3539/// #         }
3540/// #         unsafe { worker(); }
3541/// #     }
3542/// # }
3543/// ```
3544///
3545/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3546#[inline]
3547#[target_feature(enable = "avx2")]
3548#[cfg_attr(test, assert_instr(vpunpckhwd))]
3549#[stable(feature = "simd_x86", since = "1.27.0")]
3550#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3551pub const fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3552    unsafe {
3553        let r: i16x16 = simd_shuffle!(
3554            a.as_i16x16(),
3555            b.as_i16x16(),
3556            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3557        );
3558        transmute(r)
3559    }
3560}
3561
3562/// Unpacks and interleave 16-bit integers from the low half of each
3563/// 128-bit lane of `a` and `b`.
3564///
3565/// ```rust
3566/// #[cfg(target_arch = "x86")]
3567/// use std::arch::x86::*;
3568/// #[cfg(target_arch = "x86_64")]
3569/// use std::arch::x86_64::*;
3570///
3571/// # fn main() {
3572/// #     if is_x86_feature_detected!("avx2") {
3573/// #         #[target_feature(enable = "avx2")]
3574/// #         unsafe fn worker() {
3575///
3576/// let a = _mm256_setr_epi16(
3577///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3578/// );
3579/// let b = _mm256_setr_epi16(
3580///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3581/// );
3582///
3583/// let c = _mm256_unpacklo_epi16(a, b);
3584///
3585/// let expected = _mm256_setr_epi16(
3586///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3587/// );
3588/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3589///
3590/// #         }
3591/// #         unsafe { worker(); }
3592/// #     }
3593/// # }
3594/// ```
3595///
3596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3597#[inline]
3598#[target_feature(enable = "avx2")]
3599#[cfg_attr(test, assert_instr(vpunpcklwd))]
3600#[stable(feature = "simd_x86", since = "1.27.0")]
3601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3602pub const fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3603    unsafe {
3604        let r: i16x16 = simd_shuffle!(
3605            a.as_i16x16(),
3606            b.as_i16x16(),
3607            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3608        );
3609        transmute(r)
3610    }
3611}
3612
3613/// Unpacks and interleave 32-bit integers from the high half of each
3614/// 128-bit lane of `a` and `b`.
3615///
3616/// ```rust
3617/// #[cfg(target_arch = "x86")]
3618/// use std::arch::x86::*;
3619/// #[cfg(target_arch = "x86_64")]
3620/// use std::arch::x86_64::*;
3621///
3622/// # fn main() {
3623/// #     if is_x86_feature_detected!("avx2") {
3624/// #         #[target_feature(enable = "avx2")]
3625/// #         unsafe fn worker() {
3626/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3627/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3628///
3629/// let c = _mm256_unpackhi_epi32(a, b);
3630///
3631/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3632/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3633///
3634/// #         }
3635/// #         unsafe { worker(); }
3636/// #     }
3637/// # }
3638/// ```
3639///
3640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3641#[inline]
3642#[target_feature(enable = "avx2")]
3643#[cfg_attr(test, assert_instr(vunpckhps))]
3644#[stable(feature = "simd_x86", since = "1.27.0")]
3645#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3646pub const fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3647    unsafe {
3648        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3649        transmute(r)
3650    }
3651}
3652
3653/// Unpacks and interleave 32-bit integers from the low half of each
3654/// 128-bit lane of `a` and `b`.
3655///
3656/// ```rust
3657/// #[cfg(target_arch = "x86")]
3658/// use std::arch::x86::*;
3659/// #[cfg(target_arch = "x86_64")]
3660/// use std::arch::x86_64::*;
3661///
3662/// # fn main() {
3663/// #     if is_x86_feature_detected!("avx2") {
3664/// #         #[target_feature(enable = "avx2")]
3665/// #         unsafe fn worker() {
3666/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3667/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3668///
3669/// let c = _mm256_unpacklo_epi32(a, b);
3670///
3671/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3672/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3673///
3674/// #         }
3675/// #         unsafe { worker(); }
3676/// #     }
3677/// # }
3678/// ```
3679///
3680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3681#[inline]
3682#[target_feature(enable = "avx2")]
3683#[cfg_attr(test, assert_instr(vunpcklps))]
3684#[stable(feature = "simd_x86", since = "1.27.0")]
3685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3686pub const fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3687    unsafe {
3688        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3689        transmute(r)
3690    }
3691}
3692
3693/// Unpacks and interleave 64-bit integers from the high half of each
3694/// 128-bit lane of `a` and `b`.
3695///
3696/// ```rust
3697/// #[cfg(target_arch = "x86")]
3698/// use std::arch::x86::*;
3699/// #[cfg(target_arch = "x86_64")]
3700/// use std::arch::x86_64::*;
3701///
3702/// # fn main() {
3703/// #     if is_x86_feature_detected!("avx2") {
3704/// #         #[target_feature(enable = "avx2")]
3705/// #         unsafe fn worker() {
3706/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3707/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3708///
3709/// let c = _mm256_unpackhi_epi64(a, b);
3710///
3711/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3712/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3713///
3714/// #         }
3715/// #         unsafe { worker(); }
3716/// #     }
3717/// # }
3718/// ```
3719///
3720/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3721#[inline]
3722#[target_feature(enable = "avx2")]
3723#[cfg_attr(test, assert_instr(vunpckhpd))]
3724#[stable(feature = "simd_x86", since = "1.27.0")]
3725#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3726pub const fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3727    unsafe {
3728        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3729        transmute(r)
3730    }
3731}
3732
3733/// Unpacks and interleave 64-bit integers from the low half of each
3734/// 128-bit lane of `a` and `b`.
3735///
3736/// ```rust
3737/// #[cfg(target_arch = "x86")]
3738/// use std::arch::x86::*;
3739/// #[cfg(target_arch = "x86_64")]
3740/// use std::arch::x86_64::*;
3741///
3742/// # fn main() {
3743/// #     if is_x86_feature_detected!("avx2") {
3744/// #         #[target_feature(enable = "avx2")]
3745/// #         unsafe fn worker() {
3746/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3747/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3748///
3749/// let c = _mm256_unpacklo_epi64(a, b);
3750///
3751/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3752/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3753///
3754/// #         }
3755/// #         unsafe { worker(); }
3756/// #     }
3757/// # }
3758/// ```
3759///
3760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3761#[inline]
3762#[target_feature(enable = "avx2")]
3763#[cfg_attr(test, assert_instr(vunpcklpd))]
3764#[stable(feature = "simd_x86", since = "1.27.0")]
3765#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3766pub const fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3767    unsafe {
3768        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3769        transmute(r)
3770    }
3771}
3772
3773/// Computes the bitwise XOR of 256 bits (representing integer data)
3774/// in `a` and `b`
3775///
3776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3777#[inline]
3778#[target_feature(enable = "avx2")]
3779#[cfg_attr(test, assert_instr(vxorps))]
3780#[stable(feature = "simd_x86", since = "1.27.0")]
3781#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3782pub const fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3783    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3784}
3785
3786/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3787/// integer containing the zero-extended integer data.
3788///
3789/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3790///
3791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3792#[inline]
3793#[target_feature(enable = "avx2")]
3794// This intrinsic has no corresponding instruction.
3795#[rustc_legacy_const_generics(1)]
3796#[stable(feature = "simd_x86", since = "1.27.0")]
3797#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3798pub const fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3799    static_assert_uimm_bits!(INDEX, 5);
3800    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3801}
3802
3803/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3804/// integer containing the zero-extended integer data.
3805///
3806/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3807///
3808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3809#[inline]
3810#[target_feature(enable = "avx2")]
3811// This intrinsic has no corresponding instruction.
3812#[rustc_legacy_const_generics(1)]
3813#[stable(feature = "simd_x86", since = "1.27.0")]
3814#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3815pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3816    static_assert_uimm_bits!(INDEX, 4);
3817    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3818}
3819
3820#[allow(improper_ctypes)]
3821unsafe extern "C" {
3822    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3823    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3824    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3825    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3826    #[link_name = "llvm.x86.avx2.mpsadbw"]
3827    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3828    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3829    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3830    #[link_name = "llvm.x86.avx2.packsswb"]
3831    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3832    #[link_name = "llvm.x86.avx2.packssdw"]
3833    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3834    #[link_name = "llvm.x86.avx2.packuswb"]
3835    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3836    #[link_name = "llvm.x86.avx2.packusdw"]
3837    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3838    #[link_name = "llvm.x86.avx2.psad.bw"]
3839    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3840    #[link_name = "llvm.x86.avx2.psign.b"]
3841    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3842    #[link_name = "llvm.x86.avx2.psign.w"]
3843    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3844    #[link_name = "llvm.x86.avx2.psign.d"]
3845    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3846    #[link_name = "llvm.x86.avx2.psll.w"]
3847    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3848    #[link_name = "llvm.x86.avx2.psll.d"]
3849    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3850    #[link_name = "llvm.x86.avx2.psll.q"]
3851    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3852    #[link_name = "llvm.x86.avx2.psra.w"]
3853    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3854    #[link_name = "llvm.x86.avx2.psra.d"]
3855    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3856    #[link_name = "llvm.x86.avx2.psrl.w"]
3857    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3858    #[link_name = "llvm.x86.avx2.psrl.d"]
3859    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3860    #[link_name = "llvm.x86.avx2.psrl.q"]
3861    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3862    #[link_name = "llvm.x86.avx2.pshuf.b"]
3863    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3864    #[link_name = "llvm.x86.avx2.permd"]
3865    fn permd(a: u32x8, b: u32x8) -> u32x8;
3866    #[link_name = "llvm.x86.avx2.permps"]
3867    fn permps(a: __m256, b: i32x8) -> __m256;
3868    #[link_name = "llvm.x86.avx2.gather.d.d"]
3869    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3870    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3871    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3872    #[link_name = "llvm.x86.avx2.gather.d.q"]
3873    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3874    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3875    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3876    #[link_name = "llvm.x86.avx2.gather.q.d"]
3877    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3878    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3879    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3880    #[link_name = "llvm.x86.avx2.gather.q.q"]
3881    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3882    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3883    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3884    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3885    fn pgatherdpd(
3886        src: __m128d,
3887        slice: *const i8,
3888        offsets: i32x4,
3889        mask: __m128d,
3890        scale: i8,
3891    ) -> __m128d;
3892    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3893    fn vpgatherdpd(
3894        src: __m256d,
3895        slice: *const i8,
3896        offsets: i32x4,
3897        mask: __m256d,
3898        scale: i8,
3899    ) -> __m256d;
3900    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3901    fn pgatherqpd(
3902        src: __m128d,
3903        slice: *const i8,
3904        offsets: i64x2,
3905        mask: __m128d,
3906        scale: i8,
3907    ) -> __m128d;
3908    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3909    fn vpgatherqpd(
3910        src: __m256d,
3911        slice: *const i8,
3912        offsets: i64x4,
3913        mask: __m256d,
3914        scale: i8,
3915    ) -> __m256d;
3916    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3917    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3918    -> __m128;
3919    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3920    fn vpgatherdps(
3921        src: __m256,
3922        slice: *const i8,
3923        offsets: i32x8,
3924        mask: __m256,
3925        scale: i8,
3926    ) -> __m256;
3927    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3928    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3929    -> __m128;
3930    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3931    fn vpgatherqps(
3932        src: __m128,
3933        slice: *const i8,
3934        offsets: i64x4,
3935        mask: __m128,
3936        scale: i8,
3937    ) -> __m128;
3938}
3939
3940#[cfg(test)]
3941mod tests {
3942    use crate::core_arch::assert_eq_const as assert_eq;
3943
3944    use stdarch_test::simd_test;
3945
3946    use crate::core_arch::x86::*;
3947
3948    #[simd_test(enable = "avx2")]
3949    const fn test_mm256_abs_epi32() {
3950        #[rustfmt::skip]
3951        let a = _mm256_setr_epi32(
3952            0, 1, -1, i32::MAX,
3953            i32::MIN, 100, -100, -32,
3954        );
3955        let r = _mm256_abs_epi32(a);
3956        #[rustfmt::skip]
3957        let e = _mm256_setr_epi32(
3958            0, 1, 1, i32::MAX,
3959            i32::MAX.wrapping_add(1), 100, 100, 32,
3960        );
3961        assert_eq_m256i(r, e);
3962    }
3963
3964    #[simd_test(enable = "avx2")]
3965    const fn test_mm256_abs_epi16() {
3966        #[rustfmt::skip]
3967        let a = _mm256_setr_epi16(
3968            0,  1, -1, 2, -2, 3, -3, 4,
3969            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3970        );
3971        let r = _mm256_abs_epi16(a);
3972        #[rustfmt::skip]
3973        let e = _mm256_setr_epi16(
3974            0, 1, 1, 2, 2, 3, 3, 4,
3975            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3976        );
3977        assert_eq_m256i(r, e);
3978    }
3979
3980    #[simd_test(enable = "avx2")]
3981    const fn test_mm256_abs_epi8() {
3982        #[rustfmt::skip]
3983        let a = _mm256_setr_epi8(
3984            0, 1, -1, 2, -2, 3, -3, 4,
3985            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3986            0, 1, -1, 2, -2, 3, -3, 4,
3987            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3988        );
3989        let r = _mm256_abs_epi8(a);
3990        #[rustfmt::skip]
3991        let e = _mm256_setr_epi8(
3992            0, 1, 1, 2, 2, 3, 3, 4,
3993            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3994            0, 1, 1, 2, 2, 3, 3, 4,
3995            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3996        );
3997        assert_eq_m256i(r, e);
3998    }
3999
4000    #[simd_test(enable = "avx2")]
4001    const fn test_mm256_add_epi64() {
4002        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4003        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4004        let r = _mm256_add_epi64(a, b);
4005        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4006        assert_eq_m256i(r, e);
4007    }
4008
4009    #[simd_test(enable = "avx2")]
4010    const fn test_mm256_add_epi32() {
4011        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4012        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4013        let r = _mm256_add_epi32(a, b);
4014        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4015        assert_eq_m256i(r, e);
4016    }
4017
4018    #[simd_test(enable = "avx2")]
4019    const fn test_mm256_add_epi16() {
4020        #[rustfmt::skip]
4021        let a = _mm256_setr_epi16(
4022            0, 1, 2, 3, 4, 5, 6, 7,
4023            8, 9, 10, 11, 12, 13, 14, 15,
4024        );
4025        #[rustfmt::skip]
4026        let b = _mm256_setr_epi16(
4027            0, 1, 2, 3, 4, 5, 6, 7,
4028            8, 9, 10, 11, 12, 13, 14, 15,
4029        );
4030        let r = _mm256_add_epi16(a, b);
4031        #[rustfmt::skip]
4032        let e = _mm256_setr_epi16(
4033            0, 2, 4, 6, 8, 10, 12, 14,
4034            16, 18, 20, 22, 24, 26, 28, 30,
4035        );
4036        assert_eq_m256i(r, e);
4037    }
4038
4039    #[simd_test(enable = "avx2")]
4040    const fn test_mm256_add_epi8() {
4041        #[rustfmt::skip]
4042        let a = _mm256_setr_epi8(
4043            0, 1, 2, 3, 4, 5, 6, 7,
4044            8, 9, 10, 11, 12, 13, 14, 15,
4045            16, 17, 18, 19, 20, 21, 22, 23,
4046            24, 25, 26, 27, 28, 29, 30, 31,
4047        );
4048        #[rustfmt::skip]
4049        let b = _mm256_setr_epi8(
4050            0, 1, 2, 3, 4, 5, 6, 7,
4051            8, 9, 10, 11, 12, 13, 14, 15,
4052            16, 17, 18, 19, 20, 21, 22, 23,
4053            24, 25, 26, 27, 28, 29, 30, 31,
4054        );
4055        let r = _mm256_add_epi8(a, b);
4056        #[rustfmt::skip]
4057        let e = _mm256_setr_epi8(
4058            0, 2, 4, 6, 8, 10, 12, 14,
4059            16, 18, 20, 22, 24, 26, 28, 30,
4060            32, 34, 36, 38, 40, 42, 44, 46,
4061            48, 50, 52, 54, 56, 58, 60, 62,
4062        );
4063        assert_eq_m256i(r, e);
4064    }
4065
4066    #[simd_test(enable = "avx2")]
4067    const fn test_mm256_adds_epi8() {
4068        #[rustfmt::skip]
4069        let a = _mm256_setr_epi8(
4070            0, 1, 2, 3, 4, 5, 6, 7,
4071            8, 9, 10, 11, 12, 13, 14, 15,
4072            16, 17, 18, 19, 20, 21, 22, 23,
4073            24, 25, 26, 27, 28, 29, 30, 31,
4074        );
4075        #[rustfmt::skip]
4076        let b = _mm256_setr_epi8(
4077            32, 33, 34, 35, 36, 37, 38, 39,
4078            40, 41, 42, 43, 44, 45, 46, 47,
4079            48, 49, 50, 51, 52, 53, 54, 55,
4080            56, 57, 58, 59, 60, 61, 62, 63,
4081        );
4082        let r = _mm256_adds_epi8(a, b);
4083        #[rustfmt::skip]
4084        let e = _mm256_setr_epi8(
4085            32, 34, 36, 38, 40, 42, 44, 46,
4086            48, 50, 52, 54, 56, 58, 60, 62,
4087            64, 66, 68, 70, 72, 74, 76, 78,
4088            80, 82, 84, 86, 88, 90, 92, 94,
4089        );
4090        assert_eq_m256i(r, e);
4091    }
4092
4093    #[simd_test(enable = "avx2")]
4094    fn test_mm256_adds_epi8_saturate_positive() {
4095        let a = _mm256_set1_epi8(0x7F);
4096        let b = _mm256_set1_epi8(1);
4097        let r = _mm256_adds_epi8(a, b);
4098        assert_eq_m256i(r, a);
4099    }
4100
4101    #[simd_test(enable = "avx2")]
4102    fn test_mm256_adds_epi8_saturate_negative() {
4103        let a = _mm256_set1_epi8(-0x80);
4104        let b = _mm256_set1_epi8(-1);
4105        let r = _mm256_adds_epi8(a, b);
4106        assert_eq_m256i(r, a);
4107    }
4108
4109    #[simd_test(enable = "avx2")]
4110    const fn test_mm256_adds_epi16() {
4111        #[rustfmt::skip]
4112        let a = _mm256_setr_epi16(
4113            0, 1, 2, 3, 4, 5, 6, 7,
4114            8, 9, 10, 11, 12, 13, 14, 15,
4115        );
4116        #[rustfmt::skip]
4117        let b = _mm256_setr_epi16(
4118            32, 33, 34, 35, 36, 37, 38, 39,
4119            40, 41, 42, 43, 44, 45, 46, 47,
4120        );
4121        let r = _mm256_adds_epi16(a, b);
4122        #[rustfmt::skip]
4123        let e = _mm256_setr_epi16(
4124            32, 34, 36, 38, 40, 42, 44, 46,
4125            48, 50, 52, 54, 56, 58, 60, 62,
4126        );
4127
4128        assert_eq_m256i(r, e);
4129    }
4130
4131    #[simd_test(enable = "avx2")]
4132    fn test_mm256_adds_epi16_saturate_positive() {
4133        let a = _mm256_set1_epi16(0x7FFF);
4134        let b = _mm256_set1_epi16(1);
4135        let r = _mm256_adds_epi16(a, b);
4136        assert_eq_m256i(r, a);
4137    }
4138
4139    #[simd_test(enable = "avx2")]
4140    fn test_mm256_adds_epi16_saturate_negative() {
4141        let a = _mm256_set1_epi16(-0x8000);
4142        let b = _mm256_set1_epi16(-1);
4143        let r = _mm256_adds_epi16(a, b);
4144        assert_eq_m256i(r, a);
4145    }
4146
4147    #[simd_test(enable = "avx2")]
4148    const fn test_mm256_adds_epu8() {
4149        #[rustfmt::skip]
4150        let a = _mm256_setr_epi8(
4151            0, 1, 2, 3, 4, 5, 6, 7,
4152            8, 9, 10, 11, 12, 13, 14, 15,
4153            16, 17, 18, 19, 20, 21, 22, 23,
4154            24, 25, 26, 27, 28, 29, 30, 31,
4155        );
4156        #[rustfmt::skip]
4157        let b = _mm256_setr_epi8(
4158            32, 33, 34, 35, 36, 37, 38, 39,
4159            40, 41, 42, 43, 44, 45, 46, 47,
4160            48, 49, 50, 51, 52, 53, 54, 55,
4161            56, 57, 58, 59, 60, 61, 62, 63,
4162        );
4163        let r = _mm256_adds_epu8(a, b);
4164        #[rustfmt::skip]
4165        let e = _mm256_setr_epi8(
4166            32, 34, 36, 38, 40, 42, 44, 46,
4167            48, 50, 52, 54, 56, 58, 60, 62,
4168            64, 66, 68, 70, 72, 74, 76, 78,
4169            80, 82, 84, 86, 88, 90, 92, 94,
4170        );
4171        assert_eq_m256i(r, e);
4172    }
4173
4174    #[simd_test(enable = "avx2")]
4175    fn test_mm256_adds_epu8_saturate() {
4176        let a = _mm256_set1_epi8(!0);
4177        let b = _mm256_set1_epi8(1);
4178        let r = _mm256_adds_epu8(a, b);
4179        assert_eq_m256i(r, a);
4180    }
4181
4182    #[simd_test(enable = "avx2")]
4183    const fn test_mm256_adds_epu16() {
4184        #[rustfmt::skip]
4185        let a = _mm256_setr_epi16(
4186            0, 1, 2, 3, 4, 5, 6, 7,
4187            8, 9, 10, 11, 12, 13, 14, 15,
4188        );
4189        #[rustfmt::skip]
4190        let b = _mm256_setr_epi16(
4191            32, 33, 34, 35, 36, 37, 38, 39,
4192            40, 41, 42, 43, 44, 45, 46, 47,
4193        );
4194        let r = _mm256_adds_epu16(a, b);
4195        #[rustfmt::skip]
4196        let e = _mm256_setr_epi16(
4197            32, 34, 36, 38, 40, 42, 44, 46,
4198            48, 50, 52, 54, 56, 58, 60, 62,
4199        );
4200
4201        assert_eq_m256i(r, e);
4202    }
4203
4204    #[simd_test(enable = "avx2")]
4205    fn test_mm256_adds_epu16_saturate() {
4206        let a = _mm256_set1_epi16(!0);
4207        let b = _mm256_set1_epi16(1);
4208        let r = _mm256_adds_epu16(a, b);
4209        assert_eq_m256i(r, a);
4210    }
4211
4212    #[simd_test(enable = "avx2")]
4213    const fn test_mm256_and_si256() {
4214        let a = _mm256_set1_epi8(5);
4215        let b = _mm256_set1_epi8(3);
4216        let got = _mm256_and_si256(a, b);
4217        assert_eq_m256i(got, _mm256_set1_epi8(1));
4218    }
4219
4220    #[simd_test(enable = "avx2")]
4221    const fn test_mm256_andnot_si256() {
4222        let a = _mm256_set1_epi8(5);
4223        let b = _mm256_set1_epi8(3);
4224        let got = _mm256_andnot_si256(a, b);
4225        assert_eq_m256i(got, _mm256_set1_epi8(2));
4226    }
4227
4228    #[simd_test(enable = "avx2")]
4229    const fn test_mm256_avg_epu8() {
4230        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4231        let r = _mm256_avg_epu8(a, b);
4232        assert_eq_m256i(r, _mm256_set1_epi8(6));
4233    }
4234
4235    #[simd_test(enable = "avx2")]
4236    const fn test_mm256_avg_epu16() {
4237        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4238        let r = _mm256_avg_epu16(a, b);
4239        assert_eq_m256i(r, _mm256_set1_epi16(6));
4240    }
4241
4242    #[simd_test(enable = "avx2")]
4243    const fn test_mm_blend_epi32() {
4244        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4245        let e = _mm_setr_epi32(9, 3, 3, 3);
4246        let r = _mm_blend_epi32::<0x01>(a, b);
4247        assert_eq_m128i(r, e);
4248
4249        let r = _mm_blend_epi32::<0x0E>(b, a);
4250        assert_eq_m128i(r, e);
4251    }
4252
4253    #[simd_test(enable = "avx2")]
4254    const fn test_mm256_blend_epi32() {
4255        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4256        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4257        let r = _mm256_blend_epi32::<0x01>(a, b);
4258        assert_eq_m256i(r, e);
4259
4260        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4261        let r = _mm256_blend_epi32::<0x82>(a, b);
4262        assert_eq_m256i(r, e);
4263
4264        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4265        let r = _mm256_blend_epi32::<0x7C>(a, b);
4266        assert_eq_m256i(r, e);
4267    }
4268
4269    #[simd_test(enable = "avx2")]
4270    const fn test_mm256_blend_epi16() {
4271        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4272        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4273        let r = _mm256_blend_epi16::<0x01>(a, b);
4274        assert_eq_m256i(r, e);
4275
4276        let r = _mm256_blend_epi16::<0xFE>(b, a);
4277        assert_eq_m256i(r, e);
4278    }
4279
4280    #[simd_test(enable = "avx2")]
4281    const fn test_mm256_blendv_epi8() {
4282        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4283        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4284        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4285        let r = _mm256_blendv_epi8(a, b, mask);
4286        assert_eq_m256i(r, e);
4287    }
4288
4289    #[simd_test(enable = "avx2")]
4290    const fn test_mm_broadcastb_epi8() {
4291        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4292        let res = _mm_broadcastb_epi8(a);
4293        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4294    }
4295
4296    #[simd_test(enable = "avx2")]
4297    const fn test_mm256_broadcastb_epi8() {
4298        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4299        let res = _mm256_broadcastb_epi8(a);
4300        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4301    }
4302
4303    #[simd_test(enable = "avx2")]
4304    const fn test_mm_broadcastd_epi32() {
4305        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4306        let res = _mm_broadcastd_epi32(a);
4307        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4308    }
4309
4310    #[simd_test(enable = "avx2")]
4311    const fn test_mm256_broadcastd_epi32() {
4312        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4313        let res = _mm256_broadcastd_epi32(a);
4314        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4315    }
4316
4317    #[simd_test(enable = "avx2")]
4318    const fn test_mm_broadcastq_epi64() {
4319        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4320        let res = _mm_broadcastq_epi64(a);
4321        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4322    }
4323
4324    #[simd_test(enable = "avx2")]
4325    const fn test_mm256_broadcastq_epi64() {
4326        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4327        let res = _mm256_broadcastq_epi64(a);
4328        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4329    }
4330
4331    #[simd_test(enable = "avx2")]
4332    const fn test_mm_broadcastsd_pd() {
4333        let a = _mm_setr_pd(6.88, 3.44);
4334        let res = _mm_broadcastsd_pd(a);
4335        assert_eq_m128d(res, _mm_set1_pd(6.88));
4336    }
4337
4338    #[simd_test(enable = "avx2")]
4339    const fn test_mm256_broadcastsd_pd() {
4340        let a = _mm_setr_pd(6.88, 3.44);
4341        let res = _mm256_broadcastsd_pd(a);
4342        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4343    }
4344
4345    #[simd_test(enable = "avx2")]
4346    const fn test_mm_broadcastsi128_si256() {
4347        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4348        let res = _mm_broadcastsi128_si256(a);
4349        let retval = _mm256_setr_epi64x(
4350            0x0987654321012334,
4351            0x5678909876543210,
4352            0x0987654321012334,
4353            0x5678909876543210,
4354        );
4355        assert_eq_m256i(res, retval);
4356    }
4357
4358    #[simd_test(enable = "avx2")]
4359    const fn test_mm256_broadcastsi128_si256() {
4360        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4361        let res = _mm256_broadcastsi128_si256(a);
4362        let retval = _mm256_setr_epi64x(
4363            0x0987654321012334,
4364            0x5678909876543210,
4365            0x0987654321012334,
4366            0x5678909876543210,
4367        );
4368        assert_eq_m256i(res, retval);
4369    }
4370
4371    #[simd_test(enable = "avx2")]
4372    const fn test_mm_broadcastss_ps() {
4373        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4374        let res = _mm_broadcastss_ps(a);
4375        assert_eq_m128(res, _mm_set1_ps(6.88));
4376    }
4377
4378    #[simd_test(enable = "avx2")]
4379    const fn test_mm256_broadcastss_ps() {
4380        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4381        let res = _mm256_broadcastss_ps(a);
4382        assert_eq_m256(res, _mm256_set1_ps(6.88));
4383    }
4384
4385    #[simd_test(enable = "avx2")]
4386    const fn test_mm_broadcastw_epi16() {
4387        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4388        let res = _mm_broadcastw_epi16(a);
4389        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4390    }
4391
4392    #[simd_test(enable = "avx2")]
4393    const fn test_mm256_broadcastw_epi16() {
4394        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4395        let res = _mm256_broadcastw_epi16(a);
4396        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4397    }
4398
4399    #[simd_test(enable = "avx2")]
4400    const fn test_mm256_cmpeq_epi8() {
4401        #[rustfmt::skip]
4402        let a = _mm256_setr_epi8(
4403            0, 1, 2, 3, 4, 5, 6, 7,
4404            8, 9, 10, 11, 12, 13, 14, 15,
4405            16, 17, 18, 19, 20, 21, 22, 23,
4406            24, 25, 26, 27, 28, 29, 30, 31,
4407        );
4408        #[rustfmt::skip]
4409        let b = _mm256_setr_epi8(
4410            31, 30, 2, 28, 27, 26, 25, 24,
4411            23, 22, 21, 20, 19, 18, 17, 16,
4412            15, 14, 13, 12, 11, 10, 9, 8,
4413            7, 6, 5, 4, 3, 2, 1, 0,
4414        );
4415        let r = _mm256_cmpeq_epi8(a, b);
4416        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4417    }
4418
4419    #[simd_test(enable = "avx2")]
4420    const fn test_mm256_cmpeq_epi16() {
4421        #[rustfmt::skip]
4422        let a = _mm256_setr_epi16(
4423            0, 1, 2, 3, 4, 5, 6, 7,
4424            8, 9, 10, 11, 12, 13, 14, 15,
4425        );
4426        #[rustfmt::skip]
4427        let b = _mm256_setr_epi16(
4428            15, 14, 2, 12, 11, 10, 9, 8,
4429            7, 6, 5, 4, 3, 2, 1, 0,
4430        );
4431        let r = _mm256_cmpeq_epi16(a, b);
4432        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4433    }
4434
4435    #[simd_test(enable = "avx2")]
4436    const fn test_mm256_cmpeq_epi32() {
4437        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4438        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4439        let r = _mm256_cmpeq_epi32(a, b);
4440        let e = _mm256_set1_epi32(0);
4441        let e = _mm256_insert_epi32::<2>(e, !0);
4442        assert_eq_m256i(r, e);
4443    }
4444
4445    #[simd_test(enable = "avx2")]
4446    const fn test_mm256_cmpeq_epi64() {
4447        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4448        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4449        let r = _mm256_cmpeq_epi64(a, b);
4450        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4451    }
4452
4453    #[simd_test(enable = "avx2")]
4454    const fn test_mm256_cmpgt_epi8() {
4455        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4456        let b = _mm256_set1_epi8(0);
4457        let r = _mm256_cmpgt_epi8(a, b);
4458        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4459    }
4460
4461    #[simd_test(enable = "avx2")]
4462    const fn test_mm256_cmpgt_epi16() {
4463        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4464        let b = _mm256_set1_epi16(0);
4465        let r = _mm256_cmpgt_epi16(a, b);
4466        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4467    }
4468
4469    #[simd_test(enable = "avx2")]
4470    const fn test_mm256_cmpgt_epi32() {
4471        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4472        let b = _mm256_set1_epi32(0);
4473        let r = _mm256_cmpgt_epi32(a, b);
4474        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4475    }
4476
4477    #[simd_test(enable = "avx2")]
4478    const fn test_mm256_cmpgt_epi64() {
4479        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4480        let b = _mm256_set1_epi64x(0);
4481        let r = _mm256_cmpgt_epi64(a, b);
4482        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4483    }
4484
4485    #[simd_test(enable = "avx2")]
4486    const fn test_mm256_cvtepi8_epi16() {
4487        #[rustfmt::skip]
4488        let a = _mm_setr_epi8(
4489            0, 0, -1, 1, -2, 2, -3, 3,
4490            -4, 4, -5, 5, -6, 6, -7, 7,
4491        );
4492        #[rustfmt::skip]
4493        let r = _mm256_setr_epi16(
4494            0, 0, -1, 1, -2, 2, -3, 3,
4495            -4, 4, -5, 5, -6, 6, -7, 7,
4496        );
4497        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4498    }
4499
4500    #[simd_test(enable = "avx2")]
4501    const fn test_mm256_cvtepi8_epi32() {
4502        #[rustfmt::skip]
4503        let a = _mm_setr_epi8(
4504            0, 0, -1, 1, -2, 2, -3, 3,
4505            -4, 4, -5, 5, -6, 6, -7, 7,
4506        );
4507        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4508        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4509    }
4510
4511    #[simd_test(enable = "avx2")]
4512    const fn test_mm256_cvtepi8_epi64() {
4513        #[rustfmt::skip]
4514        let a = _mm_setr_epi8(
4515            0, 0, -1, 1, -2, 2, -3, 3,
4516            -4, 4, -5, 5, -6, 6, -7, 7,
4517        );
4518        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4519        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4520    }
4521
4522    #[simd_test(enable = "avx2")]
4523    const fn test_mm256_cvtepi16_epi32() {
4524        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4525        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4526        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4527    }
4528
4529    #[simd_test(enable = "avx2")]
4530    const fn test_mm256_cvtepi16_epi64() {
4531        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4532        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4533        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4534    }
4535
4536    #[simd_test(enable = "avx2")]
4537    const fn test_mm256_cvtepi32_epi64() {
4538        let a = _mm_setr_epi32(0, 0, -1, 1);
4539        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4540        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4541    }
4542
4543    #[simd_test(enable = "avx2")]
4544    const fn test_mm256_cvtepu16_epi32() {
4545        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4546        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4547        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4548    }
4549
4550    #[simd_test(enable = "avx2")]
4551    const fn test_mm256_cvtepu16_epi64() {
4552        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4553        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4554        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4555    }
4556
4557    #[simd_test(enable = "avx2")]
4558    const fn test_mm256_cvtepu32_epi64() {
4559        let a = _mm_setr_epi32(0, 1, 2, 3);
4560        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4561        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4562    }
4563
4564    #[simd_test(enable = "avx2")]
4565    const fn test_mm256_cvtepu8_epi16() {
4566        #[rustfmt::skip]
4567        let a = _mm_setr_epi8(
4568            0, 1, 2, 3, 4, 5, 6, 7,
4569            8, 9, 10, 11, 12, 13, 14, 15,
4570        );
4571        #[rustfmt::skip]
4572        let r = _mm256_setr_epi16(
4573            0, 1, 2, 3, 4, 5, 6, 7,
4574            8, 9, 10, 11, 12, 13, 14, 15,
4575        );
4576        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4577    }
4578
4579    #[simd_test(enable = "avx2")]
4580    const fn test_mm256_cvtepu8_epi32() {
4581        #[rustfmt::skip]
4582        let a = _mm_setr_epi8(
4583            0, 1, 2, 3, 4, 5, 6, 7,
4584            8, 9, 10, 11, 12, 13, 14, 15,
4585        );
4586        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4587        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4588    }
4589
4590    #[simd_test(enable = "avx2")]
4591    const fn test_mm256_cvtepu8_epi64() {
4592        #[rustfmt::skip]
4593        let a = _mm_setr_epi8(
4594            0, 1, 2, 3, 4, 5, 6, 7,
4595            8, 9, 10, 11, 12, 13, 14, 15,
4596        );
4597        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4598        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4599    }
4600
4601    #[simd_test(enable = "avx2")]
4602    const fn test_mm256_extracti128_si256() {
4603        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4604        let r = _mm256_extracti128_si256::<1>(a);
4605        let e = _mm_setr_epi64x(3, 4);
4606        assert_eq_m128i(r, e);
4607    }
4608
4609    #[simd_test(enable = "avx2")]
4610    const fn test_mm256_hadd_epi16() {
4611        let a = _mm256_set1_epi16(2);
4612        let b = _mm256_set1_epi16(4);
4613        let r = _mm256_hadd_epi16(a, b);
4614        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4615        assert_eq_m256i(r, e);
4616    }
4617
4618    #[simd_test(enable = "avx2")]
4619    const fn test_mm256_hadd_epi32() {
4620        let a = _mm256_set1_epi32(2);
4621        let b = _mm256_set1_epi32(4);
4622        let r = _mm256_hadd_epi32(a, b);
4623        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4624        assert_eq_m256i(r, e);
4625    }
4626
4627    #[simd_test(enable = "avx2")]
4628    fn test_mm256_hadds_epi16() {
4629        let a = _mm256_set1_epi16(2);
4630        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4631        let a = _mm256_insert_epi16::<1>(a, 1);
4632        let b = _mm256_set1_epi16(4);
4633        let r = _mm256_hadds_epi16(a, b);
4634        #[rustfmt::skip]
4635        let e = _mm256_setr_epi16(
4636            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4637            4, 4, 4, 4, 8, 8, 8, 8,
4638        );
4639        assert_eq_m256i(r, e);
4640    }
4641
4642    #[simd_test(enable = "avx2")]
4643    const fn test_mm256_hsub_epi16() {
4644        let a = _mm256_set1_epi16(2);
4645        let b = _mm256_set1_epi16(4);
4646        let r = _mm256_hsub_epi16(a, b);
4647        let e = _mm256_set1_epi16(0);
4648        assert_eq_m256i(r, e);
4649    }
4650
4651    #[simd_test(enable = "avx2")]
4652    const fn test_mm256_hsub_epi32() {
4653        let a = _mm256_set1_epi32(2);
4654        let b = _mm256_set1_epi32(4);
4655        let r = _mm256_hsub_epi32(a, b);
4656        let e = _mm256_set1_epi32(0);
4657        assert_eq_m256i(r, e);
4658    }
4659
4660    #[simd_test(enable = "avx2")]
4661    fn test_mm256_hsubs_epi16() {
4662        let a = _mm256_set1_epi16(2);
4663        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4664        let a = _mm256_insert_epi16::<1>(a, -1);
4665        let b = _mm256_set1_epi16(4);
4666        let r = _mm256_hsubs_epi16(a, b);
4667        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4668        assert_eq_m256i(r, e);
4669    }
4670
4671    #[simd_test(enable = "avx2")]
4672    fn test_mm256_madd_epi16() {
4673        let a = _mm256_set1_epi16(2);
4674        let b = _mm256_set1_epi16(4);
4675        let r = _mm256_madd_epi16(a, b);
4676        let e = _mm256_set1_epi32(16);
4677        assert_eq_m256i(r, e);
4678    }
4679
4680    #[target_feature(enable = "avx2")]
4681    #[cfg_attr(test, assert_instr(vpmaddwd))]
4682    unsafe fn test_mm256_madd_epi16_mul_one(v: __m256i) -> __m256i {
4683        // This is a trick used in the adler32 algorithm to get a widening addition. The
4684        // multiplication by 1 is trivial, but must not be optimized out because then the vpmaddwd
4685        // instruction is no longer selected. The assert_instr verifies that this is the case.
4686        let one_v = _mm256_set1_epi16(1);
4687        _mm256_madd_epi16(v, one_v)
4688    }
4689
4690    #[target_feature(enable = "avx2")]
4691    #[cfg_attr(test, assert_instr(vpmaddwd))]
4692    unsafe fn test_mm256_madd_epi16_shl(v: __m256i) -> __m256i {
4693        // This is a trick used in the base64 algorithm to get a widening addition. Instead of a
4694        // multiplication, a vector shl is used. In LLVM 22 that breaks the pattern recognition
4695        // for the automatic optimization to vpmaddwd.
4696        let shift_value = _mm256_set1_epi32(12i32);
4697        _mm256_madd_epi16(v, shift_value)
4698    }
4699
4700    #[simd_test(enable = "avx2")]
4701    const fn test_mm256_inserti128_si256() {
4702        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4703        let b = _mm_setr_epi64x(7, 8);
4704        let r = _mm256_inserti128_si256::<1>(a, b);
4705        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4706        assert_eq_m256i(r, e);
4707    }
4708
4709    #[simd_test(enable = "avx2")]
4710    fn test_mm256_maddubs_epi16() {
4711        let a = _mm256_set1_epi8(2);
4712        let b = _mm256_set1_epi8(4);
4713        let r = _mm256_maddubs_epi16(a, b);
4714        let e = _mm256_set1_epi16(16);
4715        assert_eq_m256i(r, e);
4716    }
4717
4718    #[simd_test(enable = "avx2")]
4719    const fn test_mm_maskload_epi32() {
4720        let nums = [1, 2, 3, 4];
4721        let a = &nums as *const i32;
4722        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4723        let r = unsafe { _mm_maskload_epi32(a, mask) };
4724        let e = _mm_setr_epi32(1, 0, 0, 4);
4725        assert_eq_m128i(r, e);
4726    }
4727
4728    #[simd_test(enable = "avx2")]
4729    const fn test_mm256_maskload_epi32() {
4730        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4731        let a = &nums as *const i32;
4732        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4733        let r = unsafe { _mm256_maskload_epi32(a, mask) };
4734        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4735        assert_eq_m256i(r, e);
4736    }
4737
4738    #[simd_test(enable = "avx2")]
4739    const fn test_mm_maskload_epi64() {
4740        let nums = [1_i64, 2_i64];
4741        let a = &nums as *const i64;
4742        let mask = _mm_setr_epi64x(0, -1);
4743        let r = unsafe { _mm_maskload_epi64(a, mask) };
4744        let e = _mm_setr_epi64x(0, 2);
4745        assert_eq_m128i(r, e);
4746    }
4747
4748    #[simd_test(enable = "avx2")]
4749    const fn test_mm256_maskload_epi64() {
4750        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4751        let a = &nums as *const i64;
4752        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4753        let r = unsafe { _mm256_maskload_epi64(a, mask) };
4754        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4755        assert_eq_m256i(r, e);
4756    }
4757
4758    #[simd_test(enable = "avx2")]
4759    const fn test_mm_maskstore_epi32() {
4760        let a = _mm_setr_epi32(1, 2, 3, 4);
4761        let mut arr = [-1, -1, -1, -1];
4762        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4763        unsafe {
4764            _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4765        }
4766        let e = [1, -1, -1, 4];
4767        assert_eq!(arr, e);
4768    }
4769
4770    #[simd_test(enable = "avx2")]
4771    const fn test_mm256_maskstore_epi32() {
4772        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4773        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4774        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4775        unsafe {
4776            _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4777        }
4778        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4779        assert_eq!(arr, e);
4780    }
4781
4782    #[simd_test(enable = "avx2")]
4783    const fn test_mm_maskstore_epi64() {
4784        let a = _mm_setr_epi64x(1_i64, 2_i64);
4785        let mut arr = [-1_i64, -1_i64];
4786        let mask = _mm_setr_epi64x(0, -1);
4787        unsafe {
4788            _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4789        }
4790        let e = [-1, 2];
4791        assert_eq!(arr, e);
4792    }
4793
4794    #[simd_test(enable = "avx2")]
4795    const fn test_mm256_maskstore_epi64() {
4796        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4797        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4798        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4799        unsafe {
4800            _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4801        }
4802        let e = [-1, 2, 3, -1];
4803        assert_eq!(arr, e);
4804    }
4805
4806    #[simd_test(enable = "avx2")]
4807    const fn test_mm256_max_epi16() {
4808        let a = _mm256_set1_epi16(2);
4809        let b = _mm256_set1_epi16(4);
4810        let r = _mm256_max_epi16(a, b);
4811        assert_eq_m256i(r, b);
4812    }
4813
4814    #[simd_test(enable = "avx2")]
4815    const fn test_mm256_max_epi32() {
4816        let a = _mm256_set1_epi32(2);
4817        let b = _mm256_set1_epi32(4);
4818        let r = _mm256_max_epi32(a, b);
4819        assert_eq_m256i(r, b);
4820    }
4821
4822    #[simd_test(enable = "avx2")]
4823    const fn test_mm256_max_epi8() {
4824        let a = _mm256_set1_epi8(2);
4825        let b = _mm256_set1_epi8(4);
4826        let r = _mm256_max_epi8(a, b);
4827        assert_eq_m256i(r, b);
4828    }
4829
4830    #[simd_test(enable = "avx2")]
4831    const fn test_mm256_max_epu16() {
4832        let a = _mm256_set1_epi16(2);
4833        let b = _mm256_set1_epi16(4);
4834        let r = _mm256_max_epu16(a, b);
4835        assert_eq_m256i(r, b);
4836    }
4837
4838    #[simd_test(enable = "avx2")]
4839    const fn test_mm256_max_epu32() {
4840        let a = _mm256_set1_epi32(2);
4841        let b = _mm256_set1_epi32(4);
4842        let r = _mm256_max_epu32(a, b);
4843        assert_eq_m256i(r, b);
4844    }
4845
4846    #[simd_test(enable = "avx2")]
4847    const fn test_mm256_max_epu8() {
4848        let a = _mm256_set1_epi8(2);
4849        let b = _mm256_set1_epi8(4);
4850        let r = _mm256_max_epu8(a, b);
4851        assert_eq_m256i(r, b);
4852    }
4853
4854    #[simd_test(enable = "avx2")]
4855    const fn test_mm256_min_epi16() {
4856        let a = _mm256_set1_epi16(2);
4857        let b = _mm256_set1_epi16(4);
4858        let r = _mm256_min_epi16(a, b);
4859        assert_eq_m256i(r, a);
4860    }
4861
4862    #[simd_test(enable = "avx2")]
4863    const fn test_mm256_min_epi32() {
4864        let a = _mm256_set1_epi32(2);
4865        let b = _mm256_set1_epi32(4);
4866        let r = _mm256_min_epi32(a, b);
4867        assert_eq_m256i(r, a);
4868    }
4869
4870    #[simd_test(enable = "avx2")]
4871    const fn test_mm256_min_epi8() {
4872        let a = _mm256_set1_epi8(2);
4873        let b = _mm256_set1_epi8(4);
4874        let r = _mm256_min_epi8(a, b);
4875        assert_eq_m256i(r, a);
4876    }
4877
4878    #[simd_test(enable = "avx2")]
4879    const fn test_mm256_min_epu16() {
4880        let a = _mm256_set1_epi16(2);
4881        let b = _mm256_set1_epi16(4);
4882        let r = _mm256_min_epu16(a, b);
4883        assert_eq_m256i(r, a);
4884    }
4885
4886    #[simd_test(enable = "avx2")]
4887    const fn test_mm256_min_epu32() {
4888        let a = _mm256_set1_epi32(2);
4889        let b = _mm256_set1_epi32(4);
4890        let r = _mm256_min_epu32(a, b);
4891        assert_eq_m256i(r, a);
4892    }
4893
4894    #[simd_test(enable = "avx2")]
4895    const fn test_mm256_min_epu8() {
4896        let a = _mm256_set1_epi8(2);
4897        let b = _mm256_set1_epi8(4);
4898        let r = _mm256_min_epu8(a, b);
4899        assert_eq_m256i(r, a);
4900    }
4901
4902    #[simd_test(enable = "avx2")]
4903    const fn test_mm256_movemask_epi8() {
4904        let a = _mm256_set1_epi8(-1);
4905        let r = _mm256_movemask_epi8(a);
4906        let e = -1;
4907        assert_eq!(r, e);
4908    }
4909
4910    #[simd_test(enable = "avx2")]
4911    fn test_mm256_mpsadbw_epu8() {
4912        let a = _mm256_set1_epi8(2);
4913        let b = _mm256_set1_epi8(4);
4914        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4915        let e = _mm256_set1_epi16(8);
4916        assert_eq_m256i(r, e);
4917    }
4918
4919    #[simd_test(enable = "avx2")]
4920    const fn test_mm256_mul_epi32() {
4921        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4922        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4923        let r = _mm256_mul_epi32(a, b);
4924        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4925        assert_eq_m256i(r, e);
4926    }
4927
4928    #[simd_test(enable = "avx2")]
4929    const fn test_mm256_mul_epu32() {
4930        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4931        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4932        let r = _mm256_mul_epu32(a, b);
4933        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4934        assert_eq_m256i(r, e);
4935    }
4936
4937    #[simd_test(enable = "avx2")]
4938    const fn test_mm256_mulhi_epi16() {
4939        let a = _mm256_set1_epi16(6535);
4940        let b = _mm256_set1_epi16(6535);
4941        let r = _mm256_mulhi_epi16(a, b);
4942        let e = _mm256_set1_epi16(651);
4943        assert_eq_m256i(r, e);
4944    }
4945
4946    #[simd_test(enable = "avx2")]
4947    const fn test_mm256_mulhi_epu16() {
4948        let a = _mm256_set1_epi16(6535);
4949        let b = _mm256_set1_epi16(6535);
4950        let r = _mm256_mulhi_epu16(a, b);
4951        let e = _mm256_set1_epi16(651);
4952        assert_eq_m256i(r, e);
4953    }
4954
4955    #[simd_test(enable = "avx2")]
4956    const fn test_mm256_mullo_epi16() {
4957        let a = _mm256_set1_epi16(2);
4958        let b = _mm256_set1_epi16(4);
4959        let r = _mm256_mullo_epi16(a, b);
4960        let e = _mm256_set1_epi16(8);
4961        assert_eq_m256i(r, e);
4962    }
4963
4964    #[simd_test(enable = "avx2")]
4965    const fn test_mm256_mullo_epi32() {
4966        let a = _mm256_set1_epi32(2);
4967        let b = _mm256_set1_epi32(4);
4968        let r = _mm256_mullo_epi32(a, b);
4969        let e = _mm256_set1_epi32(8);
4970        assert_eq_m256i(r, e);
4971    }
4972
4973    #[simd_test(enable = "avx2")]
4974    fn test_mm256_mulhrs_epi16() {
4975        let a = _mm256_set1_epi16(2);
4976        let b = _mm256_set1_epi16(4);
4977        let r = _mm256_mullo_epi16(a, b);
4978        let e = _mm256_set1_epi16(8);
4979        assert_eq_m256i(r, e);
4980    }
4981
4982    #[simd_test(enable = "avx2")]
4983    const fn test_mm256_or_si256() {
4984        let a = _mm256_set1_epi8(-1);
4985        let b = _mm256_set1_epi8(0);
4986        let r = _mm256_or_si256(a, b);
4987        assert_eq_m256i(r, a);
4988    }
4989
4990    #[simd_test(enable = "avx2")]
4991    fn test_mm256_packs_epi16() {
4992        let a = _mm256_set1_epi16(2);
4993        let b = _mm256_set1_epi16(4);
4994        let r = _mm256_packs_epi16(a, b);
4995        #[rustfmt::skip]
4996        let e = _mm256_setr_epi8(
4997            2, 2, 2, 2, 2, 2, 2, 2,
4998            4, 4, 4, 4, 4, 4, 4, 4,
4999            2, 2, 2, 2, 2, 2, 2, 2,
5000            4, 4, 4, 4, 4, 4, 4, 4,
5001        );
5002
5003        assert_eq_m256i(r, e);
5004    }
5005
5006    #[simd_test(enable = "avx2")]
5007    fn test_mm256_packs_epi32() {
5008        let a = _mm256_set1_epi32(2);
5009        let b = _mm256_set1_epi32(4);
5010        let r = _mm256_packs_epi32(a, b);
5011        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5012
5013        assert_eq_m256i(r, e);
5014    }
5015
5016    #[simd_test(enable = "avx2")]
5017    fn test_mm256_packus_epi16() {
5018        let a = _mm256_set1_epi16(2);
5019        let b = _mm256_set1_epi16(4);
5020        let r = _mm256_packus_epi16(a, b);
5021        #[rustfmt::skip]
5022        let e = _mm256_setr_epi8(
5023            2, 2, 2, 2, 2, 2, 2, 2,
5024            4, 4, 4, 4, 4, 4, 4, 4,
5025            2, 2, 2, 2, 2, 2, 2, 2,
5026            4, 4, 4, 4, 4, 4, 4, 4,
5027        );
5028
5029        assert_eq_m256i(r, e);
5030    }
5031
5032    #[simd_test(enable = "avx2")]
5033    fn test_mm256_packus_epi32() {
5034        let a = _mm256_set1_epi32(2);
5035        let b = _mm256_set1_epi32(4);
5036        let r = _mm256_packus_epi32(a, b);
5037        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5038
5039        assert_eq_m256i(r, e);
5040    }
5041
5042    #[simd_test(enable = "avx2")]
5043    fn test_mm256_sad_epu8() {
5044        let a = _mm256_set1_epi8(2);
5045        let b = _mm256_set1_epi8(4);
5046        let r = _mm256_sad_epu8(a, b);
5047        let e = _mm256_set1_epi64x(16);
5048        assert_eq_m256i(r, e);
5049    }
5050
5051    #[simd_test(enable = "avx2")]
5052    const fn test_mm256_shufflehi_epi16() {
5053        #[rustfmt::skip]
5054        let a = _mm256_setr_epi16(
5055            0, 1, 2, 3, 11, 22, 33, 44,
5056            4, 5, 6, 7, 55, 66, 77, 88,
5057        );
5058        #[rustfmt::skip]
5059        let e = _mm256_setr_epi16(
5060            0, 1, 2, 3, 44, 22, 22, 11,
5061            4, 5, 6, 7, 88, 66, 66, 55,
5062        );
5063        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5064        assert_eq_m256i(r, e);
5065    }
5066
5067    #[simd_test(enable = "avx2")]
5068    const fn test_mm256_shufflelo_epi16() {
5069        #[rustfmt::skip]
5070        let a = _mm256_setr_epi16(
5071            11, 22, 33, 44, 0, 1, 2, 3,
5072            55, 66, 77, 88, 4, 5, 6, 7,
5073        );
5074        #[rustfmt::skip]
5075        let e = _mm256_setr_epi16(
5076            44, 22, 22, 11, 0, 1, 2, 3,
5077            88, 66, 66, 55, 4, 5, 6, 7,
5078        );
5079        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5080        assert_eq_m256i(r, e);
5081    }
5082
5083    #[simd_test(enable = "avx2")]
5084    fn test_mm256_sign_epi16() {
5085        let a = _mm256_set1_epi16(2);
5086        let b = _mm256_set1_epi16(-1);
5087        let r = _mm256_sign_epi16(a, b);
5088        let e = _mm256_set1_epi16(-2);
5089        assert_eq_m256i(r, e);
5090    }
5091
5092    #[simd_test(enable = "avx2")]
5093    fn test_mm256_sign_epi32() {
5094        let a = _mm256_set1_epi32(2);
5095        let b = _mm256_set1_epi32(-1);
5096        let r = _mm256_sign_epi32(a, b);
5097        let e = _mm256_set1_epi32(-2);
5098        assert_eq_m256i(r, e);
5099    }
5100
5101    #[simd_test(enable = "avx2")]
5102    fn test_mm256_sign_epi8() {
5103        let a = _mm256_set1_epi8(2);
5104        let b = _mm256_set1_epi8(-1);
5105        let r = _mm256_sign_epi8(a, b);
5106        let e = _mm256_set1_epi8(-2);
5107        assert_eq_m256i(r, e);
5108    }
5109
5110    #[simd_test(enable = "avx2")]
5111    fn test_mm256_sll_epi16() {
5112        let a = _mm256_set1_epi16(0xFF);
5113        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5114        let r = _mm256_sll_epi16(a, b);
5115        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5116    }
5117
5118    #[simd_test(enable = "avx2")]
5119    fn test_mm256_sll_epi32() {
5120        let a = _mm256_set1_epi32(0xFFFF);
5121        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5122        let r = _mm256_sll_epi32(a, b);
5123        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5124    }
5125
5126    #[simd_test(enable = "avx2")]
5127    fn test_mm256_sll_epi64() {
5128        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5129        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5130        let r = _mm256_sll_epi64(a, b);
5131        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5132    }
5133
5134    #[simd_test(enable = "avx2")]
5135    const fn test_mm256_slli_epi16() {
5136        assert_eq_m256i(
5137            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5138            _mm256_set1_epi16(0xFF0),
5139        );
5140    }
5141
5142    #[simd_test(enable = "avx2")]
5143    const fn test_mm256_slli_epi32() {
5144        assert_eq_m256i(
5145            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5146            _mm256_set1_epi32(0xFFFF0),
5147        );
5148    }
5149
5150    #[simd_test(enable = "avx2")]
5151    const fn test_mm256_slli_epi64() {
5152        assert_eq_m256i(
5153            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5154            _mm256_set1_epi64x(0xFFFFFFFF0),
5155        );
5156    }
5157
5158    #[simd_test(enable = "avx2")]
5159    const fn test_mm256_slli_si256() {
5160        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5161        let r = _mm256_slli_si256::<3>(a);
5162        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5163    }
5164
5165    #[simd_test(enable = "avx2")]
5166    const fn test_mm_sllv_epi32() {
5167        let a = _mm_set1_epi32(2);
5168        let b = _mm_set1_epi32(1);
5169        let r = _mm_sllv_epi32(a, b);
5170        let e = _mm_set1_epi32(4);
5171        assert_eq_m128i(r, e);
5172    }
5173
5174    #[simd_test(enable = "avx2")]
5175    const fn test_mm256_sllv_epi32() {
5176        let a = _mm256_set1_epi32(2);
5177        let b = _mm256_set1_epi32(1);
5178        let r = _mm256_sllv_epi32(a, b);
5179        let e = _mm256_set1_epi32(4);
5180        assert_eq_m256i(r, e);
5181    }
5182
5183    #[simd_test(enable = "avx2")]
5184    const fn test_mm_sllv_epi64() {
5185        let a = _mm_set1_epi64x(2);
5186        let b = _mm_set1_epi64x(1);
5187        let r = _mm_sllv_epi64(a, b);
5188        let e = _mm_set1_epi64x(4);
5189        assert_eq_m128i(r, e);
5190    }
5191
5192    #[simd_test(enable = "avx2")]
5193    const fn test_mm256_sllv_epi64() {
5194        let a = _mm256_set1_epi64x(2);
5195        let b = _mm256_set1_epi64x(1);
5196        let r = _mm256_sllv_epi64(a, b);
5197        let e = _mm256_set1_epi64x(4);
5198        assert_eq_m256i(r, e);
5199    }
5200
5201    #[simd_test(enable = "avx2")]
5202    fn test_mm256_sra_epi16() {
5203        let a = _mm256_set1_epi16(-1);
5204        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5205        let r = _mm256_sra_epi16(a, b);
5206        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5207    }
5208
5209    #[simd_test(enable = "avx2")]
5210    fn test_mm256_sra_epi32() {
5211        let a = _mm256_set1_epi32(-1);
5212        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5213        let r = _mm256_sra_epi32(a, b);
5214        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5215    }
5216
5217    #[simd_test(enable = "avx2")]
5218    const fn test_mm256_srai_epi16() {
5219        assert_eq_m256i(
5220            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5221            _mm256_set1_epi16(-1),
5222        );
5223    }
5224
5225    #[simd_test(enable = "avx2")]
5226    const fn test_mm256_srai_epi32() {
5227        assert_eq_m256i(
5228            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5229            _mm256_set1_epi32(-1),
5230        );
5231    }
5232
5233    #[simd_test(enable = "avx2")]
5234    const fn test_mm_srav_epi32() {
5235        let a = _mm_set1_epi32(4);
5236        let count = _mm_set1_epi32(1);
5237        let r = _mm_srav_epi32(a, count);
5238        let e = _mm_set1_epi32(2);
5239        assert_eq_m128i(r, e);
5240    }
5241
5242    #[simd_test(enable = "avx2")]
5243    const fn test_mm256_srav_epi32() {
5244        let a = _mm256_set1_epi32(4);
5245        let count = _mm256_set1_epi32(1);
5246        let r = _mm256_srav_epi32(a, count);
5247        let e = _mm256_set1_epi32(2);
5248        assert_eq_m256i(r, e);
5249    }
5250
5251    #[simd_test(enable = "avx2")]
5252    const fn test_mm256_srli_si256() {
5253        #[rustfmt::skip]
5254        let a = _mm256_setr_epi8(
5255            1, 2, 3, 4, 5, 6, 7, 8,
5256            9, 10, 11, 12, 13, 14, 15, 16,
5257            17, 18, 19, 20, 21, 22, 23, 24,
5258            25, 26, 27, 28, 29, 30, 31, 32,
5259        );
5260        let r = _mm256_srli_si256::<3>(a);
5261        #[rustfmt::skip]
5262        let e = _mm256_setr_epi8(
5263            4, 5, 6, 7, 8, 9, 10, 11,
5264            12, 13, 14, 15, 16, 0, 0, 0,
5265            20, 21, 22, 23, 24, 25, 26, 27,
5266            28, 29, 30, 31, 32, 0, 0, 0,
5267        );
5268        assert_eq_m256i(r, e);
5269    }
5270
5271    #[simd_test(enable = "avx2")]
5272    fn test_mm256_srl_epi16() {
5273        let a = _mm256_set1_epi16(0xFF);
5274        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5275        let r = _mm256_srl_epi16(a, b);
5276        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5277    }
5278
5279    #[simd_test(enable = "avx2")]
5280    fn test_mm256_srl_epi32() {
5281        let a = _mm256_set1_epi32(0xFFFF);
5282        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5283        let r = _mm256_srl_epi32(a, b);
5284        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5285    }
5286
5287    #[simd_test(enable = "avx2")]
5288    fn test_mm256_srl_epi64() {
5289        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5290        let b = _mm_setr_epi64x(4, 0);
5291        let r = _mm256_srl_epi64(a, b);
5292        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5293    }
5294
5295    #[simd_test(enable = "avx2")]
5296    const fn test_mm256_srli_epi16() {
5297        assert_eq_m256i(
5298            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5299            _mm256_set1_epi16(0xF),
5300        );
5301    }
5302
5303    #[simd_test(enable = "avx2")]
5304    const fn test_mm256_srli_epi32() {
5305        assert_eq_m256i(
5306            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5307            _mm256_set1_epi32(0xFFF),
5308        );
5309    }
5310
5311    #[simd_test(enable = "avx2")]
5312    const fn test_mm256_srli_epi64() {
5313        assert_eq_m256i(
5314            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5315            _mm256_set1_epi64x(0xFFFFFFF),
5316        );
5317    }
5318
5319    #[simd_test(enable = "avx2")]
5320    const fn test_mm_srlv_epi32() {
5321        let a = _mm_set1_epi32(2);
5322        let count = _mm_set1_epi32(1);
5323        let r = _mm_srlv_epi32(a, count);
5324        let e = _mm_set1_epi32(1);
5325        assert_eq_m128i(r, e);
5326    }
5327
5328    #[simd_test(enable = "avx2")]
5329    const fn test_mm256_srlv_epi32() {
5330        let a = _mm256_set1_epi32(2);
5331        let count = _mm256_set1_epi32(1);
5332        let r = _mm256_srlv_epi32(a, count);
5333        let e = _mm256_set1_epi32(1);
5334        assert_eq_m256i(r, e);
5335    }
5336
5337    #[simd_test(enable = "avx2")]
5338    const fn test_mm_srlv_epi64() {
5339        let a = _mm_set1_epi64x(2);
5340        let count = _mm_set1_epi64x(1);
5341        let r = _mm_srlv_epi64(a, count);
5342        let e = _mm_set1_epi64x(1);
5343        assert_eq_m128i(r, e);
5344    }
5345
5346    #[simd_test(enable = "avx2")]
5347    const fn test_mm256_srlv_epi64() {
5348        let a = _mm256_set1_epi64x(2);
5349        let count = _mm256_set1_epi64x(1);
5350        let r = _mm256_srlv_epi64(a, count);
5351        let e = _mm256_set1_epi64x(1);
5352        assert_eq_m256i(r, e);
5353    }
5354
5355    #[simd_test(enable = "avx2")]
5356    fn test_mm256_stream_load_si256() {
5357        let a = _mm256_set_epi64x(5, 6, 7, 8);
5358        let r = unsafe { _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _) };
5359        assert_eq_m256i(a, r);
5360    }
5361
5362    #[simd_test(enable = "avx2")]
5363    const fn test_mm256_sub_epi16() {
5364        let a = _mm256_set1_epi16(4);
5365        let b = _mm256_set1_epi16(2);
5366        let r = _mm256_sub_epi16(a, b);
5367        assert_eq_m256i(r, b);
5368    }
5369
5370    #[simd_test(enable = "avx2")]
5371    const fn test_mm256_sub_epi32() {
5372        let a = _mm256_set1_epi32(4);
5373        let b = _mm256_set1_epi32(2);
5374        let r = _mm256_sub_epi32(a, b);
5375        assert_eq_m256i(r, b);
5376    }
5377
5378    #[simd_test(enable = "avx2")]
5379    const fn test_mm256_sub_epi64() {
5380        let a = _mm256_set1_epi64x(4);
5381        let b = _mm256_set1_epi64x(2);
5382        let r = _mm256_sub_epi64(a, b);
5383        assert_eq_m256i(r, b);
5384    }
5385
5386    #[simd_test(enable = "avx2")]
5387    const fn test_mm256_sub_epi8() {
5388        let a = _mm256_set1_epi8(4);
5389        let b = _mm256_set1_epi8(2);
5390        let r = _mm256_sub_epi8(a, b);
5391        assert_eq_m256i(r, b);
5392    }
5393
5394    #[simd_test(enable = "avx2")]
5395    const fn test_mm256_subs_epi16() {
5396        let a = _mm256_set1_epi16(4);
5397        let b = _mm256_set1_epi16(2);
5398        let r = _mm256_subs_epi16(a, b);
5399        assert_eq_m256i(r, b);
5400    }
5401
5402    #[simd_test(enable = "avx2")]
5403    const fn test_mm256_subs_epi8() {
5404        let a = _mm256_set1_epi8(4);
5405        let b = _mm256_set1_epi8(2);
5406        let r = _mm256_subs_epi8(a, b);
5407        assert_eq_m256i(r, b);
5408    }
5409
5410    #[simd_test(enable = "avx2")]
5411    const fn test_mm256_subs_epu16() {
5412        let a = _mm256_set1_epi16(4);
5413        let b = _mm256_set1_epi16(2);
5414        let r = _mm256_subs_epu16(a, b);
5415        assert_eq_m256i(r, b);
5416    }
5417
5418    #[simd_test(enable = "avx2")]
5419    const fn test_mm256_subs_epu8() {
5420        let a = _mm256_set1_epi8(4);
5421        let b = _mm256_set1_epi8(2);
5422        let r = _mm256_subs_epu8(a, b);
5423        assert_eq_m256i(r, b);
5424    }
5425
5426    #[simd_test(enable = "avx2")]
5427    const fn test_mm256_xor_si256() {
5428        let a = _mm256_set1_epi8(5);
5429        let b = _mm256_set1_epi8(3);
5430        let r = _mm256_xor_si256(a, b);
5431        assert_eq_m256i(r, _mm256_set1_epi8(6));
5432    }
5433
5434    #[simd_test(enable = "avx2")]
5435    const fn test_mm256_alignr_epi8() {
5436        #[rustfmt::skip]
5437        let a = _mm256_setr_epi8(
5438            1, 2, 3, 4, 5, 6, 7, 8,
5439            9, 10, 11, 12, 13, 14, 15, 16,
5440            17, 18, 19, 20, 21, 22, 23, 24,
5441            25, 26, 27, 28, 29, 30, 31, 32,
5442        );
5443        #[rustfmt::skip]
5444        let b = _mm256_setr_epi8(
5445            -1, -2, -3, -4, -5, -6, -7, -8,
5446            -9, -10, -11, -12, -13, -14, -15, -16,
5447            -17, -18, -19, -20, -21, -22, -23, -24,
5448            -25, -26, -27, -28, -29, -30, -31, -32,
5449        );
5450        let r = _mm256_alignr_epi8::<33>(a, b);
5451        assert_eq_m256i(r, _mm256_set1_epi8(0));
5452
5453        let r = _mm256_alignr_epi8::<17>(a, b);
5454        #[rustfmt::skip]
5455        let expected = _mm256_setr_epi8(
5456            2, 3, 4, 5, 6, 7, 8, 9,
5457            10, 11, 12, 13, 14, 15, 16, 0,
5458            18, 19, 20, 21, 22, 23, 24, 25,
5459            26, 27, 28, 29, 30, 31, 32, 0,
5460        );
5461        assert_eq_m256i(r, expected);
5462
5463        let r = _mm256_alignr_epi8::<4>(a, b);
5464        #[rustfmt::skip]
5465        let expected = _mm256_setr_epi8(
5466            -5, -6, -7, -8, -9, -10, -11, -12,
5467            -13, -14, -15, -16, 1, 2, 3, 4,
5468            -21, -22, -23, -24, -25, -26, -27, -28,
5469            -29, -30, -31, -32, 17, 18, 19, 20,
5470        );
5471        assert_eq_m256i(r, expected);
5472
5473        let r = _mm256_alignr_epi8::<15>(a, b);
5474        #[rustfmt::skip]
5475        let expected = _mm256_setr_epi8(
5476            -16, 1, 2, 3, 4, 5, 6, 7,
5477            8, 9, 10, 11, 12, 13, 14, 15,
5478            -32, 17, 18, 19, 20, 21, 22, 23,
5479            24, 25, 26, 27, 28, 29, 30, 31,
5480        );
5481        assert_eq_m256i(r, expected);
5482
5483        let r = _mm256_alignr_epi8::<0>(a, b);
5484        assert_eq_m256i(r, b);
5485
5486        let r = _mm256_alignr_epi8::<16>(a, b);
5487        assert_eq_m256i(r, a);
5488    }
5489
5490    #[simd_test(enable = "avx2")]
5491    fn test_mm256_shuffle_epi8() {
5492        #[rustfmt::skip]
5493        let a = _mm256_setr_epi8(
5494            1, 2, 3, 4, 5, 6, 7, 8,
5495            9, 10, 11, 12, 13, 14, 15, 16,
5496            17, 18, 19, 20, 21, 22, 23, 24,
5497            25, 26, 27, 28, 29, 30, 31, 32,
5498        );
5499        #[rustfmt::skip]
5500        let b = _mm256_setr_epi8(
5501            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5502            12, 5, 5, 10, 4, 1, 8, 0,
5503            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5504            12, 5, 5, 10, 4, 1, 8, 0,
5505        );
5506        #[rustfmt::skip]
5507        let expected = _mm256_setr_epi8(
5508            5, 0, 5, 4, 9, 13, 7, 4,
5509            13, 6, 6, 11, 5, 2, 9, 1,
5510            21, 0, 21, 20, 25, 29, 23, 20,
5511            29, 22, 22, 27, 21, 18, 25, 17,
5512        );
5513        let r = _mm256_shuffle_epi8(a, b);
5514        assert_eq_m256i(r, expected);
5515    }
5516
5517    #[simd_test(enable = "avx2")]
5518    fn test_mm256_permutevar8x32_epi32() {
5519        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5520        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5521        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5522        let r = _mm256_permutevar8x32_epi32(a, b);
5523        assert_eq_m256i(r, expected);
5524    }
5525
5526    #[simd_test(enable = "avx2")]
5527    const fn test_mm256_permute4x64_epi64() {
5528        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5529        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5530        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5531        assert_eq_m256i(r, expected);
5532    }
5533
5534    #[simd_test(enable = "avx2")]
5535    const fn test_mm256_permute2x128_si256() {
5536        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5537        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5538        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5539        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5540        assert_eq_m256i(r, e);
5541    }
5542
5543    #[simd_test(enable = "avx2")]
5544    const fn test_mm256_permute4x64_pd() {
5545        let a = _mm256_setr_pd(1., 2., 3., 4.);
5546        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5547        let e = _mm256_setr_pd(4., 1., 2., 1.);
5548        assert_eq_m256d(r, e);
5549    }
5550
5551    #[simd_test(enable = "avx2")]
5552    fn test_mm256_permutevar8x32_ps() {
5553        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5554        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5555        let r = _mm256_permutevar8x32_ps(a, b);
5556        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5557        assert_eq_m256(r, e);
5558    }
5559
5560    #[simd_test(enable = "avx2")]
5561    fn test_mm_i32gather_epi32() {
5562        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5563        // A multiplier of 4 is word-addressing
5564        let r = unsafe { _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5565        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5566    }
5567
5568    #[simd_test(enable = "avx2")]
5569    fn test_mm_mask_i32gather_epi32() {
5570        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5571        // A multiplier of 4 is word-addressing
5572        let r = unsafe {
5573            _mm_mask_i32gather_epi32::<4>(
5574                _mm_set1_epi32(256),
5575                arr.as_ptr(),
5576                _mm_setr_epi32(0, 16, 64, 96),
5577                _mm_setr_epi32(-1, -1, -1, 0),
5578            )
5579        };
5580        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5581    }
5582
5583    #[simd_test(enable = "avx2")]
5584    fn test_mm256_i32gather_epi32() {
5585        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5586        // A multiplier of 4 is word-addressing
5587        let r = unsafe {
5588            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4))
5589        };
5590        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5591    }
5592
5593    #[simd_test(enable = "avx2")]
5594    fn test_mm256_mask_i32gather_epi32() {
5595        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5596        // A multiplier of 4 is word-addressing
5597        let r = unsafe {
5598            _mm256_mask_i32gather_epi32::<4>(
5599                _mm256_set1_epi32(256),
5600                arr.as_ptr(),
5601                _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5602                _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5603            )
5604        };
5605        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5606    }
5607
5608    #[simd_test(enable = "avx2")]
5609    fn test_mm_i32gather_ps() {
5610        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5611        // A multiplier of 4 is word-addressing for f32s
5612        let r = unsafe { _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5613        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5614    }
5615
5616    #[simd_test(enable = "avx2")]
5617    fn test_mm_mask_i32gather_ps() {
5618        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5619        // A multiplier of 4 is word-addressing for f32s
5620        let r = unsafe {
5621            _mm_mask_i32gather_ps::<4>(
5622                _mm_set1_ps(256.0),
5623                arr.as_ptr(),
5624                _mm_setr_epi32(0, 16, 64, 96),
5625                _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5626            )
5627        };
5628        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5629    }
5630
5631    #[simd_test(enable = "avx2")]
5632    fn test_mm256_i32gather_ps() {
5633        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5634        // A multiplier of 4 is word-addressing for f32s
5635        let r = unsafe {
5636            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4))
5637        };
5638        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5639    }
5640
5641    #[simd_test(enable = "avx2")]
5642    fn test_mm256_mask_i32gather_ps() {
5643        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5644        // A multiplier of 4 is word-addressing for f32s
5645        let r = unsafe {
5646            _mm256_mask_i32gather_ps::<4>(
5647                _mm256_set1_ps(256.0),
5648                arr.as_ptr(),
5649                _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5650                _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5651            )
5652        };
5653        assert_eq_m256(
5654            r,
5655            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5656        );
5657    }
5658
5659    #[simd_test(enable = "avx2")]
5660    fn test_mm_i32gather_epi64() {
5661        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5662        // A multiplier of 8 is word-addressing for i64s
5663        let r = unsafe { _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)) };
5664        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5665    }
5666
5667    #[simd_test(enable = "avx2")]
5668    fn test_mm_mask_i32gather_epi64() {
5669        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5670        // A multiplier of 8 is word-addressing for i64s
5671        let r = unsafe {
5672            _mm_mask_i32gather_epi64::<8>(
5673                _mm_set1_epi64x(256),
5674                arr.as_ptr(),
5675                _mm_setr_epi32(16, 16, 16, 16),
5676                _mm_setr_epi64x(-1, 0),
5677            )
5678        };
5679        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5680    }
5681
5682    #[simd_test(enable = "avx2")]
5683    fn test_mm256_i32gather_epi64() {
5684        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5685        // A multiplier of 8 is word-addressing for i64s
5686        let r = unsafe { _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5687        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5688    }
5689
5690    #[simd_test(enable = "avx2")]
5691    fn test_mm256_mask_i32gather_epi64() {
5692        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5693        // A multiplier of 8 is word-addressing for i64s
5694        let r = unsafe {
5695            _mm256_mask_i32gather_epi64::<8>(
5696                _mm256_set1_epi64x(256),
5697                arr.as_ptr(),
5698                _mm_setr_epi32(0, 16, 64, 96),
5699                _mm256_setr_epi64x(-1, -1, -1, 0),
5700            )
5701        };
5702        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5703    }
5704
5705    #[simd_test(enable = "avx2")]
5706    fn test_mm_i32gather_pd() {
5707        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5708        // A multiplier of 8 is word-addressing for f64s
5709        let r = unsafe { _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)) };
5710        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5711    }
5712
5713    #[simd_test(enable = "avx2")]
5714    fn test_mm_mask_i32gather_pd() {
5715        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5716        // A multiplier of 8 is word-addressing for f64s
5717        let r = unsafe {
5718            _mm_mask_i32gather_pd::<8>(
5719                _mm_set1_pd(256.0),
5720                arr.as_ptr(),
5721                _mm_setr_epi32(16, 16, 16, 16),
5722                _mm_setr_pd(-1.0, 0.0),
5723            )
5724        };
5725        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5726    }
5727
5728    #[simd_test(enable = "avx2")]
5729    fn test_mm256_i32gather_pd() {
5730        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5731        // A multiplier of 8 is word-addressing for f64s
5732        let r = unsafe { _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5733        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5734    }
5735
5736    #[simd_test(enable = "avx2")]
5737    fn test_mm256_mask_i32gather_pd() {
5738        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5739        // A multiplier of 8 is word-addressing for f64s
5740        let r = unsafe {
5741            _mm256_mask_i32gather_pd::<8>(
5742                _mm256_set1_pd(256.0),
5743                arr.as_ptr(),
5744                _mm_setr_epi32(0, 16, 64, 96),
5745                _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5746            )
5747        };
5748        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5749    }
5750
5751    #[simd_test(enable = "avx2")]
5752    fn test_mm_i64gather_epi32() {
5753        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5754        // A multiplier of 4 is word-addressing
5755        let r = unsafe { _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5756        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5757    }
5758
5759    #[simd_test(enable = "avx2")]
5760    fn test_mm_mask_i64gather_epi32() {
5761        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5762        // A multiplier of 4 is word-addressing
5763        let r = unsafe {
5764            _mm_mask_i64gather_epi32::<4>(
5765                _mm_set1_epi32(256),
5766                arr.as_ptr(),
5767                _mm_setr_epi64x(0, 16),
5768                _mm_setr_epi32(-1, 0, -1, 0),
5769            )
5770        };
5771        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5772    }
5773
5774    #[simd_test(enable = "avx2")]
5775    fn test_mm256_i64gather_epi32() {
5776        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5777        // A multiplier of 4 is word-addressing
5778        let r =
5779            unsafe { _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5780        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5781    }
5782
5783    #[simd_test(enable = "avx2")]
5784    fn test_mm256_mask_i64gather_epi32() {
5785        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5786        // A multiplier of 4 is word-addressing
5787        let r = unsafe {
5788            _mm256_mask_i64gather_epi32::<4>(
5789                _mm_set1_epi32(256),
5790                arr.as_ptr(),
5791                _mm256_setr_epi64x(0, 16, 64, 96),
5792                _mm_setr_epi32(-1, -1, -1, 0),
5793            )
5794        };
5795        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5796    }
5797
5798    #[simd_test(enable = "avx2")]
5799    fn test_mm_i64gather_ps() {
5800        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5801        // A multiplier of 4 is word-addressing for f32s
5802        let r = unsafe { _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5803        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5804    }
5805
5806    #[simd_test(enable = "avx2")]
5807    fn test_mm_mask_i64gather_ps() {
5808        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5809        // A multiplier of 4 is word-addressing for f32s
5810        let r = unsafe {
5811            _mm_mask_i64gather_ps::<4>(
5812                _mm_set1_ps(256.0),
5813                arr.as_ptr(),
5814                _mm_setr_epi64x(0, 16),
5815                _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5816            )
5817        };
5818        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5819    }
5820
5821    #[simd_test(enable = "avx2")]
5822    fn test_mm256_i64gather_ps() {
5823        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5824        // A multiplier of 4 is word-addressing for f32s
5825        let r =
5826            unsafe { _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5827        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5828    }
5829
5830    #[simd_test(enable = "avx2")]
5831    fn test_mm256_mask_i64gather_ps() {
5832        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5833        // A multiplier of 4 is word-addressing for f32s
5834        let r = unsafe {
5835            _mm256_mask_i64gather_ps::<4>(
5836                _mm_set1_ps(256.0),
5837                arr.as_ptr(),
5838                _mm256_setr_epi64x(0, 16, 64, 96),
5839                _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5840            )
5841        };
5842        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5843    }
5844
5845    #[simd_test(enable = "avx2")]
5846    fn test_mm_i64gather_epi64() {
5847        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5848        // A multiplier of 8 is word-addressing for i64s
5849        let r = unsafe { _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5850        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5851    }
5852
5853    #[simd_test(enable = "avx2")]
5854    fn test_mm_mask_i64gather_epi64() {
5855        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5856        // A multiplier of 8 is word-addressing for i64s
5857        let r = unsafe {
5858            _mm_mask_i64gather_epi64::<8>(
5859                _mm_set1_epi64x(256),
5860                arr.as_ptr(),
5861                _mm_setr_epi64x(16, 16),
5862                _mm_setr_epi64x(-1, 0),
5863            )
5864        };
5865        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5866    }
5867
5868    #[simd_test(enable = "avx2")]
5869    fn test_mm256_i64gather_epi64() {
5870        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5871        // A multiplier of 8 is word-addressing for i64s
5872        let r =
5873            unsafe { _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5874        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5875    }
5876
5877    #[simd_test(enable = "avx2")]
5878    fn test_mm256_mask_i64gather_epi64() {
5879        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5880        // A multiplier of 8 is word-addressing for i64s
5881        let r = unsafe {
5882            _mm256_mask_i64gather_epi64::<8>(
5883                _mm256_set1_epi64x(256),
5884                arr.as_ptr(),
5885                _mm256_setr_epi64x(0, 16, 64, 96),
5886                _mm256_setr_epi64x(-1, -1, -1, 0),
5887            )
5888        };
5889        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5890    }
5891
5892    #[simd_test(enable = "avx2")]
5893    fn test_mm_i64gather_pd() {
5894        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5895        // A multiplier of 8 is word-addressing for f64s
5896        let r = unsafe { _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5897        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5898    }
5899
5900    #[simd_test(enable = "avx2")]
5901    fn test_mm_mask_i64gather_pd() {
5902        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5903        // A multiplier of 8 is word-addressing for f64s
5904        let r = unsafe {
5905            _mm_mask_i64gather_pd::<8>(
5906                _mm_set1_pd(256.0),
5907                arr.as_ptr(),
5908                _mm_setr_epi64x(16, 16),
5909                _mm_setr_pd(-1.0, 0.0),
5910            )
5911        };
5912        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5913    }
5914
5915    #[simd_test(enable = "avx2")]
5916    fn test_mm256_i64gather_pd() {
5917        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5918        // A multiplier of 8 is word-addressing for f64s
5919        let r =
5920            unsafe { _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5921        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5922    }
5923
5924    #[simd_test(enable = "avx2")]
5925    fn test_mm256_mask_i64gather_pd() {
5926        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5927        // A multiplier of 8 is word-addressing for f64s
5928        let r = unsafe {
5929            _mm256_mask_i64gather_pd::<8>(
5930                _mm256_set1_pd(256.0),
5931                arr.as_ptr(),
5932                _mm256_setr_epi64x(0, 16, 64, 96),
5933                _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5934            )
5935        };
5936        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5937    }
5938
5939    #[simd_test(enable = "avx2")]
5940    const fn test_mm256_extract_epi8() {
5941        #[rustfmt::skip]
5942        let a = _mm256_setr_epi8(
5943            -1, 1, 2, 3, 4, 5, 6, 7,
5944            8, 9, 10, 11, 12, 13, 14, 15,
5945            16, 17, 18, 19, 20, 21, 22, 23,
5946            24, 25, 26, 27, 28, 29, 30, 31
5947        );
5948        let r1 = _mm256_extract_epi8::<0>(a);
5949        let r2 = _mm256_extract_epi8::<3>(a);
5950        assert_eq!(r1, 0xFF);
5951        assert_eq!(r2, 3);
5952    }
5953
5954    #[simd_test(enable = "avx2")]
5955    const fn test_mm256_extract_epi16() {
5956        #[rustfmt::skip]
5957        let a = _mm256_setr_epi16(
5958            -1, 1, 2, 3, 4, 5, 6, 7,
5959            8, 9, 10, 11, 12, 13, 14, 15,
5960        );
5961        let r1 = _mm256_extract_epi16::<0>(a);
5962        let r2 = _mm256_extract_epi16::<3>(a);
5963        assert_eq!(r1, 0xFFFF);
5964        assert_eq!(r2, 3);
5965    }
5966}