core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
35pub const fn _mm256_abs_epi32(a: __m256i) -> __m256i {
36    unsafe {
37        let a = a.as_i32x8();
38        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
39        transmute(r)
40    }
41}
42
43/// Computes the absolute values of packed 16-bit integers in `a`.
44///
45/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
46#[inline]
47#[target_feature(enable = "avx2")]
48#[cfg_attr(test, assert_instr(vpabsw))]
49#[stable(feature = "simd_x86", since = "1.27.0")]
50#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
51pub const fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
67pub const fn _mm256_abs_epi8(a: __m256i) -> __m256i {
68    unsafe {
69        let a = a.as_i8x32();
70        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
71        transmute(r)
72    }
73}
74
75/// Adds packed 64-bit integers in `a` and `b`.
76///
77/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
78#[inline]
79#[target_feature(enable = "avx2")]
80#[cfg_attr(test, assert_instr(vpaddq))]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
84    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
85}
86
87/// Adds packed 32-bit integers in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
90#[inline]
91#[target_feature(enable = "avx2")]
92#[cfg_attr(test, assert_instr(vpaddd))]
93#[stable(feature = "simd_x86", since = "1.27.0")]
94#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
95pub const fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
96    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
97}
98
99/// Adds packed 16-bit integers in `a` and `b`.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
102#[inline]
103#[target_feature(enable = "avx2")]
104#[cfg_attr(test, assert_instr(vpaddw))]
105#[stable(feature = "simd_x86", since = "1.27.0")]
106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
107pub const fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
108    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
109}
110
111/// Adds packed 8-bit integers in `a` and `b`.
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
114#[inline]
115#[target_feature(enable = "avx2")]
116#[cfg_attr(test, assert_instr(vpaddb))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
119pub const fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
120    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
121}
122
123/// Adds packed 8-bit integers in `a` and `b` using saturation.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
126#[inline]
127#[target_feature(enable = "avx2")]
128#[cfg_attr(test, assert_instr(vpaddsb))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
132    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
133}
134
135/// Adds packed 16-bit integers in `a` and `b` using saturation.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
138#[inline]
139#[target_feature(enable = "avx2")]
140#[cfg_attr(test, assert_instr(vpaddsw))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
143pub const fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
144    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
145}
146
147/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
150#[inline]
151#[target_feature(enable = "avx2")]
152#[cfg_attr(test, assert_instr(vpaddusb))]
153#[stable(feature = "simd_x86", since = "1.27.0")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
156    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
157}
158
159/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
162#[inline]
163#[target_feature(enable = "avx2")]
164#[cfg_attr(test, assert_instr(vpaddusw))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
167pub const fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
168    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
169}
170
171/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
172/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
173///
174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
175#[inline]
176#[target_feature(enable = "avx2")]
177#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
178#[rustc_legacy_const_generics(2)]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
181pub const fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
182    static_assert_uimm_bits!(IMM8, 8);
183
184    // If palignr is shifting the pair of vectors more than the size of two
185    // lanes, emit zero.
186    if IMM8 >= 32 {
187        return _mm256_setzero_si256();
188    }
189    // If palignr is shifting the pair of input vectors more than one lane,
190    // but less than two lanes, convert to shifting in zeroes.
191    let (a, b) = if IMM8 > 16 {
192        (_mm256_setzero_si256(), a)
193    } else {
194        (a, b)
195    };
196    unsafe {
197        if IMM8 == 16 {
198            return transmute(a);
199        }
200    }
201    const fn mask(shift: u32, i: u32) -> u32 {
202        let shift = shift % 16;
203        let mod_i = i % 16;
204        if mod_i < (16 - shift) {
205            i + shift
206        } else {
207            i + 16 + shift
208        }
209    }
210
211    unsafe {
212        let r: i8x32 = simd_shuffle!(
213            b.as_i8x32(),
214            a.as_i8x32(),
215            [
216                mask(IMM8 as u32, 0),
217                mask(IMM8 as u32, 1),
218                mask(IMM8 as u32, 2),
219                mask(IMM8 as u32, 3),
220                mask(IMM8 as u32, 4),
221                mask(IMM8 as u32, 5),
222                mask(IMM8 as u32, 6),
223                mask(IMM8 as u32, 7),
224                mask(IMM8 as u32, 8),
225                mask(IMM8 as u32, 9),
226                mask(IMM8 as u32, 10),
227                mask(IMM8 as u32, 11),
228                mask(IMM8 as u32, 12),
229                mask(IMM8 as u32, 13),
230                mask(IMM8 as u32, 14),
231                mask(IMM8 as u32, 15),
232                mask(IMM8 as u32, 16),
233                mask(IMM8 as u32, 17),
234                mask(IMM8 as u32, 18),
235                mask(IMM8 as u32, 19),
236                mask(IMM8 as u32, 20),
237                mask(IMM8 as u32, 21),
238                mask(IMM8 as u32, 22),
239                mask(IMM8 as u32, 23),
240                mask(IMM8 as u32, 24),
241                mask(IMM8 as u32, 25),
242                mask(IMM8 as u32, 26),
243                mask(IMM8 as u32, 27),
244                mask(IMM8 as u32, 28),
245                mask(IMM8 as u32, 29),
246                mask(IMM8 as u32, 30),
247                mask(IMM8 as u32, 31),
248            ],
249        );
250        transmute(r)
251    }
252}
253
254/// Computes the bitwise AND of 256 bits (representing integer data)
255/// in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
264    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
265}
266
267/// Computes the bitwise NOT of 256 bits (representing integer data)
268/// in `a` and then AND with `b`.
269///
270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
271#[inline]
272#[target_feature(enable = "avx2")]
273#[cfg_attr(test, assert_instr(vandnps))]
274#[stable(feature = "simd_x86", since = "1.27.0")]
275#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
276pub const fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
277    unsafe {
278        let all_ones = _mm256_set1_epi8(-1);
279        transmute(simd_and(
280            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
281            b.as_i64x4(),
282        ))
283    }
284}
285
286/// Averages packed unsigned 16-bit integers in `a` and `b`.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
289#[inline]
290#[target_feature(enable = "avx2")]
291#[cfg_attr(test, assert_instr(vpavgw))]
292#[stable(feature = "simd_x86", since = "1.27.0")]
293#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
294pub const fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
295    unsafe {
296        let a = simd_cast::<_, u32x16>(a.as_u16x16());
297        let b = simd_cast::<_, u32x16>(b.as_u16x16());
298        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
299        transmute(simd_cast::<_, u16x16>(r))
300    }
301}
302
303/// Averages packed unsigned 8-bit integers in `a` and `b`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
306#[inline]
307#[target_feature(enable = "avx2")]
308#[cfg_attr(test, assert_instr(vpavgb))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
311pub const fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
312    unsafe {
313        let a = simd_cast::<_, u16x32>(a.as_u8x32());
314        let b = simd_cast::<_, u16x32>(b.as_u8x32());
315        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
316        transmute(simd_cast::<_, u8x32>(r))
317    }
318}
319
320/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
321///
322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
323#[inline]
324#[target_feature(enable = "avx2")]
325#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
326#[rustc_legacy_const_generics(2)]
327#[stable(feature = "simd_x86", since = "1.27.0")]
328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
329pub const fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
330    static_assert_uimm_bits!(IMM4, 4);
331    unsafe {
332        let a = a.as_i32x4();
333        let b = b.as_i32x4();
334        let r: i32x4 = simd_shuffle!(
335            a,
336            b,
337            [
338                [0, 4, 0, 4][IMM4 as usize & 0b11],
339                [1, 1, 5, 5][IMM4 as usize & 0b11],
340                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
341                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
342            ],
343        );
344        transmute(r)
345    }
346}
347
348/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
349///
350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
351#[inline]
352#[target_feature(enable = "avx2")]
353#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
354#[rustc_legacy_const_generics(2)]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
357pub const fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
358    static_assert_uimm_bits!(IMM8, 8);
359    unsafe {
360        let a = a.as_i32x8();
361        let b = b.as_i32x8();
362        let r: i32x8 = simd_shuffle!(
363            a,
364            b,
365            [
366                [0, 8, 0, 8][IMM8 as usize & 0b11],
367                [1, 1, 9, 9][IMM8 as usize & 0b11],
368                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
369                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
370                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
371                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
372                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
373                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
374            ],
375        );
376        transmute(r)
377    }
378}
379
380/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
383#[inline]
384#[target_feature(enable = "avx2")]
385#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
386#[rustc_legacy_const_generics(2)]
387#[stable(feature = "simd_x86", since = "1.27.0")]
388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
389pub const fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
390    static_assert_uimm_bits!(IMM8, 8);
391    unsafe {
392        let a = a.as_i16x16();
393        let b = b.as_i16x16();
394
395        let r: i16x16 = simd_shuffle!(
396            a,
397            b,
398            [
399                [0, 16, 0, 16][IMM8 as usize & 0b11],
400                [1, 1, 17, 17][IMM8 as usize & 0b11],
401                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
402                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
403                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
404                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
405                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
406                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
407                [8, 24, 8, 24][IMM8 as usize & 0b11],
408                [9, 9, 25, 25][IMM8 as usize & 0b11],
409                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
410                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
411                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
412                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
413                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
414                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
415            ],
416        );
417        transmute(r)
418    }
419}
420
421/// Blends packed 8-bit integers from `a` and `b` using `mask`.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
424#[inline]
425#[target_feature(enable = "avx2")]
426#[cfg_attr(test, assert_instr(vpblendvb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
430    unsafe {
431        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
432        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
433    }
434}
435
436/// Broadcasts the low packed 8-bit integer from `a` to all elements of
437/// the 128-bit returned value.
438///
439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
440#[inline]
441#[target_feature(enable = "avx2")]
442#[cfg_attr(test, assert_instr(vpbroadcastb))]
443#[stable(feature = "simd_x86", since = "1.27.0")]
444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
445pub const fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
446    unsafe {
447        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
448        transmute::<i8x16, _>(ret)
449    }
450}
451
452/// Broadcasts the low packed 8-bit integer from `a` to all elements of
453/// the 256-bit returned value.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
456#[inline]
457#[target_feature(enable = "avx2")]
458#[cfg_attr(test, assert_instr(vpbroadcastb))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
461pub const fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
462    unsafe {
463        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
464        transmute::<i8x32, _>(ret)
465    }
466}
467
468// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
469// often compiled to `vbroadcastss`.
470/// Broadcasts the low packed 32-bit integer from `a` to all elements of
471/// the 128-bit returned value.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
474#[inline]
475#[target_feature(enable = "avx2")]
476#[cfg_attr(test, assert_instr(vbroadcastss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
479pub const fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
480    unsafe {
481        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
482        transmute::<i32x4, _>(ret)
483    }
484}
485
486// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
487// often compiled to `vbroadcastss`.
488/// Broadcasts the low packed 32-bit integer from `a` to all elements of
489/// the 256-bit returned value.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vbroadcastss))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
497pub const fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
498    unsafe {
499        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
500        transmute::<i32x8, _>(ret)
501    }
502}
503
504/// Broadcasts the low packed 64-bit integer from `a` to all elements of
505/// the 128-bit returned value.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
508#[inline]
509#[target_feature(enable = "avx2")]
510// Emits `vmovddup` instead of `vpbroadcastq`
511// See https://github.com/rust-lang/stdarch/issues/791
512#[cfg_attr(test, assert_instr(vmovddup))]
513#[stable(feature = "simd_x86", since = "1.27.0")]
514#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
515pub const fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
516    unsafe {
517        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
518        transmute::<i64x2, _>(ret)
519    }
520}
521
522/// Broadcasts the low packed 64-bit integer from `a` to all elements of
523/// the 256-bit returned value.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
526#[inline]
527#[target_feature(enable = "avx2")]
528#[cfg_attr(test, assert_instr(vbroadcastsd))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
531pub const fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
532    unsafe {
533        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
534        transmute::<i64x4, _>(ret)
535    }
536}
537
538/// Broadcasts the low double-precision (64-bit) floating-point element
539/// from `a` to all elements of the 128-bit returned value.
540///
541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
542#[inline]
543#[target_feature(enable = "avx2")]
544#[cfg_attr(test, assert_instr(vmovddup))]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
547pub const fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
548    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
549}
550
551/// Broadcasts the low double-precision (64-bit) floating-point element
552/// from `a` to all elements of the 256-bit returned value.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
555#[inline]
556#[target_feature(enable = "avx2")]
557#[cfg_attr(test, assert_instr(vbroadcastsd))]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
560pub const fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
561    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
562}
563
564/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
565/// the 256-bit returned value.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
568#[inline]
569#[target_feature(enable = "avx2")]
570#[stable(feature = "simd_x86_updates", since = "1.82.0")]
571#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
572pub const fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
573    unsafe {
574        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
575        transmute::<i64x4, _>(ret)
576    }
577}
578
579// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
580// `vbroadcastf128`.
581/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
582/// the 256-bit returned value.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
585#[inline]
586#[target_feature(enable = "avx2")]
587#[stable(feature = "simd_x86", since = "1.27.0")]
588#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
589pub const fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
590    unsafe {
591        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
592        transmute::<i64x4, _>(ret)
593    }
594}
595
596/// Broadcasts the low single-precision (32-bit) floating-point element
597/// from `a` to all elements of the 128-bit returned value.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
600#[inline]
601#[target_feature(enable = "avx2")]
602#[cfg_attr(test, assert_instr(vbroadcastss))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
605pub const fn _mm_broadcastss_ps(a: __m128) -> __m128 {
606    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
607}
608
609/// Broadcasts the low single-precision (32-bit) floating-point element
610/// from `a` to all elements of the 256-bit returned value.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
613#[inline]
614#[target_feature(enable = "avx2")]
615#[cfg_attr(test, assert_instr(vbroadcastss))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
618pub const fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
619    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
620}
621
622/// Broadcasts the low packed 16-bit integer from a to all elements of
623/// the 128-bit returned value
624///
625/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
626#[inline]
627#[target_feature(enable = "avx2")]
628#[cfg_attr(test, assert_instr(vpbroadcastw))]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
631pub const fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
632    unsafe {
633        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
634        transmute::<i16x8, _>(ret)
635    }
636}
637
638/// Broadcasts the low packed 16-bit integer from a to all elements of
639/// the 256-bit returned value
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
642#[inline]
643#[target_feature(enable = "avx2")]
644#[cfg_attr(test, assert_instr(vpbroadcastw))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
647pub const fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
648    unsafe {
649        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
650        transmute::<i16x16, _>(ret)
651    }
652}
653
654/// Compares packed 64-bit integers in `a` and `b` for equality.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vpcmpeqq))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
662pub const fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
663    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
664}
665
666/// Compares packed 32-bit integers in `a` and `b` for equality.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vpcmpeqd))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
674pub const fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
675    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
676}
677
678/// Compares packed 16-bit integers in `a` and `b` for equality.
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpcmpeqw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
686pub const fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
687    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
688}
689
690/// Compares packed 8-bit integers in `a` and `b` for equality.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
693#[inline]
694#[target_feature(enable = "avx2")]
695#[cfg_attr(test, assert_instr(vpcmpeqb))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
698pub const fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
699    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
700}
701
702/// Compares packed 64-bit integers in `a` and `b` for greater-than.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
705#[inline]
706#[target_feature(enable = "avx2")]
707#[cfg_attr(test, assert_instr(vpcmpgtq))]
708#[stable(feature = "simd_x86", since = "1.27.0")]
709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
710pub const fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
711    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
712}
713
714/// Compares packed 32-bit integers in `a` and `b` for greater-than.
715///
716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
717#[inline]
718#[target_feature(enable = "avx2")]
719#[cfg_attr(test, assert_instr(vpcmpgtd))]
720#[stable(feature = "simd_x86", since = "1.27.0")]
721#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
722pub const fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
723    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
724}
725
726/// Compares packed 16-bit integers in `a` and `b` for greater-than.
727///
728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
729#[inline]
730#[target_feature(enable = "avx2")]
731#[cfg_attr(test, assert_instr(vpcmpgtw))]
732#[stable(feature = "simd_x86", since = "1.27.0")]
733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
734pub const fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
735    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
736}
737
738/// Compares packed 8-bit integers in `a` and `b` for greater-than.
739///
740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
741#[inline]
742#[target_feature(enable = "avx2")]
743#[cfg_attr(test, assert_instr(vpcmpgtb))]
744#[stable(feature = "simd_x86", since = "1.27.0")]
745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
746pub const fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
747    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
748}
749
750/// Sign-extend 16-bit integers to 32-bit integers.
751///
752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
753#[inline]
754#[target_feature(enable = "avx2")]
755#[cfg_attr(test, assert_instr(vpmovsxwd))]
756#[stable(feature = "simd_x86", since = "1.27.0")]
757#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
758pub const fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
759    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
760}
761
762/// Sign-extend 16-bit integers to 64-bit integers.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpmovsxwq))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
770pub const fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
771    unsafe {
772        let a = a.as_i16x8();
773        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
774        transmute::<i64x4, _>(simd_cast(v64))
775    }
776}
777
778/// Sign-extend 32-bit integers to 64-bit integers.
779///
780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
781#[inline]
782#[target_feature(enable = "avx2")]
783#[cfg_attr(test, assert_instr(vpmovsxdq))]
784#[stable(feature = "simd_x86", since = "1.27.0")]
785#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
786pub const fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
787    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
788}
789
790/// Sign-extend 8-bit integers to 16-bit integers.
791///
792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
793#[inline]
794#[target_feature(enable = "avx2")]
795#[cfg_attr(test, assert_instr(vpmovsxbw))]
796#[stable(feature = "simd_x86", since = "1.27.0")]
797#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
798pub const fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
799    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
800}
801
802/// Sign-extend 8-bit integers to 32-bit integers.
803///
804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
805#[inline]
806#[target_feature(enable = "avx2")]
807#[cfg_attr(test, assert_instr(vpmovsxbd))]
808#[stable(feature = "simd_x86", since = "1.27.0")]
809#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
810pub const fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
811    unsafe {
812        let a = a.as_i8x16();
813        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
814        transmute::<i32x8, _>(simd_cast(v64))
815    }
816}
817
818/// Sign-extend 8-bit integers to 64-bit integers.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
821#[inline]
822#[target_feature(enable = "avx2")]
823#[cfg_attr(test, assert_instr(vpmovsxbq))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
826pub const fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
827    unsafe {
828        let a = a.as_i8x16();
829        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
830        transmute::<i64x4, _>(simd_cast(v32))
831    }
832}
833
834/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
835/// integers, and stores the results in `dst`.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
838#[inline]
839#[target_feature(enable = "avx2")]
840#[cfg_attr(test, assert_instr(vpmovzxwd))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
843pub const fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
844    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
845}
846
847/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
848/// integers. The upper four elements of `a` are unused.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
851#[inline]
852#[target_feature(enable = "avx2")]
853#[cfg_attr(test, assert_instr(vpmovzxwq))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
856pub const fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
857    unsafe {
858        let a = a.as_u16x8();
859        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
860        transmute::<i64x4, _>(simd_cast(v64))
861    }
862}
863
864/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
865///
866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
867#[inline]
868#[target_feature(enable = "avx2")]
869#[cfg_attr(test, assert_instr(vpmovzxdq))]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
872pub const fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
873    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
874}
875
876/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
879#[inline]
880#[target_feature(enable = "avx2")]
881#[cfg_attr(test, assert_instr(vpmovzxbw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
884pub const fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
885    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
886}
887
888/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
889/// integers. The upper eight elements of `a` are unused.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
892#[inline]
893#[target_feature(enable = "avx2")]
894#[cfg_attr(test, assert_instr(vpmovzxbd))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
897pub const fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
898    unsafe {
899        let a = a.as_u8x16();
900        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
901        transmute::<i32x8, _>(simd_cast(v64))
902    }
903}
904
905/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
906/// integers. The upper twelve elements of `a` are unused.
907///
908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
909#[inline]
910#[target_feature(enable = "avx2")]
911#[cfg_attr(test, assert_instr(vpmovzxbq))]
912#[stable(feature = "simd_x86", since = "1.27.0")]
913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
914pub const fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
915    unsafe {
916        let a = a.as_u8x16();
917        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
918        transmute::<i64x4, _>(simd_cast(v32))
919    }
920}
921
922/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
925#[inline]
926#[target_feature(enable = "avx2")]
927#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
928#[rustc_legacy_const_generics(1)]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
931pub const fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
932    static_assert_uimm_bits!(IMM1, 1);
933    unsafe {
934        let a = a.as_i64x4();
935        let b = i64x4::ZERO;
936        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
937        transmute(dst)
938    }
939}
940
941/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
944#[inline]
945#[target_feature(enable = "avx2")]
946#[cfg_attr(test, assert_instr(vphaddw))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
950    let a = a.as_i16x16();
951    let b = b.as_i16x16();
952    unsafe {
953        let even: i16x16 = simd_shuffle!(
954            a,
955            b,
956            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
957        );
958        let odd: i16x16 = simd_shuffle!(
959            a,
960            b,
961            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
962        );
963        simd_add(even, odd).as_m256i()
964    }
965}
966
967/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
970#[inline]
971#[target_feature(enable = "avx2")]
972#[cfg_attr(test, assert_instr(vphaddd))]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
975pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
976    let a = a.as_i32x8();
977    let b = b.as_i32x8();
978    unsafe {
979        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
980        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
981        simd_add(even, odd).as_m256i()
982    }
983}
984
985/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
986/// using saturation.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
989#[inline]
990#[target_feature(enable = "avx2")]
991#[cfg_attr(test, assert_instr(vphaddsw))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
994    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
995}
996
997/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1000#[inline]
1001#[target_feature(enable = "avx2")]
1002#[cfg_attr(test, assert_instr(vphsubw))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1005pub const fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1006    let a = a.as_i16x16();
1007    let b = b.as_i16x16();
1008    unsafe {
1009        let even: i16x16 = simd_shuffle!(
1010            a,
1011            b,
1012            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1013        );
1014        let odd: i16x16 = simd_shuffle!(
1015            a,
1016            b,
1017            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1018        );
1019        simd_sub(even, odd).as_m256i()
1020    }
1021}
1022
1023/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1024///
1025/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1026#[inline]
1027#[target_feature(enable = "avx2")]
1028#[cfg_attr(test, assert_instr(vphsubd))]
1029#[stable(feature = "simd_x86", since = "1.27.0")]
1030#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1031pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1032    let a = a.as_i32x8();
1033    let b = b.as_i32x8();
1034    unsafe {
1035        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
1036        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
1037        simd_sub(even, odd).as_m256i()
1038    }
1039}
1040
1041/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1042/// using saturation.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1045#[inline]
1046#[target_feature(enable = "avx2")]
1047#[cfg_attr(test, assert_instr(vphsubsw))]
1048#[stable(feature = "simd_x86", since = "1.27.0")]
1049pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1050    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
1051}
1052
1053/// Returns values from `slice` at offsets determined by `offsets * scale`,
1054/// where
1055/// `scale` should be 1, 2, 4 or 8.
1056///
1057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1058#[inline]
1059#[target_feature(enable = "avx2")]
1060#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1061#[rustc_legacy_const_generics(2)]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1064    slice: *const i32,
1065    offsets: __m128i,
1066) -> __m128i {
1067    static_assert_imm8_scale!(SCALE);
1068    let zero = i32x4::ZERO;
1069    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1070    let offsets = offsets.as_i32x4();
1071    let slice = slice as *const i8;
1072    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1073    transmute(r)
1074}
1075
1076/// Returns values from `slice` at offsets determined by `offsets * scale`,
1077/// where
1078/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1079/// that position instead.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1082#[inline]
1083#[target_feature(enable = "avx2")]
1084#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1085#[rustc_legacy_const_generics(4)]
1086#[stable(feature = "simd_x86", since = "1.27.0")]
1087pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1088    src: __m128i,
1089    slice: *const i32,
1090    offsets: __m128i,
1091    mask: __m128i,
1092) -> __m128i {
1093    static_assert_imm8_scale!(SCALE);
1094    let src = src.as_i32x4();
1095    let mask = mask.as_i32x4();
1096    let offsets = offsets.as_i32x4();
1097    let slice = slice as *const i8;
1098    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1099    transmute(r)
1100}
1101
1102/// Returns values from `slice` at offsets determined by `offsets * scale`,
1103/// where
1104/// `scale` should be 1, 2, 4 or 8.
1105///
1106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1107#[inline]
1108#[target_feature(enable = "avx2")]
1109#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1110#[rustc_legacy_const_generics(2)]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1113    slice: *const i32,
1114    offsets: __m256i,
1115) -> __m256i {
1116    static_assert_imm8_scale!(SCALE);
1117    let zero = i32x8::ZERO;
1118    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1119    let offsets = offsets.as_i32x8();
1120    let slice = slice as *const i8;
1121    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1122    transmute(r)
1123}
1124
1125/// Returns values from `slice` at offsets determined by `offsets * scale`,
1126/// where
1127/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1128/// that position instead.
1129///
1130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1131#[inline]
1132#[target_feature(enable = "avx2")]
1133#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1134#[rustc_legacy_const_generics(4)]
1135#[stable(feature = "simd_x86", since = "1.27.0")]
1136pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1137    src: __m256i,
1138    slice: *const i32,
1139    offsets: __m256i,
1140    mask: __m256i,
1141) -> __m256i {
1142    static_assert_imm8_scale!(SCALE);
1143    let src = src.as_i32x8();
1144    let mask = mask.as_i32x8();
1145    let offsets = offsets.as_i32x8();
1146    let slice = slice as *const i8;
1147    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1148    transmute(r)
1149}
1150
1151/// Returns values from `slice` at offsets determined by `offsets * scale`,
1152/// where
1153/// `scale` should be 1, 2, 4 or 8.
1154///
1155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1156#[inline]
1157#[target_feature(enable = "avx2")]
1158#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1159#[rustc_legacy_const_generics(2)]
1160#[stable(feature = "simd_x86", since = "1.27.0")]
1161pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1162    static_assert_imm8_scale!(SCALE);
1163    let zero = _mm_setzero_ps();
1164    let neg_one = _mm_set1_ps(-1.0);
1165    let offsets = offsets.as_i32x4();
1166    let slice = slice as *const i8;
1167    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1168}
1169
1170/// Returns values from `slice` at offsets determined by `offsets * scale`,
1171/// where
1172/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1173/// that position instead.
1174///
1175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1176#[inline]
1177#[target_feature(enable = "avx2")]
1178#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1179#[rustc_legacy_const_generics(4)]
1180#[stable(feature = "simd_x86", since = "1.27.0")]
1181pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1182    src: __m128,
1183    slice: *const f32,
1184    offsets: __m128i,
1185    mask: __m128,
1186) -> __m128 {
1187    static_assert_imm8_scale!(SCALE);
1188    let offsets = offsets.as_i32x4();
1189    let slice = slice as *const i8;
1190    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1191}
1192
1193/// Returns values from `slice` at offsets determined by `offsets * scale`,
1194/// where
1195/// `scale` should be 1, 2, 4 or 8.
1196///
1197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1198#[inline]
1199#[target_feature(enable = "avx2")]
1200#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1201#[rustc_legacy_const_generics(2)]
1202#[stable(feature = "simd_x86", since = "1.27.0")]
1203pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1204    static_assert_imm8_scale!(SCALE);
1205    let zero = _mm256_setzero_ps();
1206    let neg_one = _mm256_set1_ps(-1.0);
1207    let offsets = offsets.as_i32x8();
1208    let slice = slice as *const i8;
1209    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1210}
1211
1212/// Returns values from `slice` at offsets determined by `offsets * scale`,
1213/// where
1214/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1215/// that position instead.
1216///
1217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1218#[inline]
1219#[target_feature(enable = "avx2")]
1220#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1221#[rustc_legacy_const_generics(4)]
1222#[stable(feature = "simd_x86", since = "1.27.0")]
1223pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1224    src: __m256,
1225    slice: *const f32,
1226    offsets: __m256i,
1227    mask: __m256,
1228) -> __m256 {
1229    static_assert_imm8_scale!(SCALE);
1230    let offsets = offsets.as_i32x8();
1231    let slice = slice as *const i8;
1232    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1233}
1234
1235/// Returns values from `slice` at offsets determined by `offsets * scale`,
1236/// where
1237/// `scale` should be 1, 2, 4 or 8.
1238///
1239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1240#[inline]
1241#[target_feature(enable = "avx2")]
1242#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1243#[rustc_legacy_const_generics(2)]
1244#[stable(feature = "simd_x86", since = "1.27.0")]
1245pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1246    slice: *const i64,
1247    offsets: __m128i,
1248) -> __m128i {
1249    static_assert_imm8_scale!(SCALE);
1250    let zero = i64x2::ZERO;
1251    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1252    let offsets = offsets.as_i32x4();
1253    let slice = slice as *const i8;
1254    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1255    transmute(r)
1256}
1257
1258/// Returns values from `slice` at offsets determined by `offsets * scale`,
1259/// where
1260/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1261/// that position instead.
1262///
1263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1264#[inline]
1265#[target_feature(enable = "avx2")]
1266#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1267#[rustc_legacy_const_generics(4)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1270    src: __m128i,
1271    slice: *const i64,
1272    offsets: __m128i,
1273    mask: __m128i,
1274) -> __m128i {
1275    static_assert_imm8_scale!(SCALE);
1276    let src = src.as_i64x2();
1277    let mask = mask.as_i64x2();
1278    let offsets = offsets.as_i32x4();
1279    let slice = slice as *const i8;
1280    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1281    transmute(r)
1282}
1283
1284/// Returns values from `slice` at offsets determined by `offsets * scale`,
1285/// where
1286/// `scale` should be 1, 2, 4 or 8.
1287///
1288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1289#[inline]
1290#[target_feature(enable = "avx2")]
1291#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1292#[rustc_legacy_const_generics(2)]
1293#[stable(feature = "simd_x86", since = "1.27.0")]
1294pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1295    slice: *const i64,
1296    offsets: __m128i,
1297) -> __m256i {
1298    static_assert_imm8_scale!(SCALE);
1299    let zero = i64x4::ZERO;
1300    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1301    let offsets = offsets.as_i32x4();
1302    let slice = slice as *const i8;
1303    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1304    transmute(r)
1305}
1306
1307/// Returns values from `slice` at offsets determined by `offsets * scale`,
1308/// where
1309/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1310/// that position instead.
1311///
1312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1313#[inline]
1314#[target_feature(enable = "avx2")]
1315#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1316#[rustc_legacy_const_generics(4)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1319    src: __m256i,
1320    slice: *const i64,
1321    offsets: __m128i,
1322    mask: __m256i,
1323) -> __m256i {
1324    static_assert_imm8_scale!(SCALE);
1325    let src = src.as_i64x4();
1326    let mask = mask.as_i64x4();
1327    let offsets = offsets.as_i32x4();
1328    let slice = slice as *const i8;
1329    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1330    transmute(r)
1331}
1332
1333/// Returns values from `slice` at offsets determined by `offsets * scale`,
1334/// where
1335/// `scale` should be 1, 2, 4 or 8.
1336///
1337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1338#[inline]
1339#[target_feature(enable = "avx2")]
1340#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1341#[rustc_legacy_const_generics(2)]
1342#[stable(feature = "simd_x86", since = "1.27.0")]
1343pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1344    static_assert_imm8_scale!(SCALE);
1345    let zero = _mm_setzero_pd();
1346    let neg_one = _mm_set1_pd(-1.0);
1347    let offsets = offsets.as_i32x4();
1348    let slice = slice as *const i8;
1349    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1350}
1351
1352/// Returns values from `slice` at offsets determined by `offsets * scale`,
1353/// where
1354/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1355/// that position instead.
1356///
1357/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1358#[inline]
1359#[target_feature(enable = "avx2")]
1360#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1361#[rustc_legacy_const_generics(4)]
1362#[stable(feature = "simd_x86", since = "1.27.0")]
1363pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1364    src: __m128d,
1365    slice: *const f64,
1366    offsets: __m128i,
1367    mask: __m128d,
1368) -> __m128d {
1369    static_assert_imm8_scale!(SCALE);
1370    let offsets = offsets.as_i32x4();
1371    let slice = slice as *const i8;
1372    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1373}
1374
1375/// Returns values from `slice` at offsets determined by `offsets * scale`,
1376/// where
1377/// `scale` should be 1, 2, 4 or 8.
1378///
1379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1380#[inline]
1381#[target_feature(enable = "avx2")]
1382#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1383#[rustc_legacy_const_generics(2)]
1384#[stable(feature = "simd_x86", since = "1.27.0")]
1385pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1386    slice: *const f64,
1387    offsets: __m128i,
1388) -> __m256d {
1389    static_assert_imm8_scale!(SCALE);
1390    let zero = _mm256_setzero_pd();
1391    let neg_one = _mm256_set1_pd(-1.0);
1392    let offsets = offsets.as_i32x4();
1393    let slice = slice as *const i8;
1394    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1395}
1396
1397/// Returns values from `slice` at offsets determined by `offsets * scale`,
1398/// where
1399/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1400/// that position instead.
1401///
1402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1403#[inline]
1404#[target_feature(enable = "avx2")]
1405#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1406#[rustc_legacy_const_generics(4)]
1407#[stable(feature = "simd_x86", since = "1.27.0")]
1408pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1409    src: __m256d,
1410    slice: *const f64,
1411    offsets: __m128i,
1412    mask: __m256d,
1413) -> __m256d {
1414    static_assert_imm8_scale!(SCALE);
1415    let offsets = offsets.as_i32x4();
1416    let slice = slice as *const i8;
1417    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1418}
1419
1420/// Returns values from `slice` at offsets determined by `offsets * scale`,
1421/// where
1422/// `scale` should be 1, 2, 4 or 8.
1423///
1424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1425#[inline]
1426#[target_feature(enable = "avx2")]
1427#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1428#[rustc_legacy_const_generics(2)]
1429#[stable(feature = "simd_x86", since = "1.27.0")]
1430pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1431    slice: *const i32,
1432    offsets: __m128i,
1433) -> __m128i {
1434    static_assert_imm8_scale!(SCALE);
1435    let zero = i32x4::ZERO;
1436    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1437    let offsets = offsets.as_i64x2();
1438    let slice = slice as *const i8;
1439    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1440    transmute(r)
1441}
1442
1443/// Returns values from `slice` at offsets determined by `offsets * scale`,
1444/// where
1445/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1446/// that position instead.
1447///
1448/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1449#[inline]
1450#[target_feature(enable = "avx2")]
1451#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1452#[rustc_legacy_const_generics(4)]
1453#[stable(feature = "simd_x86", since = "1.27.0")]
1454pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1455    src: __m128i,
1456    slice: *const i32,
1457    offsets: __m128i,
1458    mask: __m128i,
1459) -> __m128i {
1460    static_assert_imm8_scale!(SCALE);
1461    let src = src.as_i32x4();
1462    let mask = mask.as_i32x4();
1463    let offsets = offsets.as_i64x2();
1464    let slice = slice as *const i8;
1465    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1466    transmute(r)
1467}
1468
1469/// Returns values from `slice` at offsets determined by `offsets * scale`,
1470/// where
1471/// `scale` should be 1, 2, 4 or 8.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1474#[inline]
1475#[target_feature(enable = "avx2")]
1476#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1477#[rustc_legacy_const_generics(2)]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1480    slice: *const i32,
1481    offsets: __m256i,
1482) -> __m128i {
1483    static_assert_imm8_scale!(SCALE);
1484    let zero = i32x4::ZERO;
1485    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1486    let offsets = offsets.as_i64x4();
1487    let slice = slice as *const i8;
1488    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1489    transmute(r)
1490}
1491
1492/// Returns values from `slice` at offsets determined by `offsets * scale`,
1493/// where
1494/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1495/// that position instead.
1496///
1497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1498#[inline]
1499#[target_feature(enable = "avx2")]
1500#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1501#[rustc_legacy_const_generics(4)]
1502#[stable(feature = "simd_x86", since = "1.27.0")]
1503pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1504    src: __m128i,
1505    slice: *const i32,
1506    offsets: __m256i,
1507    mask: __m128i,
1508) -> __m128i {
1509    static_assert_imm8_scale!(SCALE);
1510    let src = src.as_i32x4();
1511    let mask = mask.as_i32x4();
1512    let offsets = offsets.as_i64x4();
1513    let slice = slice as *const i8;
1514    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1515    transmute(r)
1516}
1517
1518/// Returns values from `slice` at offsets determined by `offsets * scale`,
1519/// where
1520/// `scale` should be 1, 2, 4 or 8.
1521///
1522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1523#[inline]
1524#[target_feature(enable = "avx2")]
1525#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1526#[rustc_legacy_const_generics(2)]
1527#[stable(feature = "simd_x86", since = "1.27.0")]
1528pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1529    static_assert_imm8_scale!(SCALE);
1530    let zero = _mm_setzero_ps();
1531    let neg_one = _mm_set1_ps(-1.0);
1532    let offsets = offsets.as_i64x2();
1533    let slice = slice as *const i8;
1534    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1535}
1536
1537/// Returns values from `slice` at offsets determined by `offsets * scale`,
1538/// where
1539/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1540/// that position instead.
1541///
1542/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1543#[inline]
1544#[target_feature(enable = "avx2")]
1545#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1546#[rustc_legacy_const_generics(4)]
1547#[stable(feature = "simd_x86", since = "1.27.0")]
1548pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1549    src: __m128,
1550    slice: *const f32,
1551    offsets: __m128i,
1552    mask: __m128,
1553) -> __m128 {
1554    static_assert_imm8_scale!(SCALE);
1555    let offsets = offsets.as_i64x2();
1556    let slice = slice as *const i8;
1557    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1558}
1559
1560/// Returns values from `slice` at offsets determined by `offsets * scale`,
1561/// where
1562/// `scale` should be 1, 2, 4 or 8.
1563///
1564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1565#[inline]
1566#[target_feature(enable = "avx2")]
1567#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1568#[rustc_legacy_const_generics(2)]
1569#[stable(feature = "simd_x86", since = "1.27.0")]
1570pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1571    static_assert_imm8_scale!(SCALE);
1572    let zero = _mm_setzero_ps();
1573    let neg_one = _mm_set1_ps(-1.0);
1574    let offsets = offsets.as_i64x4();
1575    let slice = slice as *const i8;
1576    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1577}
1578
1579/// Returns values from `slice` at offsets determined by `offsets * scale`,
1580/// where
1581/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1582/// that position instead.
1583///
1584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1585#[inline]
1586#[target_feature(enable = "avx2")]
1587#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1588#[rustc_legacy_const_generics(4)]
1589#[stable(feature = "simd_x86", since = "1.27.0")]
1590pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1591    src: __m128,
1592    slice: *const f32,
1593    offsets: __m256i,
1594    mask: __m128,
1595) -> __m128 {
1596    static_assert_imm8_scale!(SCALE);
1597    let offsets = offsets.as_i64x4();
1598    let slice = slice as *const i8;
1599    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1600}
1601
1602/// Returns values from `slice` at offsets determined by `offsets * scale`,
1603/// where
1604/// `scale` should be 1, 2, 4 or 8.
1605///
1606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1607#[inline]
1608#[target_feature(enable = "avx2")]
1609#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1610#[rustc_legacy_const_generics(2)]
1611#[stable(feature = "simd_x86", since = "1.27.0")]
1612pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1613    slice: *const i64,
1614    offsets: __m128i,
1615) -> __m128i {
1616    static_assert_imm8_scale!(SCALE);
1617    let zero = i64x2::ZERO;
1618    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1619    let slice = slice as *const i8;
1620    let offsets = offsets.as_i64x2();
1621    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1622    transmute(r)
1623}
1624
1625/// Returns values from `slice` at offsets determined by `offsets * scale`,
1626/// where
1627/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1628/// that position instead.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1631#[inline]
1632#[target_feature(enable = "avx2")]
1633#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1634#[rustc_legacy_const_generics(4)]
1635#[stable(feature = "simd_x86", since = "1.27.0")]
1636pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1637    src: __m128i,
1638    slice: *const i64,
1639    offsets: __m128i,
1640    mask: __m128i,
1641) -> __m128i {
1642    static_assert_imm8_scale!(SCALE);
1643    let src = src.as_i64x2();
1644    let mask = mask.as_i64x2();
1645    let offsets = offsets.as_i64x2();
1646    let slice = slice as *const i8;
1647    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1648    transmute(r)
1649}
1650
1651/// Returns values from `slice` at offsets determined by `offsets * scale`,
1652/// where
1653/// `scale` should be 1, 2, 4 or 8.
1654///
1655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1656#[inline]
1657#[target_feature(enable = "avx2")]
1658#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1659#[rustc_legacy_const_generics(2)]
1660#[stable(feature = "simd_x86", since = "1.27.0")]
1661pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1662    slice: *const i64,
1663    offsets: __m256i,
1664) -> __m256i {
1665    static_assert_imm8_scale!(SCALE);
1666    let zero = i64x4::ZERO;
1667    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1668    let slice = slice as *const i8;
1669    let offsets = offsets.as_i64x4();
1670    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1671    transmute(r)
1672}
1673
1674/// Returns values from `slice` at offsets determined by `offsets * scale`,
1675/// where
1676/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1677/// that position instead.
1678///
1679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1680#[inline]
1681#[target_feature(enable = "avx2")]
1682#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1683#[rustc_legacy_const_generics(4)]
1684#[stable(feature = "simd_x86", since = "1.27.0")]
1685pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1686    src: __m256i,
1687    slice: *const i64,
1688    offsets: __m256i,
1689    mask: __m256i,
1690) -> __m256i {
1691    static_assert_imm8_scale!(SCALE);
1692    let src = src.as_i64x4();
1693    let mask = mask.as_i64x4();
1694    let offsets = offsets.as_i64x4();
1695    let slice = slice as *const i8;
1696    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1697    transmute(r)
1698}
1699
1700/// Returns values from `slice` at offsets determined by `offsets * scale`,
1701/// where
1702/// `scale` should be 1, 2, 4 or 8.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1705#[inline]
1706#[target_feature(enable = "avx2")]
1707#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1708#[rustc_legacy_const_generics(2)]
1709#[stable(feature = "simd_x86", since = "1.27.0")]
1710pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1711    static_assert_imm8_scale!(SCALE);
1712    let zero = _mm_setzero_pd();
1713    let neg_one = _mm_set1_pd(-1.0);
1714    let slice = slice as *const i8;
1715    let offsets = offsets.as_i64x2();
1716    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1717}
1718
1719/// Returns values from `slice` at offsets determined by `offsets * scale`,
1720/// where
1721/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1722/// that position instead.
1723///
1724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1725#[inline]
1726#[target_feature(enable = "avx2")]
1727#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1728#[rustc_legacy_const_generics(4)]
1729#[stable(feature = "simd_x86", since = "1.27.0")]
1730pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1731    src: __m128d,
1732    slice: *const f64,
1733    offsets: __m128i,
1734    mask: __m128d,
1735) -> __m128d {
1736    static_assert_imm8_scale!(SCALE);
1737    let slice = slice as *const i8;
1738    let offsets = offsets.as_i64x2();
1739    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1740}
1741
1742/// Returns values from `slice` at offsets determined by `offsets * scale`,
1743/// where
1744/// `scale` should be 1, 2, 4 or 8.
1745///
1746/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1747#[inline]
1748#[target_feature(enable = "avx2")]
1749#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1750#[rustc_legacy_const_generics(2)]
1751#[stable(feature = "simd_x86", since = "1.27.0")]
1752pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1753    slice: *const f64,
1754    offsets: __m256i,
1755) -> __m256d {
1756    static_assert_imm8_scale!(SCALE);
1757    let zero = _mm256_setzero_pd();
1758    let neg_one = _mm256_set1_pd(-1.0);
1759    let slice = slice as *const i8;
1760    let offsets = offsets.as_i64x4();
1761    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1762}
1763
1764/// Returns values from `slice` at offsets determined by `offsets * scale`,
1765/// where
1766/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1767/// that position instead.
1768///
1769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1770#[inline]
1771#[target_feature(enable = "avx2")]
1772#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1773#[rustc_legacy_const_generics(4)]
1774#[stable(feature = "simd_x86", since = "1.27.0")]
1775pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1776    src: __m256d,
1777    slice: *const f64,
1778    offsets: __m256i,
1779    mask: __m256d,
1780) -> __m256d {
1781    static_assert_imm8_scale!(SCALE);
1782    let slice = slice as *const i8;
1783    let offsets = offsets.as_i64x4();
1784    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1785}
1786
1787/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1788/// location specified by `IMM1`.
1789///
1790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1791#[inline]
1792#[target_feature(enable = "avx2")]
1793#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1794#[rustc_legacy_const_generics(2)]
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1797pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1798    static_assert_uimm_bits!(IMM1, 1);
1799    unsafe {
1800        let a = a.as_i64x4();
1801        let b = _mm256_castsi128_si256(b).as_i64x4();
1802        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1803        transmute(dst)
1804    }
1805}
1806
1807/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1808/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1809/// of intermediate 32-bit integers.
1810///
1811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1812#[inline]
1813#[target_feature(enable = "avx2")]
1814#[cfg_attr(test, assert_instr(vpmaddwd))]
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1817pub const fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1818    unsafe {
1819        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
1820        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
1821        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
1822        simd_add(even, odd).as_m256i()
1823    }
1824}
1825
1826/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1827/// corresponding signed 8-bit integer from `b`, producing intermediate
1828/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1829/// signed 16-bit integers
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1832#[inline]
1833#[target_feature(enable = "avx2")]
1834#[cfg_attr(test, assert_instr(vpmaddubsw))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1837    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1838}
1839
1840/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1841/// (elements are zeroed out when the highest bit is not set in the
1842/// corresponding element).
1843///
1844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1845#[inline]
1846#[target_feature(enable = "avx2")]
1847#[cfg_attr(test, assert_instr(vpmaskmovd))]
1848#[stable(feature = "simd_x86", since = "1.27.0")]
1849#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1850pub const unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1851    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1852    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1853}
1854
1855/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1856/// (elements are zeroed out when the highest bit is not set in the
1857/// corresponding element).
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1860#[inline]
1861#[target_feature(enable = "avx2")]
1862#[cfg_attr(test, assert_instr(vpmaskmovd))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1866    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1867    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1868}
1869
1870/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1871/// (elements are zeroed out when the highest bit is not set in the
1872/// corresponding element).
1873///
1874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1875#[inline]
1876#[target_feature(enable = "avx2")]
1877#[cfg_attr(test, assert_instr(vpmaskmovq))]
1878#[stable(feature = "simd_x86", since = "1.27.0")]
1879#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1880pub const unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1881    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1882    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1883}
1884
1885/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1886/// (elements are zeroed out when the highest bit is not set in the
1887/// corresponding element).
1888///
1889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1890#[inline]
1891#[target_feature(enable = "avx2")]
1892#[cfg_attr(test, assert_instr(vpmaskmovq))]
1893#[stable(feature = "simd_x86", since = "1.27.0")]
1894#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1895pub const unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1896    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1897    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1898}
1899
1900/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1901/// using `mask` (elements are not stored when the highest bit is not set
1902/// in the corresponding element).
1903///
1904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1905#[inline]
1906#[target_feature(enable = "avx2")]
1907#[cfg_attr(test, assert_instr(vpmaskmovd))]
1908#[stable(feature = "simd_x86", since = "1.27.0")]
1909#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1910pub const unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1911    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1912    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1913}
1914
1915/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1916/// using `mask` (elements are not stored when the highest bit is not set
1917/// in the corresponding element).
1918///
1919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1920#[inline]
1921#[target_feature(enable = "avx2")]
1922#[cfg_attr(test, assert_instr(vpmaskmovd))]
1923#[stable(feature = "simd_x86", since = "1.27.0")]
1924#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1925pub const unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1926    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1927    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1928}
1929
1930/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1931/// using `mask` (elements are not stored when the highest bit is not set
1932/// in the corresponding element).
1933///
1934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1935#[inline]
1936#[target_feature(enable = "avx2")]
1937#[cfg_attr(test, assert_instr(vpmaskmovq))]
1938#[stable(feature = "simd_x86", since = "1.27.0")]
1939#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1940pub const unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1941    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1942    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1943}
1944
1945/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1946/// using `mask` (elements are not stored when the highest bit is not set
1947/// in the corresponding element).
1948///
1949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1950#[inline]
1951#[target_feature(enable = "avx2")]
1952#[cfg_attr(test, assert_instr(vpmaskmovq))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1955pub const unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1956    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1957    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1958}
1959
1960/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1961/// maximum values.
1962///
1963/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1964#[inline]
1965#[target_feature(enable = "avx2")]
1966#[cfg_attr(test, assert_instr(vpmaxsw))]
1967#[stable(feature = "simd_x86", since = "1.27.0")]
1968#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1969pub const fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1970    unsafe { simd_imax(a.as_i16x16(), b.as_i16x16()).as_m256i() }
1971}
1972
1973/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1974/// maximum values.
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1977#[inline]
1978#[target_feature(enable = "avx2")]
1979#[cfg_attr(test, assert_instr(vpmaxsd))]
1980#[stable(feature = "simd_x86", since = "1.27.0")]
1981#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1982pub const fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1983    unsafe { simd_imax(a.as_i32x8(), b.as_i32x8()).as_m256i() }
1984}
1985
1986/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1987/// maximum values.
1988///
1989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1990#[inline]
1991#[target_feature(enable = "avx2")]
1992#[cfg_attr(test, assert_instr(vpmaxsb))]
1993#[stable(feature = "simd_x86", since = "1.27.0")]
1994#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1995pub const fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1996    unsafe { simd_imax(a.as_i8x32(), b.as_i8x32()).as_m256i() }
1997}
1998
1999/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2000/// the packed maximum values.
2001///
2002/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
2003#[inline]
2004#[target_feature(enable = "avx2")]
2005#[cfg_attr(test, assert_instr(vpmaxuw))]
2006#[stable(feature = "simd_x86", since = "1.27.0")]
2007#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2008pub const fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2009    unsafe { simd_imax(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2010}
2011
2012/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2013/// the packed maximum values.
2014///
2015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
2016#[inline]
2017#[target_feature(enable = "avx2")]
2018#[cfg_attr(test, assert_instr(vpmaxud))]
2019#[stable(feature = "simd_x86", since = "1.27.0")]
2020#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2021pub const fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2022    unsafe { simd_imax(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2023}
2024
2025/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2026/// the packed maximum values.
2027///
2028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2029#[inline]
2030#[target_feature(enable = "avx2")]
2031#[cfg_attr(test, assert_instr(vpmaxub))]
2032#[stable(feature = "simd_x86", since = "1.27.0")]
2033#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2034pub const fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2035    unsafe { simd_imax(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2036}
2037
2038/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2039/// minimum values.
2040///
2041/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2042#[inline]
2043#[target_feature(enable = "avx2")]
2044#[cfg_attr(test, assert_instr(vpminsw))]
2045#[stable(feature = "simd_x86", since = "1.27.0")]
2046#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2047pub const fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2048    unsafe { simd_imin(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2049}
2050
2051/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2052/// minimum values.
2053///
2054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2055#[inline]
2056#[target_feature(enable = "avx2")]
2057#[cfg_attr(test, assert_instr(vpminsd))]
2058#[stable(feature = "simd_x86", since = "1.27.0")]
2059#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2060pub const fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2061    unsafe { simd_imin(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2062}
2063
2064/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2065/// minimum values.
2066///
2067/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2068#[inline]
2069#[target_feature(enable = "avx2")]
2070#[cfg_attr(test, assert_instr(vpminsb))]
2071#[stable(feature = "simd_x86", since = "1.27.0")]
2072#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2073pub const fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2074    unsafe { simd_imin(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2075}
2076
2077/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2078/// the packed minimum values.
2079///
2080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2081#[inline]
2082#[target_feature(enable = "avx2")]
2083#[cfg_attr(test, assert_instr(vpminuw))]
2084#[stable(feature = "simd_x86", since = "1.27.0")]
2085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2086pub const fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2087    unsafe { simd_imin(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2088}
2089
2090/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2091/// the packed minimum values.
2092///
2093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2094#[inline]
2095#[target_feature(enable = "avx2")]
2096#[cfg_attr(test, assert_instr(vpminud))]
2097#[stable(feature = "simd_x86", since = "1.27.0")]
2098#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2099pub const fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2100    unsafe { simd_imin(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2101}
2102
2103/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2104/// the packed minimum values.
2105///
2106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2107#[inline]
2108#[target_feature(enable = "avx2")]
2109#[cfg_attr(test, assert_instr(vpminub))]
2110#[stable(feature = "simd_x86", since = "1.27.0")]
2111#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2112pub const fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2113    unsafe { simd_imin(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2114}
2115
2116/// Creates mask from the most significant bit of each 8-bit element in `a`,
2117/// return the result.
2118///
2119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2120#[inline]
2121#[target_feature(enable = "avx2")]
2122#[cfg_attr(test, assert_instr(vpmovmskb))]
2123#[stable(feature = "simd_x86", since = "1.27.0")]
2124#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2125pub const fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2126    unsafe {
2127        let z = i8x32::ZERO;
2128        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2129        simd_bitmask::<_, u32>(m) as i32
2130    }
2131}
2132
2133/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2134/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2135/// results in dst. Eight SADs are performed for each 128-bit lane using one
2136/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2137/// selected from `b` starting at on the offset specified in `imm8`. Eight
2138/// quadruplets are formed from sequential 8-bit integers selected from `a`
2139/// starting at the offset specified in `imm8`.
2140///
2141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2142#[inline]
2143#[target_feature(enable = "avx2")]
2144#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2145#[rustc_legacy_const_generics(2)]
2146#[stable(feature = "simd_x86", since = "1.27.0")]
2147pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2148    static_assert_uimm_bits!(IMM8, 8);
2149    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2150}
2151
2152/// Multiplies the low 32-bit integers from each packed 64-bit element in
2153/// `a` and `b`
2154///
2155/// Returns the 64-bit results.
2156///
2157/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2158#[inline]
2159#[target_feature(enable = "avx2")]
2160#[cfg_attr(test, assert_instr(vpmuldq))]
2161#[stable(feature = "simd_x86", since = "1.27.0")]
2162#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2163pub const fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2164    unsafe {
2165        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2166        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2167        transmute(simd_mul(a, b))
2168    }
2169}
2170
2171/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2172/// element in `a` and `b`
2173///
2174/// Returns the unsigned 64-bit results.
2175///
2176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2177#[inline]
2178#[target_feature(enable = "avx2")]
2179#[cfg_attr(test, assert_instr(vpmuludq))]
2180#[stable(feature = "simd_x86", since = "1.27.0")]
2181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2182pub const fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2183    unsafe {
2184        let a = a.as_u64x4();
2185        let b = b.as_u64x4();
2186        let mask = u64x4::splat(u32::MAX as u64);
2187        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2188    }
2189}
2190
2191/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2192/// intermediate 32-bit integers and returning the high 16 bits of the
2193/// intermediate integers.
2194///
2195/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2196#[inline]
2197#[target_feature(enable = "avx2")]
2198#[cfg_attr(test, assert_instr(vpmulhw))]
2199#[stable(feature = "simd_x86", since = "1.27.0")]
2200#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2201pub const fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2202    unsafe {
2203        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2204        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2205        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2206        transmute(simd_cast::<i32x16, i16x16>(r))
2207    }
2208}
2209
2210/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2211/// intermediate 32-bit integers and returning the high 16 bits of the
2212/// intermediate integers.
2213///
2214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2215#[inline]
2216#[target_feature(enable = "avx2")]
2217#[cfg_attr(test, assert_instr(vpmulhuw))]
2218#[stable(feature = "simd_x86", since = "1.27.0")]
2219#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2220pub const fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2221    unsafe {
2222        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2223        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2224        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2225        transmute(simd_cast::<u32x16, u16x16>(r))
2226    }
2227}
2228
2229/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2230/// intermediate 32-bit integers, and returns the low 16 bits of the
2231/// intermediate integers
2232///
2233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2234#[inline]
2235#[target_feature(enable = "avx2")]
2236#[cfg_attr(test, assert_instr(vpmullw))]
2237#[stable(feature = "simd_x86", since = "1.27.0")]
2238#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2239pub const fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2240    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2241}
2242
2243/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2244/// intermediate 64-bit integers, and returns the low 32 bits of the
2245/// intermediate integers
2246///
2247/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2248#[inline]
2249#[target_feature(enable = "avx2")]
2250#[cfg_attr(test, assert_instr(vpmulld))]
2251#[stable(feature = "simd_x86", since = "1.27.0")]
2252#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2253pub const fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2254    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2255}
2256
2257/// Multiplies packed 16-bit integers in `a` and `b`, producing
2258/// intermediate signed 32-bit integers. Truncate each intermediate
2259/// integer to the 18 most significant bits, round by adding 1, and
2260/// return bits `[16:1]`.
2261///
2262/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2263#[inline]
2264#[target_feature(enable = "avx2")]
2265#[cfg_attr(test, assert_instr(vpmulhrsw))]
2266#[stable(feature = "simd_x86", since = "1.27.0")]
2267pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2268    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2269}
2270
2271/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2272/// and `b`
2273///
2274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2275#[inline]
2276#[target_feature(enable = "avx2")]
2277#[cfg_attr(test, assert_instr(vorps))]
2278#[stable(feature = "simd_x86", since = "1.27.0")]
2279#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2280pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2281    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2282}
2283
2284/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2285/// using signed saturation
2286///
2287/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2288#[inline]
2289#[target_feature(enable = "avx2")]
2290#[cfg_attr(test, assert_instr(vpacksswb))]
2291#[stable(feature = "simd_x86", since = "1.27.0")]
2292pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2293    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2294}
2295
2296/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2297/// using signed saturation
2298///
2299/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2300#[inline]
2301#[target_feature(enable = "avx2")]
2302#[cfg_attr(test, assert_instr(vpackssdw))]
2303#[stable(feature = "simd_x86", since = "1.27.0")]
2304pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2305    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2306}
2307
2308/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2309/// using unsigned saturation
2310///
2311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2312#[inline]
2313#[target_feature(enable = "avx2")]
2314#[cfg_attr(test, assert_instr(vpackuswb))]
2315#[stable(feature = "simd_x86", since = "1.27.0")]
2316pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2317    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2318}
2319
2320/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2321/// using unsigned saturation
2322///
2323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2324#[inline]
2325#[target_feature(enable = "avx2")]
2326#[cfg_attr(test, assert_instr(vpackusdw))]
2327#[stable(feature = "simd_x86", since = "1.27.0")]
2328pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2329    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2330}
2331
2332/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2333///
2334/// The last 3 bits of each integer of `b` are used as addresses into the 8
2335/// integers of `a`.
2336///
2337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2338#[inline]
2339#[target_feature(enable = "avx2")]
2340#[cfg_attr(test, assert_instr(vpermps))]
2341#[stable(feature = "simd_x86", since = "1.27.0")]
2342pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2343    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2344}
2345
2346/// Permutes 64-bit integers from `a` using control mask `imm8`.
2347///
2348/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2349#[inline]
2350#[target_feature(enable = "avx2")]
2351#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2352#[rustc_legacy_const_generics(1)]
2353#[stable(feature = "simd_x86", since = "1.27.0")]
2354#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2355pub const fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2356    static_assert_uimm_bits!(IMM8, 8);
2357    unsafe {
2358        let zero = i64x4::ZERO;
2359        let r: i64x4 = simd_shuffle!(
2360            a.as_i64x4(),
2361            zero,
2362            [
2363                IMM8 as u32 & 0b11,
2364                (IMM8 as u32 >> 2) & 0b11,
2365                (IMM8 as u32 >> 4) & 0b11,
2366                (IMM8 as u32 >> 6) & 0b11,
2367            ],
2368        );
2369        transmute(r)
2370    }
2371}
2372
2373/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2374///
2375/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2376#[inline]
2377#[target_feature(enable = "avx2")]
2378#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2379#[rustc_legacy_const_generics(2)]
2380#[stable(feature = "simd_x86", since = "1.27.0")]
2381#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2382pub const fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2383    static_assert_uimm_bits!(IMM8, 8);
2384    _mm256_permute2f128_si256::<IMM8>(a, b)
2385}
2386
2387/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2388/// control in `imm8`.
2389///
2390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2391#[inline]
2392#[target_feature(enable = "avx2")]
2393#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2394#[rustc_legacy_const_generics(1)]
2395#[stable(feature = "simd_x86", since = "1.27.0")]
2396#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2397pub const fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2398    static_assert_uimm_bits!(IMM8, 8);
2399    unsafe {
2400        simd_shuffle!(
2401            a,
2402            _mm256_undefined_pd(),
2403            [
2404                IMM8 as u32 & 0b11,
2405                (IMM8 as u32 >> 2) & 0b11,
2406                (IMM8 as u32 >> 4) & 0b11,
2407                (IMM8 as u32 >> 6) & 0b11,
2408            ],
2409        )
2410    }
2411}
2412
2413/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2414/// the corresponding 32-bit integer index in `idx`.
2415///
2416/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2417#[inline]
2418#[target_feature(enable = "avx2")]
2419#[cfg_attr(test, assert_instr(vpermps))]
2420#[stable(feature = "simd_x86", since = "1.27.0")]
2421pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2422    unsafe { permps(a, idx.as_i32x8()) }
2423}
2424
2425/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2426/// and `b`, then horizontally sum each consecutive 8 differences to
2427/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2428/// integers in the low 16 bits of the 64-bit return value
2429///
2430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2431#[inline]
2432#[target_feature(enable = "avx2")]
2433#[cfg_attr(test, assert_instr(vpsadbw))]
2434#[stable(feature = "simd_x86", since = "1.27.0")]
2435pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2436    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2437}
2438
2439/// Shuffles bytes from `a` according to the content of `b`.
2440///
2441/// For each of the 128-bit low and high halves of the vectors, the last
2442/// 4 bits of each byte of `b` are used as addresses into the respective
2443/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2444///
2445/// In addition, if the highest significant bit of a byte of `b` is set, the
2446/// respective destination byte is set to 0.
2447///
2448/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2449/// equivalent to:
2450///
2451/// ```
2452/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2453///     let mut r = [0; 32];
2454///     for i in 0..16 {
2455///         // if the most significant bit of b is set,
2456///         // then the destination byte is set to 0.
2457///         if b[i] & 0x80 == 0u8 {
2458///             r[i] = a[(b[i] % 16) as usize];
2459///         }
2460///         if b[i + 16] & 0x80 == 0u8 {
2461///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2462///         }
2463///     }
2464///     r
2465/// }
2466/// ```
2467///
2468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2469#[inline]
2470#[target_feature(enable = "avx2")]
2471#[cfg_attr(test, assert_instr(vpshufb))]
2472#[stable(feature = "simd_x86", since = "1.27.0")]
2473pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2474    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2475}
2476
2477/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2478/// `imm8`.
2479///
2480/// ```rust
2481/// #[cfg(target_arch = "x86")]
2482/// use std::arch::x86::*;
2483/// #[cfg(target_arch = "x86_64")]
2484/// use std::arch::x86_64::*;
2485///
2486/// # fn main() {
2487/// #     if is_x86_feature_detected!("avx2") {
2488/// #         #[target_feature(enable = "avx2")]
2489/// #         unsafe fn worker() {
2490/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2491///
2492/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2493/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2494///
2495/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2496/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2497///
2498/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2499/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2500/// #         }
2501/// #         unsafe { worker(); }
2502/// #     }
2503/// # }
2504/// ```
2505///
2506/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2507#[inline]
2508#[target_feature(enable = "avx2")]
2509#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2510#[rustc_legacy_const_generics(1)]
2511#[stable(feature = "simd_x86", since = "1.27.0")]
2512#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2513pub const fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2514    static_assert_uimm_bits!(MASK, 8);
2515    unsafe {
2516        let r: i32x8 = simd_shuffle!(
2517            a.as_i32x8(),
2518            a.as_i32x8(),
2519            [
2520                MASK as u32 & 0b11,
2521                (MASK as u32 >> 2) & 0b11,
2522                (MASK as u32 >> 4) & 0b11,
2523                (MASK as u32 >> 6) & 0b11,
2524                (MASK as u32 & 0b11) + 4,
2525                ((MASK as u32 >> 2) & 0b11) + 4,
2526                ((MASK as u32 >> 4) & 0b11) + 4,
2527                ((MASK as u32 >> 6) & 0b11) + 4,
2528            ],
2529        );
2530        transmute(r)
2531    }
2532}
2533
2534/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2535/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2536/// to the output.
2537///
2538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2539#[inline]
2540#[target_feature(enable = "avx2")]
2541#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2542#[rustc_legacy_const_generics(1)]
2543#[stable(feature = "simd_x86", since = "1.27.0")]
2544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2545pub const fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2546    static_assert_uimm_bits!(IMM8, 8);
2547    unsafe {
2548        let a = a.as_i16x16();
2549        let r: i16x16 = simd_shuffle!(
2550            a,
2551            a,
2552            [
2553                0,
2554                1,
2555                2,
2556                3,
2557                4 + (IMM8 as u32 & 0b11),
2558                4 + ((IMM8 as u32 >> 2) & 0b11),
2559                4 + ((IMM8 as u32 >> 4) & 0b11),
2560                4 + ((IMM8 as u32 >> 6) & 0b11),
2561                8,
2562                9,
2563                10,
2564                11,
2565                12 + (IMM8 as u32 & 0b11),
2566                12 + ((IMM8 as u32 >> 2) & 0b11),
2567                12 + ((IMM8 as u32 >> 4) & 0b11),
2568                12 + ((IMM8 as u32 >> 6) & 0b11),
2569            ],
2570        );
2571        transmute(r)
2572    }
2573}
2574
2575/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2576/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2577/// to the output.
2578///
2579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2580#[inline]
2581#[target_feature(enable = "avx2")]
2582#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2583#[rustc_legacy_const_generics(1)]
2584#[stable(feature = "simd_x86", since = "1.27.0")]
2585#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2586pub const fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2587    static_assert_uimm_bits!(IMM8, 8);
2588    unsafe {
2589        let a = a.as_i16x16();
2590        let r: i16x16 = simd_shuffle!(
2591            a,
2592            a,
2593            [
2594                0 + (IMM8 as u32 & 0b11),
2595                0 + ((IMM8 as u32 >> 2) & 0b11),
2596                0 + ((IMM8 as u32 >> 4) & 0b11),
2597                0 + ((IMM8 as u32 >> 6) & 0b11),
2598                4,
2599                5,
2600                6,
2601                7,
2602                8 + (IMM8 as u32 & 0b11),
2603                8 + ((IMM8 as u32 >> 2) & 0b11),
2604                8 + ((IMM8 as u32 >> 4) & 0b11),
2605                8 + ((IMM8 as u32 >> 6) & 0b11),
2606                12,
2607                13,
2608                14,
2609                15,
2610            ],
2611        );
2612        transmute(r)
2613    }
2614}
2615
2616/// Negates packed 16-bit integers in `a` when the corresponding signed
2617/// 16-bit integer in `b` is negative, and returns the results.
2618/// Results are zeroed out when the corresponding element in `b` is zero.
2619///
2620/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2621#[inline]
2622#[target_feature(enable = "avx2")]
2623#[cfg_attr(test, assert_instr(vpsignw))]
2624#[stable(feature = "simd_x86", since = "1.27.0")]
2625pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2626    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2627}
2628
2629/// Negates packed 32-bit integers in `a` when the corresponding signed
2630/// 32-bit integer in `b` is negative, and returns the results.
2631/// Results are zeroed out when the corresponding element in `b` is zero.
2632///
2633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2634#[inline]
2635#[target_feature(enable = "avx2")]
2636#[cfg_attr(test, assert_instr(vpsignd))]
2637#[stable(feature = "simd_x86", since = "1.27.0")]
2638pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2639    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2640}
2641
2642/// Negates packed 8-bit integers in `a` when the corresponding signed
2643/// 8-bit integer in `b` is negative, and returns the results.
2644/// Results are zeroed out when the corresponding element in `b` is zero.
2645///
2646/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2647#[inline]
2648#[target_feature(enable = "avx2")]
2649#[cfg_attr(test, assert_instr(vpsignb))]
2650#[stable(feature = "simd_x86", since = "1.27.0")]
2651pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2652    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2653}
2654
2655/// Shifts packed 16-bit integers in `a` left by `count` while
2656/// shifting in zeros, and returns the result
2657///
2658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2659#[inline]
2660#[target_feature(enable = "avx2")]
2661#[cfg_attr(test, assert_instr(vpsllw))]
2662#[stable(feature = "simd_x86", since = "1.27.0")]
2663pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2664    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2665}
2666
2667/// Shifts packed 32-bit integers in `a` left by `count` while
2668/// shifting in zeros, and returns the result
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2671#[inline]
2672#[target_feature(enable = "avx2")]
2673#[cfg_attr(test, assert_instr(vpslld))]
2674#[stable(feature = "simd_x86", since = "1.27.0")]
2675pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2676    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2677}
2678
2679/// Shifts packed 64-bit integers in `a` left by `count` while
2680/// shifting in zeros, and returns the result
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2683#[inline]
2684#[target_feature(enable = "avx2")]
2685#[cfg_attr(test, assert_instr(vpsllq))]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2688    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2689}
2690
2691/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2692/// shifting in zeros, return the results;
2693///
2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2695#[inline]
2696#[target_feature(enable = "avx2")]
2697#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2698#[rustc_legacy_const_generics(1)]
2699#[stable(feature = "simd_x86", since = "1.27.0")]
2700#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2701pub const fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2702    static_assert_uimm_bits!(IMM8, 8);
2703    unsafe {
2704        if IMM8 >= 16 {
2705            _mm256_setzero_si256()
2706        } else {
2707            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2708        }
2709    }
2710}
2711
2712/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2713/// shifting in zeros, return the results;
2714///
2715/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2716#[inline]
2717#[target_feature(enable = "avx2")]
2718#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2719#[rustc_legacy_const_generics(1)]
2720#[stable(feature = "simd_x86", since = "1.27.0")]
2721#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2722pub const fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2723    unsafe {
2724        static_assert_uimm_bits!(IMM8, 8);
2725        if IMM8 >= 32 {
2726            _mm256_setzero_si256()
2727        } else {
2728            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2729        }
2730    }
2731}
2732
2733/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2734/// shifting in zeros, return the results;
2735///
2736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2737#[inline]
2738#[target_feature(enable = "avx2")]
2739#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2740#[rustc_legacy_const_generics(1)]
2741#[stable(feature = "simd_x86", since = "1.27.0")]
2742#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2743pub const fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2744    unsafe {
2745        static_assert_uimm_bits!(IMM8, 8);
2746        if IMM8 >= 64 {
2747            _mm256_setzero_si256()
2748        } else {
2749            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2750        }
2751    }
2752}
2753
2754/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2755///
2756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2757#[inline]
2758#[target_feature(enable = "avx2")]
2759#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2760#[rustc_legacy_const_generics(1)]
2761#[stable(feature = "simd_x86", since = "1.27.0")]
2762#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2763pub const fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2764    static_assert_uimm_bits!(IMM8, 8);
2765    _mm256_bslli_epi128::<IMM8>(a)
2766}
2767
2768/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2769///
2770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2771#[inline]
2772#[target_feature(enable = "avx2")]
2773#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2774#[rustc_legacy_const_generics(1)]
2775#[stable(feature = "simd_x86", since = "1.27.0")]
2776#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2777pub const fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2778    static_assert_uimm_bits!(IMM8, 8);
2779    const fn mask(shift: i32, i: u32) -> u32 {
2780        let shift = shift as u32 & 0xff;
2781        if shift > 15 || i % 16 < shift {
2782            0
2783        } else {
2784            32 + (i - shift)
2785        }
2786    }
2787    unsafe {
2788        let a = a.as_i8x32();
2789        let r: i8x32 = simd_shuffle!(
2790            i8x32::ZERO,
2791            a,
2792            [
2793                mask(IMM8, 0),
2794                mask(IMM8, 1),
2795                mask(IMM8, 2),
2796                mask(IMM8, 3),
2797                mask(IMM8, 4),
2798                mask(IMM8, 5),
2799                mask(IMM8, 6),
2800                mask(IMM8, 7),
2801                mask(IMM8, 8),
2802                mask(IMM8, 9),
2803                mask(IMM8, 10),
2804                mask(IMM8, 11),
2805                mask(IMM8, 12),
2806                mask(IMM8, 13),
2807                mask(IMM8, 14),
2808                mask(IMM8, 15),
2809                mask(IMM8, 16),
2810                mask(IMM8, 17),
2811                mask(IMM8, 18),
2812                mask(IMM8, 19),
2813                mask(IMM8, 20),
2814                mask(IMM8, 21),
2815                mask(IMM8, 22),
2816                mask(IMM8, 23),
2817                mask(IMM8, 24),
2818                mask(IMM8, 25),
2819                mask(IMM8, 26),
2820                mask(IMM8, 27),
2821                mask(IMM8, 28),
2822                mask(IMM8, 29),
2823                mask(IMM8, 30),
2824                mask(IMM8, 31),
2825            ],
2826        );
2827        transmute(r)
2828    }
2829}
2830
2831/// Shifts packed 32-bit integers in `a` left by the amount
2832/// specified by the corresponding element in `count` while
2833/// shifting in zeros, and returns the result.
2834///
2835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2836#[inline]
2837#[target_feature(enable = "avx2")]
2838#[cfg_attr(test, assert_instr(vpsllvd))]
2839#[stable(feature = "simd_x86", since = "1.27.0")]
2840#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2841pub const fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2842    unsafe {
2843        let count = count.as_u32x4();
2844        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2845        let count = simd_select(no_overflow, count, u32x4::ZERO);
2846        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2847    }
2848}
2849
2850/// Shifts packed 32-bit integers in `a` left by the amount
2851/// specified by the corresponding element in `count` while
2852/// shifting in zeros, and returns the result.
2853///
2854/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2855#[inline]
2856#[target_feature(enable = "avx2")]
2857#[cfg_attr(test, assert_instr(vpsllvd))]
2858#[stable(feature = "simd_x86", since = "1.27.0")]
2859#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2860pub const fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2861    unsafe {
2862        let count = count.as_u32x8();
2863        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2864        let count = simd_select(no_overflow, count, u32x8::ZERO);
2865        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2866    }
2867}
2868
2869/// Shifts packed 64-bit integers in `a` left by the amount
2870/// specified by the corresponding element in `count` while
2871/// shifting in zeros, and returns the result.
2872///
2873/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2874#[inline]
2875#[target_feature(enable = "avx2")]
2876#[cfg_attr(test, assert_instr(vpsllvq))]
2877#[stable(feature = "simd_x86", since = "1.27.0")]
2878#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2879pub const fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2880    unsafe {
2881        let count = count.as_u64x2();
2882        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
2883        let count = simd_select(no_overflow, count, u64x2::ZERO);
2884        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
2885    }
2886}
2887
2888/// Shifts packed 64-bit integers in `a` left by the amount
2889/// specified by the corresponding element in `count` while
2890/// shifting in zeros, and returns the result.
2891///
2892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2893#[inline]
2894#[target_feature(enable = "avx2")]
2895#[cfg_attr(test, assert_instr(vpsllvq))]
2896#[stable(feature = "simd_x86", since = "1.27.0")]
2897#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2898pub const fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2899    unsafe {
2900        let count = count.as_u64x4();
2901        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
2902        let count = simd_select(no_overflow, count, u64x4::ZERO);
2903        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
2904    }
2905}
2906
2907/// Shifts packed 16-bit integers in `a` right by `count` while
2908/// shifting in sign bits.
2909///
2910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2911#[inline]
2912#[target_feature(enable = "avx2")]
2913#[cfg_attr(test, assert_instr(vpsraw))]
2914#[stable(feature = "simd_x86", since = "1.27.0")]
2915pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2916    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2917}
2918
2919/// Shifts packed 32-bit integers in `a` right by `count` while
2920/// shifting in sign bits.
2921///
2922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2923#[inline]
2924#[target_feature(enable = "avx2")]
2925#[cfg_attr(test, assert_instr(vpsrad))]
2926#[stable(feature = "simd_x86", since = "1.27.0")]
2927pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2928    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2929}
2930
2931/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2932/// shifting in sign bits.
2933///
2934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2935#[inline]
2936#[target_feature(enable = "avx2")]
2937#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2938#[rustc_legacy_const_generics(1)]
2939#[stable(feature = "simd_x86", since = "1.27.0")]
2940#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2941pub const fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2942    static_assert_uimm_bits!(IMM8, 8);
2943    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2944}
2945
2946/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2947/// shifting in sign bits.
2948///
2949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2950#[inline]
2951#[target_feature(enable = "avx2")]
2952#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2953#[rustc_legacy_const_generics(1)]
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2956pub const fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2957    static_assert_uimm_bits!(IMM8, 8);
2958    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2959}
2960
2961/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2962/// corresponding element in `count` while shifting in sign bits.
2963///
2964/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2965#[inline]
2966#[target_feature(enable = "avx2")]
2967#[cfg_attr(test, assert_instr(vpsravd))]
2968#[stable(feature = "simd_x86", since = "1.27.0")]
2969#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2970pub const fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2971    unsafe {
2972        let count = count.as_u32x4();
2973        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2974        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
2975        simd_shr(a.as_i32x4(), count).as_m128i()
2976    }
2977}
2978
2979/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2980/// corresponding element in `count` while shifting in sign bits.
2981///
2982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2983#[inline]
2984#[target_feature(enable = "avx2")]
2985#[cfg_attr(test, assert_instr(vpsravd))]
2986#[stable(feature = "simd_x86", since = "1.27.0")]
2987#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2988pub const fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2989    unsafe {
2990        let count = count.as_u32x8();
2991        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2992        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
2993        simd_shr(a.as_i32x8(), count).as_m256i()
2994    }
2995}
2996
2997/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2998///
2999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
3000#[inline]
3001#[target_feature(enable = "avx2")]
3002#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3003#[rustc_legacy_const_generics(1)]
3004#[stable(feature = "simd_x86", since = "1.27.0")]
3005#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3006pub const fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
3007    static_assert_uimm_bits!(IMM8, 8);
3008    _mm256_bsrli_epi128::<IMM8>(a)
3009}
3010
3011/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3012///
3013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
3014#[inline]
3015#[target_feature(enable = "avx2")]
3016#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3017#[rustc_legacy_const_generics(1)]
3018#[stable(feature = "simd_x86", since = "1.27.0")]
3019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3020pub const fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
3021    static_assert_uimm_bits!(IMM8, 8);
3022    const fn mask(shift: i32, i: u32) -> u32 {
3023        let shift = shift as u32 & 0xff;
3024        if shift > 15 || (15 - (i % 16)) < shift {
3025            0
3026        } else {
3027            32 + (i + shift)
3028        }
3029    }
3030    unsafe {
3031        let a = a.as_i8x32();
3032        let r: i8x32 = simd_shuffle!(
3033            i8x32::ZERO,
3034            a,
3035            [
3036                mask(IMM8, 0),
3037                mask(IMM8, 1),
3038                mask(IMM8, 2),
3039                mask(IMM8, 3),
3040                mask(IMM8, 4),
3041                mask(IMM8, 5),
3042                mask(IMM8, 6),
3043                mask(IMM8, 7),
3044                mask(IMM8, 8),
3045                mask(IMM8, 9),
3046                mask(IMM8, 10),
3047                mask(IMM8, 11),
3048                mask(IMM8, 12),
3049                mask(IMM8, 13),
3050                mask(IMM8, 14),
3051                mask(IMM8, 15),
3052                mask(IMM8, 16),
3053                mask(IMM8, 17),
3054                mask(IMM8, 18),
3055                mask(IMM8, 19),
3056                mask(IMM8, 20),
3057                mask(IMM8, 21),
3058                mask(IMM8, 22),
3059                mask(IMM8, 23),
3060                mask(IMM8, 24),
3061                mask(IMM8, 25),
3062                mask(IMM8, 26),
3063                mask(IMM8, 27),
3064                mask(IMM8, 28),
3065                mask(IMM8, 29),
3066                mask(IMM8, 30),
3067                mask(IMM8, 31),
3068            ],
3069        );
3070        transmute(r)
3071    }
3072}
3073
3074/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3075/// zeros.
3076///
3077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3078#[inline]
3079#[target_feature(enable = "avx2")]
3080#[cfg_attr(test, assert_instr(vpsrlw))]
3081#[stable(feature = "simd_x86", since = "1.27.0")]
3082pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3083    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3084}
3085
3086/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3087/// zeros.
3088///
3089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3090#[inline]
3091#[target_feature(enable = "avx2")]
3092#[cfg_attr(test, assert_instr(vpsrld))]
3093#[stable(feature = "simd_x86", since = "1.27.0")]
3094pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3095    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3096}
3097
3098/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3099/// zeros.
3100///
3101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3102#[inline]
3103#[target_feature(enable = "avx2")]
3104#[cfg_attr(test, assert_instr(vpsrlq))]
3105#[stable(feature = "simd_x86", since = "1.27.0")]
3106pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3107    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3108}
3109
3110/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3111/// zeros
3112///
3113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3114#[inline]
3115#[target_feature(enable = "avx2")]
3116#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3117#[rustc_legacy_const_generics(1)]
3118#[stable(feature = "simd_x86", since = "1.27.0")]
3119#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3120pub const fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3121    static_assert_uimm_bits!(IMM8, 8);
3122    unsafe {
3123        if IMM8 >= 16 {
3124            _mm256_setzero_si256()
3125        } else {
3126            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3127        }
3128    }
3129}
3130
3131/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3132/// zeros
3133///
3134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3135#[inline]
3136#[target_feature(enable = "avx2")]
3137#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3138#[rustc_legacy_const_generics(1)]
3139#[stable(feature = "simd_x86", since = "1.27.0")]
3140#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3141pub const fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3142    static_assert_uimm_bits!(IMM8, 8);
3143    unsafe {
3144        if IMM8 >= 32 {
3145            _mm256_setzero_si256()
3146        } else {
3147            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3148        }
3149    }
3150}
3151
3152/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3153/// zeros
3154///
3155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3156#[inline]
3157#[target_feature(enable = "avx2")]
3158#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3159#[rustc_legacy_const_generics(1)]
3160#[stable(feature = "simd_x86", since = "1.27.0")]
3161#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3162pub const fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3163    static_assert_uimm_bits!(IMM8, 8);
3164    unsafe {
3165        if IMM8 >= 64 {
3166            _mm256_setzero_si256()
3167        } else {
3168            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3169        }
3170    }
3171}
3172
3173/// Shifts packed 32-bit integers in `a` right by the amount specified by
3174/// the corresponding element in `count` while shifting in zeros,
3175///
3176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3177#[inline]
3178#[target_feature(enable = "avx2")]
3179#[cfg_attr(test, assert_instr(vpsrlvd))]
3180#[stable(feature = "simd_x86", since = "1.27.0")]
3181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3182pub const fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3183    unsafe {
3184        let count = count.as_u32x4();
3185        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3186        let count = simd_select(no_overflow, count, u32x4::ZERO);
3187        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3188    }
3189}
3190
3191/// Shifts packed 32-bit integers in `a` right by the amount specified by
3192/// the corresponding element in `count` while shifting in zeros,
3193///
3194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3195#[inline]
3196#[target_feature(enable = "avx2")]
3197#[cfg_attr(test, assert_instr(vpsrlvd))]
3198#[stable(feature = "simd_x86", since = "1.27.0")]
3199#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3200pub const fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3201    unsafe {
3202        let count = count.as_u32x8();
3203        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3204        let count = simd_select(no_overflow, count, u32x8::ZERO);
3205        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3206    }
3207}
3208
3209/// Shifts packed 64-bit integers in `a` right by the amount specified by
3210/// the corresponding element in `count` while shifting in zeros,
3211///
3212/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3213#[inline]
3214#[target_feature(enable = "avx2")]
3215#[cfg_attr(test, assert_instr(vpsrlvq))]
3216#[stable(feature = "simd_x86", since = "1.27.0")]
3217#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3218pub const fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3219    unsafe {
3220        let count = count.as_u64x2();
3221        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3222        let count = simd_select(no_overflow, count, u64x2::ZERO);
3223        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3224    }
3225}
3226
3227/// Shifts packed 64-bit integers in `a` right by the amount specified by
3228/// the corresponding element in `count` while shifting in zeros,
3229///
3230/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3231#[inline]
3232#[target_feature(enable = "avx2")]
3233#[cfg_attr(test, assert_instr(vpsrlvq))]
3234#[stable(feature = "simd_x86", since = "1.27.0")]
3235#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3236pub const fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3237    unsafe {
3238        let count = count.as_u64x4();
3239        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3240        let count = simd_select(no_overflow, count, u64x4::ZERO);
3241        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3242    }
3243}
3244
3245/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3246/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3247/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3248///
3249/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3250#[inline]
3251#[target_feature(enable = "avx2")]
3252#[cfg_attr(test, assert_instr(vmovntdqa))]
3253#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3254pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3255    let dst: __m256i;
3256    crate::arch::asm!(
3257        vpl!("vmovntdqa {a}"),
3258        a = out(ymm_reg) dst,
3259        p = in(reg) mem_addr,
3260        options(pure, readonly, nostack, preserves_flags),
3261    );
3262    dst
3263}
3264
3265/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3266///
3267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3268#[inline]
3269#[target_feature(enable = "avx2")]
3270#[cfg_attr(test, assert_instr(vpsubw))]
3271#[stable(feature = "simd_x86", since = "1.27.0")]
3272#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3273pub const fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3274    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3275}
3276
3277/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3278///
3279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3280#[inline]
3281#[target_feature(enable = "avx2")]
3282#[cfg_attr(test, assert_instr(vpsubd))]
3283#[stable(feature = "simd_x86", since = "1.27.0")]
3284#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3285pub const fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3286    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3287}
3288
3289/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3290///
3291/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3292#[inline]
3293#[target_feature(enable = "avx2")]
3294#[cfg_attr(test, assert_instr(vpsubq))]
3295#[stable(feature = "simd_x86", since = "1.27.0")]
3296#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3297pub const fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3298    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3299}
3300
3301/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3302///
3303/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3304#[inline]
3305#[target_feature(enable = "avx2")]
3306#[cfg_attr(test, assert_instr(vpsubb))]
3307#[stable(feature = "simd_x86", since = "1.27.0")]
3308#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3309pub const fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3310    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3311}
3312
3313/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3314/// `a` using saturation.
3315///
3316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3317#[inline]
3318#[target_feature(enable = "avx2")]
3319#[cfg_attr(test, assert_instr(vpsubsw))]
3320#[stable(feature = "simd_x86", since = "1.27.0")]
3321#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3322pub const fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3323    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3324}
3325
3326/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3327/// `a` using saturation.
3328///
3329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3330#[inline]
3331#[target_feature(enable = "avx2")]
3332#[cfg_attr(test, assert_instr(vpsubsb))]
3333#[stable(feature = "simd_x86", since = "1.27.0")]
3334#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3335pub const fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3336    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3337}
3338
3339/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3340/// integers in `a` using saturation.
3341///
3342/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3343#[inline]
3344#[target_feature(enable = "avx2")]
3345#[cfg_attr(test, assert_instr(vpsubusw))]
3346#[stable(feature = "simd_x86", since = "1.27.0")]
3347#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3348pub const fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3349    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3350}
3351
3352/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3353/// integers in `a` using saturation.
3354///
3355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3356#[inline]
3357#[target_feature(enable = "avx2")]
3358#[cfg_attr(test, assert_instr(vpsubusb))]
3359#[stable(feature = "simd_x86", since = "1.27.0")]
3360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3361pub const fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3362    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3363}
3364
3365/// Unpacks and interleave 8-bit integers from the high half of each
3366/// 128-bit lane in `a` and `b`.
3367///
3368/// ```rust
3369/// #[cfg(target_arch = "x86")]
3370/// use std::arch::x86::*;
3371/// #[cfg(target_arch = "x86_64")]
3372/// use std::arch::x86_64::*;
3373///
3374/// # fn main() {
3375/// #     if is_x86_feature_detected!("avx2") {
3376/// #         #[target_feature(enable = "avx2")]
3377/// #         unsafe fn worker() {
3378/// let a = _mm256_setr_epi8(
3379///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3380///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3381/// );
3382/// let b = _mm256_setr_epi8(
3383///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3384///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3385///     -30, -31,
3386/// );
3387///
3388/// let c = _mm256_unpackhi_epi8(a, b);
3389///
3390/// let expected = _mm256_setr_epi8(
3391///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3392///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3393///     -31,
3394/// );
3395/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3396///
3397/// #         }
3398/// #         unsafe { worker(); }
3399/// #     }
3400/// # }
3401/// ```
3402///
3403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3404#[inline]
3405#[target_feature(enable = "avx2")]
3406#[cfg_attr(test, assert_instr(vpunpckhbw))]
3407#[stable(feature = "simd_x86", since = "1.27.0")]
3408#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3409pub const fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3410    unsafe {
3411        #[rustfmt::skip]
3412        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3413                8, 40, 9, 41, 10, 42, 11, 43,
3414                12, 44, 13, 45, 14, 46, 15, 47,
3415                24, 56, 25, 57, 26, 58, 27, 59,
3416                28, 60, 29, 61, 30, 62, 31, 63,
3417        ]);
3418        transmute(r)
3419    }
3420}
3421
3422/// Unpacks and interleave 8-bit integers from the low half of each
3423/// 128-bit lane of `a` and `b`.
3424///
3425/// ```rust
3426/// #[cfg(target_arch = "x86")]
3427/// use std::arch::x86::*;
3428/// #[cfg(target_arch = "x86_64")]
3429/// use std::arch::x86_64::*;
3430///
3431/// # fn main() {
3432/// #     if is_x86_feature_detected!("avx2") {
3433/// #         #[target_feature(enable = "avx2")]
3434/// #         unsafe fn worker() {
3435/// let a = _mm256_setr_epi8(
3436///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3437///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3438/// );
3439/// let b = _mm256_setr_epi8(
3440///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3441///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3442///     -30, -31,
3443/// );
3444///
3445/// let c = _mm256_unpacklo_epi8(a, b);
3446///
3447/// let expected = _mm256_setr_epi8(
3448///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3449///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3450/// );
3451/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3452///
3453/// #         }
3454/// #         unsafe { worker(); }
3455/// #     }
3456/// # }
3457/// ```
3458///
3459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3460#[inline]
3461#[target_feature(enable = "avx2")]
3462#[cfg_attr(test, assert_instr(vpunpcklbw))]
3463#[stable(feature = "simd_x86", since = "1.27.0")]
3464#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3465pub const fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3466    unsafe {
3467        #[rustfmt::skip]
3468        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3469            0, 32, 1, 33, 2, 34, 3, 35,
3470            4, 36, 5, 37, 6, 38, 7, 39,
3471            16, 48, 17, 49, 18, 50, 19, 51,
3472            20, 52, 21, 53, 22, 54, 23, 55,
3473        ]);
3474        transmute(r)
3475    }
3476}
3477
3478/// Unpacks and interleave 16-bit integers from the high half of each
3479/// 128-bit lane of `a` and `b`.
3480///
3481/// ```rust
3482/// #[cfg(target_arch = "x86")]
3483/// use std::arch::x86::*;
3484/// #[cfg(target_arch = "x86_64")]
3485/// use std::arch::x86_64::*;
3486///
3487/// # fn main() {
3488/// #     if is_x86_feature_detected!("avx2") {
3489/// #         #[target_feature(enable = "avx2")]
3490/// #         unsafe fn worker() {
3491/// let a = _mm256_setr_epi16(
3492///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3493/// );
3494/// let b = _mm256_setr_epi16(
3495///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3496/// );
3497///
3498/// let c = _mm256_unpackhi_epi16(a, b);
3499///
3500/// let expected = _mm256_setr_epi16(
3501///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3502/// );
3503/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3504///
3505/// #         }
3506/// #         unsafe { worker(); }
3507/// #     }
3508/// # }
3509/// ```
3510///
3511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3512#[inline]
3513#[target_feature(enable = "avx2")]
3514#[cfg_attr(test, assert_instr(vpunpckhwd))]
3515#[stable(feature = "simd_x86", since = "1.27.0")]
3516#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3517pub const fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3518    unsafe {
3519        let r: i16x16 = simd_shuffle!(
3520            a.as_i16x16(),
3521            b.as_i16x16(),
3522            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3523        );
3524        transmute(r)
3525    }
3526}
3527
3528/// Unpacks and interleave 16-bit integers from the low half of each
3529/// 128-bit lane of `a` and `b`.
3530///
3531/// ```rust
3532/// #[cfg(target_arch = "x86")]
3533/// use std::arch::x86::*;
3534/// #[cfg(target_arch = "x86_64")]
3535/// use std::arch::x86_64::*;
3536///
3537/// # fn main() {
3538/// #     if is_x86_feature_detected!("avx2") {
3539/// #         #[target_feature(enable = "avx2")]
3540/// #         unsafe fn worker() {
3541///
3542/// let a = _mm256_setr_epi16(
3543///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3544/// );
3545/// let b = _mm256_setr_epi16(
3546///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3547/// );
3548///
3549/// let c = _mm256_unpacklo_epi16(a, b);
3550///
3551/// let expected = _mm256_setr_epi16(
3552///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3553/// );
3554/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3555///
3556/// #         }
3557/// #         unsafe { worker(); }
3558/// #     }
3559/// # }
3560/// ```
3561///
3562/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3563#[inline]
3564#[target_feature(enable = "avx2")]
3565#[cfg_attr(test, assert_instr(vpunpcklwd))]
3566#[stable(feature = "simd_x86", since = "1.27.0")]
3567#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3568pub const fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3569    unsafe {
3570        let r: i16x16 = simd_shuffle!(
3571            a.as_i16x16(),
3572            b.as_i16x16(),
3573            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3574        );
3575        transmute(r)
3576    }
3577}
3578
3579/// Unpacks and interleave 32-bit integers from the high half of each
3580/// 128-bit lane of `a` and `b`.
3581///
3582/// ```rust
3583/// #[cfg(target_arch = "x86")]
3584/// use std::arch::x86::*;
3585/// #[cfg(target_arch = "x86_64")]
3586/// use std::arch::x86_64::*;
3587///
3588/// # fn main() {
3589/// #     if is_x86_feature_detected!("avx2") {
3590/// #         #[target_feature(enable = "avx2")]
3591/// #         unsafe fn worker() {
3592/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3593/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3594///
3595/// let c = _mm256_unpackhi_epi32(a, b);
3596///
3597/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3598/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3599///
3600/// #         }
3601/// #         unsafe { worker(); }
3602/// #     }
3603/// # }
3604/// ```
3605///
3606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3607#[inline]
3608#[target_feature(enable = "avx2")]
3609#[cfg_attr(test, assert_instr(vunpckhps))]
3610#[stable(feature = "simd_x86", since = "1.27.0")]
3611#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3612pub const fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3613    unsafe {
3614        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3615        transmute(r)
3616    }
3617}
3618
3619/// Unpacks and interleave 32-bit integers from the low half of each
3620/// 128-bit lane of `a` and `b`.
3621///
3622/// ```rust
3623/// #[cfg(target_arch = "x86")]
3624/// use std::arch::x86::*;
3625/// #[cfg(target_arch = "x86_64")]
3626/// use std::arch::x86_64::*;
3627///
3628/// # fn main() {
3629/// #     if is_x86_feature_detected!("avx2") {
3630/// #         #[target_feature(enable = "avx2")]
3631/// #         unsafe fn worker() {
3632/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3633/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3634///
3635/// let c = _mm256_unpacklo_epi32(a, b);
3636///
3637/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3638/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3639///
3640/// #         }
3641/// #         unsafe { worker(); }
3642/// #     }
3643/// # }
3644/// ```
3645///
3646/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3647#[inline]
3648#[target_feature(enable = "avx2")]
3649#[cfg_attr(test, assert_instr(vunpcklps))]
3650#[stable(feature = "simd_x86", since = "1.27.0")]
3651#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3652pub const fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3653    unsafe {
3654        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3655        transmute(r)
3656    }
3657}
3658
3659/// Unpacks and interleave 64-bit integers from the high half of each
3660/// 128-bit lane of `a` and `b`.
3661///
3662/// ```rust
3663/// #[cfg(target_arch = "x86")]
3664/// use std::arch::x86::*;
3665/// #[cfg(target_arch = "x86_64")]
3666/// use std::arch::x86_64::*;
3667///
3668/// # fn main() {
3669/// #     if is_x86_feature_detected!("avx2") {
3670/// #         #[target_feature(enable = "avx2")]
3671/// #         unsafe fn worker() {
3672/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3673/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3674///
3675/// let c = _mm256_unpackhi_epi64(a, b);
3676///
3677/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3678/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3679///
3680/// #         }
3681/// #         unsafe { worker(); }
3682/// #     }
3683/// # }
3684/// ```
3685///
3686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3687#[inline]
3688#[target_feature(enable = "avx2")]
3689#[cfg_attr(test, assert_instr(vunpckhpd))]
3690#[stable(feature = "simd_x86", since = "1.27.0")]
3691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3692pub const fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3693    unsafe {
3694        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3695        transmute(r)
3696    }
3697}
3698
3699/// Unpacks and interleave 64-bit integers from the low half of each
3700/// 128-bit lane of `a` and `b`.
3701///
3702/// ```rust
3703/// #[cfg(target_arch = "x86")]
3704/// use std::arch::x86::*;
3705/// #[cfg(target_arch = "x86_64")]
3706/// use std::arch::x86_64::*;
3707///
3708/// # fn main() {
3709/// #     if is_x86_feature_detected!("avx2") {
3710/// #         #[target_feature(enable = "avx2")]
3711/// #         unsafe fn worker() {
3712/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3713/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3714///
3715/// let c = _mm256_unpacklo_epi64(a, b);
3716///
3717/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3718/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3719///
3720/// #         }
3721/// #         unsafe { worker(); }
3722/// #     }
3723/// # }
3724/// ```
3725///
3726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3727#[inline]
3728#[target_feature(enable = "avx2")]
3729#[cfg_attr(test, assert_instr(vunpcklpd))]
3730#[stable(feature = "simd_x86", since = "1.27.0")]
3731#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3732pub const fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3733    unsafe {
3734        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3735        transmute(r)
3736    }
3737}
3738
3739/// Computes the bitwise XOR of 256 bits (representing integer data)
3740/// in `a` and `b`
3741///
3742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3743#[inline]
3744#[target_feature(enable = "avx2")]
3745#[cfg_attr(test, assert_instr(vxorps))]
3746#[stable(feature = "simd_x86", since = "1.27.0")]
3747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3748pub const fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3749    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3750}
3751
3752/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3753/// integer containing the zero-extended integer data.
3754///
3755/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3756///
3757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3758#[inline]
3759#[target_feature(enable = "avx2")]
3760// This intrinsic has no corresponding instruction.
3761#[rustc_legacy_const_generics(1)]
3762#[stable(feature = "simd_x86", since = "1.27.0")]
3763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3764pub const fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3765    static_assert_uimm_bits!(INDEX, 5);
3766    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3767}
3768
3769/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3770/// integer containing the zero-extended integer data.
3771///
3772/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3773///
3774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3775#[inline]
3776#[target_feature(enable = "avx2")]
3777// This intrinsic has no corresponding instruction.
3778#[rustc_legacy_const_generics(1)]
3779#[stable(feature = "simd_x86", since = "1.27.0")]
3780#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3781pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3782    static_assert_uimm_bits!(INDEX, 4);
3783    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3784}
3785
3786#[allow(improper_ctypes)]
3787unsafe extern "C" {
3788    #[link_name = "llvm.x86.avx2.phadd.sw"]
3789    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3790    #[link_name = "llvm.x86.avx2.phsub.sw"]
3791    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3792    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3793    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3794    #[link_name = "llvm.x86.avx2.mpsadbw"]
3795    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3796    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3797    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3798    #[link_name = "llvm.x86.avx2.packsswb"]
3799    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3800    #[link_name = "llvm.x86.avx2.packssdw"]
3801    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3802    #[link_name = "llvm.x86.avx2.packuswb"]
3803    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3804    #[link_name = "llvm.x86.avx2.packusdw"]
3805    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3806    #[link_name = "llvm.x86.avx2.psad.bw"]
3807    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3808    #[link_name = "llvm.x86.avx2.psign.b"]
3809    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3810    #[link_name = "llvm.x86.avx2.psign.w"]
3811    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3812    #[link_name = "llvm.x86.avx2.psign.d"]
3813    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3814    #[link_name = "llvm.x86.avx2.psll.w"]
3815    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3816    #[link_name = "llvm.x86.avx2.psll.d"]
3817    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3818    #[link_name = "llvm.x86.avx2.psll.q"]
3819    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3820    #[link_name = "llvm.x86.avx2.psra.w"]
3821    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3822    #[link_name = "llvm.x86.avx2.psra.d"]
3823    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3824    #[link_name = "llvm.x86.avx2.psrl.w"]
3825    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3826    #[link_name = "llvm.x86.avx2.psrl.d"]
3827    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3828    #[link_name = "llvm.x86.avx2.psrl.q"]
3829    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3830    #[link_name = "llvm.x86.avx2.pshuf.b"]
3831    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3832    #[link_name = "llvm.x86.avx2.permd"]
3833    fn permd(a: u32x8, b: u32x8) -> u32x8;
3834    #[link_name = "llvm.x86.avx2.permps"]
3835    fn permps(a: __m256, b: i32x8) -> __m256;
3836    #[link_name = "llvm.x86.avx2.gather.d.d"]
3837    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3838    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3839    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3840    #[link_name = "llvm.x86.avx2.gather.d.q"]
3841    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3842    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3843    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3844    #[link_name = "llvm.x86.avx2.gather.q.d"]
3845    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3846    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3847    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3848    #[link_name = "llvm.x86.avx2.gather.q.q"]
3849    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3850    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3851    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3852    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3853    fn pgatherdpd(
3854        src: __m128d,
3855        slice: *const i8,
3856        offsets: i32x4,
3857        mask: __m128d,
3858        scale: i8,
3859    ) -> __m128d;
3860    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3861    fn vpgatherdpd(
3862        src: __m256d,
3863        slice: *const i8,
3864        offsets: i32x4,
3865        mask: __m256d,
3866        scale: i8,
3867    ) -> __m256d;
3868    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3869    fn pgatherqpd(
3870        src: __m128d,
3871        slice: *const i8,
3872        offsets: i64x2,
3873        mask: __m128d,
3874        scale: i8,
3875    ) -> __m128d;
3876    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3877    fn vpgatherqpd(
3878        src: __m256d,
3879        slice: *const i8,
3880        offsets: i64x4,
3881        mask: __m256d,
3882        scale: i8,
3883    ) -> __m256d;
3884    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3885    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3886    -> __m128;
3887    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3888    fn vpgatherdps(
3889        src: __m256,
3890        slice: *const i8,
3891        offsets: i32x8,
3892        mask: __m256,
3893        scale: i8,
3894    ) -> __m256;
3895    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3896    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3897    -> __m128;
3898    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3899    fn vpgatherqps(
3900        src: __m128,
3901        slice: *const i8,
3902        offsets: i64x4,
3903        mask: __m128,
3904        scale: i8,
3905    ) -> __m128;
3906}
3907
3908#[cfg(test)]
3909mod tests {
3910    use crate::core_arch::assert_eq_const as assert_eq;
3911
3912    use stdarch_test::simd_test;
3913
3914    use crate::core_arch::x86::*;
3915
3916    #[simd_test(enable = "avx2")]
3917    const unsafe fn test_mm256_abs_epi32() {
3918        #[rustfmt::skip]
3919        let a = _mm256_setr_epi32(
3920            0, 1, -1, i32::MAX,
3921            i32::MIN, 100, -100, -32,
3922        );
3923        let r = _mm256_abs_epi32(a);
3924        #[rustfmt::skip]
3925        let e = _mm256_setr_epi32(
3926            0, 1, 1, i32::MAX,
3927            i32::MAX.wrapping_add(1), 100, 100, 32,
3928        );
3929        assert_eq_m256i(r, e);
3930    }
3931
3932    #[simd_test(enable = "avx2")]
3933    const unsafe fn test_mm256_abs_epi16() {
3934        #[rustfmt::skip]
3935        let a = _mm256_setr_epi16(
3936            0,  1, -1, 2, -2, 3, -3, 4,
3937            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3938        );
3939        let r = _mm256_abs_epi16(a);
3940        #[rustfmt::skip]
3941        let e = _mm256_setr_epi16(
3942            0, 1, 1, 2, 2, 3, 3, 4,
3943            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3944        );
3945        assert_eq_m256i(r, e);
3946    }
3947
3948    #[simd_test(enable = "avx2")]
3949    const unsafe fn test_mm256_abs_epi8() {
3950        #[rustfmt::skip]
3951        let a = _mm256_setr_epi8(
3952            0, 1, -1, 2, -2, 3, -3, 4,
3953            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3954            0, 1, -1, 2, -2, 3, -3, 4,
3955            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3956        );
3957        let r = _mm256_abs_epi8(a);
3958        #[rustfmt::skip]
3959        let e = _mm256_setr_epi8(
3960            0, 1, 1, 2, 2, 3, 3, 4,
3961            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3962            0, 1, 1, 2, 2, 3, 3, 4,
3963            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3964        );
3965        assert_eq_m256i(r, e);
3966    }
3967
3968    #[simd_test(enable = "avx2")]
3969    const unsafe fn test_mm256_add_epi64() {
3970        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3971        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3972        let r = _mm256_add_epi64(a, b);
3973        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3974        assert_eq_m256i(r, e);
3975    }
3976
3977    #[simd_test(enable = "avx2")]
3978    const unsafe fn test_mm256_add_epi32() {
3979        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3980        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3981        let r = _mm256_add_epi32(a, b);
3982        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3983        assert_eq_m256i(r, e);
3984    }
3985
3986    #[simd_test(enable = "avx2")]
3987    const unsafe fn test_mm256_add_epi16() {
3988        #[rustfmt::skip]
3989        let a = _mm256_setr_epi16(
3990            0, 1, 2, 3, 4, 5, 6, 7,
3991            8, 9, 10, 11, 12, 13, 14, 15,
3992        );
3993        #[rustfmt::skip]
3994        let b = _mm256_setr_epi16(
3995            0, 1, 2, 3, 4, 5, 6, 7,
3996            8, 9, 10, 11, 12, 13, 14, 15,
3997        );
3998        let r = _mm256_add_epi16(a, b);
3999        #[rustfmt::skip]
4000        let e = _mm256_setr_epi16(
4001            0, 2, 4, 6, 8, 10, 12, 14,
4002            16, 18, 20, 22, 24, 26, 28, 30,
4003        );
4004        assert_eq_m256i(r, e);
4005    }
4006
4007    #[simd_test(enable = "avx2")]
4008    const unsafe fn test_mm256_add_epi8() {
4009        #[rustfmt::skip]
4010        let a = _mm256_setr_epi8(
4011            0, 1, 2, 3, 4, 5, 6, 7,
4012            8, 9, 10, 11, 12, 13, 14, 15,
4013            16, 17, 18, 19, 20, 21, 22, 23,
4014            24, 25, 26, 27, 28, 29, 30, 31,
4015        );
4016        #[rustfmt::skip]
4017        let b = _mm256_setr_epi8(
4018            0, 1, 2, 3, 4, 5, 6, 7,
4019            8, 9, 10, 11, 12, 13, 14, 15,
4020            16, 17, 18, 19, 20, 21, 22, 23,
4021            24, 25, 26, 27, 28, 29, 30, 31,
4022        );
4023        let r = _mm256_add_epi8(a, b);
4024        #[rustfmt::skip]
4025        let e = _mm256_setr_epi8(
4026            0, 2, 4, 6, 8, 10, 12, 14,
4027            16, 18, 20, 22, 24, 26, 28, 30,
4028            32, 34, 36, 38, 40, 42, 44, 46,
4029            48, 50, 52, 54, 56, 58, 60, 62,
4030        );
4031        assert_eq_m256i(r, e);
4032    }
4033
4034    #[simd_test(enable = "avx2")]
4035    const unsafe fn test_mm256_adds_epi8() {
4036        #[rustfmt::skip]
4037        let a = _mm256_setr_epi8(
4038            0, 1, 2, 3, 4, 5, 6, 7,
4039            8, 9, 10, 11, 12, 13, 14, 15,
4040            16, 17, 18, 19, 20, 21, 22, 23,
4041            24, 25, 26, 27, 28, 29, 30, 31,
4042        );
4043        #[rustfmt::skip]
4044        let b = _mm256_setr_epi8(
4045            32, 33, 34, 35, 36, 37, 38, 39,
4046            40, 41, 42, 43, 44, 45, 46, 47,
4047            48, 49, 50, 51, 52, 53, 54, 55,
4048            56, 57, 58, 59, 60, 61, 62, 63,
4049        );
4050        let r = _mm256_adds_epi8(a, b);
4051        #[rustfmt::skip]
4052        let e = _mm256_setr_epi8(
4053            32, 34, 36, 38, 40, 42, 44, 46,
4054            48, 50, 52, 54, 56, 58, 60, 62,
4055            64, 66, 68, 70, 72, 74, 76, 78,
4056            80, 82, 84, 86, 88, 90, 92, 94,
4057        );
4058        assert_eq_m256i(r, e);
4059    }
4060
4061    #[simd_test(enable = "avx2")]
4062    unsafe fn test_mm256_adds_epi8_saturate_positive() {
4063        let a = _mm256_set1_epi8(0x7F);
4064        let b = _mm256_set1_epi8(1);
4065        let r = _mm256_adds_epi8(a, b);
4066        assert_eq_m256i(r, a);
4067    }
4068
4069    #[simd_test(enable = "avx2")]
4070    unsafe fn test_mm256_adds_epi8_saturate_negative() {
4071        let a = _mm256_set1_epi8(-0x80);
4072        let b = _mm256_set1_epi8(-1);
4073        let r = _mm256_adds_epi8(a, b);
4074        assert_eq_m256i(r, a);
4075    }
4076
4077    #[simd_test(enable = "avx2")]
4078    const unsafe fn test_mm256_adds_epi16() {
4079        #[rustfmt::skip]
4080        let a = _mm256_setr_epi16(
4081            0, 1, 2, 3, 4, 5, 6, 7,
4082            8, 9, 10, 11, 12, 13, 14, 15,
4083        );
4084        #[rustfmt::skip]
4085        let b = _mm256_setr_epi16(
4086            32, 33, 34, 35, 36, 37, 38, 39,
4087            40, 41, 42, 43, 44, 45, 46, 47,
4088        );
4089        let r = _mm256_adds_epi16(a, b);
4090        #[rustfmt::skip]
4091        let e = _mm256_setr_epi16(
4092            32, 34, 36, 38, 40, 42, 44, 46,
4093            48, 50, 52, 54, 56, 58, 60, 62,
4094        );
4095
4096        assert_eq_m256i(r, e);
4097    }
4098
4099    #[simd_test(enable = "avx2")]
4100    unsafe fn test_mm256_adds_epi16_saturate_positive() {
4101        let a = _mm256_set1_epi16(0x7FFF);
4102        let b = _mm256_set1_epi16(1);
4103        let r = _mm256_adds_epi16(a, b);
4104        assert_eq_m256i(r, a);
4105    }
4106
4107    #[simd_test(enable = "avx2")]
4108    unsafe fn test_mm256_adds_epi16_saturate_negative() {
4109        let a = _mm256_set1_epi16(-0x8000);
4110        let b = _mm256_set1_epi16(-1);
4111        let r = _mm256_adds_epi16(a, b);
4112        assert_eq_m256i(r, a);
4113    }
4114
4115    #[simd_test(enable = "avx2")]
4116    const unsafe fn test_mm256_adds_epu8() {
4117        #[rustfmt::skip]
4118        let a = _mm256_setr_epi8(
4119            0, 1, 2, 3, 4, 5, 6, 7,
4120            8, 9, 10, 11, 12, 13, 14, 15,
4121            16, 17, 18, 19, 20, 21, 22, 23,
4122            24, 25, 26, 27, 28, 29, 30, 31,
4123        );
4124        #[rustfmt::skip]
4125        let b = _mm256_setr_epi8(
4126            32, 33, 34, 35, 36, 37, 38, 39,
4127            40, 41, 42, 43, 44, 45, 46, 47,
4128            48, 49, 50, 51, 52, 53, 54, 55,
4129            56, 57, 58, 59, 60, 61, 62, 63,
4130        );
4131        let r = _mm256_adds_epu8(a, b);
4132        #[rustfmt::skip]
4133        let e = _mm256_setr_epi8(
4134            32, 34, 36, 38, 40, 42, 44, 46,
4135            48, 50, 52, 54, 56, 58, 60, 62,
4136            64, 66, 68, 70, 72, 74, 76, 78,
4137            80, 82, 84, 86, 88, 90, 92, 94,
4138        );
4139        assert_eq_m256i(r, e);
4140    }
4141
4142    #[simd_test(enable = "avx2")]
4143    unsafe fn test_mm256_adds_epu8_saturate() {
4144        let a = _mm256_set1_epi8(!0);
4145        let b = _mm256_set1_epi8(1);
4146        let r = _mm256_adds_epu8(a, b);
4147        assert_eq_m256i(r, a);
4148    }
4149
4150    #[simd_test(enable = "avx2")]
4151    const unsafe fn test_mm256_adds_epu16() {
4152        #[rustfmt::skip]
4153        let a = _mm256_setr_epi16(
4154            0, 1, 2, 3, 4, 5, 6, 7,
4155            8, 9, 10, 11, 12, 13, 14, 15,
4156        );
4157        #[rustfmt::skip]
4158        let b = _mm256_setr_epi16(
4159            32, 33, 34, 35, 36, 37, 38, 39,
4160            40, 41, 42, 43, 44, 45, 46, 47,
4161        );
4162        let r = _mm256_adds_epu16(a, b);
4163        #[rustfmt::skip]
4164        let e = _mm256_setr_epi16(
4165            32, 34, 36, 38, 40, 42, 44, 46,
4166            48, 50, 52, 54, 56, 58, 60, 62,
4167        );
4168
4169        assert_eq_m256i(r, e);
4170    }
4171
4172    #[simd_test(enable = "avx2")]
4173    unsafe fn test_mm256_adds_epu16_saturate() {
4174        let a = _mm256_set1_epi16(!0);
4175        let b = _mm256_set1_epi16(1);
4176        let r = _mm256_adds_epu16(a, b);
4177        assert_eq_m256i(r, a);
4178    }
4179
4180    #[simd_test(enable = "avx2")]
4181    const unsafe fn test_mm256_and_si256() {
4182        let a = _mm256_set1_epi8(5);
4183        let b = _mm256_set1_epi8(3);
4184        let got = _mm256_and_si256(a, b);
4185        assert_eq_m256i(got, _mm256_set1_epi8(1));
4186    }
4187
4188    #[simd_test(enable = "avx2")]
4189    const unsafe fn test_mm256_andnot_si256() {
4190        let a = _mm256_set1_epi8(5);
4191        let b = _mm256_set1_epi8(3);
4192        let got = _mm256_andnot_si256(a, b);
4193        assert_eq_m256i(got, _mm256_set1_epi8(2));
4194    }
4195
4196    #[simd_test(enable = "avx2")]
4197    const unsafe fn test_mm256_avg_epu8() {
4198        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4199        let r = _mm256_avg_epu8(a, b);
4200        assert_eq_m256i(r, _mm256_set1_epi8(6));
4201    }
4202
4203    #[simd_test(enable = "avx2")]
4204    const unsafe fn test_mm256_avg_epu16() {
4205        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4206        let r = _mm256_avg_epu16(a, b);
4207        assert_eq_m256i(r, _mm256_set1_epi16(6));
4208    }
4209
4210    #[simd_test(enable = "avx2")]
4211    const unsafe fn test_mm_blend_epi32() {
4212        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4213        let e = _mm_setr_epi32(9, 3, 3, 3);
4214        let r = _mm_blend_epi32::<0x01>(a, b);
4215        assert_eq_m128i(r, e);
4216
4217        let r = _mm_blend_epi32::<0x0E>(b, a);
4218        assert_eq_m128i(r, e);
4219    }
4220
4221    #[simd_test(enable = "avx2")]
4222    const unsafe fn test_mm256_blend_epi32() {
4223        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4224        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4225        let r = _mm256_blend_epi32::<0x01>(a, b);
4226        assert_eq_m256i(r, e);
4227
4228        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4229        let r = _mm256_blend_epi32::<0x82>(a, b);
4230        assert_eq_m256i(r, e);
4231
4232        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4233        let r = _mm256_blend_epi32::<0x7C>(a, b);
4234        assert_eq_m256i(r, e);
4235    }
4236
4237    #[simd_test(enable = "avx2")]
4238    const unsafe fn test_mm256_blend_epi16() {
4239        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4240        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4241        let r = _mm256_blend_epi16::<0x01>(a, b);
4242        assert_eq_m256i(r, e);
4243
4244        let r = _mm256_blend_epi16::<0xFE>(b, a);
4245        assert_eq_m256i(r, e);
4246    }
4247
4248    #[simd_test(enable = "avx2")]
4249    const unsafe fn test_mm256_blendv_epi8() {
4250        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4251        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4252        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4253        let r = _mm256_blendv_epi8(a, b, mask);
4254        assert_eq_m256i(r, e);
4255    }
4256
4257    #[simd_test(enable = "avx2")]
4258    const unsafe fn test_mm_broadcastb_epi8() {
4259        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4260        let res = _mm_broadcastb_epi8(a);
4261        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4262    }
4263
4264    #[simd_test(enable = "avx2")]
4265    const unsafe fn test_mm256_broadcastb_epi8() {
4266        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4267        let res = _mm256_broadcastb_epi8(a);
4268        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4269    }
4270
4271    #[simd_test(enable = "avx2")]
4272    const unsafe fn test_mm_broadcastd_epi32() {
4273        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4274        let res = _mm_broadcastd_epi32(a);
4275        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4276    }
4277
4278    #[simd_test(enable = "avx2")]
4279    const unsafe fn test_mm256_broadcastd_epi32() {
4280        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4281        let res = _mm256_broadcastd_epi32(a);
4282        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4283    }
4284
4285    #[simd_test(enable = "avx2")]
4286    const unsafe fn test_mm_broadcastq_epi64() {
4287        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4288        let res = _mm_broadcastq_epi64(a);
4289        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4290    }
4291
4292    #[simd_test(enable = "avx2")]
4293    const unsafe fn test_mm256_broadcastq_epi64() {
4294        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4295        let res = _mm256_broadcastq_epi64(a);
4296        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4297    }
4298
4299    #[simd_test(enable = "avx2")]
4300    const unsafe fn test_mm_broadcastsd_pd() {
4301        let a = _mm_setr_pd(6.88, 3.44);
4302        let res = _mm_broadcastsd_pd(a);
4303        assert_eq_m128d(res, _mm_set1_pd(6.88));
4304    }
4305
4306    #[simd_test(enable = "avx2")]
4307    const unsafe fn test_mm256_broadcastsd_pd() {
4308        let a = _mm_setr_pd(6.88, 3.44);
4309        let res = _mm256_broadcastsd_pd(a);
4310        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4311    }
4312
4313    #[simd_test(enable = "avx2")]
4314    const unsafe fn test_mm_broadcastsi128_si256() {
4315        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4316        let res = _mm_broadcastsi128_si256(a);
4317        let retval = _mm256_setr_epi64x(
4318            0x0987654321012334,
4319            0x5678909876543210,
4320            0x0987654321012334,
4321            0x5678909876543210,
4322        );
4323        assert_eq_m256i(res, retval);
4324    }
4325
4326    #[simd_test(enable = "avx2")]
4327    const unsafe fn test_mm256_broadcastsi128_si256() {
4328        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4329        let res = _mm256_broadcastsi128_si256(a);
4330        let retval = _mm256_setr_epi64x(
4331            0x0987654321012334,
4332            0x5678909876543210,
4333            0x0987654321012334,
4334            0x5678909876543210,
4335        );
4336        assert_eq_m256i(res, retval);
4337    }
4338
4339    #[simd_test(enable = "avx2")]
4340    const unsafe fn test_mm_broadcastss_ps() {
4341        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4342        let res = _mm_broadcastss_ps(a);
4343        assert_eq_m128(res, _mm_set1_ps(6.88));
4344    }
4345
4346    #[simd_test(enable = "avx2")]
4347    const unsafe fn test_mm256_broadcastss_ps() {
4348        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4349        let res = _mm256_broadcastss_ps(a);
4350        assert_eq_m256(res, _mm256_set1_ps(6.88));
4351    }
4352
4353    #[simd_test(enable = "avx2")]
4354    const unsafe fn test_mm_broadcastw_epi16() {
4355        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4356        let res = _mm_broadcastw_epi16(a);
4357        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4358    }
4359
4360    #[simd_test(enable = "avx2")]
4361    const unsafe fn test_mm256_broadcastw_epi16() {
4362        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4363        let res = _mm256_broadcastw_epi16(a);
4364        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4365    }
4366
4367    #[simd_test(enable = "avx2")]
4368    const unsafe fn test_mm256_cmpeq_epi8() {
4369        #[rustfmt::skip]
4370        let a = _mm256_setr_epi8(
4371            0, 1, 2, 3, 4, 5, 6, 7,
4372            8, 9, 10, 11, 12, 13, 14, 15,
4373            16, 17, 18, 19, 20, 21, 22, 23,
4374            24, 25, 26, 27, 28, 29, 30, 31,
4375        );
4376        #[rustfmt::skip]
4377        let b = _mm256_setr_epi8(
4378            31, 30, 2, 28, 27, 26, 25, 24,
4379            23, 22, 21, 20, 19, 18, 17, 16,
4380            15, 14, 13, 12, 11, 10, 9, 8,
4381            7, 6, 5, 4, 3, 2, 1, 0,
4382        );
4383        let r = _mm256_cmpeq_epi8(a, b);
4384        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4385    }
4386
4387    #[simd_test(enable = "avx2")]
4388    const unsafe fn test_mm256_cmpeq_epi16() {
4389        #[rustfmt::skip]
4390        let a = _mm256_setr_epi16(
4391            0, 1, 2, 3, 4, 5, 6, 7,
4392            8, 9, 10, 11, 12, 13, 14, 15,
4393        );
4394        #[rustfmt::skip]
4395        let b = _mm256_setr_epi16(
4396            15, 14, 2, 12, 11, 10, 9, 8,
4397            7, 6, 5, 4, 3, 2, 1, 0,
4398        );
4399        let r = _mm256_cmpeq_epi16(a, b);
4400        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4401    }
4402
4403    #[simd_test(enable = "avx2")]
4404    const unsafe fn test_mm256_cmpeq_epi32() {
4405        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4406        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4407        let r = _mm256_cmpeq_epi32(a, b);
4408        let e = _mm256_set1_epi32(0);
4409        let e = _mm256_insert_epi32::<2>(e, !0);
4410        assert_eq_m256i(r, e);
4411    }
4412
4413    #[simd_test(enable = "avx2")]
4414    const unsafe fn test_mm256_cmpeq_epi64() {
4415        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4416        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4417        let r = _mm256_cmpeq_epi64(a, b);
4418        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4419    }
4420
4421    #[simd_test(enable = "avx2")]
4422    const unsafe fn test_mm256_cmpgt_epi8() {
4423        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4424        let b = _mm256_set1_epi8(0);
4425        let r = _mm256_cmpgt_epi8(a, b);
4426        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4427    }
4428
4429    #[simd_test(enable = "avx2")]
4430    const unsafe fn test_mm256_cmpgt_epi16() {
4431        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4432        let b = _mm256_set1_epi16(0);
4433        let r = _mm256_cmpgt_epi16(a, b);
4434        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4435    }
4436
4437    #[simd_test(enable = "avx2")]
4438    const unsafe fn test_mm256_cmpgt_epi32() {
4439        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4440        let b = _mm256_set1_epi32(0);
4441        let r = _mm256_cmpgt_epi32(a, b);
4442        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4443    }
4444
4445    #[simd_test(enable = "avx2")]
4446    const unsafe fn test_mm256_cmpgt_epi64() {
4447        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4448        let b = _mm256_set1_epi64x(0);
4449        let r = _mm256_cmpgt_epi64(a, b);
4450        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4451    }
4452
4453    #[simd_test(enable = "avx2")]
4454    const unsafe fn test_mm256_cvtepi8_epi16() {
4455        #[rustfmt::skip]
4456        let a = _mm_setr_epi8(
4457            0, 0, -1, 1, -2, 2, -3, 3,
4458            -4, 4, -5, 5, -6, 6, -7, 7,
4459        );
4460        #[rustfmt::skip]
4461        let r = _mm256_setr_epi16(
4462            0, 0, -1, 1, -2, 2, -3, 3,
4463            -4, 4, -5, 5, -6, 6, -7, 7,
4464        );
4465        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4466    }
4467
4468    #[simd_test(enable = "avx2")]
4469    const unsafe fn test_mm256_cvtepi8_epi32() {
4470        #[rustfmt::skip]
4471        let a = _mm_setr_epi8(
4472            0, 0, -1, 1, -2, 2, -3, 3,
4473            -4, 4, -5, 5, -6, 6, -7, 7,
4474        );
4475        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4476        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4477    }
4478
4479    #[simd_test(enable = "avx2")]
4480    const unsafe fn test_mm256_cvtepi8_epi64() {
4481        #[rustfmt::skip]
4482        let a = _mm_setr_epi8(
4483            0, 0, -1, 1, -2, 2, -3, 3,
4484            -4, 4, -5, 5, -6, 6, -7, 7,
4485        );
4486        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4487        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4488    }
4489
4490    #[simd_test(enable = "avx2")]
4491    const unsafe fn test_mm256_cvtepi16_epi32() {
4492        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4493        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4494        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4495    }
4496
4497    #[simd_test(enable = "avx2")]
4498    const unsafe fn test_mm256_cvtepi16_epi64() {
4499        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4500        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4501        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4502    }
4503
4504    #[simd_test(enable = "avx2")]
4505    const unsafe fn test_mm256_cvtepi32_epi64() {
4506        let a = _mm_setr_epi32(0, 0, -1, 1);
4507        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4508        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4509    }
4510
4511    #[simd_test(enable = "avx2")]
4512    const unsafe fn test_mm256_cvtepu16_epi32() {
4513        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4514        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4515        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4516    }
4517
4518    #[simd_test(enable = "avx2")]
4519    const unsafe fn test_mm256_cvtepu16_epi64() {
4520        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4521        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4522        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4523    }
4524
4525    #[simd_test(enable = "avx2")]
4526    const unsafe fn test_mm256_cvtepu32_epi64() {
4527        let a = _mm_setr_epi32(0, 1, 2, 3);
4528        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4529        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4530    }
4531
4532    #[simd_test(enable = "avx2")]
4533    const unsafe fn test_mm256_cvtepu8_epi16() {
4534        #[rustfmt::skip]
4535        let a = _mm_setr_epi8(
4536            0, 1, 2, 3, 4, 5, 6, 7,
4537            8, 9, 10, 11, 12, 13, 14, 15,
4538        );
4539        #[rustfmt::skip]
4540        let r = _mm256_setr_epi16(
4541            0, 1, 2, 3, 4, 5, 6, 7,
4542            8, 9, 10, 11, 12, 13, 14, 15,
4543        );
4544        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4545    }
4546
4547    #[simd_test(enable = "avx2")]
4548    const unsafe fn test_mm256_cvtepu8_epi32() {
4549        #[rustfmt::skip]
4550        let a = _mm_setr_epi8(
4551            0, 1, 2, 3, 4, 5, 6, 7,
4552            8, 9, 10, 11, 12, 13, 14, 15,
4553        );
4554        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4555        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4556    }
4557
4558    #[simd_test(enable = "avx2")]
4559    const unsafe fn test_mm256_cvtepu8_epi64() {
4560        #[rustfmt::skip]
4561        let a = _mm_setr_epi8(
4562            0, 1, 2, 3, 4, 5, 6, 7,
4563            8, 9, 10, 11, 12, 13, 14, 15,
4564        );
4565        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4566        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4567    }
4568
4569    #[simd_test(enable = "avx2")]
4570    const unsafe fn test_mm256_extracti128_si256() {
4571        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4572        let r = _mm256_extracti128_si256::<1>(a);
4573        let e = _mm_setr_epi64x(3, 4);
4574        assert_eq_m128i(r, e);
4575    }
4576
4577    #[simd_test(enable = "avx2")]
4578    const unsafe fn test_mm256_hadd_epi16() {
4579        let a = _mm256_set1_epi16(2);
4580        let b = _mm256_set1_epi16(4);
4581        let r = _mm256_hadd_epi16(a, b);
4582        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4583        assert_eq_m256i(r, e);
4584    }
4585
4586    #[simd_test(enable = "avx2")]
4587    const unsafe fn test_mm256_hadd_epi32() {
4588        let a = _mm256_set1_epi32(2);
4589        let b = _mm256_set1_epi32(4);
4590        let r = _mm256_hadd_epi32(a, b);
4591        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4592        assert_eq_m256i(r, e);
4593    }
4594
4595    #[simd_test(enable = "avx2")]
4596    unsafe fn test_mm256_hadds_epi16() {
4597        let a = _mm256_set1_epi16(2);
4598        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4599        let a = _mm256_insert_epi16::<1>(a, 1);
4600        let b = _mm256_set1_epi16(4);
4601        let r = _mm256_hadds_epi16(a, b);
4602        #[rustfmt::skip]
4603        let e = _mm256_setr_epi16(
4604            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4605            4, 4, 4, 4, 8, 8, 8, 8,
4606        );
4607        assert_eq_m256i(r, e);
4608    }
4609
4610    #[simd_test(enable = "avx2")]
4611    const unsafe fn test_mm256_hsub_epi16() {
4612        let a = _mm256_set1_epi16(2);
4613        let b = _mm256_set1_epi16(4);
4614        let r = _mm256_hsub_epi16(a, b);
4615        let e = _mm256_set1_epi16(0);
4616        assert_eq_m256i(r, e);
4617    }
4618
4619    #[simd_test(enable = "avx2")]
4620    const unsafe fn test_mm256_hsub_epi32() {
4621        let a = _mm256_set1_epi32(2);
4622        let b = _mm256_set1_epi32(4);
4623        let r = _mm256_hsub_epi32(a, b);
4624        let e = _mm256_set1_epi32(0);
4625        assert_eq_m256i(r, e);
4626    }
4627
4628    #[simd_test(enable = "avx2")]
4629    unsafe fn test_mm256_hsubs_epi16() {
4630        let a = _mm256_set1_epi16(2);
4631        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4632        let a = _mm256_insert_epi16::<1>(a, -1);
4633        let b = _mm256_set1_epi16(4);
4634        let r = _mm256_hsubs_epi16(a, b);
4635        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4636        assert_eq_m256i(r, e);
4637    }
4638
4639    #[simd_test(enable = "avx2")]
4640    const unsafe fn test_mm256_madd_epi16() {
4641        let a = _mm256_set1_epi16(2);
4642        let b = _mm256_set1_epi16(4);
4643        let r = _mm256_madd_epi16(a, b);
4644        let e = _mm256_set1_epi32(16);
4645        assert_eq_m256i(r, e);
4646    }
4647
4648    #[simd_test(enable = "avx2")]
4649    const unsafe fn test_mm256_inserti128_si256() {
4650        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4651        let b = _mm_setr_epi64x(7, 8);
4652        let r = _mm256_inserti128_si256::<1>(a, b);
4653        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4654        assert_eq_m256i(r, e);
4655    }
4656
4657    #[simd_test(enable = "avx2")]
4658    unsafe fn test_mm256_maddubs_epi16() {
4659        let a = _mm256_set1_epi8(2);
4660        let b = _mm256_set1_epi8(4);
4661        let r = _mm256_maddubs_epi16(a, b);
4662        let e = _mm256_set1_epi16(16);
4663        assert_eq_m256i(r, e);
4664    }
4665
4666    #[simd_test(enable = "avx2")]
4667    const unsafe fn test_mm_maskload_epi32() {
4668        let nums = [1, 2, 3, 4];
4669        let a = &nums as *const i32;
4670        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4671        let r = _mm_maskload_epi32(a, mask);
4672        let e = _mm_setr_epi32(1, 0, 0, 4);
4673        assert_eq_m128i(r, e);
4674    }
4675
4676    #[simd_test(enable = "avx2")]
4677    const unsafe fn test_mm256_maskload_epi32() {
4678        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4679        let a = &nums as *const i32;
4680        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4681        let r = _mm256_maskload_epi32(a, mask);
4682        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4683        assert_eq_m256i(r, e);
4684    }
4685
4686    #[simd_test(enable = "avx2")]
4687    const unsafe fn test_mm_maskload_epi64() {
4688        let nums = [1_i64, 2_i64];
4689        let a = &nums as *const i64;
4690        let mask = _mm_setr_epi64x(0, -1);
4691        let r = _mm_maskload_epi64(a, mask);
4692        let e = _mm_setr_epi64x(0, 2);
4693        assert_eq_m128i(r, e);
4694    }
4695
4696    #[simd_test(enable = "avx2")]
4697    const unsafe fn test_mm256_maskload_epi64() {
4698        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4699        let a = &nums as *const i64;
4700        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4701        let r = _mm256_maskload_epi64(a, mask);
4702        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4703        assert_eq_m256i(r, e);
4704    }
4705
4706    #[simd_test(enable = "avx2")]
4707    const unsafe fn test_mm_maskstore_epi32() {
4708        let a = _mm_setr_epi32(1, 2, 3, 4);
4709        let mut arr = [-1, -1, -1, -1];
4710        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4711        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4712        let e = [1, -1, -1, 4];
4713        assert_eq!(arr, e);
4714    }
4715
4716    #[simd_test(enable = "avx2")]
4717    const unsafe fn test_mm256_maskstore_epi32() {
4718        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4719        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4720        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4721        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4722        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4723        assert_eq!(arr, e);
4724    }
4725
4726    #[simd_test(enable = "avx2")]
4727    const unsafe fn test_mm_maskstore_epi64() {
4728        let a = _mm_setr_epi64x(1_i64, 2_i64);
4729        let mut arr = [-1_i64, -1_i64];
4730        let mask = _mm_setr_epi64x(0, -1);
4731        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4732        let e = [-1, 2];
4733        assert_eq!(arr, e);
4734    }
4735
4736    #[simd_test(enable = "avx2")]
4737    const unsafe fn test_mm256_maskstore_epi64() {
4738        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4739        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4740        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4741        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4742        let e = [-1, 2, 3, -1];
4743        assert_eq!(arr, e);
4744    }
4745
4746    #[simd_test(enable = "avx2")]
4747    const unsafe fn test_mm256_max_epi16() {
4748        let a = _mm256_set1_epi16(2);
4749        let b = _mm256_set1_epi16(4);
4750        let r = _mm256_max_epi16(a, b);
4751        assert_eq_m256i(r, b);
4752    }
4753
4754    #[simd_test(enable = "avx2")]
4755    const unsafe fn test_mm256_max_epi32() {
4756        let a = _mm256_set1_epi32(2);
4757        let b = _mm256_set1_epi32(4);
4758        let r = _mm256_max_epi32(a, b);
4759        assert_eq_m256i(r, b);
4760    }
4761
4762    #[simd_test(enable = "avx2")]
4763    const unsafe fn test_mm256_max_epi8() {
4764        let a = _mm256_set1_epi8(2);
4765        let b = _mm256_set1_epi8(4);
4766        let r = _mm256_max_epi8(a, b);
4767        assert_eq_m256i(r, b);
4768    }
4769
4770    #[simd_test(enable = "avx2")]
4771    const unsafe fn test_mm256_max_epu16() {
4772        let a = _mm256_set1_epi16(2);
4773        let b = _mm256_set1_epi16(4);
4774        let r = _mm256_max_epu16(a, b);
4775        assert_eq_m256i(r, b);
4776    }
4777
4778    #[simd_test(enable = "avx2")]
4779    const unsafe fn test_mm256_max_epu32() {
4780        let a = _mm256_set1_epi32(2);
4781        let b = _mm256_set1_epi32(4);
4782        let r = _mm256_max_epu32(a, b);
4783        assert_eq_m256i(r, b);
4784    }
4785
4786    #[simd_test(enable = "avx2")]
4787    const unsafe fn test_mm256_max_epu8() {
4788        let a = _mm256_set1_epi8(2);
4789        let b = _mm256_set1_epi8(4);
4790        let r = _mm256_max_epu8(a, b);
4791        assert_eq_m256i(r, b);
4792    }
4793
4794    #[simd_test(enable = "avx2")]
4795    const unsafe fn test_mm256_min_epi16() {
4796        let a = _mm256_set1_epi16(2);
4797        let b = _mm256_set1_epi16(4);
4798        let r = _mm256_min_epi16(a, b);
4799        assert_eq_m256i(r, a);
4800    }
4801
4802    #[simd_test(enable = "avx2")]
4803    const unsafe fn test_mm256_min_epi32() {
4804        let a = _mm256_set1_epi32(2);
4805        let b = _mm256_set1_epi32(4);
4806        let r = _mm256_min_epi32(a, b);
4807        assert_eq_m256i(r, a);
4808    }
4809
4810    #[simd_test(enable = "avx2")]
4811    const unsafe fn test_mm256_min_epi8() {
4812        let a = _mm256_set1_epi8(2);
4813        let b = _mm256_set1_epi8(4);
4814        let r = _mm256_min_epi8(a, b);
4815        assert_eq_m256i(r, a);
4816    }
4817
4818    #[simd_test(enable = "avx2")]
4819    const unsafe fn test_mm256_min_epu16() {
4820        let a = _mm256_set1_epi16(2);
4821        let b = _mm256_set1_epi16(4);
4822        let r = _mm256_min_epu16(a, b);
4823        assert_eq_m256i(r, a);
4824    }
4825
4826    #[simd_test(enable = "avx2")]
4827    const unsafe fn test_mm256_min_epu32() {
4828        let a = _mm256_set1_epi32(2);
4829        let b = _mm256_set1_epi32(4);
4830        let r = _mm256_min_epu32(a, b);
4831        assert_eq_m256i(r, a);
4832    }
4833
4834    #[simd_test(enable = "avx2")]
4835    const unsafe fn test_mm256_min_epu8() {
4836        let a = _mm256_set1_epi8(2);
4837        let b = _mm256_set1_epi8(4);
4838        let r = _mm256_min_epu8(a, b);
4839        assert_eq_m256i(r, a);
4840    }
4841
4842    #[simd_test(enable = "avx2")]
4843    const unsafe fn test_mm256_movemask_epi8() {
4844        let a = _mm256_set1_epi8(-1);
4845        let r = _mm256_movemask_epi8(a);
4846        let e = -1;
4847        assert_eq!(r, e);
4848    }
4849
4850    #[simd_test(enable = "avx2")]
4851    unsafe fn test_mm256_mpsadbw_epu8() {
4852        let a = _mm256_set1_epi8(2);
4853        let b = _mm256_set1_epi8(4);
4854        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4855        let e = _mm256_set1_epi16(8);
4856        assert_eq_m256i(r, e);
4857    }
4858
4859    #[simd_test(enable = "avx2")]
4860    const unsafe fn test_mm256_mul_epi32() {
4861        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4862        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4863        let r = _mm256_mul_epi32(a, b);
4864        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4865        assert_eq_m256i(r, e);
4866    }
4867
4868    #[simd_test(enable = "avx2")]
4869    const unsafe fn test_mm256_mul_epu32() {
4870        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4871        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4872        let r = _mm256_mul_epu32(a, b);
4873        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4874        assert_eq_m256i(r, e);
4875    }
4876
4877    #[simd_test(enable = "avx2")]
4878    const unsafe fn test_mm256_mulhi_epi16() {
4879        let a = _mm256_set1_epi16(6535);
4880        let b = _mm256_set1_epi16(6535);
4881        let r = _mm256_mulhi_epi16(a, b);
4882        let e = _mm256_set1_epi16(651);
4883        assert_eq_m256i(r, e);
4884    }
4885
4886    #[simd_test(enable = "avx2")]
4887    const unsafe fn test_mm256_mulhi_epu16() {
4888        let a = _mm256_set1_epi16(6535);
4889        let b = _mm256_set1_epi16(6535);
4890        let r = _mm256_mulhi_epu16(a, b);
4891        let e = _mm256_set1_epi16(651);
4892        assert_eq_m256i(r, e);
4893    }
4894
4895    #[simd_test(enable = "avx2")]
4896    const unsafe fn test_mm256_mullo_epi16() {
4897        let a = _mm256_set1_epi16(2);
4898        let b = _mm256_set1_epi16(4);
4899        let r = _mm256_mullo_epi16(a, b);
4900        let e = _mm256_set1_epi16(8);
4901        assert_eq_m256i(r, e);
4902    }
4903
4904    #[simd_test(enable = "avx2")]
4905    const unsafe fn test_mm256_mullo_epi32() {
4906        let a = _mm256_set1_epi32(2);
4907        let b = _mm256_set1_epi32(4);
4908        let r = _mm256_mullo_epi32(a, b);
4909        let e = _mm256_set1_epi32(8);
4910        assert_eq_m256i(r, e);
4911    }
4912
4913    #[simd_test(enable = "avx2")]
4914    unsafe fn test_mm256_mulhrs_epi16() {
4915        let a = _mm256_set1_epi16(2);
4916        let b = _mm256_set1_epi16(4);
4917        let r = _mm256_mullo_epi16(a, b);
4918        let e = _mm256_set1_epi16(8);
4919        assert_eq_m256i(r, e);
4920    }
4921
4922    #[simd_test(enable = "avx2")]
4923    const unsafe fn test_mm256_or_si256() {
4924        let a = _mm256_set1_epi8(-1);
4925        let b = _mm256_set1_epi8(0);
4926        let r = _mm256_or_si256(a, b);
4927        assert_eq_m256i(r, a);
4928    }
4929
4930    #[simd_test(enable = "avx2")]
4931    unsafe fn test_mm256_packs_epi16() {
4932        let a = _mm256_set1_epi16(2);
4933        let b = _mm256_set1_epi16(4);
4934        let r = _mm256_packs_epi16(a, b);
4935        #[rustfmt::skip]
4936        let e = _mm256_setr_epi8(
4937            2, 2, 2, 2, 2, 2, 2, 2,
4938            4, 4, 4, 4, 4, 4, 4, 4,
4939            2, 2, 2, 2, 2, 2, 2, 2,
4940            4, 4, 4, 4, 4, 4, 4, 4,
4941        );
4942
4943        assert_eq_m256i(r, e);
4944    }
4945
4946    #[simd_test(enable = "avx2")]
4947    unsafe fn test_mm256_packs_epi32() {
4948        let a = _mm256_set1_epi32(2);
4949        let b = _mm256_set1_epi32(4);
4950        let r = _mm256_packs_epi32(a, b);
4951        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4952
4953        assert_eq_m256i(r, e);
4954    }
4955
4956    #[simd_test(enable = "avx2")]
4957    unsafe fn test_mm256_packus_epi16() {
4958        let a = _mm256_set1_epi16(2);
4959        let b = _mm256_set1_epi16(4);
4960        let r = _mm256_packus_epi16(a, b);
4961        #[rustfmt::skip]
4962        let e = _mm256_setr_epi8(
4963            2, 2, 2, 2, 2, 2, 2, 2,
4964            4, 4, 4, 4, 4, 4, 4, 4,
4965            2, 2, 2, 2, 2, 2, 2, 2,
4966            4, 4, 4, 4, 4, 4, 4, 4,
4967        );
4968
4969        assert_eq_m256i(r, e);
4970    }
4971
4972    #[simd_test(enable = "avx2")]
4973    unsafe fn test_mm256_packus_epi32() {
4974        let a = _mm256_set1_epi32(2);
4975        let b = _mm256_set1_epi32(4);
4976        let r = _mm256_packus_epi32(a, b);
4977        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4978
4979        assert_eq_m256i(r, e);
4980    }
4981
4982    #[simd_test(enable = "avx2")]
4983    unsafe fn test_mm256_sad_epu8() {
4984        let a = _mm256_set1_epi8(2);
4985        let b = _mm256_set1_epi8(4);
4986        let r = _mm256_sad_epu8(a, b);
4987        let e = _mm256_set1_epi64x(16);
4988        assert_eq_m256i(r, e);
4989    }
4990
4991    #[simd_test(enable = "avx2")]
4992    const unsafe fn test_mm256_shufflehi_epi16() {
4993        #[rustfmt::skip]
4994        let a = _mm256_setr_epi16(
4995            0, 1, 2, 3, 11, 22, 33, 44,
4996            4, 5, 6, 7, 55, 66, 77, 88,
4997        );
4998        #[rustfmt::skip]
4999        let e = _mm256_setr_epi16(
5000            0, 1, 2, 3, 44, 22, 22, 11,
5001            4, 5, 6, 7, 88, 66, 66, 55,
5002        );
5003        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5004        assert_eq_m256i(r, e);
5005    }
5006
5007    #[simd_test(enable = "avx2")]
5008    const unsafe fn test_mm256_shufflelo_epi16() {
5009        #[rustfmt::skip]
5010        let a = _mm256_setr_epi16(
5011            11, 22, 33, 44, 0, 1, 2, 3,
5012            55, 66, 77, 88, 4, 5, 6, 7,
5013        );
5014        #[rustfmt::skip]
5015        let e = _mm256_setr_epi16(
5016            44, 22, 22, 11, 0, 1, 2, 3,
5017            88, 66, 66, 55, 4, 5, 6, 7,
5018        );
5019        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5020        assert_eq_m256i(r, e);
5021    }
5022
5023    #[simd_test(enable = "avx2")]
5024    unsafe fn test_mm256_sign_epi16() {
5025        let a = _mm256_set1_epi16(2);
5026        let b = _mm256_set1_epi16(-1);
5027        let r = _mm256_sign_epi16(a, b);
5028        let e = _mm256_set1_epi16(-2);
5029        assert_eq_m256i(r, e);
5030    }
5031
5032    #[simd_test(enable = "avx2")]
5033    unsafe fn test_mm256_sign_epi32() {
5034        let a = _mm256_set1_epi32(2);
5035        let b = _mm256_set1_epi32(-1);
5036        let r = _mm256_sign_epi32(a, b);
5037        let e = _mm256_set1_epi32(-2);
5038        assert_eq_m256i(r, e);
5039    }
5040
5041    #[simd_test(enable = "avx2")]
5042    unsafe fn test_mm256_sign_epi8() {
5043        let a = _mm256_set1_epi8(2);
5044        let b = _mm256_set1_epi8(-1);
5045        let r = _mm256_sign_epi8(a, b);
5046        let e = _mm256_set1_epi8(-2);
5047        assert_eq_m256i(r, e);
5048    }
5049
5050    #[simd_test(enable = "avx2")]
5051    unsafe fn test_mm256_sll_epi16() {
5052        let a = _mm256_set1_epi16(0xFF);
5053        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5054        let r = _mm256_sll_epi16(a, b);
5055        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5056    }
5057
5058    #[simd_test(enable = "avx2")]
5059    unsafe fn test_mm256_sll_epi32() {
5060        let a = _mm256_set1_epi32(0xFFFF);
5061        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5062        let r = _mm256_sll_epi32(a, b);
5063        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5064    }
5065
5066    #[simd_test(enable = "avx2")]
5067    unsafe fn test_mm256_sll_epi64() {
5068        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5069        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5070        let r = _mm256_sll_epi64(a, b);
5071        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5072    }
5073
5074    #[simd_test(enable = "avx2")]
5075    const unsafe fn test_mm256_slli_epi16() {
5076        assert_eq_m256i(
5077            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5078            _mm256_set1_epi16(0xFF0),
5079        );
5080    }
5081
5082    #[simd_test(enable = "avx2")]
5083    const unsafe fn test_mm256_slli_epi32() {
5084        assert_eq_m256i(
5085            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5086            _mm256_set1_epi32(0xFFFF0),
5087        );
5088    }
5089
5090    #[simd_test(enable = "avx2")]
5091    const unsafe fn test_mm256_slli_epi64() {
5092        assert_eq_m256i(
5093            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5094            _mm256_set1_epi64x(0xFFFFFFFF0),
5095        );
5096    }
5097
5098    #[simd_test(enable = "avx2")]
5099    const unsafe fn test_mm256_slli_si256() {
5100        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5101        let r = _mm256_slli_si256::<3>(a);
5102        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5103    }
5104
5105    #[simd_test(enable = "avx2")]
5106    const unsafe fn test_mm_sllv_epi32() {
5107        let a = _mm_set1_epi32(2);
5108        let b = _mm_set1_epi32(1);
5109        let r = _mm_sllv_epi32(a, b);
5110        let e = _mm_set1_epi32(4);
5111        assert_eq_m128i(r, e);
5112    }
5113
5114    #[simd_test(enable = "avx2")]
5115    const unsafe fn test_mm256_sllv_epi32() {
5116        let a = _mm256_set1_epi32(2);
5117        let b = _mm256_set1_epi32(1);
5118        let r = _mm256_sllv_epi32(a, b);
5119        let e = _mm256_set1_epi32(4);
5120        assert_eq_m256i(r, e);
5121    }
5122
5123    #[simd_test(enable = "avx2")]
5124    const unsafe fn test_mm_sllv_epi64() {
5125        let a = _mm_set1_epi64x(2);
5126        let b = _mm_set1_epi64x(1);
5127        let r = _mm_sllv_epi64(a, b);
5128        let e = _mm_set1_epi64x(4);
5129        assert_eq_m128i(r, e);
5130    }
5131
5132    #[simd_test(enable = "avx2")]
5133    const unsafe fn test_mm256_sllv_epi64() {
5134        let a = _mm256_set1_epi64x(2);
5135        let b = _mm256_set1_epi64x(1);
5136        let r = _mm256_sllv_epi64(a, b);
5137        let e = _mm256_set1_epi64x(4);
5138        assert_eq_m256i(r, e);
5139    }
5140
5141    #[simd_test(enable = "avx2")]
5142    unsafe fn test_mm256_sra_epi16() {
5143        let a = _mm256_set1_epi16(-1);
5144        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5145        let r = _mm256_sra_epi16(a, b);
5146        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5147    }
5148
5149    #[simd_test(enable = "avx2")]
5150    unsafe fn test_mm256_sra_epi32() {
5151        let a = _mm256_set1_epi32(-1);
5152        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5153        let r = _mm256_sra_epi32(a, b);
5154        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5155    }
5156
5157    #[simd_test(enable = "avx2")]
5158    const unsafe fn test_mm256_srai_epi16() {
5159        assert_eq_m256i(
5160            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5161            _mm256_set1_epi16(-1),
5162        );
5163    }
5164
5165    #[simd_test(enable = "avx2")]
5166    const unsafe fn test_mm256_srai_epi32() {
5167        assert_eq_m256i(
5168            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5169            _mm256_set1_epi32(-1),
5170        );
5171    }
5172
5173    #[simd_test(enable = "avx2")]
5174    const unsafe fn test_mm_srav_epi32() {
5175        let a = _mm_set1_epi32(4);
5176        let count = _mm_set1_epi32(1);
5177        let r = _mm_srav_epi32(a, count);
5178        let e = _mm_set1_epi32(2);
5179        assert_eq_m128i(r, e);
5180    }
5181
5182    #[simd_test(enable = "avx2")]
5183    const unsafe fn test_mm256_srav_epi32() {
5184        let a = _mm256_set1_epi32(4);
5185        let count = _mm256_set1_epi32(1);
5186        let r = _mm256_srav_epi32(a, count);
5187        let e = _mm256_set1_epi32(2);
5188        assert_eq_m256i(r, e);
5189    }
5190
5191    #[simd_test(enable = "avx2")]
5192    const unsafe fn test_mm256_srli_si256() {
5193        #[rustfmt::skip]
5194        let a = _mm256_setr_epi8(
5195            1, 2, 3, 4, 5, 6, 7, 8,
5196            9, 10, 11, 12, 13, 14, 15, 16,
5197            17, 18, 19, 20, 21, 22, 23, 24,
5198            25, 26, 27, 28, 29, 30, 31, 32,
5199        );
5200        let r = _mm256_srli_si256::<3>(a);
5201        #[rustfmt::skip]
5202        let e = _mm256_setr_epi8(
5203            4, 5, 6, 7, 8, 9, 10, 11,
5204            12, 13, 14, 15, 16, 0, 0, 0,
5205            20, 21, 22, 23, 24, 25, 26, 27,
5206            28, 29, 30, 31, 32, 0, 0, 0,
5207        );
5208        assert_eq_m256i(r, e);
5209    }
5210
5211    #[simd_test(enable = "avx2")]
5212    unsafe fn test_mm256_srl_epi16() {
5213        let a = _mm256_set1_epi16(0xFF);
5214        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5215        let r = _mm256_srl_epi16(a, b);
5216        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5217    }
5218
5219    #[simd_test(enable = "avx2")]
5220    unsafe fn test_mm256_srl_epi32() {
5221        let a = _mm256_set1_epi32(0xFFFF);
5222        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5223        let r = _mm256_srl_epi32(a, b);
5224        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5225    }
5226
5227    #[simd_test(enable = "avx2")]
5228    unsafe fn test_mm256_srl_epi64() {
5229        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5230        let b = _mm_setr_epi64x(4, 0);
5231        let r = _mm256_srl_epi64(a, b);
5232        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5233    }
5234
5235    #[simd_test(enable = "avx2")]
5236    const unsafe fn test_mm256_srli_epi16() {
5237        assert_eq_m256i(
5238            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5239            _mm256_set1_epi16(0xF),
5240        );
5241    }
5242
5243    #[simd_test(enable = "avx2")]
5244    const unsafe fn test_mm256_srli_epi32() {
5245        assert_eq_m256i(
5246            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5247            _mm256_set1_epi32(0xFFF),
5248        );
5249    }
5250
5251    #[simd_test(enable = "avx2")]
5252    const unsafe fn test_mm256_srli_epi64() {
5253        assert_eq_m256i(
5254            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5255            _mm256_set1_epi64x(0xFFFFFFF),
5256        );
5257    }
5258
5259    #[simd_test(enable = "avx2")]
5260    const unsafe fn test_mm_srlv_epi32() {
5261        let a = _mm_set1_epi32(2);
5262        let count = _mm_set1_epi32(1);
5263        let r = _mm_srlv_epi32(a, count);
5264        let e = _mm_set1_epi32(1);
5265        assert_eq_m128i(r, e);
5266    }
5267
5268    #[simd_test(enable = "avx2")]
5269    const unsafe fn test_mm256_srlv_epi32() {
5270        let a = _mm256_set1_epi32(2);
5271        let count = _mm256_set1_epi32(1);
5272        let r = _mm256_srlv_epi32(a, count);
5273        let e = _mm256_set1_epi32(1);
5274        assert_eq_m256i(r, e);
5275    }
5276
5277    #[simd_test(enable = "avx2")]
5278    const unsafe fn test_mm_srlv_epi64() {
5279        let a = _mm_set1_epi64x(2);
5280        let count = _mm_set1_epi64x(1);
5281        let r = _mm_srlv_epi64(a, count);
5282        let e = _mm_set1_epi64x(1);
5283        assert_eq_m128i(r, e);
5284    }
5285
5286    #[simd_test(enable = "avx2")]
5287    const unsafe fn test_mm256_srlv_epi64() {
5288        let a = _mm256_set1_epi64x(2);
5289        let count = _mm256_set1_epi64x(1);
5290        let r = _mm256_srlv_epi64(a, count);
5291        let e = _mm256_set1_epi64x(1);
5292        assert_eq_m256i(r, e);
5293    }
5294
5295    #[simd_test(enable = "avx2")]
5296    unsafe fn test_mm256_stream_load_si256() {
5297        let a = _mm256_set_epi64x(5, 6, 7, 8);
5298        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5299        assert_eq_m256i(a, r);
5300    }
5301
5302    #[simd_test(enable = "avx2")]
5303    const unsafe fn test_mm256_sub_epi16() {
5304        let a = _mm256_set1_epi16(4);
5305        let b = _mm256_set1_epi16(2);
5306        let r = _mm256_sub_epi16(a, b);
5307        assert_eq_m256i(r, b);
5308    }
5309
5310    #[simd_test(enable = "avx2")]
5311    const unsafe fn test_mm256_sub_epi32() {
5312        let a = _mm256_set1_epi32(4);
5313        let b = _mm256_set1_epi32(2);
5314        let r = _mm256_sub_epi32(a, b);
5315        assert_eq_m256i(r, b);
5316    }
5317
5318    #[simd_test(enable = "avx2")]
5319    const unsafe fn test_mm256_sub_epi64() {
5320        let a = _mm256_set1_epi64x(4);
5321        let b = _mm256_set1_epi64x(2);
5322        let r = _mm256_sub_epi64(a, b);
5323        assert_eq_m256i(r, b);
5324    }
5325
5326    #[simd_test(enable = "avx2")]
5327    const unsafe fn test_mm256_sub_epi8() {
5328        let a = _mm256_set1_epi8(4);
5329        let b = _mm256_set1_epi8(2);
5330        let r = _mm256_sub_epi8(a, b);
5331        assert_eq_m256i(r, b);
5332    }
5333
5334    #[simd_test(enable = "avx2")]
5335    const unsafe fn test_mm256_subs_epi16() {
5336        let a = _mm256_set1_epi16(4);
5337        let b = _mm256_set1_epi16(2);
5338        let r = _mm256_subs_epi16(a, b);
5339        assert_eq_m256i(r, b);
5340    }
5341
5342    #[simd_test(enable = "avx2")]
5343    const unsafe fn test_mm256_subs_epi8() {
5344        let a = _mm256_set1_epi8(4);
5345        let b = _mm256_set1_epi8(2);
5346        let r = _mm256_subs_epi8(a, b);
5347        assert_eq_m256i(r, b);
5348    }
5349
5350    #[simd_test(enable = "avx2")]
5351    const unsafe fn test_mm256_subs_epu16() {
5352        let a = _mm256_set1_epi16(4);
5353        let b = _mm256_set1_epi16(2);
5354        let r = _mm256_subs_epu16(a, b);
5355        assert_eq_m256i(r, b);
5356    }
5357
5358    #[simd_test(enable = "avx2")]
5359    const unsafe fn test_mm256_subs_epu8() {
5360        let a = _mm256_set1_epi8(4);
5361        let b = _mm256_set1_epi8(2);
5362        let r = _mm256_subs_epu8(a, b);
5363        assert_eq_m256i(r, b);
5364    }
5365
5366    #[simd_test(enable = "avx2")]
5367    const unsafe fn test_mm256_xor_si256() {
5368        let a = _mm256_set1_epi8(5);
5369        let b = _mm256_set1_epi8(3);
5370        let r = _mm256_xor_si256(a, b);
5371        assert_eq_m256i(r, _mm256_set1_epi8(6));
5372    }
5373
5374    #[simd_test(enable = "avx2")]
5375    const unsafe fn test_mm256_alignr_epi8() {
5376        #[rustfmt::skip]
5377        let a = _mm256_setr_epi8(
5378            1, 2, 3, 4, 5, 6, 7, 8,
5379            9, 10, 11, 12, 13, 14, 15, 16,
5380            17, 18, 19, 20, 21, 22, 23, 24,
5381            25, 26, 27, 28, 29, 30, 31, 32,
5382        );
5383        #[rustfmt::skip]
5384        let b = _mm256_setr_epi8(
5385            -1, -2, -3, -4, -5, -6, -7, -8,
5386            -9, -10, -11, -12, -13, -14, -15, -16,
5387            -17, -18, -19, -20, -21, -22, -23, -24,
5388            -25, -26, -27, -28, -29, -30, -31, -32,
5389        );
5390        let r = _mm256_alignr_epi8::<33>(a, b);
5391        assert_eq_m256i(r, _mm256_set1_epi8(0));
5392
5393        let r = _mm256_alignr_epi8::<17>(a, b);
5394        #[rustfmt::skip]
5395        let expected = _mm256_setr_epi8(
5396            2, 3, 4, 5, 6, 7, 8, 9,
5397            10, 11, 12, 13, 14, 15, 16, 0,
5398            18, 19, 20, 21, 22, 23, 24, 25,
5399            26, 27, 28, 29, 30, 31, 32, 0,
5400        );
5401        assert_eq_m256i(r, expected);
5402
5403        let r = _mm256_alignr_epi8::<4>(a, b);
5404        #[rustfmt::skip]
5405        let expected = _mm256_setr_epi8(
5406            -5, -6, -7, -8, -9, -10, -11, -12,
5407            -13, -14, -15, -16, 1, 2, 3, 4,
5408            -21, -22, -23, -24, -25, -26, -27, -28,
5409            -29, -30, -31, -32, 17, 18, 19, 20,
5410        );
5411        assert_eq_m256i(r, expected);
5412
5413        let r = _mm256_alignr_epi8::<15>(a, b);
5414        #[rustfmt::skip]
5415        let expected = _mm256_setr_epi8(
5416            -16, 1, 2, 3, 4, 5, 6, 7,
5417            8, 9, 10, 11, 12, 13, 14, 15,
5418            -32, 17, 18, 19, 20, 21, 22, 23,
5419            24, 25, 26, 27, 28, 29, 30, 31,
5420        );
5421        assert_eq_m256i(r, expected);
5422
5423        let r = _mm256_alignr_epi8::<0>(a, b);
5424        assert_eq_m256i(r, b);
5425
5426        let r = _mm256_alignr_epi8::<16>(a, b);
5427        assert_eq_m256i(r, a);
5428    }
5429
5430    #[simd_test(enable = "avx2")]
5431    unsafe fn test_mm256_shuffle_epi8() {
5432        #[rustfmt::skip]
5433        let a = _mm256_setr_epi8(
5434            1, 2, 3, 4, 5, 6, 7, 8,
5435            9, 10, 11, 12, 13, 14, 15, 16,
5436            17, 18, 19, 20, 21, 22, 23, 24,
5437            25, 26, 27, 28, 29, 30, 31, 32,
5438        );
5439        #[rustfmt::skip]
5440        let b = _mm256_setr_epi8(
5441            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5442            12, 5, 5, 10, 4, 1, 8, 0,
5443            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5444            12, 5, 5, 10, 4, 1, 8, 0,
5445        );
5446        #[rustfmt::skip]
5447        let expected = _mm256_setr_epi8(
5448            5, 0, 5, 4, 9, 13, 7, 4,
5449            13, 6, 6, 11, 5, 2, 9, 1,
5450            21, 0, 21, 20, 25, 29, 23, 20,
5451            29, 22, 22, 27, 21, 18, 25, 17,
5452        );
5453        let r = _mm256_shuffle_epi8(a, b);
5454        assert_eq_m256i(r, expected);
5455    }
5456
5457    #[simd_test(enable = "avx2")]
5458    unsafe fn test_mm256_permutevar8x32_epi32() {
5459        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5460        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5461        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5462        let r = _mm256_permutevar8x32_epi32(a, b);
5463        assert_eq_m256i(r, expected);
5464    }
5465
5466    #[simd_test(enable = "avx2")]
5467    const unsafe fn test_mm256_permute4x64_epi64() {
5468        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5469        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5470        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5471        assert_eq_m256i(r, expected);
5472    }
5473
5474    #[simd_test(enable = "avx2")]
5475    const unsafe fn test_mm256_permute2x128_si256() {
5476        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5477        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5478        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5479        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5480        assert_eq_m256i(r, e);
5481    }
5482
5483    #[simd_test(enable = "avx2")]
5484    const unsafe fn test_mm256_permute4x64_pd() {
5485        let a = _mm256_setr_pd(1., 2., 3., 4.);
5486        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5487        let e = _mm256_setr_pd(4., 1., 2., 1.);
5488        assert_eq_m256d(r, e);
5489    }
5490
5491    #[simd_test(enable = "avx2")]
5492    unsafe fn test_mm256_permutevar8x32_ps() {
5493        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5494        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5495        let r = _mm256_permutevar8x32_ps(a, b);
5496        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5497        assert_eq_m256(r, e);
5498    }
5499
5500    #[simd_test(enable = "avx2")]
5501    unsafe fn test_mm_i32gather_epi32() {
5502        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5503        // A multiplier of 4 is word-addressing
5504        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5505        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5506    }
5507
5508    #[simd_test(enable = "avx2")]
5509    unsafe fn test_mm_mask_i32gather_epi32() {
5510        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5511        // A multiplier of 4 is word-addressing
5512        let r = _mm_mask_i32gather_epi32::<4>(
5513            _mm_set1_epi32(256),
5514            arr.as_ptr(),
5515            _mm_setr_epi32(0, 16, 64, 96),
5516            _mm_setr_epi32(-1, -1, -1, 0),
5517        );
5518        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5519    }
5520
5521    #[simd_test(enable = "avx2")]
5522    unsafe fn test_mm256_i32gather_epi32() {
5523        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5524        // A multiplier of 4 is word-addressing
5525        let r =
5526            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5527        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5528    }
5529
5530    #[simd_test(enable = "avx2")]
5531    unsafe fn test_mm256_mask_i32gather_epi32() {
5532        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5533        // A multiplier of 4 is word-addressing
5534        let r = _mm256_mask_i32gather_epi32::<4>(
5535            _mm256_set1_epi32(256),
5536            arr.as_ptr(),
5537            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5538            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5539        );
5540        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5541    }
5542
5543    #[simd_test(enable = "avx2")]
5544    unsafe fn test_mm_i32gather_ps() {
5545        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5546        // A multiplier of 4 is word-addressing for f32s
5547        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5548        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5549    }
5550
5551    #[simd_test(enable = "avx2")]
5552    unsafe fn test_mm_mask_i32gather_ps() {
5553        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5554        // A multiplier of 4 is word-addressing for f32s
5555        let r = _mm_mask_i32gather_ps::<4>(
5556            _mm_set1_ps(256.0),
5557            arr.as_ptr(),
5558            _mm_setr_epi32(0, 16, 64, 96),
5559            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5560        );
5561        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5562    }
5563
5564    #[simd_test(enable = "avx2")]
5565    unsafe fn test_mm256_i32gather_ps() {
5566        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5567        // A multiplier of 4 is word-addressing for f32s
5568        let r =
5569            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5570        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5571    }
5572
5573    #[simd_test(enable = "avx2")]
5574    unsafe fn test_mm256_mask_i32gather_ps() {
5575        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5576        // A multiplier of 4 is word-addressing for f32s
5577        let r = _mm256_mask_i32gather_ps::<4>(
5578            _mm256_set1_ps(256.0),
5579            arr.as_ptr(),
5580            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5581            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5582        );
5583        assert_eq_m256(
5584            r,
5585            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5586        );
5587    }
5588
5589    #[simd_test(enable = "avx2")]
5590    unsafe fn test_mm_i32gather_epi64() {
5591        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5592        // A multiplier of 8 is word-addressing for i64s
5593        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5594        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5595    }
5596
5597    #[simd_test(enable = "avx2")]
5598    unsafe fn test_mm_mask_i32gather_epi64() {
5599        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5600        // A multiplier of 8 is word-addressing for i64s
5601        let r = _mm_mask_i32gather_epi64::<8>(
5602            _mm_set1_epi64x(256),
5603            arr.as_ptr(),
5604            _mm_setr_epi32(16, 16, 16, 16),
5605            _mm_setr_epi64x(-1, 0),
5606        );
5607        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5608    }
5609
5610    #[simd_test(enable = "avx2")]
5611    unsafe fn test_mm256_i32gather_epi64() {
5612        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5613        // A multiplier of 8 is word-addressing for i64s
5614        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5615        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5616    }
5617
5618    #[simd_test(enable = "avx2")]
5619    unsafe fn test_mm256_mask_i32gather_epi64() {
5620        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5621        // A multiplier of 8 is word-addressing for i64s
5622        let r = _mm256_mask_i32gather_epi64::<8>(
5623            _mm256_set1_epi64x(256),
5624            arr.as_ptr(),
5625            _mm_setr_epi32(0, 16, 64, 96),
5626            _mm256_setr_epi64x(-1, -1, -1, 0),
5627        );
5628        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5629    }
5630
5631    #[simd_test(enable = "avx2")]
5632    unsafe fn test_mm_i32gather_pd() {
5633        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5634        // A multiplier of 8 is word-addressing for f64s
5635        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5636        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5637    }
5638
5639    #[simd_test(enable = "avx2")]
5640    unsafe fn test_mm_mask_i32gather_pd() {
5641        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5642        // A multiplier of 8 is word-addressing for f64s
5643        let r = _mm_mask_i32gather_pd::<8>(
5644            _mm_set1_pd(256.0),
5645            arr.as_ptr(),
5646            _mm_setr_epi32(16, 16, 16, 16),
5647            _mm_setr_pd(-1.0, 0.0),
5648        );
5649        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5650    }
5651
5652    #[simd_test(enable = "avx2")]
5653    unsafe fn test_mm256_i32gather_pd() {
5654        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5655        // A multiplier of 8 is word-addressing for f64s
5656        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5657        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5658    }
5659
5660    #[simd_test(enable = "avx2")]
5661    unsafe fn test_mm256_mask_i32gather_pd() {
5662        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5663        // A multiplier of 8 is word-addressing for f64s
5664        let r = _mm256_mask_i32gather_pd::<8>(
5665            _mm256_set1_pd(256.0),
5666            arr.as_ptr(),
5667            _mm_setr_epi32(0, 16, 64, 96),
5668            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5669        );
5670        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5671    }
5672
5673    #[simd_test(enable = "avx2")]
5674    unsafe fn test_mm_i64gather_epi32() {
5675        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5676        // A multiplier of 4 is word-addressing
5677        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5678        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5679    }
5680
5681    #[simd_test(enable = "avx2")]
5682    unsafe fn test_mm_mask_i64gather_epi32() {
5683        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5684        // A multiplier of 4 is word-addressing
5685        let r = _mm_mask_i64gather_epi32::<4>(
5686            _mm_set1_epi32(256),
5687            arr.as_ptr(),
5688            _mm_setr_epi64x(0, 16),
5689            _mm_setr_epi32(-1, 0, -1, 0),
5690        );
5691        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5692    }
5693
5694    #[simd_test(enable = "avx2")]
5695    unsafe fn test_mm256_i64gather_epi32() {
5696        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5697        // A multiplier of 4 is word-addressing
5698        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5699        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5700    }
5701
5702    #[simd_test(enable = "avx2")]
5703    unsafe fn test_mm256_mask_i64gather_epi32() {
5704        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5705        // A multiplier of 4 is word-addressing
5706        let r = _mm256_mask_i64gather_epi32::<4>(
5707            _mm_set1_epi32(256),
5708            arr.as_ptr(),
5709            _mm256_setr_epi64x(0, 16, 64, 96),
5710            _mm_setr_epi32(-1, -1, -1, 0),
5711        );
5712        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5713    }
5714
5715    #[simd_test(enable = "avx2")]
5716    unsafe fn test_mm_i64gather_ps() {
5717        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5718        // A multiplier of 4 is word-addressing for f32s
5719        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5720        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5721    }
5722
5723    #[simd_test(enable = "avx2")]
5724    unsafe fn test_mm_mask_i64gather_ps() {
5725        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5726        // A multiplier of 4 is word-addressing for f32s
5727        let r = _mm_mask_i64gather_ps::<4>(
5728            _mm_set1_ps(256.0),
5729            arr.as_ptr(),
5730            _mm_setr_epi64x(0, 16),
5731            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5732        );
5733        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5734    }
5735
5736    #[simd_test(enable = "avx2")]
5737    unsafe fn test_mm256_i64gather_ps() {
5738        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5739        // A multiplier of 4 is word-addressing for f32s
5740        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5741        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5742    }
5743
5744    #[simd_test(enable = "avx2")]
5745    unsafe fn test_mm256_mask_i64gather_ps() {
5746        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5747        // A multiplier of 4 is word-addressing for f32s
5748        let r = _mm256_mask_i64gather_ps::<4>(
5749            _mm_set1_ps(256.0),
5750            arr.as_ptr(),
5751            _mm256_setr_epi64x(0, 16, 64, 96),
5752            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5753        );
5754        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5755    }
5756
5757    #[simd_test(enable = "avx2")]
5758    unsafe fn test_mm_i64gather_epi64() {
5759        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5760        // A multiplier of 8 is word-addressing for i64s
5761        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5762        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5763    }
5764
5765    #[simd_test(enable = "avx2")]
5766    unsafe fn test_mm_mask_i64gather_epi64() {
5767        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5768        // A multiplier of 8 is word-addressing for i64s
5769        let r = _mm_mask_i64gather_epi64::<8>(
5770            _mm_set1_epi64x(256),
5771            arr.as_ptr(),
5772            _mm_setr_epi64x(16, 16),
5773            _mm_setr_epi64x(-1, 0),
5774        );
5775        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5776    }
5777
5778    #[simd_test(enable = "avx2")]
5779    unsafe fn test_mm256_i64gather_epi64() {
5780        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5781        // A multiplier of 8 is word-addressing for i64s
5782        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5783        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5784    }
5785
5786    #[simd_test(enable = "avx2")]
5787    unsafe fn test_mm256_mask_i64gather_epi64() {
5788        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5789        // A multiplier of 8 is word-addressing for i64s
5790        let r = _mm256_mask_i64gather_epi64::<8>(
5791            _mm256_set1_epi64x(256),
5792            arr.as_ptr(),
5793            _mm256_setr_epi64x(0, 16, 64, 96),
5794            _mm256_setr_epi64x(-1, -1, -1, 0),
5795        );
5796        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5797    }
5798
5799    #[simd_test(enable = "avx2")]
5800    unsafe fn test_mm_i64gather_pd() {
5801        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5802        // A multiplier of 8 is word-addressing for f64s
5803        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5804        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5805    }
5806
5807    #[simd_test(enable = "avx2")]
5808    unsafe fn test_mm_mask_i64gather_pd() {
5809        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5810        // A multiplier of 8 is word-addressing for f64s
5811        let r = _mm_mask_i64gather_pd::<8>(
5812            _mm_set1_pd(256.0),
5813            arr.as_ptr(),
5814            _mm_setr_epi64x(16, 16),
5815            _mm_setr_pd(-1.0, 0.0),
5816        );
5817        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5818    }
5819
5820    #[simd_test(enable = "avx2")]
5821    unsafe fn test_mm256_i64gather_pd() {
5822        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5823        // A multiplier of 8 is word-addressing for f64s
5824        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5825        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5826    }
5827
5828    #[simd_test(enable = "avx2")]
5829    unsafe fn test_mm256_mask_i64gather_pd() {
5830        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5831        // A multiplier of 8 is word-addressing for f64s
5832        let r = _mm256_mask_i64gather_pd::<8>(
5833            _mm256_set1_pd(256.0),
5834            arr.as_ptr(),
5835            _mm256_setr_epi64x(0, 16, 64, 96),
5836            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5837        );
5838        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5839    }
5840
5841    #[simd_test(enable = "avx2")]
5842    const unsafe fn test_mm256_extract_epi8() {
5843        #[rustfmt::skip]
5844        let a = _mm256_setr_epi8(
5845            -1, 1, 2, 3, 4, 5, 6, 7,
5846            8, 9, 10, 11, 12, 13, 14, 15,
5847            16, 17, 18, 19, 20, 21, 22, 23,
5848            24, 25, 26, 27, 28, 29, 30, 31
5849        );
5850        let r1 = _mm256_extract_epi8::<0>(a);
5851        let r2 = _mm256_extract_epi8::<3>(a);
5852        assert_eq!(r1, 0xFF);
5853        assert_eq!(r2, 3);
5854    }
5855
5856    #[simd_test(enable = "avx2")]
5857    const unsafe fn test_mm256_extract_epi16() {
5858        #[rustfmt::skip]
5859        let a = _mm256_setr_epi16(
5860            -1, 1, 2, 3, 4, 5, 6, 7,
5861            8, 9, 10, 11, 12, 13, 14, 15,
5862        );
5863        let r1 = _mm256_extract_epi16::<0>(a);
5864        let r2 = _mm256_extract_epi16::<3>(a);
5865        assert_eq!(r1, 0xFFFF);
5866        assert_eq!(r2, 3);
5867    }
5868}