core/stdarch/crates/core_arch/src/x86/
sse41.rs

1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9// SSE4 rounding constants
10/// round to nearest
11#[stable(feature = "simd_x86", since = "1.27.0")]
12pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
13/// round down
14#[stable(feature = "simd_x86", since = "1.27.0")]
15pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
16/// round up
17#[stable(feature = "simd_x86", since = "1.27.0")]
18pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
19/// truncate
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
22/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23#[stable(feature = "simd_x86", since = "1.27.0")]
24pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
25/// do not suppress exceptions
26#[stable(feature = "simd_x86", since = "1.27.0")]
27pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
28/// suppress exceptions
29#[stable(feature = "simd_x86", since = "1.27.0")]
30pub const _MM_FROUND_NO_EXC: i32 = 0x08;
31/// round to nearest and do not suppress exceptions
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub const _MM_FROUND_NINT: i32 = 0x00;
34/// round down and do not suppress exceptions
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
37/// round up and do not suppress exceptions
38#[stable(feature = "simd_x86", since = "1.27.0")]
39pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
40/// truncate and do not suppress exceptions
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
43/// use MXCSR.RC and do not suppress exceptions; see
44/// `vendor::_MM_SET_ROUNDING_MODE`
45#[stable(feature = "simd_x86", since = "1.27.0")]
46pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
47/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
50
51/// Blend packed 8-bit integers from `a` and `b` using `mask`
52///
53/// The high bit of each corresponding mask byte determines the selection.
54/// If the high bit is set, the element of `b` is selected.
55/// Otherwise, the element of `a` is selected.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58#[inline]
59#[target_feature(enable = "sse4.1")]
60#[cfg_attr(test, assert_instr(pblendvb))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
64    unsafe {
65        let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
66        transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
67    }
68}
69
70/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
71///
72/// The mask bits determine the selection. A clear bit selects the
73/// corresponding element of `a`, and a set bit the corresponding
74/// element of `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
77#[inline]
78#[target_feature(enable = "sse4.1")]
79#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
80#[rustc_legacy_const_generics(2)]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
84    static_assert_uimm_bits!(IMM8, 8);
85    unsafe {
86        transmute::<i16x8, _>(simd_shuffle!(
87            a.as_i16x8(),
88            b.as_i16x8(),
89            [
90                [0, 8][IMM8 as usize & 1],
91                [1, 9][(IMM8 >> 1) as usize & 1],
92                [2, 10][(IMM8 >> 2) as usize & 1],
93                [3, 11][(IMM8 >> 3) as usize & 1],
94                [4, 12][(IMM8 >> 4) as usize & 1],
95                [5, 13][(IMM8 >> 5) as usize & 1],
96                [6, 14][(IMM8 >> 6) as usize & 1],
97                [7, 15][(IMM8 >> 7) as usize & 1],
98            ]
99        ))
100    }
101}
102
103/// Blend packed double-precision (64-bit) floating-point elements from `a`
104/// and `b` using `mask`
105///
106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
107#[inline]
108#[target_feature(enable = "sse4.1")]
109#[cfg_attr(test, assert_instr(blendvpd))]
110#[stable(feature = "simd_x86", since = "1.27.0")]
111#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
112pub const fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
113    unsafe {
114        let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
115        transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
116    }
117}
118
119/// Blend packed single-precision (32-bit) floating-point elements from `a`
120/// and `b` using `mask`
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
123#[inline]
124#[target_feature(enable = "sse4.1")]
125#[cfg_attr(test, assert_instr(blendvps))]
126#[stable(feature = "simd_x86", since = "1.27.0")]
127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
128pub const fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
129    unsafe {
130        let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
131        transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
132    }
133}
134
135/// Blend packed double-precision (64-bit) floating-point elements from `a`
136/// and `b` using control mask `IMM2`
137///
138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
139#[inline]
140#[target_feature(enable = "sse4.1")]
141// Note: LLVM7 prefers the single-precision floating-point domain when possible
142// see https://bugs.llvm.org/show_bug.cgi?id=38195
143// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
144#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
145#[rustc_legacy_const_generics(2)]
146#[stable(feature = "simd_x86", since = "1.27.0")]
147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
148pub const fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
149    static_assert_uimm_bits!(IMM2, 2);
150    unsafe {
151        transmute::<f64x2, _>(simd_shuffle!(
152            a.as_f64x2(),
153            b.as_f64x2(),
154            [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
155        ))
156    }
157}
158
159/// Blend packed single-precision (32-bit) floating-point elements from `a`
160/// and `b` using mask `IMM4`
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
163#[inline]
164#[target_feature(enable = "sse4.1")]
165#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
166#[rustc_legacy_const_generics(2)]
167#[stable(feature = "simd_x86", since = "1.27.0")]
168#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
169pub const fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
170    static_assert_uimm_bits!(IMM4, 4);
171    unsafe {
172        transmute::<f32x4, _>(simd_shuffle!(
173            a.as_f32x4(),
174            b.as_f32x4(),
175            [
176                [0, 4][IMM4 as usize & 1],
177                [1, 5][(IMM4 >> 1) as usize & 1],
178                [2, 6][(IMM4 >> 2) as usize & 1],
179                [3, 7][(IMM4 >> 3) as usize & 1],
180            ]
181        ))
182    }
183}
184
185/// Extracts a single-precision (32-bit) floating-point element from `a`,
186/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
187/// and may be converted back to a floating point number via casting.
188///
189/// # Example
190/// ```rust
191/// # #[cfg(target_arch = "x86")]
192/// # use std::arch::x86::*;
193/// # #[cfg(target_arch = "x86_64")]
194/// # use std::arch::x86_64::*;
195/// # fn main() {
196/// #    if is_x86_feature_detected!("sse4.1") {
197/// #       #[target_feature(enable = "sse4.1")]
198/// #       #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
199/// #       unsafe fn worker() { unsafe {
200/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
201/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
202/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
203/// float_store.push(f32::from_bits(x as u32));
204/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
205/// #       }}
206/// #       unsafe { worker() }
207/// #   }
208/// # }
209/// ```
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
211#[inline]
212#[target_feature(enable = "sse4.1")]
213#[cfg_attr(test, assert_instr(extractps, IMM8 = 0))]
214#[rustc_legacy_const_generics(1)]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
217pub const fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
218    static_assert_uimm_bits!(IMM8, 2);
219    unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
220}
221
222/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
223/// integer containing the zero-extended integer data.
224///
225/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
228#[inline]
229#[target_feature(enable = "sse4.1")]
230#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
231#[rustc_legacy_const_generics(1)]
232#[stable(feature = "simd_x86", since = "1.27.0")]
233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
234pub const fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
235    static_assert_uimm_bits!(IMM8, 4);
236    unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
237}
238
239/// Extracts an 32-bit integer from `a` selected with `IMM8`
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
242#[inline]
243#[target_feature(enable = "sse4.1")]
244#[cfg_attr(test, assert_instr(extractps, IMM8 = 1))]
245#[rustc_legacy_const_generics(1)]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
248pub const fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
249    static_assert_uimm_bits!(IMM8, 2);
250    unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
251}
252
253/// Select a single value in `b` to store at some position in `a`,
254/// Then zero elements according to `IMM8`.
255///
256/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
257/// the result they will be copied to, and which bits in the result will be
258/// cleared. The following assignments are made:
259///
260/// * Bits `[7:6]` specify the bits to copy from operand `b`:
261///     - `00`: Selects bits `[31:0]` from operand `b`.
262///     - `01`: Selects bits `[63:32]` from operand `b`.
263///     - `10`: Selects bits `[95:64]` from operand `b`.
264///     - `11`: Selects bits `[127:96]` from operand `b`.
265///
266/// * Bits `[5:4]` specify the bits in the result to which the selected bits
267///   from operand `b` are copied:
268///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
269///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
270///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
271///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
272///
273/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
274///   element is cleared.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
277#[inline]
278#[target_feature(enable = "sse4.1")]
279#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
280#[rustc_legacy_const_generics(2)]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
283    static_assert_uimm_bits!(IMM8, 8);
284    unsafe { insertps(a, b, IMM8 as u8) }
285}
286
287/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
288/// location specified by `IMM8`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
291#[inline]
292#[target_feature(enable = "sse4.1")]
293#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
294#[rustc_legacy_const_generics(2)]
295#[stable(feature = "simd_x86", since = "1.27.0")]
296#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
297pub const fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
298    static_assert_uimm_bits!(IMM8, 4);
299    unsafe { transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
300}
301
302/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
303/// location specified by `IMM8`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
306#[inline]
307#[target_feature(enable = "sse4.1")]
308#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
309#[rustc_legacy_const_generics(2)]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
312pub const fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
313    static_assert_uimm_bits!(IMM8, 2);
314    unsafe { transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
315}
316
317/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
318/// values in dst.
319///
320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
321#[inline]
322#[target_feature(enable = "sse4.1")]
323#[cfg_attr(test, assert_instr(pmaxsb))]
324#[stable(feature = "simd_x86", since = "1.27.0")]
325#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
326pub const fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
327    unsafe { simd_imax(a.as_i8x16(), b.as_i8x16()).as_m128i() }
328}
329
330/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
331/// maximum.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
334#[inline]
335#[target_feature(enable = "sse4.1")]
336#[cfg_attr(test, assert_instr(pmaxuw))]
337#[stable(feature = "simd_x86", since = "1.27.0")]
338#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
339pub const fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
340    unsafe { simd_imax(a.as_u16x8(), b.as_u16x8()).as_m128i() }
341}
342
343/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
344/// values.
345///
346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
347#[inline]
348#[target_feature(enable = "sse4.1")]
349#[cfg_attr(test, assert_instr(pmaxsd))]
350#[stable(feature = "simd_x86", since = "1.27.0")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
353    unsafe { simd_imax(a.as_i32x4(), b.as_i32x4()).as_m128i() }
354}
355
356/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
357/// maximum values.
358///
359/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
360#[inline]
361#[target_feature(enable = "sse4.1")]
362#[cfg_attr(test, assert_instr(pmaxud))]
363#[stable(feature = "simd_x86", since = "1.27.0")]
364#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
365pub const fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
366    unsafe { simd_imax(a.as_u32x4(), b.as_u32x4()).as_m128i() }
367}
368
369/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
370/// values in dst.
371///
372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
373#[inline]
374#[target_feature(enable = "sse4.1")]
375#[cfg_attr(test, assert_instr(pminsb))]
376#[stable(feature = "simd_x86", since = "1.27.0")]
377#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
378pub const fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
379    unsafe { simd_imin(a.as_i8x16(), b.as_i8x16()).as_m128i() }
380}
381
382/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
383/// minimum.
384///
385/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
386#[inline]
387#[target_feature(enable = "sse4.1")]
388#[cfg_attr(test, assert_instr(pminuw))]
389#[stable(feature = "simd_x86", since = "1.27.0")]
390#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
391pub const fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
392    unsafe { simd_imin(a.as_u16x8(), b.as_u16x8()).as_m128i() }
393}
394
395/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
396/// values.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
399#[inline]
400#[target_feature(enable = "sse4.1")]
401#[cfg_attr(test, assert_instr(pminsd))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
404pub const fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
405    unsafe { simd_imin(a.as_i32x4(), b.as_i32x4()).as_m128i() }
406}
407
408/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
409/// minimum values.
410///
411/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
412#[inline]
413#[target_feature(enable = "sse4.1")]
414#[cfg_attr(test, assert_instr(pminud))]
415#[stable(feature = "simd_x86", since = "1.27.0")]
416#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
417pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
418    unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() }
419}
420
421/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
422/// using unsigned saturation
423///
424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
425#[inline]
426#[target_feature(enable = "sse4.1")]
427#[cfg_attr(test, assert_instr(packusdw))]
428#[stable(feature = "simd_x86", since = "1.27.0")]
429pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
430    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
431}
432
433/// Compares packed 64-bit integers in `a` and `b` for equality
434///
435/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
436#[inline]
437#[target_feature(enable = "sse4.1")]
438#[cfg_attr(test, assert_instr(pcmpeqq))]
439#[stable(feature = "simd_x86", since = "1.27.0")]
440#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
441pub const fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
442    unsafe { transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
443}
444
445/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
446///
447/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
448#[inline]
449#[target_feature(enable = "sse4.1")]
450#[cfg_attr(test, assert_instr(pmovsxbw))]
451#[stable(feature = "simd_x86", since = "1.27.0")]
452#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
453pub const fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
454    unsafe {
455        let a = a.as_i8x16();
456        let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
457        transmute(simd_cast::<_, i16x8>(a))
458    }
459}
460
461/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
462///
463/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
464#[inline]
465#[target_feature(enable = "sse4.1")]
466#[cfg_attr(test, assert_instr(pmovsxbd))]
467#[stable(feature = "simd_x86", since = "1.27.0")]
468#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
469pub const fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
470    unsafe {
471        let a = a.as_i8x16();
472        let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
473        transmute(simd_cast::<_, i32x4>(a))
474    }
475}
476
477/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
478/// 64-bit integers
479///
480/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
481#[inline]
482#[target_feature(enable = "sse4.1")]
483#[cfg_attr(test, assert_instr(pmovsxbq))]
484#[stable(feature = "simd_x86", since = "1.27.0")]
485#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
486pub const fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
487    unsafe {
488        let a = a.as_i8x16();
489        let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
490        transmute(simd_cast::<_, i64x2>(a))
491    }
492}
493
494/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
495///
496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
497#[inline]
498#[target_feature(enable = "sse4.1")]
499#[cfg_attr(test, assert_instr(pmovsxwd))]
500#[stable(feature = "simd_x86", since = "1.27.0")]
501#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
502pub const fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
503    unsafe {
504        let a = a.as_i16x8();
505        let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
506        transmute(simd_cast::<_, i32x4>(a))
507    }
508}
509
510/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
511///
512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
513#[inline]
514#[target_feature(enable = "sse4.1")]
515#[cfg_attr(test, assert_instr(pmovsxwq))]
516#[stable(feature = "simd_x86", since = "1.27.0")]
517#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
518pub const fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
519    unsafe {
520        let a = a.as_i16x8();
521        let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
522        transmute(simd_cast::<_, i64x2>(a))
523    }
524}
525
526/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
529#[inline]
530#[target_feature(enable = "sse4.1")]
531#[cfg_attr(test, assert_instr(pmovsxdq))]
532#[stable(feature = "simd_x86", since = "1.27.0")]
533#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
534pub const fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
535    unsafe {
536        let a = a.as_i32x4();
537        let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
538        transmute(simd_cast::<_, i64x2>(a))
539    }
540}
541
542/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
543///
544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
545#[inline]
546#[target_feature(enable = "sse4.1")]
547#[cfg_attr(test, assert_instr(pmovzxbw))]
548#[stable(feature = "simd_x86", since = "1.27.0")]
549#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
550pub const fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
551    unsafe {
552        let a = a.as_u8x16();
553        let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
554        transmute(simd_cast::<_, i16x8>(a))
555    }
556}
557
558/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
561#[inline]
562#[target_feature(enable = "sse4.1")]
563#[cfg_attr(test, assert_instr(pmovzxbd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
567    unsafe {
568        let a = a.as_u8x16();
569        let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
570        transmute(simd_cast::<_, i32x4>(a))
571    }
572}
573
574/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
577#[inline]
578#[target_feature(enable = "sse4.1")]
579#[cfg_attr(test, assert_instr(pmovzxbq))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
583    unsafe {
584        let a = a.as_u8x16();
585        let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
586        transmute(simd_cast::<_, i64x2>(a))
587    }
588}
589
590/// Zeroes extend packed unsigned 16-bit integers in `a`
591/// to packed 32-bit integers
592///
593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
594#[inline]
595#[target_feature(enable = "sse4.1")]
596#[cfg_attr(test, assert_instr(pmovzxwd))]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
599pub const fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
600    unsafe {
601        let a = a.as_u16x8();
602        let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
603        transmute(simd_cast::<_, i32x4>(a))
604    }
605}
606
607/// Zeroes extend packed unsigned 16-bit integers in `a`
608/// to packed 64-bit integers
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
611#[inline]
612#[target_feature(enable = "sse4.1")]
613#[cfg_attr(test, assert_instr(pmovzxwq))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
617    unsafe {
618        let a = a.as_u16x8();
619        let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
620        transmute(simd_cast::<_, i64x2>(a))
621    }
622}
623
624/// Zeroes extend packed unsigned 32-bit integers in `a`
625/// to packed 64-bit integers
626///
627/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
628#[inline]
629#[target_feature(enable = "sse4.1")]
630#[cfg_attr(test, assert_instr(pmovzxdq))]
631#[stable(feature = "simd_x86", since = "1.27.0")]
632#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
633pub const fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
634    unsafe {
635        let a = a.as_u32x4();
636        let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
637        transmute(simd_cast::<_, i64x2>(a))
638    }
639}
640
641/// Returns the dot product of two __m128d vectors.
642///
643/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
644/// If a condition mask bit is zero, the corresponding multiplication is
645/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
646/// the dot product will be stored in the return value component. Otherwise if
647/// the broadcast mask bit is zero then the return component will be zero.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
650#[inline]
651#[target_feature(enable = "sse4.1")]
652#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
653#[rustc_legacy_const_generics(2)]
654#[stable(feature = "simd_x86", since = "1.27.0")]
655pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
656    unsafe {
657        static_assert_uimm_bits!(IMM8, 8);
658        dppd(a, b, IMM8 as u8)
659    }
660}
661
662/// Returns the dot product of two __m128 vectors.
663///
664/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
665/// If a condition mask bit is zero, the corresponding multiplication is
666/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
667/// the dot product will be stored in the return value component. Otherwise if
668/// the broadcast mask bit is zero then the return component will be zero.
669///
670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
671#[inline]
672#[target_feature(enable = "sse4.1")]
673#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
674#[rustc_legacy_const_generics(2)]
675#[stable(feature = "simd_x86", since = "1.27.0")]
676pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
677    static_assert_uimm_bits!(IMM8, 8);
678    unsafe { dpps(a, b, IMM8 as u8) }
679}
680
681/// Round the packed double-precision (64-bit) floating-point elements in `a`
682/// down to an integer value, and stores the results as packed double-precision
683/// floating-point elements.
684///
685/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
686#[inline]
687#[target_feature(enable = "sse4.1")]
688#[cfg_attr(test, assert_instr(roundpd))]
689#[stable(feature = "simd_x86", since = "1.27.0")]
690#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
691pub const fn _mm_floor_pd(a: __m128d) -> __m128d {
692    unsafe { simd_floor(a) }
693}
694
695/// Round the packed single-precision (32-bit) floating-point elements in `a`
696/// down to an integer value, and stores the results as packed single-precision
697/// floating-point elements.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
700#[inline]
701#[target_feature(enable = "sse4.1")]
702#[cfg_attr(test, assert_instr(roundps))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
705pub const fn _mm_floor_ps(a: __m128) -> __m128 {
706    unsafe { simd_floor(a) }
707}
708
709/// Round the lower double-precision (64-bit) floating-point element in `b`
710/// down to an integer value, store the result as a double-precision
711/// floating-point element in the lower element of the intrinsic result,
712/// and copies the upper element from `a` to the upper element of the intrinsic
713/// result.
714///
715/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
716#[inline]
717#[target_feature(enable = "sse4.1")]
718#[cfg_attr(test, assert_instr(roundsd))]
719#[stable(feature = "simd_x86", since = "1.27.0")]
720pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
721    unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
722}
723
724/// Round the lower single-precision (32-bit) floating-point element in `b`
725/// down to an integer value, store the result as a single-precision
726/// floating-point element in the lower element of the intrinsic result,
727/// and copies the upper 3 packed elements from `a` to the upper elements
728/// of the intrinsic result.
729///
730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
731#[inline]
732#[target_feature(enable = "sse4.1")]
733#[cfg_attr(test, assert_instr(roundss))]
734#[stable(feature = "simd_x86", since = "1.27.0")]
735pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
736    unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
737}
738
739/// Round the packed double-precision (64-bit) floating-point elements in `a`
740/// up to an integer value, and stores the results as packed double-precision
741/// floating-point elements.
742///
743/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
744#[inline]
745#[target_feature(enable = "sse4.1")]
746#[cfg_attr(test, assert_instr(roundpd))]
747#[stable(feature = "simd_x86", since = "1.27.0")]
748#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
749pub const fn _mm_ceil_pd(a: __m128d) -> __m128d {
750    unsafe { simd_ceil(a) }
751}
752
753/// Round the packed single-precision (32-bit) floating-point elements in `a`
754/// up to an integer value, and stores the results as packed single-precision
755/// floating-point elements.
756///
757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
758#[inline]
759#[target_feature(enable = "sse4.1")]
760#[cfg_attr(test, assert_instr(roundps))]
761#[stable(feature = "simd_x86", since = "1.27.0")]
762#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
763pub const fn _mm_ceil_ps(a: __m128) -> __m128 {
764    unsafe { simd_ceil(a) }
765}
766
767/// Round the lower double-precision (64-bit) floating-point element in `b`
768/// up to an integer value, store the result as a double-precision
769/// floating-point element in the lower element of the intrinsic result,
770/// and copies the upper element from `a` to the upper element
771/// of the intrinsic result.
772///
773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
774#[inline]
775#[target_feature(enable = "sse4.1")]
776#[cfg_attr(test, assert_instr(roundsd))]
777#[stable(feature = "simd_x86", since = "1.27.0")]
778pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
779    unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
780}
781
782/// Round the lower single-precision (32-bit) floating-point element in `b`
783/// up to an integer value, store the result as a single-precision
784/// floating-point element in the lower element of the intrinsic result,
785/// and copies the upper 3 packed elements from `a` to the upper elements
786/// of the intrinsic result.
787///
788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
789#[inline]
790#[target_feature(enable = "sse4.1")]
791#[cfg_attr(test, assert_instr(roundss))]
792#[stable(feature = "simd_x86", since = "1.27.0")]
793pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
794    unsafe { roundss(a, b, _MM_FROUND_CEIL) }
795}
796
797/// Round the packed double-precision (64-bit) floating-point elements in `a`
798/// using the `ROUNDING` parameter, and stores the results as packed
799/// double-precision floating-point elements.
800/// Rounding is done according to the rounding parameter, which can be one of:
801///
802/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
803/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
804/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
805/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
806/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
809#[inline]
810#[target_feature(enable = "sse4.1")]
811#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
812#[rustc_legacy_const_generics(1)]
813#[stable(feature = "simd_x86", since = "1.27.0")]
814pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
815    static_assert_uimm_bits!(ROUNDING, 4);
816    unsafe { roundpd(a, ROUNDING) }
817}
818
819/// Round the packed single-precision (32-bit) floating-point elements in `a`
820/// using the `ROUNDING` parameter, and stores the results as packed
821/// single-precision floating-point elements.
822/// Rounding is done according to the rounding parameter, which can be one of:
823///
824/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
825/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
826/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
827/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
828/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
829///
830/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
831#[inline]
832#[target_feature(enable = "sse4.1")]
833#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
834#[rustc_legacy_const_generics(1)]
835#[stable(feature = "simd_x86", since = "1.27.0")]
836pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
837    static_assert_uimm_bits!(ROUNDING, 4);
838    unsafe { roundps(a, ROUNDING) }
839}
840
841/// Round the lower double-precision (64-bit) floating-point element in `b`
842/// using the `ROUNDING` parameter, store the result as a double-precision
843/// floating-point element in the lower element of the intrinsic result,
844/// and copies the upper element from `a` to the upper element of the intrinsic
845/// result.
846/// Rounding is done according to the rounding parameter, which can be one of:
847///
848/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
849/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
850/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
851/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
852/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
853///
854/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
855#[inline]
856#[target_feature(enable = "sse4.1")]
857#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
858#[rustc_legacy_const_generics(2)]
859#[stable(feature = "simd_x86", since = "1.27.0")]
860pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
861    static_assert_uimm_bits!(ROUNDING, 4);
862    unsafe { roundsd(a, b, ROUNDING) }
863}
864
865/// Round the lower single-precision (32-bit) floating-point element in `b`
866/// using the `ROUNDING` parameter, store the result as a single-precision
867/// floating-point element in the lower element of the intrinsic result,
868/// and copies the upper 3 packed elements from `a` to the upper elements
869/// of the intrinsic result.
870/// Rounding is done according to the rounding parameter, which can be one of:
871///
872/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
873/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
874/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
875/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
876/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
879#[inline]
880#[target_feature(enable = "sse4.1")]
881#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
882#[rustc_legacy_const_generics(2)]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
885    static_assert_uimm_bits!(ROUNDING, 4);
886    unsafe { roundss(a, b, ROUNDING) }
887}
888
889/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
890/// returning a vector containing its value in its first position, and its
891/// index
892/// in its second position; all other elements are set to zero.
893///
894/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
895/// instruction.
896///
897/// Arguments:
898///
899/// * `a` - A 128-bit vector of type `__m128i`.
900///
901/// Returns:
902///
903/// A 128-bit value where:
904///
905/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
906/// * bits `[18:16]` - contain the index of the minimum value
907/// * remaining bits are set to `0`.
908///
909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
910#[inline]
911#[target_feature(enable = "sse4.1")]
912#[cfg_attr(test, assert_instr(phminposuw))]
913#[stable(feature = "simd_x86", since = "1.27.0")]
914pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
915    unsafe { transmute(phminposuw(a.as_u16x8())) }
916}
917
918/// Multiplies the low 32-bit integers from each packed 64-bit
919/// element in `a` and `b`, and returns the signed 64-bit result.
920///
921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
922#[inline]
923#[target_feature(enable = "sse4.1")]
924#[cfg_attr(test, assert_instr(pmuldq))]
925#[stable(feature = "simd_x86", since = "1.27.0")]
926#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
927pub const fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
928    unsafe {
929        let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
930        let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
931        transmute(simd_mul(a, b))
932    }
933}
934
935/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
936/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
937/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
938/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
939/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
940/// return a negative number.
941///
942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
943#[inline]
944#[target_feature(enable = "sse4.1")]
945#[cfg_attr(test, assert_instr(pmulld))]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
948pub const fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
949    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
950}
951
952/// Subtracts 8-bit unsigned integer values and computes the absolute
953/// values of the differences to the corresponding bits in the destination.
954/// Then sums of the absolute differences are returned according to the bit
955/// fields in the immediate operand.
956///
957/// The following algorithm is performed:
958///
959/// ```ignore
960/// i = IMM8[2] * 4
961/// j = IMM8[1:0] * 4
962/// for k := 0 to 7
963///     d0 = abs(a[i + k + 0] - b[j + 0])
964///     d1 = abs(a[i + k + 1] - b[j + 1])
965///     d2 = abs(a[i + k + 2] - b[j + 2])
966///     d3 = abs(a[i + k + 3] - b[j + 3])
967///     r[k] = d0 + d1 + d2 + d3
968/// ```
969///
970/// Arguments:
971///
972/// * `a` - A 128-bit vector of type `__m128i`.
973/// * `b` - A 128-bit vector of type `__m128i`.
974/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
975///   differences are to be calculated
976///     * Bit `[2]` specify the offset for operand `a`
977///     * Bits `[1:0]` specify the offset for operand `b`
978///
979/// Returns:
980///
981/// * A `__m128i` vector containing the sums of the sets of   absolute
982///   differences between both operands.
983///
984/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
985#[inline]
986#[target_feature(enable = "sse4.1")]
987#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
988#[rustc_legacy_const_generics(2)]
989#[stable(feature = "simd_x86", since = "1.27.0")]
990pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
991    static_assert_uimm_bits!(IMM8, 3);
992    unsafe { transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
993}
994
995/// Tests whether the specified bits in a 128-bit integer vector are all
996/// zeros.
997///
998/// Arguments:
999///
1000/// * `a` - A 128-bit integer vector containing the bits to be tested.
1001/// * `mask` - A 128-bit integer vector selecting which bits to test in
1002///   operand `a`.
1003///
1004/// Returns:
1005///
1006/// * `1` - if the specified bits are all zeros,
1007/// * `0` - otherwise.
1008///
1009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1010#[inline]
1011#[target_feature(enable = "sse4.1")]
1012#[cfg_attr(test, assert_instr(ptest))]
1013#[stable(feature = "simd_x86", since = "1.27.0")]
1014#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1015pub const fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1016    unsafe {
1017        let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
1018        (0i64 == r) as i32
1019    }
1020}
1021
1022/// Tests whether the specified bits in a 128-bit integer vector are all
1023/// ones.
1024///
1025/// Arguments:
1026///
1027/// * `a` - A 128-bit integer vector containing the bits to be tested.
1028/// * `mask` - A 128-bit integer vector selecting which bits to test in
1029///   operand `a`.
1030///
1031/// Returns:
1032///
1033/// * `1` - if the specified bits are all ones,
1034/// * `0` - otherwise.
1035///
1036/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1037#[inline]
1038#[target_feature(enable = "sse4.1")]
1039#[cfg_attr(test, assert_instr(ptest))]
1040#[stable(feature = "simd_x86", since = "1.27.0")]
1041#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1042pub const fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1043    unsafe {
1044        let r = simd_reduce_or(simd_and(
1045            simd_xor(a.as_i64x2(), i64x2::splat(!0)),
1046            mask.as_i64x2(),
1047        ));
1048        (0i64 == r) as i32
1049    }
1050}
1051
1052/// Tests whether the specified bits in a 128-bit integer vector are
1053/// neither all zeros nor all ones.
1054///
1055/// Arguments:
1056///
1057/// * `a` - A 128-bit integer vector containing the bits to be tested.
1058/// * `mask` - A 128-bit integer vector selecting which bits to test in
1059///   operand `a`.
1060///
1061/// Returns:
1062///
1063/// * `1` - if the specified bits are neither all zeros nor all ones,
1064/// * `0` - otherwise.
1065///
1066/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1067#[inline]
1068#[target_feature(enable = "sse4.1")]
1069#[cfg_attr(test, assert_instr(ptest))]
1070#[stable(feature = "simd_x86", since = "1.27.0")]
1071pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1072    unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
1073}
1074
1075/// Tests whether the specified bits in a 128-bit integer vector are all
1076/// zeros.
1077///
1078/// Arguments:
1079///
1080/// * `a` - A 128-bit integer vector containing the bits to be tested.
1081/// * `mask` - A 128-bit integer vector selecting which bits to test in
1082///   operand `a`.
1083///
1084/// Returns:
1085///
1086/// * `1` - if the specified bits are all zeros,
1087/// * `0` - otherwise.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1090#[inline]
1091#[target_feature(enable = "sse4.1")]
1092#[cfg_attr(test, assert_instr(ptest))]
1093#[stable(feature = "simd_x86", since = "1.27.0")]
1094#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1095pub const fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1096    _mm_testz_si128(a, mask)
1097}
1098
1099/// Tests whether the specified bits in `a` 128-bit integer vector are all
1100/// ones.
1101///
1102/// Argument:
1103///
1104/// * `a` - A 128-bit integer vector containing the bits to be tested.
1105///
1106/// Returns:
1107///
1108/// * `1` - if the bits specified in the operand are all set to 1,
1109/// * `0` - otherwise.
1110///
1111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1112#[inline]
1113#[target_feature(enable = "sse4.1")]
1114#[cfg_attr(test, assert_instr(pcmpeqd))]
1115#[cfg_attr(test, assert_instr(ptest))]
1116#[stable(feature = "simd_x86", since = "1.27.0")]
1117#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1118pub const fn _mm_test_all_ones(a: __m128i) -> i32 {
1119    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1120}
1121
1122/// Tests whether the specified bits in a 128-bit integer vector are
1123/// neither all zeros nor all ones.
1124///
1125/// Arguments:
1126///
1127/// * `a` - A 128-bit integer vector containing the bits to be tested.
1128/// * `mask` - A 128-bit integer vector selecting which bits to test in
1129///   operand `a`.
1130///
1131/// Returns:
1132///
1133/// * `1` - if the specified bits are neither all zeros nor all ones,
1134/// * `0` - otherwise.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1137#[inline]
1138#[target_feature(enable = "sse4.1")]
1139#[cfg_attr(test, assert_instr(ptest))]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1142    _mm_testnzc_si128(a, mask)
1143}
1144
1145/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
1146/// boundary or a general-protection exception may be generated. To minimize caching, the data
1147/// is flagged as non-temporal (unlikely to be used again soon)
1148///
1149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
1150#[inline]
1151#[target_feature(enable = "sse4.1")]
1152#[cfg_attr(test, assert_instr(movntdqa))]
1153#[stable(feature = "simd_x86_updates", since = "1.82.0")]
1154pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
1155    let dst: __m128i;
1156    crate::arch::asm!(
1157        vpl!("movntdqa {a}"),
1158        a = out(xmm_reg) dst,
1159        p = in(reg) mem_addr,
1160        options(pure, readonly, nostack, preserves_flags),
1161    );
1162    dst
1163}
1164
1165#[allow(improper_ctypes)]
1166unsafe extern "C" {
1167    #[link_name = "llvm.x86.sse41.insertps"]
1168    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1169    #[link_name = "llvm.x86.sse41.packusdw"]
1170    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1171    #[link_name = "llvm.x86.sse41.dppd"]
1172    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1173    #[link_name = "llvm.x86.sse41.dpps"]
1174    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1175    #[link_name = "llvm.x86.sse41.round.pd"]
1176    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1177    #[link_name = "llvm.x86.sse41.round.ps"]
1178    fn roundps(a: __m128, rounding: i32) -> __m128;
1179    #[link_name = "llvm.x86.sse41.round.sd"]
1180    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1181    #[link_name = "llvm.x86.sse41.round.ss"]
1182    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1183    #[link_name = "llvm.x86.sse41.phminposuw"]
1184    fn phminposuw(a: u16x8) -> u16x8;
1185    #[link_name = "llvm.x86.sse41.mpsadbw"]
1186    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1187    #[link_name = "llvm.x86.sse41.ptestnzc"]
1188    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1189}
1190
1191#[cfg(test)]
1192mod tests {
1193    use crate::core_arch::assert_eq_const as assert_eq;
1194    use crate::core_arch::x86::*;
1195    use std::mem;
1196    use stdarch_test::simd_test;
1197
1198    #[simd_test(enable = "sse4.1")]
1199    const unsafe fn test_mm_blendv_epi8() {
1200        #[rustfmt::skip]
1201        let a = _mm_setr_epi8(
1202            0, 1, 2, 3, 4, 5, 6, 7,
1203            8, 9, 10, 11, 12, 13, 14, 15,
1204        );
1205        #[rustfmt::skip]
1206        let b = _mm_setr_epi8(
1207            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1208        );
1209        #[rustfmt::skip]
1210        let mask = _mm_setr_epi8(
1211            0, -1, 0, -1, 0, -1, 0, -1,
1212            0, -1, 0, -1, 0, -1, 0, -1,
1213        );
1214        #[rustfmt::skip]
1215        let e = _mm_setr_epi8(
1216            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1217        );
1218        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1219    }
1220
1221    #[simd_test(enable = "sse4.1")]
1222    const unsafe fn test_mm_blendv_pd() {
1223        let a = _mm_set1_pd(0.0);
1224        let b = _mm_set1_pd(1.0);
1225        let mask = transmute(_mm_setr_epi64x(0, -1));
1226        let r = _mm_blendv_pd(a, b, mask);
1227        let e = _mm_setr_pd(0.0, 1.0);
1228        assert_eq_m128d(r, e);
1229    }
1230
1231    #[simd_test(enable = "sse4.1")]
1232    const unsafe fn test_mm_blendv_ps() {
1233        let a = _mm_set1_ps(0.0);
1234        let b = _mm_set1_ps(1.0);
1235        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1236        let r = _mm_blendv_ps(a, b, mask);
1237        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1238        assert_eq_m128(r, e);
1239    }
1240
1241    #[simd_test(enable = "sse4.1")]
1242    const unsafe fn test_mm_blend_pd() {
1243        let a = _mm_set1_pd(0.0);
1244        let b = _mm_set1_pd(1.0);
1245        let r = _mm_blend_pd::<0b10>(a, b);
1246        let e = _mm_setr_pd(0.0, 1.0);
1247        assert_eq_m128d(r, e);
1248    }
1249
1250    #[simd_test(enable = "sse4.1")]
1251    const unsafe fn test_mm_blend_ps() {
1252        let a = _mm_set1_ps(0.0);
1253        let b = _mm_set1_ps(1.0);
1254        let r = _mm_blend_ps::<0b1010>(a, b);
1255        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1256        assert_eq_m128(r, e);
1257    }
1258
1259    #[simd_test(enable = "sse4.1")]
1260    const unsafe fn test_mm_blend_epi16() {
1261        let a = _mm_set1_epi16(0);
1262        let b = _mm_set1_epi16(1);
1263        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1264        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1265        assert_eq_m128i(r, e);
1266    }
1267
1268    #[simd_test(enable = "sse4.1")]
1269    const unsafe fn test_mm_extract_ps() {
1270        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1271        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
1272        assert_eq!(r, 1.0);
1273        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
1274        assert_eq!(r, 3.0);
1275    }
1276
1277    #[simd_test(enable = "sse4.1")]
1278    const unsafe fn test_mm_extract_epi8() {
1279        #[rustfmt::skip]
1280        let a = _mm_setr_epi8(
1281            -1, 1, 2, 3, 4, 5, 6, 7,
1282            8, 9, 10, 11, 12, 13, 14, 15
1283        );
1284        let r1 = _mm_extract_epi8::<0>(a);
1285        let r2 = _mm_extract_epi8::<3>(a);
1286        assert_eq!(r1, 0xFF);
1287        assert_eq!(r2, 3);
1288    }
1289
1290    #[simd_test(enable = "sse4.1")]
1291    const unsafe fn test_mm_extract_epi32() {
1292        let a = _mm_setr_epi32(0, 1, 2, 3);
1293        let r = _mm_extract_epi32::<1>(a);
1294        assert_eq!(r, 1);
1295        let r = _mm_extract_epi32::<3>(a);
1296        assert_eq!(r, 3);
1297    }
1298
1299    #[simd_test(enable = "sse4.1")]
1300    unsafe fn test_mm_insert_ps() {
1301        let a = _mm_set1_ps(1.0);
1302        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1303        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1304        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1305        assert_eq_m128(r, e);
1306
1307        // Zeroing takes precedence over copied value
1308        let a = _mm_set1_ps(1.0);
1309        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1310        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
1311        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
1312        assert_eq_m128(r, e);
1313    }
1314
1315    #[simd_test(enable = "sse4.1")]
1316    const unsafe fn test_mm_insert_epi8() {
1317        let a = _mm_set1_epi8(0);
1318        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1319        let r = _mm_insert_epi8::<1>(a, 32);
1320        assert_eq_m128i(r, e);
1321        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1322        let r = _mm_insert_epi8::<14>(a, 32);
1323        assert_eq_m128i(r, e);
1324    }
1325
1326    #[simd_test(enable = "sse4.1")]
1327    const unsafe fn test_mm_insert_epi32() {
1328        let a = _mm_set1_epi32(0);
1329        let e = _mm_setr_epi32(0, 32, 0, 0);
1330        let r = _mm_insert_epi32::<1>(a, 32);
1331        assert_eq_m128i(r, e);
1332        let e = _mm_setr_epi32(0, 0, 0, 32);
1333        let r = _mm_insert_epi32::<3>(a, 32);
1334        assert_eq_m128i(r, e);
1335    }
1336
1337    #[simd_test(enable = "sse4.1")]
1338    const unsafe fn test_mm_max_epi8() {
1339        #[rustfmt::skip]
1340        let a = _mm_setr_epi8(
1341            1, 4, 5, 8, 9, 12, 13, 16,
1342            17, 20, 21, 24, 25, 28, 29, 32,
1343        );
1344        #[rustfmt::skip]
1345        let b = _mm_setr_epi8(
1346            2, 3, 6, 7, 10, 11, 14, 15,
1347            18, 19, 22, 23, 26, 27, 30, 31,
1348        );
1349        let r = _mm_max_epi8(a, b);
1350        #[rustfmt::skip]
1351        let e = _mm_setr_epi8(
1352            2, 4, 6, 8, 10, 12, 14, 16,
1353            18, 20, 22, 24, 26, 28, 30, 32,
1354        );
1355        assert_eq_m128i(r, e);
1356    }
1357
1358    #[simd_test(enable = "sse4.1")]
1359    const unsafe fn test_mm_max_epu16() {
1360        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1361        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1362        let r = _mm_max_epu16(a, b);
1363        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1364        assert_eq_m128i(r, e);
1365    }
1366
1367    #[simd_test(enable = "sse4.1")]
1368    const unsafe fn test_mm_max_epi32() {
1369        let a = _mm_setr_epi32(1, 4, 5, 8);
1370        let b = _mm_setr_epi32(2, 3, 6, 7);
1371        let r = _mm_max_epi32(a, b);
1372        let e = _mm_setr_epi32(2, 4, 6, 8);
1373        assert_eq_m128i(r, e);
1374    }
1375
1376    #[simd_test(enable = "sse4.1")]
1377    const unsafe fn test_mm_max_epu32() {
1378        let a = _mm_setr_epi32(1, 4, 5, 8);
1379        let b = _mm_setr_epi32(2, 3, 6, 7);
1380        let r = _mm_max_epu32(a, b);
1381        let e = _mm_setr_epi32(2, 4, 6, 8);
1382        assert_eq_m128i(r, e);
1383    }
1384
1385    #[simd_test(enable = "sse4.1")]
1386    const unsafe fn test_mm_min_epi8() {
1387        #[rustfmt::skip]
1388        let a = _mm_setr_epi8(
1389            1, 4, 5, 8, 9, 12, 13, 16,
1390            17, 20, 21, 24, 25, 28, 29, 32,
1391        );
1392        #[rustfmt::skip]
1393        let b = _mm_setr_epi8(
1394            2, 3, 6, 7, 10, 11, 14, 15,
1395            18, 19, 22, 23, 26, 27, 30, 31,
1396        );
1397        let r = _mm_min_epi8(a, b);
1398        #[rustfmt::skip]
1399        let e = _mm_setr_epi8(
1400            1, 3, 5, 7, 9, 11, 13, 15,
1401            17, 19, 21, 23, 25, 27, 29, 31,
1402        );
1403        assert_eq_m128i(r, e);
1404
1405        #[rustfmt::skip]
1406        let a = _mm_setr_epi8(
1407            1, -4, -5, 8, -9, -12, 13, -16,
1408            17, 20, 21, 24, 25, 28, 29, 32,
1409        );
1410        #[rustfmt::skip]
1411        let b = _mm_setr_epi8(
1412            2, -3, -6, 7, -10, -11, 14, -15,
1413            18, 19, 22, 23, 26, 27, 30, 31,
1414        );
1415        let r = _mm_min_epi8(a, b);
1416        #[rustfmt::skip]
1417        let e = _mm_setr_epi8(
1418            1, -4, -6, 7, -10, -12, 13, -16,
1419            17, 19, 21, 23, 25, 27, 29, 31,
1420        );
1421        assert_eq_m128i(r, e);
1422    }
1423
1424    #[simd_test(enable = "sse4.1")]
1425    const unsafe fn test_mm_min_epu16() {
1426        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1427        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1428        let r = _mm_min_epu16(a, b);
1429        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1430        assert_eq_m128i(r, e);
1431    }
1432
1433    #[simd_test(enable = "sse4.1")]
1434    const unsafe fn test_mm_min_epi32() {
1435        let a = _mm_setr_epi32(1, 4, 5, 8);
1436        let b = _mm_setr_epi32(2, 3, 6, 7);
1437        let r = _mm_min_epi32(a, b);
1438        let e = _mm_setr_epi32(1, 3, 5, 7);
1439        assert_eq_m128i(r, e);
1440
1441        let a = _mm_setr_epi32(-1, 4, 5, -7);
1442        let b = _mm_setr_epi32(-2, 3, -6, 8);
1443        let r = _mm_min_epi32(a, b);
1444        let e = _mm_setr_epi32(-2, 3, -6, -7);
1445        assert_eq_m128i(r, e);
1446    }
1447
1448    #[simd_test(enable = "sse4.1")]
1449    const unsafe fn test_mm_min_epu32() {
1450        let a = _mm_setr_epi32(1, 4, 5, 8);
1451        let b = _mm_setr_epi32(2, 3, 6, 7);
1452        let r = _mm_min_epu32(a, b);
1453        let e = _mm_setr_epi32(1, 3, 5, 7);
1454        assert_eq_m128i(r, e);
1455    }
1456
1457    #[simd_test(enable = "sse4.1")]
1458    unsafe fn test_mm_packus_epi32() {
1459        let a = _mm_setr_epi32(1, 2, 3, 4);
1460        let b = _mm_setr_epi32(-1, -2, -3, -4);
1461        let r = _mm_packus_epi32(a, b);
1462        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1463        assert_eq_m128i(r, e);
1464    }
1465
1466    #[simd_test(enable = "sse4.1")]
1467    const unsafe fn test_mm_cmpeq_epi64() {
1468        let a = _mm_setr_epi64x(0, 1);
1469        let b = _mm_setr_epi64x(0, 0);
1470        let r = _mm_cmpeq_epi64(a, b);
1471        let e = _mm_setr_epi64x(-1, 0);
1472        assert_eq_m128i(r, e);
1473    }
1474
1475    #[simd_test(enable = "sse4.1")]
1476    const unsafe fn test_mm_cvtepi8_epi16() {
1477        let a = _mm_set1_epi8(10);
1478        let r = _mm_cvtepi8_epi16(a);
1479        let e = _mm_set1_epi16(10);
1480        assert_eq_m128i(r, e);
1481        let a = _mm_set1_epi8(-10);
1482        let r = _mm_cvtepi8_epi16(a);
1483        let e = _mm_set1_epi16(-10);
1484        assert_eq_m128i(r, e);
1485    }
1486
1487    #[simd_test(enable = "sse4.1")]
1488    const unsafe fn test_mm_cvtepi8_epi32() {
1489        let a = _mm_set1_epi8(10);
1490        let r = _mm_cvtepi8_epi32(a);
1491        let e = _mm_set1_epi32(10);
1492        assert_eq_m128i(r, e);
1493        let a = _mm_set1_epi8(-10);
1494        let r = _mm_cvtepi8_epi32(a);
1495        let e = _mm_set1_epi32(-10);
1496        assert_eq_m128i(r, e);
1497    }
1498
1499    #[simd_test(enable = "sse4.1")]
1500    const unsafe fn test_mm_cvtepi8_epi64() {
1501        let a = _mm_set1_epi8(10);
1502        let r = _mm_cvtepi8_epi64(a);
1503        let e = _mm_set1_epi64x(10);
1504        assert_eq_m128i(r, e);
1505        let a = _mm_set1_epi8(-10);
1506        let r = _mm_cvtepi8_epi64(a);
1507        let e = _mm_set1_epi64x(-10);
1508        assert_eq_m128i(r, e);
1509    }
1510
1511    #[simd_test(enable = "sse4.1")]
1512    const unsafe fn test_mm_cvtepi16_epi32() {
1513        let a = _mm_set1_epi16(10);
1514        let r = _mm_cvtepi16_epi32(a);
1515        let e = _mm_set1_epi32(10);
1516        assert_eq_m128i(r, e);
1517        let a = _mm_set1_epi16(-10);
1518        let r = _mm_cvtepi16_epi32(a);
1519        let e = _mm_set1_epi32(-10);
1520        assert_eq_m128i(r, e);
1521    }
1522
1523    #[simd_test(enable = "sse4.1")]
1524    const unsafe fn test_mm_cvtepi16_epi64() {
1525        let a = _mm_set1_epi16(10);
1526        let r = _mm_cvtepi16_epi64(a);
1527        let e = _mm_set1_epi64x(10);
1528        assert_eq_m128i(r, e);
1529        let a = _mm_set1_epi16(-10);
1530        let r = _mm_cvtepi16_epi64(a);
1531        let e = _mm_set1_epi64x(-10);
1532        assert_eq_m128i(r, e);
1533    }
1534
1535    #[simd_test(enable = "sse4.1")]
1536    const unsafe fn test_mm_cvtepi32_epi64() {
1537        let a = _mm_set1_epi32(10);
1538        let r = _mm_cvtepi32_epi64(a);
1539        let e = _mm_set1_epi64x(10);
1540        assert_eq_m128i(r, e);
1541        let a = _mm_set1_epi32(-10);
1542        let r = _mm_cvtepi32_epi64(a);
1543        let e = _mm_set1_epi64x(-10);
1544        assert_eq_m128i(r, e);
1545    }
1546
1547    #[simd_test(enable = "sse4.1")]
1548    const unsafe fn test_mm_cvtepu8_epi16() {
1549        let a = _mm_set1_epi8(10);
1550        let r = _mm_cvtepu8_epi16(a);
1551        let e = _mm_set1_epi16(10);
1552        assert_eq_m128i(r, e);
1553    }
1554
1555    #[simd_test(enable = "sse4.1")]
1556    const unsafe fn test_mm_cvtepu8_epi32() {
1557        let a = _mm_set1_epi8(10);
1558        let r = _mm_cvtepu8_epi32(a);
1559        let e = _mm_set1_epi32(10);
1560        assert_eq_m128i(r, e);
1561    }
1562
1563    #[simd_test(enable = "sse4.1")]
1564    const unsafe fn test_mm_cvtepu8_epi64() {
1565        let a = _mm_set1_epi8(10);
1566        let r = _mm_cvtepu8_epi64(a);
1567        let e = _mm_set1_epi64x(10);
1568        assert_eq_m128i(r, e);
1569    }
1570
1571    #[simd_test(enable = "sse4.1")]
1572    const unsafe fn test_mm_cvtepu16_epi32() {
1573        let a = _mm_set1_epi16(10);
1574        let r = _mm_cvtepu16_epi32(a);
1575        let e = _mm_set1_epi32(10);
1576        assert_eq_m128i(r, e);
1577    }
1578
1579    #[simd_test(enable = "sse4.1")]
1580    const unsafe fn test_mm_cvtepu16_epi64() {
1581        let a = _mm_set1_epi16(10);
1582        let r = _mm_cvtepu16_epi64(a);
1583        let e = _mm_set1_epi64x(10);
1584        assert_eq_m128i(r, e);
1585    }
1586
1587    #[simd_test(enable = "sse4.1")]
1588    const unsafe fn test_mm_cvtepu32_epi64() {
1589        let a = _mm_set1_epi32(10);
1590        let r = _mm_cvtepu32_epi64(a);
1591        let e = _mm_set1_epi64x(10);
1592        assert_eq_m128i(r, e);
1593    }
1594
1595    #[simd_test(enable = "sse4.1")]
1596    unsafe fn test_mm_dp_pd() {
1597        let a = _mm_setr_pd(2.0, 3.0);
1598        let b = _mm_setr_pd(1.0, 4.0);
1599        let e = _mm_setr_pd(14.0, 0.0);
1600        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1601    }
1602
1603    #[simd_test(enable = "sse4.1")]
1604    unsafe fn test_mm_dp_ps() {
1605        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1606        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1607        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1608        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1609    }
1610
1611    #[simd_test(enable = "sse4.1")]
1612    const unsafe fn test_mm_floor_pd() {
1613        let a = _mm_setr_pd(2.5, 4.5);
1614        let r = _mm_floor_pd(a);
1615        let e = _mm_setr_pd(2.0, 4.0);
1616        assert_eq_m128d(r, e);
1617    }
1618
1619    #[simd_test(enable = "sse4.1")]
1620    const unsafe fn test_mm_floor_ps() {
1621        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1622        let r = _mm_floor_ps(a);
1623        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1624        assert_eq_m128(r, e);
1625    }
1626
1627    #[simd_test(enable = "sse4.1")]
1628    unsafe fn test_mm_floor_sd() {
1629        let a = _mm_setr_pd(2.5, 4.5);
1630        let b = _mm_setr_pd(-1.5, -3.5);
1631        let r = _mm_floor_sd(a, b);
1632        let e = _mm_setr_pd(-2.0, 4.5);
1633        assert_eq_m128d(r, e);
1634    }
1635
1636    #[simd_test(enable = "sse4.1")]
1637    unsafe fn test_mm_floor_ss() {
1638        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1639        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1640        let r = _mm_floor_ss(a, b);
1641        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1642        assert_eq_m128(r, e);
1643    }
1644
1645    #[simd_test(enable = "sse4.1")]
1646    const unsafe fn test_mm_ceil_pd() {
1647        let a = _mm_setr_pd(1.5, 3.5);
1648        let r = _mm_ceil_pd(a);
1649        let e = _mm_setr_pd(2.0, 4.0);
1650        assert_eq_m128d(r, e);
1651    }
1652
1653    #[simd_test(enable = "sse4.1")]
1654    const unsafe fn test_mm_ceil_ps() {
1655        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1656        let r = _mm_ceil_ps(a);
1657        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1658        assert_eq_m128(r, e);
1659    }
1660
1661    #[simd_test(enable = "sse4.1")]
1662    unsafe fn test_mm_ceil_sd() {
1663        let a = _mm_setr_pd(1.5, 3.5);
1664        let b = _mm_setr_pd(-2.5, -4.5);
1665        let r = _mm_ceil_sd(a, b);
1666        let e = _mm_setr_pd(-2.0, 3.5);
1667        assert_eq_m128d(r, e);
1668    }
1669
1670    #[simd_test(enable = "sse4.1")]
1671    unsafe fn test_mm_ceil_ss() {
1672        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1673        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1674        let r = _mm_ceil_ss(a, b);
1675        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1676        assert_eq_m128(r, e);
1677    }
1678
1679    #[simd_test(enable = "sse4.1")]
1680    unsafe fn test_mm_round_pd() {
1681        let a = _mm_setr_pd(1.25, 3.75);
1682        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1683        let e = _mm_setr_pd(1.0, 4.0);
1684        assert_eq_m128d(r, e);
1685    }
1686
1687    #[simd_test(enable = "sse4.1")]
1688    unsafe fn test_mm_round_ps() {
1689        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1690        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1691        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1692        assert_eq_m128(r, e);
1693    }
1694
1695    #[simd_test(enable = "sse4.1")]
1696    unsafe fn test_mm_round_sd() {
1697        let a = _mm_setr_pd(1.5, 3.5);
1698        let b = _mm_setr_pd(-2.5, -4.5);
1699        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1700        let e = _mm_setr_pd(-2.0, 3.5);
1701        assert_eq_m128d(r, e);
1702
1703        let a = _mm_setr_pd(1.5, 3.5);
1704        let b = _mm_setr_pd(-2.5, -4.5);
1705        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1706        let e = _mm_setr_pd(-3.0, 3.5);
1707        assert_eq_m128d(r, e);
1708
1709        let a = _mm_setr_pd(1.5, 3.5);
1710        let b = _mm_setr_pd(-2.5, -4.5);
1711        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1712        let e = _mm_setr_pd(-2.0, 3.5);
1713        assert_eq_m128d(r, e);
1714
1715        let a = _mm_setr_pd(1.5, 3.5);
1716        let b = _mm_setr_pd(-2.5, -4.5);
1717        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1718        let e = _mm_setr_pd(-2.0, 3.5);
1719        assert_eq_m128d(r, e);
1720    }
1721
1722    #[simd_test(enable = "sse4.1")]
1723    unsafe fn test_mm_round_ss() {
1724        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1725        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1726        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1727        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1728        assert_eq_m128(r, e);
1729
1730        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1731        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1732        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1733        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1734        assert_eq_m128(r, e);
1735
1736        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1737        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1738        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1739        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1740        assert_eq_m128(r, e);
1741
1742        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1743        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1744        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1745        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1746        assert_eq_m128(r, e);
1747    }
1748
1749    #[simd_test(enable = "sse4.1")]
1750    unsafe fn test_mm_minpos_epu16_1() {
1751        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1752        let r = _mm_minpos_epu16(a);
1753        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1754        assert_eq_m128i(r, e);
1755    }
1756
1757    #[simd_test(enable = "sse4.1")]
1758    unsafe fn test_mm_minpos_epu16_2() {
1759        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1760        let r = _mm_minpos_epu16(a);
1761        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1762        assert_eq_m128i(r, e);
1763    }
1764
1765    #[simd_test(enable = "sse4.1")]
1766    unsafe fn test_mm_minpos_epu16_3() {
1767        // Case where the minimum value is repeated
1768        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
1769        let r = _mm_minpos_epu16(a);
1770        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1771        assert_eq_m128i(r, e);
1772    }
1773
1774    #[simd_test(enable = "sse4.1")]
1775    const unsafe fn test_mm_mul_epi32() {
1776        {
1777            let a = _mm_setr_epi32(1, 1, 1, 1);
1778            let b = _mm_setr_epi32(1, 2, 3, 4);
1779            let r = _mm_mul_epi32(a, b);
1780            let e = _mm_setr_epi64x(1, 3);
1781            assert_eq_m128i(r, e);
1782        }
1783        {
1784            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1785            let b = _mm_setr_epi32(
1786                -20, -256, /* ignored */
1787                666666, 666666, /* ignored */
1788            );
1789            let r = _mm_mul_epi32(a, b);
1790            let e = _mm_setr_epi64x(-300, 823043843622);
1791            assert_eq_m128i(r, e);
1792        }
1793    }
1794
1795    #[simd_test(enable = "sse4.1")]
1796    const unsafe fn test_mm_mullo_epi32() {
1797        {
1798            let a = _mm_setr_epi32(1, 1, 1, 1);
1799            let b = _mm_setr_epi32(1, 2, 3, 4);
1800            let r = _mm_mullo_epi32(a, b);
1801            let e = _mm_setr_epi32(1, 2, 3, 4);
1802            assert_eq_m128i(r, e);
1803        }
1804        {
1805            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1806            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1807            let r = _mm_mullo_epi32(a, b);
1808            // Attention, most significant bit in r[2] is treated
1809            // as a sign bit:
1810            // 1234567 * 666666 = -1589877210
1811            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1812            assert_eq_m128i(r, e);
1813        }
1814    }
1815
1816    #[simd_test(enable = "sse4.1")]
1817    unsafe fn test_mm_minpos_epu16() {
1818        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1819        let r = _mm_minpos_epu16(a);
1820        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1821        assert_eq_m128i(r, e);
1822    }
1823
1824    #[simd_test(enable = "sse4.1")]
1825    unsafe fn test_mm_mpsadbw_epu8() {
1826        #[rustfmt::skip]
1827        let a = _mm_setr_epi8(
1828            0, 1, 2, 3, 4, 5, 6, 7,
1829            8, 9, 10, 11, 12, 13, 14, 15,
1830        );
1831
1832        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1833        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1834        assert_eq_m128i(r, e);
1835
1836        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1837        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1838        assert_eq_m128i(r, e);
1839
1840        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1841        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1842        assert_eq_m128i(r, e);
1843
1844        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1845        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1846        assert_eq_m128i(r, e);
1847
1848        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1849        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1850        assert_eq_m128i(r, e);
1851    }
1852
1853    #[simd_test(enable = "sse4.1")]
1854    const unsafe fn test_mm_testz_si128() {
1855        let a = _mm_set1_epi8(1);
1856        let mask = _mm_set1_epi8(0);
1857        let r = _mm_testz_si128(a, mask);
1858        assert_eq!(r, 1);
1859        let a = _mm_set1_epi8(0b101);
1860        let mask = _mm_set1_epi8(0b110);
1861        let r = _mm_testz_si128(a, mask);
1862        assert_eq!(r, 0);
1863        let a = _mm_set1_epi8(0b011);
1864        let mask = _mm_set1_epi8(0b100);
1865        let r = _mm_testz_si128(a, mask);
1866        assert_eq!(r, 1);
1867    }
1868
1869    #[simd_test(enable = "sse4.1")]
1870    const unsafe fn test_mm_testc_si128() {
1871        let a = _mm_set1_epi8(-1);
1872        let mask = _mm_set1_epi8(0);
1873        let r = _mm_testc_si128(a, mask);
1874        assert_eq!(r, 1);
1875        let a = _mm_set1_epi8(0b101);
1876        let mask = _mm_set1_epi8(0b110);
1877        let r = _mm_testc_si128(a, mask);
1878        assert_eq!(r, 0);
1879        let a = _mm_set1_epi8(0b101);
1880        let mask = _mm_set1_epi8(0b100);
1881        let r = _mm_testc_si128(a, mask);
1882        assert_eq!(r, 1);
1883    }
1884
1885    #[simd_test(enable = "sse4.1")]
1886    unsafe fn test_mm_testnzc_si128() {
1887        let a = _mm_set1_epi8(0);
1888        let mask = _mm_set1_epi8(1);
1889        let r = _mm_testnzc_si128(a, mask);
1890        assert_eq!(r, 0);
1891        let a = _mm_set1_epi8(-1);
1892        let mask = _mm_set1_epi8(0);
1893        let r = _mm_testnzc_si128(a, mask);
1894        assert_eq!(r, 0);
1895        let a = _mm_set1_epi8(0b101);
1896        let mask = _mm_set1_epi8(0b110);
1897        let r = _mm_testnzc_si128(a, mask);
1898        assert_eq!(r, 1);
1899        let a = _mm_set1_epi8(0b101);
1900        let mask = _mm_set1_epi8(0b101);
1901        let r = _mm_testnzc_si128(a, mask);
1902        assert_eq!(r, 0);
1903    }
1904
1905    #[simd_test(enable = "sse4.1")]
1906    const unsafe fn test_mm_test_all_zeros() {
1907        let a = _mm_set1_epi8(1);
1908        let mask = _mm_set1_epi8(0);
1909        let r = _mm_test_all_zeros(a, mask);
1910        assert_eq!(r, 1);
1911        let a = _mm_set1_epi8(0b101);
1912        let mask = _mm_set1_epi8(0b110);
1913        let r = _mm_test_all_zeros(a, mask);
1914        assert_eq!(r, 0);
1915        let a = _mm_set1_epi8(0b011);
1916        let mask = _mm_set1_epi8(0b100);
1917        let r = _mm_test_all_zeros(a, mask);
1918        assert_eq!(r, 1);
1919    }
1920
1921    #[simd_test(enable = "sse4.1")]
1922    const unsafe fn test_mm_test_all_ones() {
1923        let a = _mm_set1_epi8(-1);
1924        let r = _mm_test_all_ones(a);
1925        assert_eq!(r, 1);
1926        let a = _mm_set1_epi8(0b101);
1927        let r = _mm_test_all_ones(a);
1928        assert_eq!(r, 0);
1929    }
1930
1931    #[simd_test(enable = "sse4.1")]
1932    unsafe fn test_mm_test_mix_ones_zeros() {
1933        let a = _mm_set1_epi8(0);
1934        let mask = _mm_set1_epi8(1);
1935        let r = _mm_test_mix_ones_zeros(a, mask);
1936        assert_eq!(r, 0);
1937        let a = _mm_set1_epi8(-1);
1938        let mask = _mm_set1_epi8(0);
1939        let r = _mm_test_mix_ones_zeros(a, mask);
1940        assert_eq!(r, 0);
1941        let a = _mm_set1_epi8(0b101);
1942        let mask = _mm_set1_epi8(0b110);
1943        let r = _mm_test_mix_ones_zeros(a, mask);
1944        assert_eq!(r, 1);
1945        let a = _mm_set1_epi8(0b101);
1946        let mask = _mm_set1_epi8(0b101);
1947        let r = _mm_test_mix_ones_zeros(a, mask);
1948        assert_eq!(r, 0);
1949    }
1950
1951    #[simd_test(enable = "sse4.1")]
1952    unsafe fn test_mm_stream_load_si128() {
1953        let a = _mm_set_epi64x(5, 6);
1954        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
1955        assert_eq_m128i(a, r);
1956    }
1957}