Skip to main content

core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub fn _mm_lfence() {
53    unsafe { lfence() }
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub fn _mm_mfence() {
69    unsafe { mfence() }
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
80pub const fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
81    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
82}
83
84/// Adds packed 16-bit integers in `a` and `b`.
85///
86/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
87#[inline]
88#[target_feature(enable = "sse2")]
89#[cfg_attr(test, assert_instr(paddw))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
92pub const fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
93    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
94}
95
96/// Adds packed 32-bit integers in `a` and `b`.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
99#[inline]
100#[target_feature(enable = "sse2")]
101#[cfg_attr(test, assert_instr(paddd))]
102#[stable(feature = "simd_x86", since = "1.27.0")]
103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
104pub const fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
105    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
106}
107
108/// Adds packed 64-bit integers in `a` and `b`.
109///
110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
111#[inline]
112#[target_feature(enable = "sse2")]
113#[cfg_attr(test, assert_instr(paddq))]
114#[stable(feature = "simd_x86", since = "1.27.0")]
115#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
116pub const fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
117    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
118}
119
120/// Adds packed 8-bit integers in `a` and `b` using saturation.
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
123#[inline]
124#[target_feature(enable = "sse2")]
125#[cfg_attr(test, assert_instr(paddsb))]
126#[stable(feature = "simd_x86", since = "1.27.0")]
127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
128pub const fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
129    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
130}
131
132/// Adds packed 16-bit integers in `a` and `b` using saturation.
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
135#[inline]
136#[target_feature(enable = "sse2")]
137#[cfg_attr(test, assert_instr(paddsw))]
138#[stable(feature = "simd_x86", since = "1.27.0")]
139#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
140pub const fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
141    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
142}
143
144/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
145///
146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
147#[inline]
148#[target_feature(enable = "sse2")]
149#[cfg_attr(test, assert_instr(paddusb))]
150#[stable(feature = "simd_x86", since = "1.27.0")]
151#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
152pub const fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
153    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
154}
155
156/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
157///
158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
159#[inline]
160#[target_feature(enable = "sse2")]
161#[cfg_attr(test, assert_instr(paddusw))]
162#[stable(feature = "simd_x86", since = "1.27.0")]
163#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
164pub const fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
165    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
166}
167
168/// Averages packed unsigned 8-bit integers in `a` and `b`.
169///
170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
171#[inline]
172#[target_feature(enable = "sse2")]
173#[cfg_attr(test, assert_instr(pavgb))]
174#[stable(feature = "simd_x86", since = "1.27.0")]
175#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
176pub const fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
177    unsafe {
178        let a = simd_cast::<_, u16x16>(a.as_u8x16());
179        let b = simd_cast::<_, u16x16>(b.as_u8x16());
180        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
181        transmute(simd_cast::<_, u8x16>(r))
182    }
183}
184
185/// Averages packed unsigned 16-bit integers in `a` and `b`.
186///
187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
188#[inline]
189#[target_feature(enable = "sse2")]
190#[cfg_attr(test, assert_instr(pavgw))]
191#[stable(feature = "simd_x86", since = "1.27.0")]
192#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
193pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
194    unsafe {
195        let a = simd_cast::<_, u32x8>(a.as_u16x8());
196        let b = simd_cast::<_, u32x8>(b.as_u16x8());
197        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
198        transmute(simd_cast::<_, u16x8>(r))
199    }
200}
201
202/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
203///
204/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
205/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
206/// intermediate 32-bit integers.
207///
208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
209#[inline]
210#[target_feature(enable = "sse2")]
211#[cfg_attr(test, assert_instr(pmaddwd))]
212#[stable(feature = "simd_x86", since = "1.27.0")]
213pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
215    //
216    // ```rust
217    // #[target_feature(enable = "sse2")]
218    // unsafe fn widening_add(mad: __m128i) -> __m128i {
219    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
220    // }
221    // ```
222    //
223    // If we implement this using generic vector intrinsics, the optimizer
224    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
225    // For this reason, we use x86 intrinsics.
226    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
227}
228
229/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
230/// maximum values.
231///
232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
233#[inline]
234#[target_feature(enable = "sse2")]
235#[cfg_attr(test, assert_instr(pmaxsw))]
236#[stable(feature = "simd_x86", since = "1.27.0")]
237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
238pub const fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
239    unsafe { simd_imax(a.as_i16x8(), b.as_i16x8()).as_m128i() }
240}
241
242/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
243/// packed maximum values.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
246#[inline]
247#[target_feature(enable = "sse2")]
248#[cfg_attr(test, assert_instr(pmaxub))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
251pub const fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
252    unsafe { simd_imax(a.as_u8x16(), b.as_u8x16()).as_m128i() }
253}
254
255/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
256/// minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminsw))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
264pub const fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
265    unsafe { simd_imin(a.as_i16x8(), b.as_i16x8()).as_m128i() }
266}
267
268/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
269/// packed minimum values.
270///
271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
272#[inline]
273#[target_feature(enable = "sse2")]
274#[cfg_attr(test, assert_instr(pminub))]
275#[stable(feature = "simd_x86", since = "1.27.0")]
276#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
277pub const fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
278    unsafe { simd_imin(a.as_u8x16(), b.as_u8x16()).as_m128i() }
279}
280
281/// Multiplies the packed 16-bit integers in `a` and `b`.
282///
283/// The multiplication produces intermediate 32-bit integers, and returns the
284/// high 16 bits of the intermediate integers.
285///
286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
287#[inline]
288#[target_feature(enable = "sse2")]
289#[cfg_attr(test, assert_instr(pmulhw))]
290#[stable(feature = "simd_x86", since = "1.27.0")]
291#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
292pub const fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
293    unsafe {
294        let a = simd_cast::<_, i32x8>(a.as_i16x8());
295        let b = simd_cast::<_, i32x8>(b.as_i16x8());
296        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
297        transmute(simd_cast::<i32x8, i16x8>(r))
298    }
299}
300
301/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
302///
303/// The multiplication produces intermediate 32-bit integers, and returns the
304/// high 16 bits of the intermediate integers.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
307#[inline]
308#[target_feature(enable = "sse2")]
309#[cfg_attr(test, assert_instr(pmulhuw))]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
312pub const fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
313    unsafe {
314        let a = simd_cast::<_, u32x8>(a.as_u16x8());
315        let b = simd_cast::<_, u32x8>(b.as_u16x8());
316        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
317        transmute(simd_cast::<u32x8, u16x8>(r))
318    }
319}
320
321/// Multiplies the packed 16-bit integers in `a` and `b`.
322///
323/// The multiplication produces intermediate 32-bit integers, and returns the
324/// low 16 bits of the intermediate integers.
325///
326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
327#[inline]
328#[target_feature(enable = "sse2")]
329#[cfg_attr(test, assert_instr(pmullw))]
330#[stable(feature = "simd_x86", since = "1.27.0")]
331#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
332pub const fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
333    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
334}
335
336/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
337/// in `a` and `b`.
338///
339/// Returns the unsigned 64-bit results.
340///
341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
342#[inline]
343#[target_feature(enable = "sse2")]
344#[cfg_attr(test, assert_instr(pmuludq))]
345#[stable(feature = "simd_x86", since = "1.27.0")]
346#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
347pub const fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
348    unsafe {
349        let a = a.as_u64x2();
350        let b = b.as_u64x2();
351        let mask = u64x2::splat(u32::MAX as u64);
352        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
353    }
354}
355
356/// Sum the absolute differences of packed unsigned 8-bit integers.
357///
358/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
359/// and `b`, then horizontally sum each consecutive 8 differences to produce
360/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
361/// the low 16 bits of 64-bit elements returned.
362///
363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
364#[inline]
365#[target_feature(enable = "sse2")]
366#[cfg_attr(test, assert_instr(psadbw))]
367#[stable(feature = "simd_x86", since = "1.27.0")]
368pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
369    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
370}
371
372/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
373///
374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
375#[inline]
376#[target_feature(enable = "sse2")]
377#[cfg_attr(test, assert_instr(psubb))]
378#[stable(feature = "simd_x86", since = "1.27.0")]
379#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
380pub const fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
381    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
382}
383
384/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
385///
386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
387#[inline]
388#[target_feature(enable = "sse2")]
389#[cfg_attr(test, assert_instr(psubw))]
390#[stable(feature = "simd_x86", since = "1.27.0")]
391#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
392pub const fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
393    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
394}
395
396/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
399#[inline]
400#[target_feature(enable = "sse2")]
401#[cfg_attr(test, assert_instr(psubd))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
404pub const fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
405    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
406}
407
408/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
409///
410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
411#[inline]
412#[target_feature(enable = "sse2")]
413#[cfg_attr(test, assert_instr(psubq))]
414#[stable(feature = "simd_x86", since = "1.27.0")]
415#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
416pub const fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
417    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
418}
419
420/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
421/// using saturation.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
424#[inline]
425#[target_feature(enable = "sse2")]
426#[cfg_attr(test, assert_instr(psubsb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
430    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
431}
432
433/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
434/// using saturation.
435///
436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
437#[inline]
438#[target_feature(enable = "sse2")]
439#[cfg_attr(test, assert_instr(psubsw))]
440#[stable(feature = "simd_x86", since = "1.27.0")]
441#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
442pub const fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
443    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
444}
445
446/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
447/// integers in `a` using saturation.
448///
449/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
450#[inline]
451#[target_feature(enable = "sse2")]
452#[cfg_attr(test, assert_instr(psubusb))]
453#[stable(feature = "simd_x86", since = "1.27.0")]
454#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
455pub const fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
456    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
457}
458
459/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
460/// integers in `a` using saturation.
461///
462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
463#[inline]
464#[target_feature(enable = "sse2")]
465#[cfg_attr(test, assert_instr(psubusw))]
466#[stable(feature = "simd_x86", since = "1.27.0")]
467#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
468pub const fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
469    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
470}
471
472/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
473///
474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
475#[inline]
476#[target_feature(enable = "sse2")]
477#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
478#[rustc_legacy_const_generics(1)]
479#[stable(feature = "simd_x86", since = "1.27.0")]
480#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
481pub const fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
482    static_assert_uimm_bits!(IMM8, 8);
483    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
484}
485
486/// Implementation detail: converts the immediate argument of the
487/// `_mm_slli_si128` intrinsic into a compile-time constant.
488#[inline]
489#[target_feature(enable = "sse2")]
490#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
491const unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
492    const fn mask(shift: i32, i: u32) -> u32 {
493        let shift = shift as u32 & 0xff;
494        if shift > 15 { i } else { 16 - shift + i }
495    }
496    transmute::<i8x16, _>(simd_shuffle!(
497        i8x16::ZERO,
498        a.as_i8x16(),
499        [
500            mask(IMM8, 0),
501            mask(IMM8, 1),
502            mask(IMM8, 2),
503            mask(IMM8, 3),
504            mask(IMM8, 4),
505            mask(IMM8, 5),
506            mask(IMM8, 6),
507            mask(IMM8, 7),
508            mask(IMM8, 8),
509            mask(IMM8, 9),
510            mask(IMM8, 10),
511            mask(IMM8, 11),
512            mask(IMM8, 12),
513            mask(IMM8, 13),
514            mask(IMM8, 14),
515            mask(IMM8, 15),
516        ],
517    ))
518}
519
520/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
521///
522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
523#[inline]
524#[target_feature(enable = "sse2")]
525#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
526#[rustc_legacy_const_generics(1)]
527#[stable(feature = "simd_x86", since = "1.27.0")]
528#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
529pub const fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
530    unsafe {
531        static_assert_uimm_bits!(IMM8, 8);
532        _mm_slli_si128_impl::<IMM8>(a)
533    }
534}
535
536/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
539#[inline]
540#[target_feature(enable = "sse2")]
541#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
542#[rustc_legacy_const_generics(1)]
543#[stable(feature = "simd_x86", since = "1.27.0")]
544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
545pub const fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
546    unsafe {
547        static_assert_uimm_bits!(IMM8, 8);
548        _mm_srli_si128_impl::<IMM8>(a)
549    }
550}
551
552/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
555#[inline]
556#[target_feature(enable = "sse2")]
557#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
558#[rustc_legacy_const_generics(1)]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
561pub const fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
562    static_assert_uimm_bits!(IMM8, 8);
563    unsafe {
564        if IMM8 >= 16 {
565            _mm_setzero_si128()
566        } else {
567            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
568        }
569    }
570}
571
572/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
573/// zeros.
574///
575/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
576#[inline]
577#[target_feature(enable = "sse2")]
578#[cfg_attr(test, assert_instr(psllw))]
579#[stable(feature = "simd_x86", since = "1.27.0")]
580pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
581    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
582}
583
584/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
585///
586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
587#[inline]
588#[target_feature(enable = "sse2")]
589#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
590#[rustc_legacy_const_generics(1)]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
593pub const fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
594    static_assert_uimm_bits!(IMM8, 8);
595    unsafe {
596        if IMM8 >= 32 {
597            _mm_setzero_si128()
598        } else {
599            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
600        }
601    }
602}
603
604/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
605/// zeros.
606///
607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
608#[inline]
609#[target_feature(enable = "sse2")]
610#[cfg_attr(test, assert_instr(pslld))]
611#[stable(feature = "simd_x86", since = "1.27.0")]
612pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
613    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
614}
615
616/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
617///
618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
619#[inline]
620#[target_feature(enable = "sse2")]
621#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
622#[rustc_legacy_const_generics(1)]
623#[stable(feature = "simd_x86", since = "1.27.0")]
624#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
625pub const fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
626    static_assert_uimm_bits!(IMM8, 8);
627    unsafe {
628        if IMM8 >= 64 {
629            _mm_setzero_si128()
630        } else {
631            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
632        }
633    }
634}
635
636/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
637/// zeros.
638///
639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
640#[inline]
641#[target_feature(enable = "sse2")]
642#[cfg_attr(test, assert_instr(psllq))]
643#[stable(feature = "simd_x86", since = "1.27.0")]
644pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
645    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
646}
647
648/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
649/// bits.
650///
651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
652#[inline]
653#[target_feature(enable = "sse2")]
654#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
655#[rustc_legacy_const_generics(1)]
656#[stable(feature = "simd_x86", since = "1.27.0")]
657#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
658pub const fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
659    static_assert_uimm_bits!(IMM8, 8);
660    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
661}
662
663/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
664/// bits.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
667#[inline]
668#[target_feature(enable = "sse2")]
669#[cfg_attr(test, assert_instr(psraw))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
672    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
673}
674
675/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
676/// bits.
677///
678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
679#[inline]
680#[target_feature(enable = "sse2")]
681#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
682#[rustc_legacy_const_generics(1)]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
686    static_assert_uimm_bits!(IMM8, 8);
687    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
688}
689
690/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
691/// bits.
692///
693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
694#[inline]
695#[target_feature(enable = "sse2")]
696#[cfg_attr(test, assert_instr(psrad))]
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
699    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
700}
701
702/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
705#[inline]
706#[target_feature(enable = "sse2")]
707#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
708#[rustc_legacy_const_generics(1)]
709#[stable(feature = "simd_x86", since = "1.27.0")]
710#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
711pub const fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
712    static_assert_uimm_bits!(IMM8, 8);
713    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
714}
715
716/// Implementation detail: converts the immediate argument of the
717/// `_mm_srli_si128` intrinsic into a compile-time constant.
718#[inline]
719#[target_feature(enable = "sse2")]
720#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
721const unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
722    const fn mask(shift: i32, i: u32) -> u32 {
723        if (shift as u32) > 15 {
724            i + 16
725        } else {
726            i + (shift as u32)
727        }
728    }
729    let x: i8x16 = simd_shuffle!(
730        a.as_i8x16(),
731        i8x16::ZERO,
732        [
733            mask(IMM8, 0),
734            mask(IMM8, 1),
735            mask(IMM8, 2),
736            mask(IMM8, 3),
737            mask(IMM8, 4),
738            mask(IMM8, 5),
739            mask(IMM8, 6),
740            mask(IMM8, 7),
741            mask(IMM8, 8),
742            mask(IMM8, 9),
743            mask(IMM8, 10),
744            mask(IMM8, 11),
745            mask(IMM8, 12),
746            mask(IMM8, 13),
747            mask(IMM8, 14),
748            mask(IMM8, 15),
749        ],
750    );
751    transmute(x)
752}
753
754/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
755/// zeros.
756///
757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
758#[inline]
759#[target_feature(enable = "sse2")]
760#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
761#[rustc_legacy_const_generics(1)]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
764pub const fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
765    static_assert_uimm_bits!(IMM8, 8);
766    unsafe {
767        if IMM8 >= 16 {
768            _mm_setzero_si128()
769        } else {
770            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
771        }
772    }
773}
774
775/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
776/// zeros.
777///
778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
779#[inline]
780#[target_feature(enable = "sse2")]
781#[cfg_attr(test, assert_instr(psrlw))]
782#[stable(feature = "simd_x86", since = "1.27.0")]
783pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
784    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
785}
786
787/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
788/// zeros.
789///
790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
791#[inline]
792#[target_feature(enable = "sse2")]
793#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
794#[rustc_legacy_const_generics(1)]
795#[stable(feature = "simd_x86", since = "1.27.0")]
796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
797pub const fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
798    static_assert_uimm_bits!(IMM8, 8);
799    unsafe {
800        if IMM8 >= 32 {
801            _mm_setzero_si128()
802        } else {
803            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
804        }
805    }
806}
807
808/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
809/// zeros.
810///
811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
812#[inline]
813#[target_feature(enable = "sse2")]
814#[cfg_attr(test, assert_instr(psrld))]
815#[stable(feature = "simd_x86", since = "1.27.0")]
816pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
817    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
818}
819
820/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
821/// zeros.
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
824#[inline]
825#[target_feature(enable = "sse2")]
826#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
827#[rustc_legacy_const_generics(1)]
828#[stable(feature = "simd_x86", since = "1.27.0")]
829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
830pub const fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
831    static_assert_uimm_bits!(IMM8, 8);
832    unsafe {
833        if IMM8 >= 64 {
834            _mm_setzero_si128()
835        } else {
836            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
837        }
838    }
839}
840
841/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
842/// zeros.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(psrlq))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
850    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
851}
852
853/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(andps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
862pub const fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
863    unsafe { simd_and(a, b) }
864}
865
866/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
867/// then AND with `b`.
868///
869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
870#[inline]
871#[target_feature(enable = "sse2")]
872#[cfg_attr(test, assert_instr(andnps))]
873#[stable(feature = "simd_x86", since = "1.27.0")]
874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
875pub const fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
876    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
877}
878
879/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
880/// `b`.
881///
882/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
883#[inline]
884#[target_feature(enable = "sse2")]
885#[cfg_attr(test, assert_instr(orps))]
886#[stable(feature = "simd_x86", since = "1.27.0")]
887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
888pub const fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
889    unsafe { simd_or(a, b) }
890}
891
892/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
893/// `b`.
894///
895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
896#[inline]
897#[target_feature(enable = "sse2")]
898#[cfg_attr(test, assert_instr(xorps))]
899#[stable(feature = "simd_x86", since = "1.27.0")]
900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
901pub const fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
902    unsafe { simd_xor(a, b) }
903}
904
905/// Compares packed 8-bit integers in `a` and `b` for equality.
906///
907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
908#[inline]
909#[target_feature(enable = "sse2")]
910#[cfg_attr(test, assert_instr(pcmpeqb))]
911#[stable(feature = "simd_x86", since = "1.27.0")]
912#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
913pub const fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
914    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
915}
916
917/// Compares packed 16-bit integers in `a` and `b` for equality.
918///
919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
920#[inline]
921#[target_feature(enable = "sse2")]
922#[cfg_attr(test, assert_instr(pcmpeqw))]
923#[stable(feature = "simd_x86", since = "1.27.0")]
924#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
925pub const fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
926    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
927}
928
929/// Compares packed 32-bit integers in `a` and `b` for equality.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
932#[inline]
933#[target_feature(enable = "sse2")]
934#[cfg_attr(test, assert_instr(pcmpeqd))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
937pub const fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
938    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
939}
940
941/// Compares packed 8-bit integers in `a` and `b` for greater-than.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
944#[inline]
945#[target_feature(enable = "sse2")]
946#[cfg_attr(test, assert_instr(pcmpgtb))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
951}
952
953/// Compares packed 16-bit integers in `a` and `b` for greater-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtw))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
961pub const fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
962    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
963}
964
965/// Compares packed 32-bit integers in `a` and `b` for greater-than.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(pcmpgtd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
973pub const fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
974    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
975}
976
977/// Compares packed 8-bit integers in `a` and `b` for less-than.
978///
979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
980#[inline]
981#[target_feature(enable = "sse2")]
982#[cfg_attr(test, assert_instr(pcmpgtb))]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
985pub const fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
986    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
987}
988
989/// Compares packed 16-bit integers in `a` and `b` for less-than.
990///
991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
992#[inline]
993#[target_feature(enable = "sse2")]
994#[cfg_attr(test, assert_instr(pcmpgtw))]
995#[stable(feature = "simd_x86", since = "1.27.0")]
996#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
997pub const fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
998    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
999}
1000
1001/// Compares packed 32-bit integers in `a` and `b` for less-than.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
1004#[inline]
1005#[target_feature(enable = "sse2")]
1006#[cfg_attr(test, assert_instr(pcmpgtd))]
1007#[stable(feature = "simd_x86", since = "1.27.0")]
1008#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1009pub const fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
1010    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
1011}
1012
1013/// Converts the lower two packed 32-bit integers in `a` to packed
1014/// double-precision (64-bit) floating-point elements.
1015///
1016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
1017#[inline]
1018#[target_feature(enable = "sse2")]
1019#[cfg_attr(test, assert_instr(cvtdq2pd))]
1020#[stable(feature = "simd_x86", since = "1.27.0")]
1021#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1022pub const fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
1023    unsafe {
1024        let a = a.as_i32x4();
1025        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
1026    }
1027}
1028
1029/// Returns `a` with its lower element replaced by `b` after converting it to
1030/// an `f64`.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
1033#[inline]
1034#[target_feature(enable = "sse2")]
1035#[cfg_attr(test, assert_instr(cvtsi2sd))]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1038pub const fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
1039    unsafe { simd_insert!(a, 0, b as f64) }
1040}
1041
1042/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
1043/// floating-point elements.
1044///
1045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1046#[inline]
1047#[target_feature(enable = "sse2")]
1048#[cfg_attr(test, assert_instr(cvtdq2ps))]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1052    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1053}
1054
1055/// Converts packed single-precision (32-bit) floating-point elements in `a`
1056/// to packed 32-bit integers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1059#[inline]
1060#[target_feature(enable = "sse2")]
1061#[cfg_attr(test, assert_instr(cvtps2dq))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1064    unsafe { transmute(cvtps2dq(a)) }
1065}
1066
1067/// Returns a vector whose lowest element is `a` and all higher elements are
1068/// `0`.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1071#[inline]
1072#[target_feature(enable = "sse2")]
1073#[stable(feature = "simd_x86", since = "1.27.0")]
1074#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1075pub const fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1076    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1077}
1078
1079/// Returns the lowest element of `a`.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1086pub const fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1087    unsafe { simd_extract!(a.as_i32x4(), 0) }
1088}
1089
1090/// Sets packed 64-bit integers with the supplied values, from highest to
1091/// lowest.
1092///
1093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1094#[inline]
1095#[target_feature(enable = "sse2")]
1096// no particular instruction to test
1097#[stable(feature = "simd_x86", since = "1.27.0")]
1098#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1099pub const fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1100    unsafe { transmute(i64x2::new(e0, e1)) }
1101}
1102
1103/// Sets packed 32-bit integers with the supplied values.
1104///
1105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1106#[inline]
1107#[target_feature(enable = "sse2")]
1108// no particular instruction to test
1109#[stable(feature = "simd_x86", since = "1.27.0")]
1110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1111pub const fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1112    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1113}
1114
1115/// Sets packed 16-bit integers with the supplied values.
1116///
1117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1118#[inline]
1119#[target_feature(enable = "sse2")]
1120// no particular instruction to test
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1123pub const fn _mm_set_epi16(
1124    e7: i16,
1125    e6: i16,
1126    e5: i16,
1127    e4: i16,
1128    e3: i16,
1129    e2: i16,
1130    e1: i16,
1131    e0: i16,
1132) -> __m128i {
1133    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1134}
1135
1136/// Sets packed 8-bit integers with the supplied values.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1139#[inline]
1140#[target_feature(enable = "sse2")]
1141// no particular instruction to test
1142#[stable(feature = "simd_x86", since = "1.27.0")]
1143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1144pub const fn _mm_set_epi8(
1145    e15: i8,
1146    e14: i8,
1147    e13: i8,
1148    e12: i8,
1149    e11: i8,
1150    e10: i8,
1151    e9: i8,
1152    e8: i8,
1153    e7: i8,
1154    e6: i8,
1155    e5: i8,
1156    e4: i8,
1157    e3: i8,
1158    e2: i8,
1159    e1: i8,
1160    e0: i8,
1161) -> __m128i {
1162    unsafe {
1163        #[rustfmt::skip]
1164        transmute(i8x16::new(
1165            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1166        ))
1167    }
1168}
1169
1170/// Broadcasts 64-bit integer `a` to all elements.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1173#[inline]
1174#[target_feature(enable = "sse2")]
1175// no particular instruction to test
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1178pub const fn _mm_set1_epi64x(a: i64) -> __m128i {
1179    i64x2::splat(a).as_m128i()
1180}
1181
1182/// Broadcasts 32-bit integer `a` to all elements.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1185#[inline]
1186#[target_feature(enable = "sse2")]
1187// no particular instruction to test
1188#[stable(feature = "simd_x86", since = "1.27.0")]
1189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1190pub const fn _mm_set1_epi32(a: i32) -> __m128i {
1191    i32x4::splat(a).as_m128i()
1192}
1193
1194/// Broadcasts 16-bit integer `a` to all elements.
1195///
1196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1197#[inline]
1198#[target_feature(enable = "sse2")]
1199// no particular instruction to test
1200#[stable(feature = "simd_x86", since = "1.27.0")]
1201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1202pub const fn _mm_set1_epi16(a: i16) -> __m128i {
1203    i16x8::splat(a).as_m128i()
1204}
1205
1206/// Broadcasts 8-bit integer `a` to all elements.
1207///
1208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1209#[inline]
1210#[target_feature(enable = "sse2")]
1211// no particular instruction to test
1212#[stable(feature = "simd_x86", since = "1.27.0")]
1213#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1214pub const fn _mm_set1_epi8(a: i8) -> __m128i {
1215    i8x16::splat(a).as_m128i()
1216}
1217
1218/// Sets packed 32-bit integers with the supplied values in reverse order.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223// no particular instruction to test
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1226pub const fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1227    _mm_set_epi32(e0, e1, e2, e3)
1228}
1229
1230/// Sets packed 16-bit integers with the supplied values in reverse order.
1231///
1232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1233#[inline]
1234#[target_feature(enable = "sse2")]
1235// no particular instruction to test
1236#[stable(feature = "simd_x86", since = "1.27.0")]
1237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1238pub const fn _mm_setr_epi16(
1239    e7: i16,
1240    e6: i16,
1241    e5: i16,
1242    e4: i16,
1243    e3: i16,
1244    e2: i16,
1245    e1: i16,
1246    e0: i16,
1247) -> __m128i {
1248    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1249}
1250
1251/// Sets packed 8-bit integers with the supplied values in reverse order.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1254#[inline]
1255#[target_feature(enable = "sse2")]
1256// no particular instruction to test
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1259pub const fn _mm_setr_epi8(
1260    e15: i8,
1261    e14: i8,
1262    e13: i8,
1263    e12: i8,
1264    e11: i8,
1265    e10: i8,
1266    e9: i8,
1267    e8: i8,
1268    e7: i8,
1269    e6: i8,
1270    e5: i8,
1271    e4: i8,
1272    e3: i8,
1273    e2: i8,
1274    e1: i8,
1275    e0: i8,
1276) -> __m128i {
1277    #[rustfmt::skip]
1278    _mm_set_epi8(
1279        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1280    )
1281}
1282
1283/// Returns a vector with all elements set to zero.
1284///
1285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1286#[inline]
1287#[target_feature(enable = "sse2")]
1288#[cfg_attr(test, assert_instr(xorps))]
1289#[stable(feature = "simd_x86", since = "1.27.0")]
1290#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1291pub const fn _mm_setzero_si128() -> __m128i {
1292    const { unsafe { mem::zeroed() } }
1293}
1294
1295/// Loads 64-bit integer from memory into first element of returned vector.
1296///
1297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1298#[inline]
1299#[target_feature(enable = "sse2")]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1302pub const unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1303    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1304}
1305
1306/// Loads 128-bits of integer data from memory into a new vector.
1307///
1308/// `mem_addr` must be aligned on a 16-byte boundary.
1309///
1310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1311#[inline]
1312#[target_feature(enable = "sse2")]
1313#[cfg_attr(
1314    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1315    assert_instr(movaps)
1316)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1319pub const unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1320    *mem_addr
1321}
1322
1323/// Loads 128-bits of integer data from memory into a new vector.
1324///
1325/// `mem_addr` does not need to be aligned on any particular boundary.
1326///
1327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1328#[inline]
1329#[target_feature(enable = "sse2")]
1330#[cfg_attr(test, assert_instr(movups))]
1331#[stable(feature = "simd_x86", since = "1.27.0")]
1332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1333pub const unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1334    let mut dst: __m128i = _mm_undefined_si128();
1335    ptr::copy_nonoverlapping(
1336        mem_addr as *const u8,
1337        ptr::addr_of_mut!(dst) as *mut u8,
1338        mem::size_of::<__m128i>(),
1339    );
1340    dst
1341}
1342
1343/// Conditionally store 8-bit integer elements from `a` into memory using
1344/// `mask` flagged as non-temporal (unlikely to be used again soon).
1345///
1346/// Elements are not stored when the highest bit is not set in the
1347/// corresponding element.
1348///
1349/// `mem_addr` should correspond to a 128-bit memory location and does not need
1350/// to be aligned on any particular boundary.
1351///
1352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1353///
1354/// # Safety of non-temporal stores
1355///
1356/// After using this intrinsic, but before any other access to the memory that this intrinsic
1357/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1358/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1359/// return.
1360///
1361/// See [`_mm_sfence`] for details.
1362#[inline]
1363#[target_feature(enable = "sse2")]
1364#[cfg_attr(test, assert_instr(maskmovdqu))]
1365#[stable(feature = "simd_x86", since = "1.27.0")]
1366pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1367    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1368}
1369
1370/// Stores 128-bits of integer data from `a` into memory.
1371///
1372/// `mem_addr` must be aligned on a 16-byte boundary.
1373///
1374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1375#[inline]
1376#[target_feature(enable = "sse2")]
1377#[cfg_attr(
1378    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1379    assert_instr(movaps)
1380)]
1381#[stable(feature = "simd_x86", since = "1.27.0")]
1382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1383pub const unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1384    *mem_addr = a;
1385}
1386
1387/// Stores 128-bits of integer data from `a` into memory.
1388///
1389/// `mem_addr` does not need to be aligned on any particular boundary.
1390///
1391/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1392#[inline]
1393#[target_feature(enable = "sse2")]
1394#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1397pub const unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1398    mem_addr.write_unaligned(a);
1399}
1400
1401/// Stores the lower 64-bit integer `a` to a memory location.
1402///
1403/// `mem_addr` does not need to be aligned on any particular boundary.
1404///
1405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1406#[inline]
1407#[target_feature(enable = "sse2")]
1408#[stable(feature = "simd_x86", since = "1.27.0")]
1409#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1410pub const unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1411    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1412}
1413
1414/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1415/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1416/// used again soon).
1417///
1418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1419///
1420/// # Safety of non-temporal stores
1421///
1422/// After using this intrinsic, but before any other access to the memory that this intrinsic
1423/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1424/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1425/// return.
1426///
1427/// See [`_mm_sfence`] for details.
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(movntdq))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1433    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1434    crate::arch::asm!(
1435        vps!("movntdq",  ",{a}"),
1436        p = in(reg) mem_addr,
1437        a = in(xmm_reg) a,
1438        options(nostack, preserves_flags),
1439    );
1440}
1441
1442/// Stores a 32-bit integer value in the specified memory location.
1443/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1444/// used again soon).
1445///
1446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1447///
1448/// # Safety of non-temporal stores
1449///
1450/// After using this intrinsic, but before any other access to the memory that this intrinsic
1451/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1452/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1453/// return.
1454///
1455/// See [`_mm_sfence`] for details.
1456#[inline]
1457#[target_feature(enable = "sse2")]
1458#[cfg_attr(test, assert_instr(movnti))]
1459#[stable(feature = "simd_x86", since = "1.27.0")]
1460pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1461    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1462    crate::arch::asm!(
1463        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1464        p = in(reg) mem_addr,
1465        a = in(reg) a,
1466        options(nostack, preserves_flags),
1467    );
1468}
1469
1470/// Returns a vector where the low element is extracted from `a` and its upper
1471/// element is zero.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1474#[inline]
1475#[target_feature(enable = "sse2")]
1476// FIXME movd on msvc, movd on i686
1477#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1480pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
1481    unsafe {
1482        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1483        transmute(r)
1484    }
1485}
1486
1487/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1488/// using signed saturation.
1489///
1490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1491#[inline]
1492#[target_feature(enable = "sse2")]
1493#[cfg_attr(test, assert_instr(packsswb))]
1494#[stable(feature = "simd_x86", since = "1.27.0")]
1495pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1496    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1497}
1498
1499/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1500/// using signed saturation.
1501///
1502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1503#[inline]
1504#[target_feature(enable = "sse2")]
1505#[cfg_attr(test, assert_instr(packssdw))]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1508    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1509}
1510
1511/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1512/// using unsigned saturation.
1513///
1514/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1515#[inline]
1516#[target_feature(enable = "sse2")]
1517#[cfg_attr(test, assert_instr(packuswb))]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1520    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1521}
1522
1523/// Returns the `imm8` element of `a`.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1526#[inline]
1527#[target_feature(enable = "sse2")]
1528#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1529#[rustc_legacy_const_generics(1)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1532pub const fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1533    static_assert_uimm_bits!(IMM8, 3);
1534    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1535}
1536
1537/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1538///
1539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1540#[inline]
1541#[target_feature(enable = "sse2")]
1542#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1543#[rustc_legacy_const_generics(2)]
1544#[stable(feature = "simd_x86", since = "1.27.0")]
1545#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1546pub const fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1547    static_assert_uimm_bits!(IMM8, 3);
1548    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1549}
1550
1551/// Returns a mask of the most significant bit of each element in `a`.
1552///
1553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1554#[inline]
1555#[target_feature(enable = "sse2")]
1556#[cfg_attr(test, assert_instr(pmovmskb))]
1557#[stable(feature = "simd_x86", since = "1.27.0")]
1558#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1559pub const fn _mm_movemask_epi8(a: __m128i) -> i32 {
1560    unsafe {
1561        let z = i8x16::ZERO;
1562        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1563        simd_bitmask::<_, u16>(m) as u32 as i32
1564    }
1565}
1566
1567/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1568///
1569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1570#[inline]
1571#[target_feature(enable = "sse2")]
1572#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1573#[rustc_legacy_const_generics(1)]
1574#[stable(feature = "simd_x86", since = "1.27.0")]
1575#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1576pub const fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1577    static_assert_uimm_bits!(IMM8, 8);
1578    unsafe {
1579        let a = a.as_i32x4();
1580        let x: i32x4 = simd_shuffle!(
1581            a,
1582            a,
1583            [
1584                IMM8 as u32 & 0b11,
1585                (IMM8 as u32 >> 2) & 0b11,
1586                (IMM8 as u32 >> 4) & 0b11,
1587                (IMM8 as u32 >> 6) & 0b11,
1588            ],
1589        );
1590        transmute(x)
1591    }
1592}
1593
1594/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1595/// `IMM8`.
1596///
1597/// Put the results in the high 64 bits of the returned vector, with the low 64
1598/// bits being copied from `a`.
1599///
1600/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1601#[inline]
1602#[target_feature(enable = "sse2")]
1603#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1604#[rustc_legacy_const_generics(1)]
1605#[stable(feature = "simd_x86", since = "1.27.0")]
1606#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1607pub const fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1608    static_assert_uimm_bits!(IMM8, 8);
1609    unsafe {
1610        let a = a.as_i16x8();
1611        let x: i16x8 = simd_shuffle!(
1612            a,
1613            a,
1614            [
1615                0,
1616                1,
1617                2,
1618                3,
1619                (IMM8 as u32 & 0b11) + 4,
1620                ((IMM8 as u32 >> 2) & 0b11) + 4,
1621                ((IMM8 as u32 >> 4) & 0b11) + 4,
1622                ((IMM8 as u32 >> 6) & 0b11) + 4,
1623            ],
1624        );
1625        transmute(x)
1626    }
1627}
1628
1629/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1630/// `IMM8`.
1631///
1632/// Put the results in the low 64 bits of the returned vector, with the high 64
1633/// bits being copied from `a`.
1634///
1635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1636#[inline]
1637#[target_feature(enable = "sse2")]
1638#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1639#[rustc_legacy_const_generics(1)]
1640#[stable(feature = "simd_x86", since = "1.27.0")]
1641#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1642pub const fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1643    static_assert_uimm_bits!(IMM8, 8);
1644    unsafe {
1645        let a = a.as_i16x8();
1646        let x: i16x8 = simd_shuffle!(
1647            a,
1648            a,
1649            [
1650                IMM8 as u32 & 0b11,
1651                (IMM8 as u32 >> 2) & 0b11,
1652                (IMM8 as u32 >> 4) & 0b11,
1653                (IMM8 as u32 >> 6) & 0b11,
1654                4,
1655                5,
1656                6,
1657                7,
1658            ],
1659        );
1660        transmute(x)
1661    }
1662}
1663
1664/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1665///
1666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1667#[inline]
1668#[target_feature(enable = "sse2")]
1669#[cfg_attr(test, assert_instr(punpckhbw))]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1672pub const fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1673    unsafe {
1674        transmute::<i8x16, _>(simd_shuffle!(
1675            a.as_i8x16(),
1676            b.as_i8x16(),
1677            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1678        ))
1679    }
1680}
1681
1682/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1685#[inline]
1686#[target_feature(enable = "sse2")]
1687#[cfg_attr(test, assert_instr(punpckhwd))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1690pub const fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1691    unsafe {
1692        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1693        transmute::<i16x8, _>(x)
1694    }
1695}
1696
1697/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1698///
1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1700#[inline]
1701#[target_feature(enable = "sse2")]
1702#[cfg_attr(test, assert_instr(unpckhps))]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1705pub const fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1706    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1707}
1708
1709/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1712#[inline]
1713#[target_feature(enable = "sse2")]
1714#[cfg_attr(test, assert_instr(unpckhpd))]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1718    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1719}
1720
1721/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1722///
1723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1724#[inline]
1725#[target_feature(enable = "sse2")]
1726#[cfg_attr(test, assert_instr(punpcklbw))]
1727#[stable(feature = "simd_x86", since = "1.27.0")]
1728#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1729pub const fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1730    unsafe {
1731        transmute::<i8x16, _>(simd_shuffle!(
1732            a.as_i8x16(),
1733            b.as_i8x16(),
1734            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1735        ))
1736    }
1737}
1738
1739/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1740///
1741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1742#[inline]
1743#[target_feature(enable = "sse2")]
1744#[cfg_attr(test, assert_instr(punpcklwd))]
1745#[stable(feature = "simd_x86", since = "1.27.0")]
1746#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1747pub const fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1748    unsafe {
1749        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1750        transmute::<i16x8, _>(x)
1751    }
1752}
1753
1754/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1755///
1756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1757#[inline]
1758#[target_feature(enable = "sse2")]
1759#[cfg_attr(test, assert_instr(unpcklps))]
1760#[stable(feature = "simd_x86", since = "1.27.0")]
1761#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1762pub const fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1763    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1764}
1765
1766/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1767///
1768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1769#[inline]
1770#[target_feature(enable = "sse2")]
1771#[cfg_attr(test, assert_instr(movlhps))]
1772#[stable(feature = "simd_x86", since = "1.27.0")]
1773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1774pub const fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1775    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1776}
1777
1778/// Returns a new vector with the low element of `a` replaced by the sum of the
1779/// low elements of `a` and `b`.
1780///
1781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1782#[inline]
1783#[target_feature(enable = "sse2")]
1784#[cfg_attr(test, assert_instr(addsd))]
1785#[stable(feature = "simd_x86", since = "1.27.0")]
1786#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1787pub const fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1788    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1789}
1790
1791/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1792/// `b`.
1793///
1794/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1795#[inline]
1796#[target_feature(enable = "sse2")]
1797#[cfg_attr(test, assert_instr(addpd))]
1798#[stable(feature = "simd_x86", since = "1.27.0")]
1799#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1800pub const fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1801    unsafe { simd_add(a, b) }
1802}
1803
1804/// Returns a new vector with the low element of `a` replaced by the result of
1805/// diving the lower element of `a` by the lower element of `b`.
1806///
1807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1808#[inline]
1809#[target_feature(enable = "sse2")]
1810#[cfg_attr(test, assert_instr(divsd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1813pub const fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1814    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1815}
1816
1817/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1818/// packed elements in `b`.
1819///
1820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1821#[inline]
1822#[target_feature(enable = "sse2")]
1823#[cfg_attr(test, assert_instr(divpd))]
1824#[stable(feature = "simd_x86", since = "1.27.0")]
1825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1826pub const fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1827    unsafe { simd_div(a, b) }
1828}
1829
1830/// Returns a new vector with the low element of `a` replaced by the maximum
1831/// of the lower elements of `a` and `b`.
1832///
1833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1834#[inline]
1835#[target_feature(enable = "sse2")]
1836#[cfg_attr(test, assert_instr(maxsd))]
1837#[stable(feature = "simd_x86", since = "1.27.0")]
1838pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1839    unsafe { maxsd(a, b) }
1840}
1841
1842/// Returns a new vector with the maximum values from corresponding elements in
1843/// `a` and `b`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1846#[inline]
1847#[target_feature(enable = "sse2")]
1848#[cfg_attr(test, assert_instr(maxpd))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1851    unsafe { maxpd(a, b) }
1852}
1853
1854/// Returns a new vector with the low element of `a` replaced by the minimum
1855/// of the lower elements of `a` and `b`.
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1858#[inline]
1859#[target_feature(enable = "sse2")]
1860#[cfg_attr(test, assert_instr(minsd))]
1861#[stable(feature = "simd_x86", since = "1.27.0")]
1862pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1863    unsafe { minsd(a, b) }
1864}
1865
1866/// Returns a new vector with the minimum values from corresponding elements in
1867/// `a` and `b`.
1868///
1869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1870#[inline]
1871#[target_feature(enable = "sse2")]
1872#[cfg_attr(test, assert_instr(minpd))]
1873#[stable(feature = "simd_x86", since = "1.27.0")]
1874pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1875    unsafe { minpd(a, b) }
1876}
1877
1878/// Returns a new vector with the low element of `a` replaced by multiplying the
1879/// low elements of `a` and `b`.
1880///
1881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1882#[inline]
1883#[target_feature(enable = "sse2")]
1884#[cfg_attr(test, assert_instr(mulsd))]
1885#[stable(feature = "simd_x86", since = "1.27.0")]
1886#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1887pub const fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1888    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1889}
1890
1891/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1892/// and `b`.
1893///
1894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1895#[inline]
1896#[target_feature(enable = "sse2")]
1897#[cfg_attr(test, assert_instr(mulpd))]
1898#[stable(feature = "simd_x86", since = "1.27.0")]
1899#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1900pub const fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1901    unsafe { simd_mul(a, b) }
1902}
1903
1904/// Returns a new vector with the low element of `a` replaced by the square
1905/// root of the lower element `b`.
1906///
1907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1908#[inline]
1909#[target_feature(enable = "sse2")]
1910#[cfg_attr(test, assert_instr(sqrtsd))]
1911#[stable(feature = "simd_x86", since = "1.27.0")]
1912pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1913    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1914}
1915
1916/// Returns a new vector with the square root of each of the values in `a`.
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1919#[inline]
1920#[target_feature(enable = "sse2")]
1921#[cfg_attr(test, assert_instr(sqrtpd))]
1922#[stable(feature = "simd_x86", since = "1.27.0")]
1923pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1924    unsafe { simd_fsqrt(a) }
1925}
1926
1927/// Returns a new vector with the low element of `a` replaced by subtracting the
1928/// low element by `b` from the low element of `a`.
1929///
1930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1931#[inline]
1932#[target_feature(enable = "sse2")]
1933#[cfg_attr(test, assert_instr(subsd))]
1934#[stable(feature = "simd_x86", since = "1.27.0")]
1935#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1936pub const fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1937    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1938}
1939
1940/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1941/// from `a`.
1942///
1943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1944#[inline]
1945#[target_feature(enable = "sse2")]
1946#[cfg_attr(test, assert_instr(subpd))]
1947#[stable(feature = "simd_x86", since = "1.27.0")]
1948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1949pub const fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1950    unsafe { simd_sub(a, b) }
1951}
1952
1953/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1954/// elements in `a` and `b`.
1955///
1956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(andps))]
1960#[stable(feature = "simd_x86", since = "1.27.0")]
1961#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1962pub const fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1963    unsafe {
1964        let a: __m128i = transmute(a);
1965        let b: __m128i = transmute(b);
1966        transmute(_mm_and_si128(a, b))
1967    }
1968}
1969
1970/// Computes the bitwise NOT of `a` and then AND with `b`.
1971///
1972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1973#[inline]
1974#[target_feature(enable = "sse2")]
1975#[cfg_attr(test, assert_instr(andnps))]
1976#[stable(feature = "simd_x86", since = "1.27.0")]
1977#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1978pub const fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1979    unsafe {
1980        let a: __m128i = transmute(a);
1981        let b: __m128i = transmute(b);
1982        transmute(_mm_andnot_si128(a, b))
1983    }
1984}
1985
1986/// Computes the bitwise OR of `a` and `b`.
1987///
1988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1989#[inline]
1990#[target_feature(enable = "sse2")]
1991#[cfg_attr(test, assert_instr(orps))]
1992#[stable(feature = "simd_x86", since = "1.27.0")]
1993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1994pub const fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1995    unsafe {
1996        let a: __m128i = transmute(a);
1997        let b: __m128i = transmute(b);
1998        transmute(_mm_or_si128(a, b))
1999    }
2000}
2001
2002/// Computes the bitwise XOR of `a` and `b`.
2003///
2004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
2005#[inline]
2006#[target_feature(enable = "sse2")]
2007#[cfg_attr(test, assert_instr(xorps))]
2008#[stable(feature = "simd_x86", since = "1.27.0")]
2009#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2010pub const fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
2011    unsafe {
2012        let a: __m128i = transmute(a);
2013        let b: __m128i = transmute(b);
2014        transmute(_mm_xor_si128(a, b))
2015    }
2016}
2017
2018/// Returns a new vector with the low element of `a` replaced by the equality
2019/// comparison of the lower elements of `a` and `b`.
2020///
2021/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
2022#[inline]
2023#[target_feature(enable = "sse2")]
2024#[cfg_attr(test, assert_instr(cmpeqsd))]
2025#[stable(feature = "simd_x86", since = "1.27.0")]
2026pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
2027    unsafe { cmpsd(a, b, 0) }
2028}
2029
2030/// Returns a new vector with the low element of `a` replaced by the less-than
2031/// comparison of the lower elements of `a` and `b`.
2032///
2033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
2034#[inline]
2035#[target_feature(enable = "sse2")]
2036#[cfg_attr(test, assert_instr(cmpltsd))]
2037#[stable(feature = "simd_x86", since = "1.27.0")]
2038pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
2039    unsafe { cmpsd(a, b, 1) }
2040}
2041
2042/// Returns a new vector with the low element of `a` replaced by the
2043/// less-than-or-equal comparison of the lower elements of `a` and `b`.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
2046#[inline]
2047#[target_feature(enable = "sse2")]
2048#[cfg_attr(test, assert_instr(cmplesd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
2051    unsafe { cmpsd(a, b, 2) }
2052}
2053
2054/// Returns a new vector with the low element of `a` replaced by the
2055/// greater-than comparison of the lower elements of `a` and `b`.
2056///
2057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
2058#[inline]
2059#[target_feature(enable = "sse2")]
2060#[cfg_attr(test, assert_instr(cmpltsd))]
2061#[stable(feature = "simd_x86", since = "1.27.0")]
2062pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
2063    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2064}
2065
2066/// Returns a new vector with the low element of `a` replaced by the
2067/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
2068///
2069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
2070#[inline]
2071#[target_feature(enable = "sse2")]
2072#[cfg_attr(test, assert_instr(cmplesd))]
2073#[stable(feature = "simd_x86", since = "1.27.0")]
2074pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
2075    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2076}
2077
2078/// Returns a new vector with the low element of `a` replaced by the result
2079/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
2080/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
2081/// otherwise.
2082///
2083/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
2084#[inline]
2085#[target_feature(enable = "sse2")]
2086#[cfg_attr(test, assert_instr(cmpordsd))]
2087#[stable(feature = "simd_x86", since = "1.27.0")]
2088pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
2089    unsafe { cmpsd(a, b, 7) }
2090}
2091
2092/// Returns a new vector with the low element of `a` replaced by the result of
2093/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2094/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2097#[inline]
2098#[target_feature(enable = "sse2")]
2099#[cfg_attr(test, assert_instr(cmpunordsd))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2102    unsafe { cmpsd(a, b, 3) }
2103}
2104
2105/// Returns a new vector with the low element of `a` replaced by the not-equal
2106/// comparison of the lower elements of `a` and `b`.
2107///
2108/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2109#[inline]
2110#[target_feature(enable = "sse2")]
2111#[cfg_attr(test, assert_instr(cmpneqsd))]
2112#[stable(feature = "simd_x86", since = "1.27.0")]
2113pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2114    unsafe { cmpsd(a, b, 4) }
2115}
2116
2117/// Returns a new vector with the low element of `a` replaced by the
2118/// not-less-than comparison of the lower elements of `a` and `b`.
2119///
2120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2121#[inline]
2122#[target_feature(enable = "sse2")]
2123#[cfg_attr(test, assert_instr(cmpnltsd))]
2124#[stable(feature = "simd_x86", since = "1.27.0")]
2125pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2126    unsafe { cmpsd(a, b, 5) }
2127}
2128
2129/// Returns a new vector with the low element of `a` replaced by the
2130/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2133#[inline]
2134#[target_feature(enable = "sse2")]
2135#[cfg_attr(test, assert_instr(cmpnlesd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2138    unsafe { cmpsd(a, b, 6) }
2139}
2140
2141/// Returns a new vector with the low element of `a` replaced by the
2142/// not-greater-than comparison of the lower elements of `a` and `b`.
2143///
2144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2145#[inline]
2146#[target_feature(enable = "sse2")]
2147#[cfg_attr(test, assert_instr(cmpnltsd))]
2148#[stable(feature = "simd_x86", since = "1.27.0")]
2149pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2150    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2151}
2152
2153/// Returns a new vector with the low element of `a` replaced by the
2154/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2155///
2156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2157#[inline]
2158#[target_feature(enable = "sse2")]
2159#[cfg_attr(test, assert_instr(cmpnlesd))]
2160#[stable(feature = "simd_x86", since = "1.27.0")]
2161pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2162    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2163}
2164
2165/// Compares corresponding elements in `a` and `b` for equality.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2168#[inline]
2169#[target_feature(enable = "sse2")]
2170#[cfg_attr(test, assert_instr(cmpeqpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2173    unsafe { cmppd(a, b, 0) }
2174}
2175
2176/// Compares corresponding elements in `a` and `b` for less-than.
2177///
2178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2179#[inline]
2180#[target_feature(enable = "sse2")]
2181#[cfg_attr(test, assert_instr(cmpltpd))]
2182#[stable(feature = "simd_x86", since = "1.27.0")]
2183pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2184    unsafe { cmppd(a, b, 1) }
2185}
2186
2187/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2188///
2189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2190#[inline]
2191#[target_feature(enable = "sse2")]
2192#[cfg_attr(test, assert_instr(cmplepd))]
2193#[stable(feature = "simd_x86", since = "1.27.0")]
2194pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2195    unsafe { cmppd(a, b, 2) }
2196}
2197
2198/// Compares corresponding elements in `a` and `b` for greater-than.
2199///
2200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2201#[inline]
2202#[target_feature(enable = "sse2")]
2203#[cfg_attr(test, assert_instr(cmpltpd))]
2204#[stable(feature = "simd_x86", since = "1.27.0")]
2205pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2206    _mm_cmplt_pd(b, a)
2207}
2208
2209/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2210///
2211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2212#[inline]
2213#[target_feature(enable = "sse2")]
2214#[cfg_attr(test, assert_instr(cmplepd))]
2215#[stable(feature = "simd_x86", since = "1.27.0")]
2216pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2217    _mm_cmple_pd(b, a)
2218}
2219
2220/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2221///
2222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2223#[inline]
2224#[target_feature(enable = "sse2")]
2225#[cfg_attr(test, assert_instr(cmpordpd))]
2226#[stable(feature = "simd_x86", since = "1.27.0")]
2227pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2228    unsafe { cmppd(a, b, 7) }
2229}
2230
2231/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2232///
2233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2234#[inline]
2235#[target_feature(enable = "sse2")]
2236#[cfg_attr(test, assert_instr(cmpunordpd))]
2237#[stable(feature = "simd_x86", since = "1.27.0")]
2238pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2239    unsafe { cmppd(a, b, 3) }
2240}
2241
2242/// Compares corresponding elements in `a` and `b` for not-equal.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2245#[inline]
2246#[target_feature(enable = "sse2")]
2247#[cfg_attr(test, assert_instr(cmpneqpd))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2250    unsafe { cmppd(a, b, 4) }
2251}
2252
2253/// Compares corresponding elements in `a` and `b` for not-less-than.
2254///
2255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2256#[inline]
2257#[target_feature(enable = "sse2")]
2258#[cfg_attr(test, assert_instr(cmpnltpd))]
2259#[stable(feature = "simd_x86", since = "1.27.0")]
2260pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2261    unsafe { cmppd(a, b, 5) }
2262}
2263
2264/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2265///
2266/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2267#[inline]
2268#[target_feature(enable = "sse2")]
2269#[cfg_attr(test, assert_instr(cmpnlepd))]
2270#[stable(feature = "simd_x86", since = "1.27.0")]
2271pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2272    unsafe { cmppd(a, b, 6) }
2273}
2274
2275/// Compares corresponding elements in `a` and `b` for not-greater-than.
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2278#[inline]
2279#[target_feature(enable = "sse2")]
2280#[cfg_attr(test, assert_instr(cmpnltpd))]
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2283    _mm_cmpnlt_pd(b, a)
2284}
2285
2286/// Compares corresponding elements in `a` and `b` for
2287/// not-greater-than-or-equal.
2288///
2289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2290#[inline]
2291#[target_feature(enable = "sse2")]
2292#[cfg_attr(test, assert_instr(cmpnlepd))]
2293#[stable(feature = "simd_x86", since = "1.27.0")]
2294pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2295    _mm_cmpnle_pd(b, a)
2296}
2297
2298/// Compares the lower element of `a` and `b` for equality.
2299///
2300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2301#[inline]
2302#[target_feature(enable = "sse2")]
2303#[cfg_attr(test, assert_instr(comisd))]
2304#[stable(feature = "simd_x86", since = "1.27.0")]
2305pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2306    unsafe { comieqsd(a, b) }
2307}
2308
2309/// Compares the lower element of `a` and `b` for less-than.
2310///
2311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2312#[inline]
2313#[target_feature(enable = "sse2")]
2314#[cfg_attr(test, assert_instr(comisd))]
2315#[stable(feature = "simd_x86", since = "1.27.0")]
2316pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2317    unsafe { comiltsd(a, b) }
2318}
2319
2320/// Compares the lower element of `a` and `b` for less-than-or-equal.
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2323#[inline]
2324#[target_feature(enable = "sse2")]
2325#[cfg_attr(test, assert_instr(comisd))]
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2328    unsafe { comilesd(a, b) }
2329}
2330
2331/// Compares the lower element of `a` and `b` for greater-than.
2332///
2333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2334#[inline]
2335#[target_feature(enable = "sse2")]
2336#[cfg_attr(test, assert_instr(comisd))]
2337#[stable(feature = "simd_x86", since = "1.27.0")]
2338pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2339    unsafe { comigtsd(a, b) }
2340}
2341
2342/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2343///
2344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2345#[inline]
2346#[target_feature(enable = "sse2")]
2347#[cfg_attr(test, assert_instr(comisd))]
2348#[stable(feature = "simd_x86", since = "1.27.0")]
2349pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2350    unsafe { comigesd(a, b) }
2351}
2352
2353/// Compares the lower element of `a` and `b` for not-equal.
2354///
2355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2356#[inline]
2357#[target_feature(enable = "sse2")]
2358#[cfg_attr(test, assert_instr(comisd))]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2361    unsafe { comineqsd(a, b) }
2362}
2363
2364/// Compares the lower element of `a` and `b` for equality.
2365///
2366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2367#[inline]
2368#[target_feature(enable = "sse2")]
2369#[cfg_attr(test, assert_instr(ucomisd))]
2370#[stable(feature = "simd_x86", since = "1.27.0")]
2371pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2372    unsafe { ucomieqsd(a, b) }
2373}
2374
2375/// Compares the lower element of `a` and `b` for less-than.
2376///
2377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2378#[inline]
2379#[target_feature(enable = "sse2")]
2380#[cfg_attr(test, assert_instr(ucomisd))]
2381#[stable(feature = "simd_x86", since = "1.27.0")]
2382pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2383    unsafe { ucomiltsd(a, b) }
2384}
2385
2386/// Compares the lower element of `a` and `b` for less-than-or-equal.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2389#[inline]
2390#[target_feature(enable = "sse2")]
2391#[cfg_attr(test, assert_instr(ucomisd))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2394    unsafe { ucomilesd(a, b) }
2395}
2396
2397/// Compares the lower element of `a` and `b` for greater-than.
2398///
2399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2400#[inline]
2401#[target_feature(enable = "sse2")]
2402#[cfg_attr(test, assert_instr(ucomisd))]
2403#[stable(feature = "simd_x86", since = "1.27.0")]
2404pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2405    unsafe { ucomigtsd(a, b) }
2406}
2407
2408/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2409///
2410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2411#[inline]
2412#[target_feature(enable = "sse2")]
2413#[cfg_attr(test, assert_instr(ucomisd))]
2414#[stable(feature = "simd_x86", since = "1.27.0")]
2415pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2416    unsafe { ucomigesd(a, b) }
2417}
2418
2419/// Compares the lower element of `a` and `b` for not-equal.
2420///
2421/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2422#[inline]
2423#[target_feature(enable = "sse2")]
2424#[cfg_attr(test, assert_instr(ucomisd))]
2425#[stable(feature = "simd_x86", since = "1.27.0")]
2426pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2427    unsafe { ucomineqsd(a, b) }
2428}
2429
2430/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2431/// packed single-precision (32-bit) floating-point elements
2432///
2433/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2434#[inline]
2435#[target_feature(enable = "sse2")]
2436#[cfg_attr(test, assert_instr(cvtpd2ps))]
2437#[stable(feature = "simd_x86", since = "1.27.0")]
2438#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2439pub const fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2440    unsafe {
2441        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2442        let zero = f32x2::ZERO;
2443        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2444    }
2445}
2446
2447/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2448/// packed
2449/// double-precision (64-bit) floating-point elements.
2450///
2451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2452#[inline]
2453#[target_feature(enable = "sse2")]
2454#[cfg_attr(test, assert_instr(cvtps2pd))]
2455#[stable(feature = "simd_x86", since = "1.27.0")]
2456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2457pub const fn _mm_cvtps_pd(a: __m128) -> __m128d {
2458    unsafe {
2459        let a = a.as_f32x4();
2460        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2461    }
2462}
2463
2464/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2465/// packed 32-bit integers.
2466///
2467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2468#[inline]
2469#[target_feature(enable = "sse2")]
2470#[cfg_attr(test, assert_instr(cvtpd2dq))]
2471#[stable(feature = "simd_x86", since = "1.27.0")]
2472pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2473    unsafe { transmute(cvtpd2dq(a)) }
2474}
2475
2476/// Converts the lower double-precision (64-bit) floating-point element in a to
2477/// a 32-bit integer.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2480#[inline]
2481#[target_feature(enable = "sse2")]
2482#[cfg_attr(test, assert_instr(cvtsd2si))]
2483#[stable(feature = "simd_x86", since = "1.27.0")]
2484pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2485    unsafe { cvtsd2si(a) }
2486}
2487
2488/// Converts the lower double-precision (64-bit) floating-point element in `b`
2489/// to a single-precision (32-bit) floating-point element, store the result in
2490/// the lower element of the return value, and copies the upper element from `a`
2491/// to the upper element the return value.
2492///
2493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2494#[inline]
2495#[target_feature(enable = "sse2")]
2496#[cfg_attr(test, assert_instr(cvtsd2ss))]
2497#[stable(feature = "simd_x86", since = "1.27.0")]
2498pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2499    unsafe { cvtsd2ss(a, b) }
2500}
2501
2502/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2503///
2504/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2505#[inline]
2506#[target_feature(enable = "sse2")]
2507#[stable(feature = "simd_x86", since = "1.27.0")]
2508#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2509pub const fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2510    unsafe { simd_extract!(a, 0) }
2511}
2512
2513/// Converts the lower single-precision (32-bit) floating-point element in `b`
2514/// to a double-precision (64-bit) floating-point element, store the result in
2515/// the lower element of the return value, and copies the upper element from `a`
2516/// to the upper element the return value.
2517///
2518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2519#[inline]
2520#[target_feature(enable = "sse2")]
2521#[cfg_attr(test, assert_instr(cvtss2sd))]
2522#[stable(feature = "simd_x86", since = "1.27.0")]
2523#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2524pub const fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2525    unsafe {
2526        let elt: f32 = simd_extract!(b, 0);
2527        simd_insert!(a, 0, elt as f64)
2528    }
2529}
2530
2531/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2532/// packed 32-bit integers with truncation.
2533///
2534/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2535#[inline]
2536#[target_feature(enable = "sse2")]
2537#[cfg_attr(test, assert_instr(cvttpd2dq))]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2540    unsafe { transmute(cvttpd2dq(a)) }
2541}
2542
2543/// Converts the lower double-precision (64-bit) floating-point element in `a`
2544/// to a 32-bit integer with truncation.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2547#[inline]
2548#[target_feature(enable = "sse2")]
2549#[cfg_attr(test, assert_instr(cvttsd2si))]
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2552    unsafe { cvttsd2si(a) }
2553}
2554
2555/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2556/// packed 32-bit integers with truncation.
2557///
2558/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2559#[inline]
2560#[target_feature(enable = "sse2")]
2561#[cfg_attr(test, assert_instr(cvttps2dq))]
2562#[stable(feature = "simd_x86", since = "1.27.0")]
2563pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2564    unsafe { transmute(cvttps2dq(a)) }
2565}
2566
2567/// Copies double-precision (64-bit) floating-point element `a` to the lower
2568/// element of the packed 64-bit return value.
2569///
2570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2571#[inline]
2572#[target_feature(enable = "sse2")]
2573#[stable(feature = "simd_x86", since = "1.27.0")]
2574#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2575pub const fn _mm_set_sd(a: f64) -> __m128d {
2576    _mm_set_pd(0.0, a)
2577}
2578
2579/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2580/// of the return value.
2581///
2582/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2583#[inline]
2584#[target_feature(enable = "sse2")]
2585#[stable(feature = "simd_x86", since = "1.27.0")]
2586#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2587pub const fn _mm_set1_pd(a: f64) -> __m128d {
2588    _mm_set_pd(a, a)
2589}
2590
2591/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2592/// of the return value.
2593///
2594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2595#[inline]
2596#[target_feature(enable = "sse2")]
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm_set_pd1(a: f64) -> __m128d {
2600    _mm_set_pd(a, a)
2601}
2602
2603/// Sets packed double-precision (64-bit) floating-point elements in the return
2604/// value with the supplied values.
2605///
2606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2607#[inline]
2608#[target_feature(enable = "sse2")]
2609#[stable(feature = "simd_x86", since = "1.27.0")]
2610#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2611pub const fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2612    __m128d([b, a])
2613}
2614
2615/// Sets packed double-precision (64-bit) floating-point elements in the return
2616/// value with the supplied values in reverse order.
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2619#[inline]
2620#[target_feature(enable = "sse2")]
2621#[stable(feature = "simd_x86", since = "1.27.0")]
2622#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2623pub const fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2624    _mm_set_pd(b, a)
2625}
2626
2627/// Returns packed double-precision (64-bit) floating-point elements with all
2628/// zeros.
2629///
2630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2631#[inline]
2632#[target_feature(enable = "sse2")]
2633#[cfg_attr(test, assert_instr(xorp))]
2634#[stable(feature = "simd_x86", since = "1.27.0")]
2635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2636pub const fn _mm_setzero_pd() -> __m128d {
2637    const { unsafe { mem::zeroed() } }
2638}
2639
2640/// Returns a mask of the most significant bit of each element in `a`.
2641///
2642/// The mask is stored in the 2 least significant bits of the return value.
2643/// All other bits are set to `0`.
2644///
2645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2646#[inline]
2647#[target_feature(enable = "sse2")]
2648#[cfg_attr(test, assert_instr(movmskpd))]
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2651pub const fn _mm_movemask_pd(a: __m128d) -> i32 {
2652    // Propagate the highest bit to the rest, because simd_bitmask
2653    // requires all-1 or all-0.
2654    unsafe {
2655        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2656        simd_bitmask::<i64x2, u8>(mask) as i32
2657    }
2658}
2659
2660/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2661/// floating-point elements) from memory into the returned vector.
2662/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2663/// exception may be generated.
2664///
2665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2666#[inline]
2667#[target_feature(enable = "sse2")]
2668#[cfg_attr(
2669    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2670    assert_instr(movaps)
2671)]
2672#[stable(feature = "simd_x86", since = "1.27.0")]
2673#[allow(clippy::cast_ptr_alignment)]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2676    *(mem_addr as *const __m128d)
2677}
2678
2679/// Loads a 64-bit double-precision value to the low element of a
2680/// 128-bit integer vector and clears the upper element.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[cfg_attr(test, assert_instr(movsd))]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2688pub const unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2689    _mm_setr_pd(*mem_addr, 0.)
2690}
2691
2692/// Loads a double-precision value into the high-order bits of a 128-bit
2693/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2694/// bits of the first operand.
2695///
2696/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2697#[inline]
2698#[target_feature(enable = "sse2")]
2699#[cfg_attr(test, assert_instr(movhps))]
2700#[stable(feature = "simd_x86", since = "1.27.0")]
2701#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2702pub const unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2703    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2704}
2705
2706/// Loads a double-precision value into the low-order bits of a 128-bit
2707/// vector of `[2 x double]`. The high-order bits are copied from the
2708/// high-order bits of the first operand.
2709///
2710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2711#[inline]
2712#[target_feature(enable = "sse2")]
2713#[cfg_attr(test, assert_instr(movlps))]
2714#[stable(feature = "simd_x86", since = "1.27.0")]
2715#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2716pub const unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2717    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2718}
2719
2720/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2721/// aligned memory location.
2722/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2723/// used again soon).
2724///
2725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2726///
2727/// # Safety of non-temporal stores
2728///
2729/// After using this intrinsic, but before any other access to the memory that this intrinsic
2730/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2731/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2732/// return.
2733///
2734/// See [`_mm_sfence`] for details.
2735#[inline]
2736#[target_feature(enable = "sse2")]
2737#[cfg_attr(test, assert_instr(movntpd))]
2738#[stable(feature = "simd_x86", since = "1.27.0")]
2739#[allow(clippy::cast_ptr_alignment)]
2740pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2741    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2742    crate::arch::asm!(
2743        vps!("movntpd", ",{a}"),
2744        p = in(reg) mem_addr,
2745        a = in(xmm_reg) a,
2746        options(nostack, preserves_flags),
2747    );
2748}
2749
2750/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2751/// memory location.
2752///
2753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2754#[inline]
2755#[target_feature(enable = "sse2")]
2756#[cfg_attr(test, assert_instr(movlps))]
2757#[stable(feature = "simd_x86", since = "1.27.0")]
2758#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2759pub const unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2760    *mem_addr = simd_extract!(a, 0)
2761}
2762
2763/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2764/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2765/// on a 16-byte boundary or a general-protection exception may be generated.
2766///
2767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2768#[inline]
2769#[target_feature(enable = "sse2")]
2770#[cfg_attr(
2771    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2772    assert_instr(movaps)
2773)]
2774#[stable(feature = "simd_x86", since = "1.27.0")]
2775#[allow(clippy::cast_ptr_alignment)]
2776#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2777pub const unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2778    *(mem_addr as *mut __m128d) = a;
2779}
2780
2781/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2782/// floating-point elements) from `a` into memory.
2783/// `mem_addr` does not need to be aligned on any particular boundary.
2784///
2785/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2786#[inline]
2787#[target_feature(enable = "sse2")]
2788#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2789#[stable(feature = "simd_x86", since = "1.27.0")]
2790#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2791pub const unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2792    mem_addr.cast::<__m128d>().write_unaligned(a);
2793}
2794
2795/// Store 16-bit integer from the first element of a into memory.
2796///
2797/// `mem_addr` does not need to be aligned on any particular boundary.
2798///
2799/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2800#[inline]
2801#[target_feature(enable = "sse2")]
2802#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2803#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2804pub const unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2805    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2806}
2807
2808/// Store 32-bit integer from the first element of a into memory.
2809///
2810/// `mem_addr` does not need to be aligned on any particular boundary.
2811///
2812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2813#[inline]
2814#[target_feature(enable = "sse2")]
2815#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2817pub const unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2818    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2819}
2820
2821/// Store 64-bit integer from the first element of a into memory.
2822///
2823/// `mem_addr` does not need to be aligned on any particular boundary.
2824///
2825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2826#[inline]
2827#[target_feature(enable = "sse2")]
2828#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2830pub const unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2831    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2832}
2833
2834/// Stores the lower double-precision (64-bit) floating-point element from `a`
2835/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2836/// 16-byte boundary or a general-protection exception may be generated.
2837///
2838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2839#[inline]
2840#[target_feature(enable = "sse2")]
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842#[allow(clippy::cast_ptr_alignment)]
2843#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2844pub const unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2845    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2846    *(mem_addr as *mut __m128d) = b;
2847}
2848
2849/// Stores the lower double-precision (64-bit) floating-point element from `a`
2850/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2851/// 16-byte boundary or a general-protection exception may be generated.
2852///
2853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2854#[inline]
2855#[target_feature(enable = "sse2")]
2856#[stable(feature = "simd_x86", since = "1.27.0")]
2857#[allow(clippy::cast_ptr_alignment)]
2858#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2859pub const unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2860    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2861    *(mem_addr as *mut __m128d) = b;
2862}
2863
2864/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2865/// memory in reverse order.
2866/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2867/// exception may be generated.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2870#[inline]
2871#[target_feature(enable = "sse2")]
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873#[allow(clippy::cast_ptr_alignment)]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2876    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2877    *(mem_addr as *mut __m128d) = b;
2878}
2879
2880/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2881/// memory location.
2882///
2883/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2884#[inline]
2885#[target_feature(enable = "sse2")]
2886#[cfg_attr(test, assert_instr(movhps))]
2887#[stable(feature = "simd_x86", since = "1.27.0")]
2888#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2889pub const unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2890    *mem_addr = simd_extract!(a, 1);
2891}
2892
2893/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2894/// memory location.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[cfg_attr(test, assert_instr(movlps))]
2900#[stable(feature = "simd_x86", since = "1.27.0")]
2901#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2902pub const unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2903    *mem_addr = simd_extract!(a, 0);
2904}
2905
2906/// Loads a double-precision (64-bit) floating-point element from memory
2907/// into both elements of returned vector.
2908///
2909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2910#[inline]
2911#[target_feature(enable = "sse2")]
2912// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2913#[stable(feature = "simd_x86", since = "1.27.0")]
2914#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2915pub const unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2916    let d = *mem_addr;
2917    _mm_setr_pd(d, d)
2918}
2919
2920/// Loads a double-precision (64-bit) floating-point element from memory
2921/// into both elements of returned vector.
2922///
2923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2924#[inline]
2925#[target_feature(enable = "sse2")]
2926// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2927#[stable(feature = "simd_x86", since = "1.27.0")]
2928#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2929pub const unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2930    _mm_load1_pd(mem_addr)
2931}
2932
2933/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2934/// the returned vector in reverse order. `mem_addr` must be aligned on a
2935/// 16-byte boundary or a general-protection exception may be generated.
2936///
2937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2938#[inline]
2939#[target_feature(enable = "sse2")]
2940#[cfg_attr(
2941    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2942    assert_instr(movaps)
2943)]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2946pub const unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2947    let a = _mm_load_pd(mem_addr);
2948    simd_shuffle!(a, a, [1, 0])
2949}
2950
2951/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2952/// floating-point elements) from memory into the returned vector.
2953/// `mem_addr` does not need to be aligned on any particular boundary.
2954///
2955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2956#[inline]
2957#[target_feature(enable = "sse2")]
2958#[cfg_attr(test, assert_instr(movups))]
2959#[stable(feature = "simd_x86", since = "1.27.0")]
2960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2961pub const unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2962    let mut dst = _mm_undefined_pd();
2963    ptr::copy_nonoverlapping(
2964        mem_addr as *const u8,
2965        ptr::addr_of_mut!(dst) as *mut u8,
2966        mem::size_of::<__m128d>(),
2967    );
2968    dst
2969}
2970
2971/// Loads unaligned 16-bits of integer data from memory into new vector.
2972///
2973/// `mem_addr` does not need to be aligned on any particular boundary.
2974///
2975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2976#[inline]
2977#[target_feature(enable = "sse2")]
2978#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2980pub const unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2981    transmute(i16x8::new(
2982        ptr::read_unaligned(mem_addr as *const i16),
2983        0,
2984        0,
2985        0,
2986        0,
2987        0,
2988        0,
2989        0,
2990    ))
2991}
2992
2993/// Loads unaligned 32-bits of integer data from memory into new vector.
2994///
2995/// `mem_addr` does not need to be aligned on any particular boundary.
2996///
2997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2998#[inline]
2999#[target_feature(enable = "sse2")]
3000#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3001#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3002pub const unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
3003    transmute(i32x4::new(
3004        ptr::read_unaligned(mem_addr as *const i32),
3005        0,
3006        0,
3007        0,
3008    ))
3009}
3010
3011/// Loads unaligned 64-bits of integer data from memory into new vector.
3012///
3013/// `mem_addr` does not need to be aligned on any particular boundary.
3014///
3015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
3016#[inline]
3017#[target_feature(enable = "sse2")]
3018#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
3019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3020pub const unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
3021    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
3022}
3023
3024/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
3025/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
3026/// parameter as a specifier.
3027///
3028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
3029#[inline]
3030#[target_feature(enable = "sse2")]
3031#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
3032#[rustc_legacy_const_generics(2)]
3033#[stable(feature = "simd_x86", since = "1.27.0")]
3034#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3035pub const fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
3036    static_assert_uimm_bits!(MASK, 8);
3037    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
3038}
3039
3040/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
3041/// 64 bits are set to the lower 64 bits of the second parameter. The upper
3042/// 64 bits are set to the upper 64 bits of the first parameter.
3043///
3044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
3045#[inline]
3046#[target_feature(enable = "sse2")]
3047#[cfg_attr(test, assert_instr(movsd))]
3048#[stable(feature = "simd_x86", since = "1.27.0")]
3049#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3050pub const fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
3051    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
3052}
3053
3054/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3055/// floating-point vector of `[4 x float]`.
3056///
3057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
3058#[inline]
3059#[target_feature(enable = "sse2")]
3060#[stable(feature = "simd_x86", since = "1.27.0")]
3061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3062pub const fn _mm_castpd_ps(a: __m128d) -> __m128 {
3063    unsafe { transmute(a) }
3064}
3065
3066/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3067/// integer vector.
3068///
3069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
3070#[inline]
3071#[target_feature(enable = "sse2")]
3072#[stable(feature = "simd_x86", since = "1.27.0")]
3073#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3074pub const fn _mm_castpd_si128(a: __m128d) -> __m128i {
3075    unsafe { transmute(a) }
3076}
3077
3078/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3079/// floating-point vector of `[2 x double]`.
3080///
3081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
3082#[inline]
3083#[target_feature(enable = "sse2")]
3084#[stable(feature = "simd_x86", since = "1.27.0")]
3085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3086pub const fn _mm_castps_pd(a: __m128) -> __m128d {
3087    unsafe { transmute(a) }
3088}
3089
3090/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3091/// integer vector.
3092///
3093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
3094#[inline]
3095#[target_feature(enable = "sse2")]
3096#[stable(feature = "simd_x86", since = "1.27.0")]
3097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3098pub const fn _mm_castps_si128(a: __m128) -> __m128i {
3099    unsafe { transmute(a) }
3100}
3101
3102/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3103/// of `[2 x double]`.
3104///
3105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
3106#[inline]
3107#[target_feature(enable = "sse2")]
3108#[stable(feature = "simd_x86", since = "1.27.0")]
3109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3110pub const fn _mm_castsi128_pd(a: __m128i) -> __m128d {
3111    unsafe { transmute(a) }
3112}
3113
3114/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3115/// of `[4 x float]`.
3116///
3117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
3118#[inline]
3119#[target_feature(enable = "sse2")]
3120#[stable(feature = "simd_x86", since = "1.27.0")]
3121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3122pub const fn _mm_castsi128_ps(a: __m128i) -> __m128 {
3123    unsafe { transmute(a) }
3124}
3125
3126/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
3127/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3128/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3129/// In practice, this is typically equivalent to [`mem::zeroed`].
3130///
3131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
3132#[inline]
3133#[target_feature(enable = "sse2")]
3134#[stable(feature = "simd_x86", since = "1.27.0")]
3135#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3136pub const fn _mm_undefined_pd() -> __m128d {
3137    const { unsafe { mem::zeroed() } }
3138}
3139
3140/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3141/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3142/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3143/// In practice, this is typically equivalent to [`mem::zeroed`].
3144///
3145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3146#[inline]
3147#[target_feature(enable = "sse2")]
3148#[stable(feature = "simd_x86", since = "1.27.0")]
3149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3150pub const fn _mm_undefined_si128() -> __m128i {
3151    const { unsafe { mem::zeroed() } }
3152}
3153
3154/// The resulting `__m128d` element is composed by the low-order values of
3155/// the two `__m128d` interleaved input elements, i.e.:
3156///
3157/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3158/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3159///
3160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3161#[inline]
3162#[target_feature(enable = "sse2")]
3163#[cfg_attr(test, assert_instr(unpckhpd))]
3164#[stable(feature = "simd_x86", since = "1.27.0")]
3165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3166pub const fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3167    unsafe { simd_shuffle!(a, b, [1, 3]) }
3168}
3169
3170/// The resulting `__m128d` element is composed by the high-order values of
3171/// the two `__m128d` interleaved input elements, i.e.:
3172///
3173/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3174/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3175///
3176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3177#[inline]
3178#[target_feature(enable = "sse2")]
3179#[cfg_attr(test, assert_instr(movlhps))]
3180#[stable(feature = "simd_x86", since = "1.27.0")]
3181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3182pub const fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3183    unsafe { simd_shuffle!(a, b, [0, 2]) }
3184}
3185
3186#[allow(improper_ctypes)]
3187unsafe extern "C" {
3188    #[link_name = "llvm.x86.sse2.pause"]
3189    fn pause();
3190    #[link_name = "llvm.x86.sse2.clflush"]
3191    fn clflush(p: *const u8);
3192    #[link_name = "llvm.x86.sse2.lfence"]
3193    fn lfence();
3194    #[link_name = "llvm.x86.sse2.mfence"]
3195    fn mfence();
3196    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3197    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3198    #[link_name = "llvm.x86.sse2.psad.bw"]
3199    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3200    #[link_name = "llvm.x86.sse2.psll.w"]
3201    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3202    #[link_name = "llvm.x86.sse2.psll.d"]
3203    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3204    #[link_name = "llvm.x86.sse2.psll.q"]
3205    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3206    #[link_name = "llvm.x86.sse2.psra.w"]
3207    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3208    #[link_name = "llvm.x86.sse2.psra.d"]
3209    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3210    #[link_name = "llvm.x86.sse2.psrl.w"]
3211    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3212    #[link_name = "llvm.x86.sse2.psrl.d"]
3213    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3214    #[link_name = "llvm.x86.sse2.psrl.q"]
3215    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3216    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3217    fn cvtps2dq(a: __m128) -> i32x4;
3218    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3219    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3220    #[link_name = "llvm.x86.sse2.packsswb.128"]
3221    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3222    #[link_name = "llvm.x86.sse2.packssdw.128"]
3223    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3224    #[link_name = "llvm.x86.sse2.packuswb.128"]
3225    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3226    #[link_name = "llvm.x86.sse2.max.sd"]
3227    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3228    #[link_name = "llvm.x86.sse2.max.pd"]
3229    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3230    #[link_name = "llvm.x86.sse2.min.sd"]
3231    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3232    #[link_name = "llvm.x86.sse2.min.pd"]
3233    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3234    #[link_name = "llvm.x86.sse2.cmp.sd"]
3235    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3236    #[link_name = "llvm.x86.sse2.cmp.pd"]
3237    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3238    #[link_name = "llvm.x86.sse2.comieq.sd"]
3239    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3240    #[link_name = "llvm.x86.sse2.comilt.sd"]
3241    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3242    #[link_name = "llvm.x86.sse2.comile.sd"]
3243    fn comilesd(a: __m128d, b: __m128d) -> i32;
3244    #[link_name = "llvm.x86.sse2.comigt.sd"]
3245    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3246    #[link_name = "llvm.x86.sse2.comige.sd"]
3247    fn comigesd(a: __m128d, b: __m128d) -> i32;
3248    #[link_name = "llvm.x86.sse2.comineq.sd"]
3249    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3250    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3251    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3252    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3253    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3254    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3255    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3256    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3257    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3258    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3259    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3260    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3261    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3262    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3263    fn cvtpd2dq(a: __m128d) -> i32x4;
3264    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3265    fn cvtsd2si(a: __m128d) -> i32;
3266    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3267    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3268    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3269    fn cvttpd2dq(a: __m128d) -> i32x4;
3270    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3271    fn cvttsd2si(a: __m128d) -> i32;
3272    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3273    fn cvttps2dq(a: __m128) -> i32x4;
3274}
3275
3276#[cfg(test)]
3277mod tests {
3278    use crate::core_arch::assert_eq_const as assert_eq;
3279    use crate::{
3280        core_arch::{simd::*, x86::*},
3281        hint::black_box,
3282    };
3283    use std::{boxed, f32, f64, mem, ptr};
3284    use stdarch_test::simd_test;
3285
3286    const NAN: f64 = f64::NAN;
3287
3288    #[test]
3289    fn test_mm_pause() {
3290        _mm_pause()
3291    }
3292
3293    #[simd_test(enable = "sse2")]
3294    unsafe fn test_mm_clflush() {
3295        let x = 0_u8;
3296        _mm_clflush(ptr::addr_of!(x));
3297    }
3298
3299    #[simd_test(enable = "sse2")]
3300    // Miri cannot support this until it is clear how it fits in the Rust memory model
3301    #[cfg_attr(miri, ignore)]
3302    fn test_mm_lfence() {
3303        _mm_lfence();
3304    }
3305
3306    #[simd_test(enable = "sse2")]
3307    // Miri cannot support this until it is clear how it fits in the Rust memory model
3308    #[cfg_attr(miri, ignore)]
3309    fn test_mm_mfence() {
3310        _mm_mfence();
3311    }
3312
3313    #[simd_test(enable = "sse2")]
3314    const fn test_mm_add_epi8() {
3315        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3316        #[rustfmt::skip]
3317        let b = _mm_setr_epi8(
3318            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3319        );
3320        let r = _mm_add_epi8(a, b);
3321        #[rustfmt::skip]
3322        let e = _mm_setr_epi8(
3323            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3324        );
3325        assert_eq_m128i(r, e);
3326    }
3327
3328    #[simd_test(enable = "sse2")]
3329    fn test_mm_add_epi8_overflow() {
3330        let a = _mm_set1_epi8(0x7F);
3331        let b = _mm_set1_epi8(1);
3332        let r = _mm_add_epi8(a, b);
3333        assert_eq_m128i(r, _mm_set1_epi8(-128));
3334    }
3335
3336    #[simd_test(enable = "sse2")]
3337    const fn test_mm_add_epi16() {
3338        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3339        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3340        let r = _mm_add_epi16(a, b);
3341        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3342        assert_eq_m128i(r, e);
3343    }
3344
3345    #[simd_test(enable = "sse2")]
3346    const fn test_mm_add_epi32() {
3347        let a = _mm_setr_epi32(0, 1, 2, 3);
3348        let b = _mm_setr_epi32(4, 5, 6, 7);
3349        let r = _mm_add_epi32(a, b);
3350        let e = _mm_setr_epi32(4, 6, 8, 10);
3351        assert_eq_m128i(r, e);
3352    }
3353
3354    #[simd_test(enable = "sse2")]
3355    const fn test_mm_add_epi64() {
3356        let a = _mm_setr_epi64x(0, 1);
3357        let b = _mm_setr_epi64x(2, 3);
3358        let r = _mm_add_epi64(a, b);
3359        let e = _mm_setr_epi64x(2, 4);
3360        assert_eq_m128i(r, e);
3361    }
3362
3363    #[simd_test(enable = "sse2")]
3364    const fn test_mm_adds_epi8() {
3365        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3366        #[rustfmt::skip]
3367        let b = _mm_setr_epi8(
3368            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3369        );
3370        let r = _mm_adds_epi8(a, b);
3371        #[rustfmt::skip]
3372        let e = _mm_setr_epi8(
3373            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3374        );
3375        assert_eq_m128i(r, e);
3376    }
3377
3378    #[simd_test(enable = "sse2")]
3379    fn test_mm_adds_epi8_saturate_positive() {
3380        let a = _mm_set1_epi8(0x7F);
3381        let b = _mm_set1_epi8(1);
3382        let r = _mm_adds_epi8(a, b);
3383        assert_eq_m128i(r, a);
3384    }
3385
3386    #[simd_test(enable = "sse2")]
3387    fn test_mm_adds_epi8_saturate_negative() {
3388        let a = _mm_set1_epi8(-0x80);
3389        let b = _mm_set1_epi8(-1);
3390        let r = _mm_adds_epi8(a, b);
3391        assert_eq_m128i(r, a);
3392    }
3393
3394    #[simd_test(enable = "sse2")]
3395    const fn test_mm_adds_epi16() {
3396        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3397        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3398        let r = _mm_adds_epi16(a, b);
3399        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3400        assert_eq_m128i(r, e);
3401    }
3402
3403    #[simd_test(enable = "sse2")]
3404    fn test_mm_adds_epi16_saturate_positive() {
3405        let a = _mm_set1_epi16(0x7FFF);
3406        let b = _mm_set1_epi16(1);
3407        let r = _mm_adds_epi16(a, b);
3408        assert_eq_m128i(r, a);
3409    }
3410
3411    #[simd_test(enable = "sse2")]
3412    fn test_mm_adds_epi16_saturate_negative() {
3413        let a = _mm_set1_epi16(-0x8000);
3414        let b = _mm_set1_epi16(-1);
3415        let r = _mm_adds_epi16(a, b);
3416        assert_eq_m128i(r, a);
3417    }
3418
3419    #[simd_test(enable = "sse2")]
3420    const fn test_mm_adds_epu8() {
3421        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3422        #[rustfmt::skip]
3423        let b = _mm_setr_epi8(
3424            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3425        );
3426        let r = _mm_adds_epu8(a, b);
3427        #[rustfmt::skip]
3428        let e = _mm_setr_epi8(
3429            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3430        );
3431        assert_eq_m128i(r, e);
3432    }
3433
3434    #[simd_test(enable = "sse2")]
3435    fn test_mm_adds_epu8_saturate() {
3436        let a = _mm_set1_epi8(!0);
3437        let b = _mm_set1_epi8(1);
3438        let r = _mm_adds_epu8(a, b);
3439        assert_eq_m128i(r, a);
3440    }
3441
3442    #[simd_test(enable = "sse2")]
3443    const fn test_mm_adds_epu16() {
3444        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3445        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3446        let r = _mm_adds_epu16(a, b);
3447        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3448        assert_eq_m128i(r, e);
3449    }
3450
3451    #[simd_test(enable = "sse2")]
3452    fn test_mm_adds_epu16_saturate() {
3453        let a = _mm_set1_epi16(!0);
3454        let b = _mm_set1_epi16(1);
3455        let r = _mm_adds_epu16(a, b);
3456        assert_eq_m128i(r, a);
3457    }
3458
3459    #[simd_test(enable = "sse2")]
3460    const fn test_mm_avg_epu8() {
3461        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3462        let r = _mm_avg_epu8(a, b);
3463        assert_eq_m128i(r, _mm_set1_epi8(6));
3464    }
3465
3466    #[simd_test(enable = "sse2")]
3467    const fn test_mm_avg_epu16() {
3468        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3469        let r = _mm_avg_epu16(a, b);
3470        assert_eq_m128i(r, _mm_set1_epi16(6));
3471    }
3472
3473    #[simd_test(enable = "sse2")]
3474    fn test_mm_madd_epi16() {
3475        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3476        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3477        let r = _mm_madd_epi16(a, b);
3478        let e = _mm_setr_epi32(29, 81, 149, 233);
3479        assert_eq_m128i(r, e);
3480
3481        // Test large values.
3482        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3483        let a = _mm_setr_epi16(
3484            i16::MAX,
3485            i16::MAX,
3486            i16::MIN,
3487            i16::MIN,
3488            i16::MIN,
3489            i16::MAX,
3490            0,
3491            0,
3492        );
3493        let b = _mm_setr_epi16(
3494            i16::MAX,
3495            i16::MAX,
3496            i16::MIN,
3497            i16::MIN,
3498            i16::MAX,
3499            i16::MIN,
3500            0,
3501            0,
3502        );
3503        let r = _mm_madd_epi16(a, b);
3504        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3505        assert_eq_m128i(r, e);
3506    }
3507
3508    #[simd_test(enable = "sse2")]
3509    const fn test_mm_max_epi16() {
3510        let a = _mm_set1_epi16(1);
3511        let b = _mm_set1_epi16(-1);
3512        let r = _mm_max_epi16(a, b);
3513        assert_eq_m128i(r, a);
3514    }
3515
3516    #[simd_test(enable = "sse2")]
3517    const fn test_mm_max_epu8() {
3518        let a = _mm_set1_epi8(1);
3519        let b = _mm_set1_epi8(!0);
3520        let r = _mm_max_epu8(a, b);
3521        assert_eq_m128i(r, b);
3522    }
3523
3524    #[simd_test(enable = "sse2")]
3525    const fn test_mm_min_epi16() {
3526        let a = _mm_set1_epi16(1);
3527        let b = _mm_set1_epi16(-1);
3528        let r = _mm_min_epi16(a, b);
3529        assert_eq_m128i(r, b);
3530    }
3531
3532    #[simd_test(enable = "sse2")]
3533    const fn test_mm_min_epu8() {
3534        let a = _mm_set1_epi8(1);
3535        let b = _mm_set1_epi8(!0);
3536        let r = _mm_min_epu8(a, b);
3537        assert_eq_m128i(r, a);
3538    }
3539
3540    #[simd_test(enable = "sse2")]
3541    const fn test_mm_mulhi_epi16() {
3542        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3543        let r = _mm_mulhi_epi16(a, b);
3544        assert_eq_m128i(r, _mm_set1_epi16(-16));
3545    }
3546
3547    #[simd_test(enable = "sse2")]
3548    const fn test_mm_mulhi_epu16() {
3549        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3550        let r = _mm_mulhi_epu16(a, b);
3551        assert_eq_m128i(r, _mm_set1_epi16(15));
3552    }
3553
3554    #[simd_test(enable = "sse2")]
3555    const fn test_mm_mullo_epi16() {
3556        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3557        let r = _mm_mullo_epi16(a, b);
3558        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3559    }
3560
3561    #[simd_test(enable = "sse2")]
3562    const fn test_mm_mul_epu32() {
3563        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3564        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3565        let r = _mm_mul_epu32(a, b);
3566        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3567        assert_eq_m128i(r, e);
3568    }
3569
3570    #[simd_test(enable = "sse2")]
3571    fn test_mm_sad_epu8() {
3572        #[rustfmt::skip]
3573        let a = _mm_setr_epi8(
3574            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3575            1, 2, 3, 4,
3576            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3577            1, 2, 3, 4,
3578        );
3579        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3580        let r = _mm_sad_epu8(a, b);
3581        let e = _mm_setr_epi64x(1020, 614);
3582        assert_eq_m128i(r, e);
3583    }
3584
3585    #[simd_test(enable = "sse2")]
3586    const fn test_mm_sub_epi8() {
3587        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3588        let r = _mm_sub_epi8(a, b);
3589        assert_eq_m128i(r, _mm_set1_epi8(-1));
3590    }
3591
3592    #[simd_test(enable = "sse2")]
3593    const fn test_mm_sub_epi16() {
3594        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3595        let r = _mm_sub_epi16(a, b);
3596        assert_eq_m128i(r, _mm_set1_epi16(-1));
3597    }
3598
3599    #[simd_test(enable = "sse2")]
3600    const fn test_mm_sub_epi32() {
3601        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3602        let r = _mm_sub_epi32(a, b);
3603        assert_eq_m128i(r, _mm_set1_epi32(-1));
3604    }
3605
3606    #[simd_test(enable = "sse2")]
3607    const fn test_mm_sub_epi64() {
3608        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3609        let r = _mm_sub_epi64(a, b);
3610        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3611    }
3612
3613    #[simd_test(enable = "sse2")]
3614    const fn test_mm_subs_epi8() {
3615        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3616        let r = _mm_subs_epi8(a, b);
3617        assert_eq_m128i(r, _mm_set1_epi8(3));
3618    }
3619
3620    #[simd_test(enable = "sse2")]
3621    fn test_mm_subs_epi8_saturate_positive() {
3622        let a = _mm_set1_epi8(0x7F);
3623        let b = _mm_set1_epi8(-1);
3624        let r = _mm_subs_epi8(a, b);
3625        assert_eq_m128i(r, a);
3626    }
3627
3628    #[simd_test(enable = "sse2")]
3629    fn test_mm_subs_epi8_saturate_negative() {
3630        let a = _mm_set1_epi8(-0x80);
3631        let b = _mm_set1_epi8(1);
3632        let r = _mm_subs_epi8(a, b);
3633        assert_eq_m128i(r, a);
3634    }
3635
3636    #[simd_test(enable = "sse2")]
3637    const fn test_mm_subs_epi16() {
3638        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3639        let r = _mm_subs_epi16(a, b);
3640        assert_eq_m128i(r, _mm_set1_epi16(3));
3641    }
3642
3643    #[simd_test(enable = "sse2")]
3644    fn test_mm_subs_epi16_saturate_positive() {
3645        let a = _mm_set1_epi16(0x7FFF);
3646        let b = _mm_set1_epi16(-1);
3647        let r = _mm_subs_epi16(a, b);
3648        assert_eq_m128i(r, a);
3649    }
3650
3651    #[simd_test(enable = "sse2")]
3652    fn test_mm_subs_epi16_saturate_negative() {
3653        let a = _mm_set1_epi16(-0x8000);
3654        let b = _mm_set1_epi16(1);
3655        let r = _mm_subs_epi16(a, b);
3656        assert_eq_m128i(r, a);
3657    }
3658
3659    #[simd_test(enable = "sse2")]
3660    const fn test_mm_subs_epu8() {
3661        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3662        let r = _mm_subs_epu8(a, b);
3663        assert_eq_m128i(r, _mm_set1_epi8(3));
3664    }
3665
3666    #[simd_test(enable = "sse2")]
3667    fn test_mm_subs_epu8_saturate() {
3668        let a = _mm_set1_epi8(0);
3669        let b = _mm_set1_epi8(1);
3670        let r = _mm_subs_epu8(a, b);
3671        assert_eq_m128i(r, a);
3672    }
3673
3674    #[simd_test(enable = "sse2")]
3675    const fn test_mm_subs_epu16() {
3676        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3677        let r = _mm_subs_epu16(a, b);
3678        assert_eq_m128i(r, _mm_set1_epi16(3));
3679    }
3680
3681    #[simd_test(enable = "sse2")]
3682    fn test_mm_subs_epu16_saturate() {
3683        let a = _mm_set1_epi16(0);
3684        let b = _mm_set1_epi16(1);
3685        let r = _mm_subs_epu16(a, b);
3686        assert_eq_m128i(r, a);
3687    }
3688
3689    #[simd_test(enable = "sse2")]
3690    const fn test_mm_slli_si128() {
3691        #[rustfmt::skip]
3692        let a = _mm_setr_epi8(
3693            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3694        );
3695        let r = _mm_slli_si128::<1>(a);
3696        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3697        assert_eq_m128i(r, e);
3698
3699        #[rustfmt::skip]
3700        let a = _mm_setr_epi8(
3701            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3702        );
3703        let r = _mm_slli_si128::<15>(a);
3704        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3705        assert_eq_m128i(r, e);
3706
3707        #[rustfmt::skip]
3708        let a = _mm_setr_epi8(
3709            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3710        );
3711        let r = _mm_slli_si128::<16>(a);
3712        assert_eq_m128i(r, _mm_set1_epi8(0));
3713    }
3714
3715    #[simd_test(enable = "sse2")]
3716    const fn test_mm_slli_epi16() {
3717        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3718        let r = _mm_slli_epi16::<4>(a);
3719        assert_eq_m128i(
3720            r,
3721            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3722        );
3723        let r = _mm_slli_epi16::<16>(a);
3724        assert_eq_m128i(r, _mm_set1_epi16(0));
3725    }
3726
3727    #[simd_test(enable = "sse2")]
3728    unsafe fn test_mm_sll_epi16() {
3729        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3730        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3731        assert_eq_m128i(
3732            r,
3733            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3734        );
3735        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3736        assert_eq_m128i(r, a);
3737        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3738        assert_eq_m128i(r, _mm_set1_epi16(0));
3739        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3740        assert_eq_m128i(r, _mm_set1_epi16(0));
3741    }
3742
3743    #[simd_test(enable = "sse2")]
3744    const fn test_mm_slli_epi32() {
3745        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3746        let r = _mm_slli_epi32::<4>(a);
3747        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3748        let r = _mm_slli_epi32::<32>(a);
3749        assert_eq_m128i(r, _mm_set1_epi32(0));
3750    }
3751
3752    #[simd_test(enable = "sse2")]
3753    fn test_mm_sll_epi32() {
3754        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3755        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3756        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3757        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3758        assert_eq_m128i(r, a);
3759        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3760        assert_eq_m128i(r, _mm_set1_epi32(0));
3761        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3762        assert_eq_m128i(r, _mm_set1_epi32(0));
3763    }
3764
3765    #[simd_test(enable = "sse2")]
3766    const fn test_mm_slli_epi64() {
3767        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3768        let r = _mm_slli_epi64::<4>(a);
3769        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3770        let r = _mm_slli_epi64::<64>(a);
3771        assert_eq_m128i(r, _mm_set1_epi64x(0));
3772    }
3773
3774    #[simd_test(enable = "sse2")]
3775    fn test_mm_sll_epi64() {
3776        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3777        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3778        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3779        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3780        assert_eq_m128i(r, a);
3781        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3782        assert_eq_m128i(r, _mm_set1_epi64x(0));
3783        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3784        assert_eq_m128i(r, _mm_set1_epi64x(0));
3785    }
3786
3787    #[simd_test(enable = "sse2")]
3788    const fn test_mm_srai_epi16() {
3789        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3790        let r = _mm_srai_epi16::<4>(a);
3791        assert_eq_m128i(
3792            r,
3793            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3794        );
3795        let r = _mm_srai_epi16::<16>(a);
3796        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3797    }
3798
3799    #[simd_test(enable = "sse2")]
3800    fn test_mm_sra_epi16() {
3801        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3802        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3803        assert_eq_m128i(
3804            r,
3805            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3806        );
3807        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3808        assert_eq_m128i(r, a);
3809        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3810        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3811        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3812        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3813    }
3814
3815    #[simd_test(enable = "sse2")]
3816    const fn test_mm_srai_epi32() {
3817        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3818        let r = _mm_srai_epi32::<4>(a);
3819        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3820        let r = _mm_srai_epi32::<32>(a);
3821        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3822    }
3823
3824    #[simd_test(enable = "sse2")]
3825    fn test_mm_sra_epi32() {
3826        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3827        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3828        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3829        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3830        assert_eq_m128i(r, a);
3831        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3832        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3833        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3834        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3835    }
3836
3837    #[simd_test(enable = "sse2")]
3838    const fn test_mm_srli_si128() {
3839        #[rustfmt::skip]
3840        let a = _mm_setr_epi8(
3841            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3842        );
3843        let r = _mm_srli_si128::<1>(a);
3844        #[rustfmt::skip]
3845        let e = _mm_setr_epi8(
3846            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3847        );
3848        assert_eq_m128i(r, e);
3849
3850        #[rustfmt::skip]
3851        let a = _mm_setr_epi8(
3852            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3853        );
3854        let r = _mm_srli_si128::<15>(a);
3855        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3856        assert_eq_m128i(r, e);
3857
3858        #[rustfmt::skip]
3859        let a = _mm_setr_epi8(
3860            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3861        );
3862        let r = _mm_srli_si128::<16>(a);
3863        assert_eq_m128i(r, _mm_set1_epi8(0));
3864    }
3865
3866    #[simd_test(enable = "sse2")]
3867    const fn test_mm_srli_epi16() {
3868        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3869        let r = _mm_srli_epi16::<4>(a);
3870        assert_eq_m128i(
3871            r,
3872            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3873        );
3874        let r = _mm_srli_epi16::<16>(a);
3875        assert_eq_m128i(r, _mm_set1_epi16(0));
3876    }
3877
3878    #[simd_test(enable = "sse2")]
3879    fn test_mm_srl_epi16() {
3880        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3881        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3882        assert_eq_m128i(
3883            r,
3884            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3885        );
3886        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3887        assert_eq_m128i(r, a);
3888        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3889        assert_eq_m128i(r, _mm_set1_epi16(0));
3890        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3891        assert_eq_m128i(r, _mm_set1_epi16(0));
3892    }
3893
3894    #[simd_test(enable = "sse2")]
3895    const fn test_mm_srli_epi32() {
3896        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3897        let r = _mm_srli_epi32::<4>(a);
3898        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3899        let r = _mm_srli_epi32::<32>(a);
3900        assert_eq_m128i(r, _mm_set1_epi32(0));
3901    }
3902
3903    #[simd_test(enable = "sse2")]
3904    fn test_mm_srl_epi32() {
3905        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3906        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3907        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3908        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3909        assert_eq_m128i(r, a);
3910        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3911        assert_eq_m128i(r, _mm_set1_epi32(0));
3912        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3913        assert_eq_m128i(r, _mm_set1_epi32(0));
3914    }
3915
3916    #[simd_test(enable = "sse2")]
3917    const fn test_mm_srli_epi64() {
3918        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3919        let r = _mm_srli_epi64::<4>(a);
3920        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3921        let r = _mm_srli_epi64::<64>(a);
3922        assert_eq_m128i(r, _mm_set1_epi64x(0));
3923    }
3924
3925    #[simd_test(enable = "sse2")]
3926    fn test_mm_srl_epi64() {
3927        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3928        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3929        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3930        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3931        assert_eq_m128i(r, a);
3932        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3933        assert_eq_m128i(r, _mm_set1_epi64x(0));
3934        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3935        assert_eq_m128i(r, _mm_set1_epi64x(0));
3936    }
3937
3938    #[simd_test(enable = "sse2")]
3939    const fn test_mm_and_si128() {
3940        let a = _mm_set1_epi8(5);
3941        let b = _mm_set1_epi8(3);
3942        let r = _mm_and_si128(a, b);
3943        assert_eq_m128i(r, _mm_set1_epi8(1));
3944    }
3945
3946    #[simd_test(enable = "sse2")]
3947    const fn test_mm_andnot_si128() {
3948        let a = _mm_set1_epi8(5);
3949        let b = _mm_set1_epi8(3);
3950        let r = _mm_andnot_si128(a, b);
3951        assert_eq_m128i(r, _mm_set1_epi8(2));
3952    }
3953
3954    #[simd_test(enable = "sse2")]
3955    const fn test_mm_or_si128() {
3956        let a = _mm_set1_epi8(5);
3957        let b = _mm_set1_epi8(3);
3958        let r = _mm_or_si128(a, b);
3959        assert_eq_m128i(r, _mm_set1_epi8(7));
3960    }
3961
3962    #[simd_test(enable = "sse2")]
3963    const fn test_mm_xor_si128() {
3964        let a = _mm_set1_epi8(5);
3965        let b = _mm_set1_epi8(3);
3966        let r = _mm_xor_si128(a, b);
3967        assert_eq_m128i(r, _mm_set1_epi8(6));
3968    }
3969
3970    #[simd_test(enable = "sse2")]
3971    const fn test_mm_cmpeq_epi8() {
3972        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3973        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3974        let r = _mm_cmpeq_epi8(a, b);
3975        #[rustfmt::skip]
3976        assert_eq_m128i(
3977            r,
3978            _mm_setr_epi8(
3979                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3980            )
3981        );
3982    }
3983
3984    #[simd_test(enable = "sse2")]
3985    const fn test_mm_cmpeq_epi16() {
3986        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3987        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3988        let r = _mm_cmpeq_epi16(a, b);
3989        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3990    }
3991
3992    #[simd_test(enable = "sse2")]
3993    const fn test_mm_cmpeq_epi32() {
3994        let a = _mm_setr_epi32(0, 1, 2, 3);
3995        let b = _mm_setr_epi32(3, 2, 2, 0);
3996        let r = _mm_cmpeq_epi32(a, b);
3997        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3998    }
3999
4000    #[simd_test(enable = "sse2")]
4001    const fn test_mm_cmpgt_epi8() {
4002        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4003        let b = _mm_set1_epi8(0);
4004        let r = _mm_cmpgt_epi8(a, b);
4005        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4006        assert_eq_m128i(r, e);
4007    }
4008
4009    #[simd_test(enable = "sse2")]
4010    const fn test_mm_cmpgt_epi16() {
4011        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4012        let b = _mm_set1_epi16(0);
4013        let r = _mm_cmpgt_epi16(a, b);
4014        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4015        assert_eq_m128i(r, e);
4016    }
4017
4018    #[simd_test(enable = "sse2")]
4019    const fn test_mm_cmpgt_epi32() {
4020        let a = _mm_set_epi32(5, 0, 0, 0);
4021        let b = _mm_set1_epi32(0);
4022        let r = _mm_cmpgt_epi32(a, b);
4023        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4024    }
4025
4026    #[simd_test(enable = "sse2")]
4027    const fn test_mm_cmplt_epi8() {
4028        let a = _mm_set1_epi8(0);
4029        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4030        let r = _mm_cmplt_epi8(a, b);
4031        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4032        assert_eq_m128i(r, e);
4033    }
4034
4035    #[simd_test(enable = "sse2")]
4036    const fn test_mm_cmplt_epi16() {
4037        let a = _mm_set1_epi16(0);
4038        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4039        let r = _mm_cmplt_epi16(a, b);
4040        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4041        assert_eq_m128i(r, e);
4042    }
4043
4044    #[simd_test(enable = "sse2")]
4045    const fn test_mm_cmplt_epi32() {
4046        let a = _mm_set1_epi32(0);
4047        let b = _mm_set_epi32(5, 0, 0, 0);
4048        let r = _mm_cmplt_epi32(a, b);
4049        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4050    }
4051
4052    #[simd_test(enable = "sse2")]
4053    const fn test_mm_cvtepi32_pd() {
4054        let a = _mm_set_epi32(35, 25, 15, 5);
4055        let r = _mm_cvtepi32_pd(a);
4056        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
4057    }
4058
4059    #[simd_test(enable = "sse2")]
4060    const fn test_mm_cvtsi32_sd() {
4061        let a = _mm_set1_pd(3.5);
4062        let r = _mm_cvtsi32_sd(a, 5);
4063        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
4064    }
4065
4066    #[simd_test(enable = "sse2")]
4067    const fn test_mm_cvtepi32_ps() {
4068        let a = _mm_setr_epi32(1, 2, 3, 4);
4069        let r = _mm_cvtepi32_ps(a);
4070        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
4071    }
4072
4073    #[simd_test(enable = "sse2")]
4074    unsafe fn test_mm_cvtps_epi32() {
4075        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4076        let r = _mm_cvtps_epi32(a);
4077        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
4078    }
4079
4080    #[simd_test(enable = "sse2")]
4081    const fn test_mm_cvtsi32_si128() {
4082        let r = _mm_cvtsi32_si128(5);
4083        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
4084    }
4085
4086    #[simd_test(enable = "sse2")]
4087    const fn test_mm_cvtsi128_si32() {
4088        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
4089        assert_eq!(r, 5);
4090    }
4091
4092    #[simd_test(enable = "sse2")]
4093    const fn test_mm_set_epi64x() {
4094        let r = _mm_set_epi64x(0, 1);
4095        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
4096    }
4097
4098    #[simd_test(enable = "sse2")]
4099    const fn test_mm_set_epi32() {
4100        let r = _mm_set_epi32(0, 1, 2, 3);
4101        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
4102    }
4103
4104    #[simd_test(enable = "sse2")]
4105    const fn test_mm_set_epi16() {
4106        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4107        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
4108    }
4109
4110    #[simd_test(enable = "sse2")]
4111    const fn test_mm_set_epi8() {
4112        #[rustfmt::skip]
4113        let r = _mm_set_epi8(
4114            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4115        );
4116        #[rustfmt::skip]
4117        let e = _mm_setr_epi8(
4118            15, 14, 13, 12, 11, 10, 9, 8,
4119            7, 6, 5, 4, 3, 2, 1, 0,
4120        );
4121        assert_eq_m128i(r, e);
4122    }
4123
4124    #[simd_test(enable = "sse2")]
4125    const fn test_mm_set1_epi64x() {
4126        let r = _mm_set1_epi64x(1);
4127        assert_eq_m128i(r, _mm_set1_epi64x(1));
4128    }
4129
4130    #[simd_test(enable = "sse2")]
4131    const fn test_mm_set1_epi32() {
4132        let r = _mm_set1_epi32(1);
4133        assert_eq_m128i(r, _mm_set1_epi32(1));
4134    }
4135
4136    #[simd_test(enable = "sse2")]
4137    const fn test_mm_set1_epi16() {
4138        let r = _mm_set1_epi16(1);
4139        assert_eq_m128i(r, _mm_set1_epi16(1));
4140    }
4141
4142    #[simd_test(enable = "sse2")]
4143    const fn test_mm_set1_epi8() {
4144        let r = _mm_set1_epi8(1);
4145        assert_eq_m128i(r, _mm_set1_epi8(1));
4146    }
4147
4148    #[simd_test(enable = "sse2")]
4149    const fn test_mm_setr_epi32() {
4150        let r = _mm_setr_epi32(0, 1, 2, 3);
4151        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4152    }
4153
4154    #[simd_test(enable = "sse2")]
4155    const fn test_mm_setr_epi16() {
4156        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4157        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4158    }
4159
4160    #[simd_test(enable = "sse2")]
4161    const fn test_mm_setr_epi8() {
4162        #[rustfmt::skip]
4163        let r = _mm_setr_epi8(
4164            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4165        );
4166        #[rustfmt::skip]
4167        let e = _mm_setr_epi8(
4168            0, 1, 2, 3, 4, 5, 6, 7,
4169            8, 9, 10, 11, 12, 13, 14, 15,
4170        );
4171        assert_eq_m128i(r, e);
4172    }
4173
4174    #[simd_test(enable = "sse2")]
4175    const fn test_mm_setzero_si128() {
4176        let r = _mm_setzero_si128();
4177        assert_eq_m128i(r, _mm_set1_epi64x(0));
4178    }
4179
4180    #[simd_test(enable = "sse2")]
4181    const unsafe fn test_mm_loadl_epi64() {
4182        let a = _mm_setr_epi64x(6, 5);
4183        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4184        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4185    }
4186
4187    #[simd_test(enable = "sse2")]
4188    const unsafe fn test_mm_load_si128() {
4189        let a = _mm_set_epi64x(5, 6);
4190        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4191        assert_eq_m128i(a, r);
4192    }
4193
4194    #[simd_test(enable = "sse2")]
4195    const unsafe fn test_mm_loadu_si128() {
4196        let a = _mm_set_epi64x(5, 6);
4197        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4198        assert_eq_m128i(a, r);
4199    }
4200
4201    #[simd_test(enable = "sse2")]
4202    // Miri cannot support this until it is clear how it fits in the Rust memory model
4203    // (non-temporal store)
4204    #[cfg_attr(miri, ignore)]
4205    unsafe fn test_mm_maskmoveu_si128() {
4206        let a = _mm_set1_epi8(9);
4207        #[rustfmt::skip]
4208        let mask = _mm_set_epi8(
4209            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4210            0, 0, 0, 0, 0, 0, 0, 0,
4211        );
4212        let mut r = _mm_set1_epi8(0);
4213        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4214        _mm_sfence();
4215        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4216        assert_eq_m128i(r, e);
4217    }
4218
4219    #[simd_test(enable = "sse2")]
4220    const unsafe fn test_mm_store_si128() {
4221        let a = _mm_set1_epi8(9);
4222        let mut r = _mm_set1_epi8(0);
4223        _mm_store_si128(&mut r, a);
4224        assert_eq_m128i(r, a);
4225    }
4226
4227    #[simd_test(enable = "sse2")]
4228    const unsafe fn test_mm_storeu_si128() {
4229        let a = _mm_set1_epi8(9);
4230        let mut r = _mm_set1_epi8(0);
4231        _mm_storeu_si128(&mut r, a);
4232        assert_eq_m128i(r, a);
4233    }
4234
4235    #[simd_test(enable = "sse2")]
4236    const unsafe fn test_mm_storel_epi64() {
4237        let a = _mm_setr_epi64x(2, 9);
4238        let mut r = _mm_set1_epi8(0);
4239        _mm_storel_epi64(&mut r, a);
4240        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4241    }
4242
4243    #[simd_test(enable = "sse2")]
4244    // Miri cannot support this until it is clear how it fits in the Rust memory model
4245    // (non-temporal store)
4246    #[cfg_attr(miri, ignore)]
4247    unsafe fn test_mm_stream_si128() {
4248        let a = _mm_setr_epi32(1, 2, 3, 4);
4249        let mut r = _mm_undefined_si128();
4250        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4251        _mm_sfence();
4252        assert_eq_m128i(r, a);
4253    }
4254
4255    #[simd_test(enable = "sse2")]
4256    // Miri cannot support this until it is clear how it fits in the Rust memory model
4257    // (non-temporal store)
4258    #[cfg_attr(miri, ignore)]
4259    unsafe fn test_mm_stream_si32() {
4260        let a: i32 = 7;
4261        let mut mem = boxed::Box::<i32>::new(-1);
4262        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4263        _mm_sfence();
4264        assert_eq!(a, *mem);
4265    }
4266
4267    #[simd_test(enable = "sse2")]
4268    const fn test_mm_move_epi64() {
4269        let a = _mm_setr_epi64x(5, 6);
4270        let r = _mm_move_epi64(a);
4271        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4272    }
4273
4274    #[simd_test(enable = "sse2")]
4275    fn test_mm_packs_epi16() {
4276        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4277        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4278        let r = _mm_packs_epi16(a, b);
4279        #[rustfmt::skip]
4280        assert_eq_m128i(
4281            r,
4282            _mm_setr_epi8(
4283                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4284            )
4285        );
4286    }
4287
4288    #[simd_test(enable = "sse2")]
4289    fn test_mm_packs_epi32() {
4290        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4291        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4292        let r = _mm_packs_epi32(a, b);
4293        assert_eq_m128i(
4294            r,
4295            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4296        );
4297    }
4298
4299    #[simd_test(enable = "sse2")]
4300    fn test_mm_packus_epi16() {
4301        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4302        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4303        let r = _mm_packus_epi16(a, b);
4304        assert_eq_m128i(
4305            r,
4306            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4307        );
4308    }
4309
4310    #[simd_test(enable = "sse2")]
4311    const fn test_mm_extract_epi16() {
4312        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4313        let r1 = _mm_extract_epi16::<0>(a);
4314        let r2 = _mm_extract_epi16::<3>(a);
4315        assert_eq!(r1, 0xFFFF);
4316        assert_eq!(r2, 3);
4317    }
4318
4319    #[simd_test(enable = "sse2")]
4320    const fn test_mm_insert_epi16() {
4321        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4322        let r = _mm_insert_epi16::<0>(a, 9);
4323        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4324        assert_eq_m128i(r, e);
4325    }
4326
4327    #[simd_test(enable = "sse2")]
4328    const fn test_mm_movemask_epi8() {
4329        #[rustfmt::skip]
4330        let a = _mm_setr_epi8(
4331            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4332            0b0101, 0b1111_0000u8 as i8, 0, 0,
4333            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4334            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4335        );
4336        let r = _mm_movemask_epi8(a);
4337        assert_eq!(r, 0b10100110_00100101);
4338    }
4339
4340    #[simd_test(enable = "sse2")]
4341    const fn test_mm_shuffle_epi32() {
4342        let a = _mm_setr_epi32(5, 10, 15, 20);
4343        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4344        let e = _mm_setr_epi32(20, 10, 10, 5);
4345        assert_eq_m128i(r, e);
4346    }
4347
4348    #[simd_test(enable = "sse2")]
4349    const fn test_mm_shufflehi_epi16() {
4350        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4351        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4352        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4353        assert_eq_m128i(r, e);
4354    }
4355
4356    #[simd_test(enable = "sse2")]
4357    const fn test_mm_shufflelo_epi16() {
4358        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4359        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4360        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4361        assert_eq_m128i(r, e);
4362    }
4363
4364    #[simd_test(enable = "sse2")]
4365    const fn test_mm_unpackhi_epi8() {
4366        #[rustfmt::skip]
4367        let a = _mm_setr_epi8(
4368            0, 1, 2, 3, 4, 5, 6, 7,
4369            8, 9, 10, 11, 12, 13, 14, 15,
4370        );
4371        #[rustfmt::skip]
4372        let b = _mm_setr_epi8(
4373            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4374        );
4375        let r = _mm_unpackhi_epi8(a, b);
4376        #[rustfmt::skip]
4377        let e = _mm_setr_epi8(
4378            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4379        );
4380        assert_eq_m128i(r, e);
4381    }
4382
4383    #[simd_test(enable = "sse2")]
4384    const fn test_mm_unpackhi_epi16() {
4385        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4386        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4387        let r = _mm_unpackhi_epi16(a, b);
4388        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4389        assert_eq_m128i(r, e);
4390    }
4391
4392    #[simd_test(enable = "sse2")]
4393    const fn test_mm_unpackhi_epi32() {
4394        let a = _mm_setr_epi32(0, 1, 2, 3);
4395        let b = _mm_setr_epi32(4, 5, 6, 7);
4396        let r = _mm_unpackhi_epi32(a, b);
4397        let e = _mm_setr_epi32(2, 6, 3, 7);
4398        assert_eq_m128i(r, e);
4399    }
4400
4401    #[simd_test(enable = "sse2")]
4402    const fn test_mm_unpackhi_epi64() {
4403        let a = _mm_setr_epi64x(0, 1);
4404        let b = _mm_setr_epi64x(2, 3);
4405        let r = _mm_unpackhi_epi64(a, b);
4406        let e = _mm_setr_epi64x(1, 3);
4407        assert_eq_m128i(r, e);
4408    }
4409
4410    #[simd_test(enable = "sse2")]
4411    const fn test_mm_unpacklo_epi8() {
4412        #[rustfmt::skip]
4413        let a = _mm_setr_epi8(
4414            0, 1, 2, 3, 4, 5, 6, 7,
4415            8, 9, 10, 11, 12, 13, 14, 15,
4416        );
4417        #[rustfmt::skip]
4418        let b = _mm_setr_epi8(
4419            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4420        );
4421        let r = _mm_unpacklo_epi8(a, b);
4422        #[rustfmt::skip]
4423        let e = _mm_setr_epi8(
4424            0, 16, 1, 17, 2, 18, 3, 19,
4425            4, 20, 5, 21, 6, 22, 7, 23,
4426        );
4427        assert_eq_m128i(r, e);
4428    }
4429
4430    #[simd_test(enable = "sse2")]
4431    const fn test_mm_unpacklo_epi16() {
4432        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4433        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4434        let r = _mm_unpacklo_epi16(a, b);
4435        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4436        assert_eq_m128i(r, e);
4437    }
4438
4439    #[simd_test(enable = "sse2")]
4440    const fn test_mm_unpacklo_epi32() {
4441        let a = _mm_setr_epi32(0, 1, 2, 3);
4442        let b = _mm_setr_epi32(4, 5, 6, 7);
4443        let r = _mm_unpacklo_epi32(a, b);
4444        let e = _mm_setr_epi32(0, 4, 1, 5);
4445        assert_eq_m128i(r, e);
4446    }
4447
4448    #[simd_test(enable = "sse2")]
4449    const fn test_mm_unpacklo_epi64() {
4450        let a = _mm_setr_epi64x(0, 1);
4451        let b = _mm_setr_epi64x(2, 3);
4452        let r = _mm_unpacklo_epi64(a, b);
4453        let e = _mm_setr_epi64x(0, 2);
4454        assert_eq_m128i(r, e);
4455    }
4456
4457    #[simd_test(enable = "sse2")]
4458    const fn test_mm_add_sd() {
4459        let a = _mm_setr_pd(1.0, 2.0);
4460        let b = _mm_setr_pd(5.0, 10.0);
4461        let r = _mm_add_sd(a, b);
4462        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4463    }
4464
4465    #[simd_test(enable = "sse2")]
4466    const fn test_mm_add_pd() {
4467        let a = _mm_setr_pd(1.0, 2.0);
4468        let b = _mm_setr_pd(5.0, 10.0);
4469        let r = _mm_add_pd(a, b);
4470        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4471    }
4472
4473    #[simd_test(enable = "sse2")]
4474    const fn test_mm_div_sd() {
4475        let a = _mm_setr_pd(1.0, 2.0);
4476        let b = _mm_setr_pd(5.0, 10.0);
4477        let r = _mm_div_sd(a, b);
4478        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4479    }
4480
4481    #[simd_test(enable = "sse2")]
4482    const fn test_mm_div_pd() {
4483        let a = _mm_setr_pd(1.0, 2.0);
4484        let b = _mm_setr_pd(5.0, 10.0);
4485        let r = _mm_div_pd(a, b);
4486        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4487    }
4488
4489    #[simd_test(enable = "sse2")]
4490    fn test_mm_max_sd() {
4491        let a = _mm_setr_pd(1.0, 2.0);
4492        let b = _mm_setr_pd(5.0, 10.0);
4493        let r = _mm_max_sd(a, b);
4494        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4495    }
4496
4497    #[simd_test(enable = "sse2")]
4498    fn test_mm_max_pd() {
4499        let a = _mm_setr_pd(1.0, 2.0);
4500        let b = _mm_setr_pd(5.0, 10.0);
4501        let r = _mm_max_pd(a, b);
4502        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4503
4504        // Check SSE(2)-specific semantics for -0.0 handling.
4505        let a = _mm_setr_pd(-0.0, 0.0);
4506        let b = _mm_setr_pd(0.0, 0.0);
4507        // Cast to __m128i to compare exact bit patterns
4508        let r1 = _mm_castpd_si128(_mm_max_pd(a, b));
4509        let r2 = _mm_castpd_si128(_mm_max_pd(b, a));
4510        let a = _mm_castpd_si128(a);
4511        let b = _mm_castpd_si128(b);
4512        assert_eq_m128i(r1, b);
4513        assert_eq_m128i(r2, a);
4514        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4515    }
4516
4517    #[simd_test(enable = "sse2")]
4518    fn test_mm_min_sd() {
4519        let a = _mm_setr_pd(1.0, 2.0);
4520        let b = _mm_setr_pd(5.0, 10.0);
4521        let r = _mm_min_sd(a, b);
4522        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4523    }
4524
4525    #[simd_test(enable = "sse2")]
4526    fn test_mm_min_pd() {
4527        let a = _mm_setr_pd(1.0, 2.0);
4528        let b = _mm_setr_pd(5.0, 10.0);
4529        let r = _mm_min_pd(a, b);
4530        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4531
4532        // Check SSE(2)-specific semantics for -0.0 handling.
4533        let a = _mm_setr_pd(-0.0, 0.0);
4534        let b = _mm_setr_pd(0.0, 0.0);
4535        // Cast to __m128i to compare exact bit patterns
4536        let r1 = _mm_castpd_si128(_mm_min_pd(a, b));
4537        let r2 = _mm_castpd_si128(_mm_min_pd(b, a));
4538        let a = _mm_castpd_si128(a);
4539        let b = _mm_castpd_si128(b);
4540        assert_eq_m128i(r1, b);
4541        assert_eq_m128i(r2, a);
4542        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4543    }
4544
4545    #[simd_test(enable = "sse2")]
4546    const fn test_mm_mul_sd() {
4547        let a = _mm_setr_pd(1.0, 2.0);
4548        let b = _mm_setr_pd(5.0, 10.0);
4549        let r = _mm_mul_sd(a, b);
4550        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4551    }
4552
4553    #[simd_test(enable = "sse2")]
4554    const fn test_mm_mul_pd() {
4555        let a = _mm_setr_pd(1.0, 2.0);
4556        let b = _mm_setr_pd(5.0, 10.0);
4557        let r = _mm_mul_pd(a, b);
4558        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4559    }
4560
4561    #[simd_test(enable = "sse2")]
4562    fn test_mm_sqrt_sd() {
4563        let a = _mm_setr_pd(1.0, 2.0);
4564        let b = _mm_setr_pd(5.0, 10.0);
4565        let r = _mm_sqrt_sd(a, b);
4566        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4567    }
4568
4569    #[simd_test(enable = "sse2")]
4570    fn test_mm_sqrt_pd() {
4571        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4572        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4573    }
4574
4575    #[simd_test(enable = "sse2")]
4576    const fn test_mm_sub_sd() {
4577        let a = _mm_setr_pd(1.0, 2.0);
4578        let b = _mm_setr_pd(5.0, 10.0);
4579        let r = _mm_sub_sd(a, b);
4580        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4581    }
4582
4583    #[simd_test(enable = "sse2")]
4584    const fn test_mm_sub_pd() {
4585        let a = _mm_setr_pd(1.0, 2.0);
4586        let b = _mm_setr_pd(5.0, 10.0);
4587        let r = _mm_sub_pd(a, b);
4588        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4589    }
4590
4591    #[simd_test(enable = "sse2")]
4592    const fn test_mm_and_pd() {
4593        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4594        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4595        let r = _mm_and_pd(a, b);
4596        let e = f64x2::from_bits(u64x2::splat(1)).as_m128d();
4597        assert_eq_m128d(r, e);
4598    }
4599
4600    #[simd_test(enable = "sse2")]
4601    const fn test_mm_andnot_pd() {
4602        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4603        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4604        let r = _mm_andnot_pd(a, b);
4605        let e = f64x2::from_bits(u64x2::splat(2)).as_m128d();
4606        assert_eq_m128d(r, e);
4607    }
4608
4609    #[simd_test(enable = "sse2")]
4610    const fn test_mm_or_pd() {
4611        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4612        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4613        let r = _mm_or_pd(a, b);
4614        let e = f64x2::from_bits(u64x2::splat(7)).as_m128d();
4615        assert_eq_m128d(r, e);
4616    }
4617
4618    #[simd_test(enable = "sse2")]
4619    const fn test_mm_xor_pd() {
4620        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4621        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4622        let r = _mm_xor_pd(a, b);
4623        let e = f64x2::from_bits(u64x2::splat(6)).as_m128d();
4624        assert_eq_m128d(r, e);
4625    }
4626
4627    #[simd_test(enable = "sse2")]
4628    fn test_mm_cmpeq_sd() {
4629        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4630        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4631        let r = _mm_castpd_si128(_mm_cmpeq_sd(a, b));
4632        assert_eq_m128i(r, e);
4633    }
4634
4635    #[simd_test(enable = "sse2")]
4636    fn test_mm_cmplt_sd() {
4637        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4638        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4639        let r = _mm_castpd_si128(_mm_cmplt_sd(a, b));
4640        assert_eq_m128i(r, e);
4641    }
4642
4643    #[simd_test(enable = "sse2")]
4644    fn test_mm_cmple_sd() {
4645        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4646        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4647        let r = _mm_castpd_si128(_mm_cmple_sd(a, b));
4648        assert_eq_m128i(r, e);
4649    }
4650
4651    #[simd_test(enable = "sse2")]
4652    fn test_mm_cmpgt_sd() {
4653        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4654        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4655        let r = _mm_castpd_si128(_mm_cmpgt_sd(a, b));
4656        assert_eq_m128i(r, e);
4657    }
4658
4659    #[simd_test(enable = "sse2")]
4660    fn test_mm_cmpge_sd() {
4661        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4662        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4663        let r = _mm_castpd_si128(_mm_cmpge_sd(a, b));
4664        assert_eq_m128i(r, e);
4665    }
4666
4667    #[simd_test(enable = "sse2")]
4668    fn test_mm_cmpord_sd() {
4669        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4670        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4671        let r = _mm_castpd_si128(_mm_cmpord_sd(a, b));
4672        assert_eq_m128i(r, e);
4673    }
4674
4675    #[simd_test(enable = "sse2")]
4676    fn test_mm_cmpunord_sd() {
4677        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4678        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4679        let r = _mm_castpd_si128(_mm_cmpunord_sd(a, b));
4680        assert_eq_m128i(r, e);
4681    }
4682
4683    #[simd_test(enable = "sse2")]
4684    fn test_mm_cmpneq_sd() {
4685        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4686        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4687        let r = _mm_castpd_si128(_mm_cmpneq_sd(a, b));
4688        assert_eq_m128i(r, e);
4689    }
4690
4691    #[simd_test(enable = "sse2")]
4692    fn test_mm_cmpnlt_sd() {
4693        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4694        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4695        let r = _mm_castpd_si128(_mm_cmpnlt_sd(a, b));
4696        assert_eq_m128i(r, e);
4697    }
4698
4699    #[simd_test(enable = "sse2")]
4700    fn test_mm_cmpnle_sd() {
4701        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4702        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4703        let r = _mm_castpd_si128(_mm_cmpnle_sd(a, b));
4704        assert_eq_m128i(r, e);
4705    }
4706
4707    #[simd_test(enable = "sse2")]
4708    fn test_mm_cmpngt_sd() {
4709        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4710        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4711        let r = _mm_castpd_si128(_mm_cmpngt_sd(a, b));
4712        assert_eq_m128i(r, e);
4713    }
4714
4715    #[simd_test(enable = "sse2")]
4716    fn test_mm_cmpnge_sd() {
4717        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4718        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4719        let r = _mm_castpd_si128(_mm_cmpnge_sd(a, b));
4720        assert_eq_m128i(r, e);
4721    }
4722
4723    #[simd_test(enable = "sse2")]
4724    fn test_mm_cmpeq_pd() {
4725        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4726        let e = _mm_setr_epi64x(!0, 0);
4727        let r = _mm_castpd_si128(_mm_cmpeq_pd(a, b));
4728        assert_eq_m128i(r, e);
4729    }
4730
4731    #[simd_test(enable = "sse2")]
4732    fn test_mm_cmplt_pd() {
4733        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4734        let e = _mm_setr_epi64x(0, !0);
4735        let r = _mm_castpd_si128(_mm_cmplt_pd(a, b));
4736        assert_eq_m128i(r, e);
4737    }
4738
4739    #[simd_test(enable = "sse2")]
4740    fn test_mm_cmple_pd() {
4741        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4742        let e = _mm_setr_epi64x(!0, !0);
4743        let r = _mm_castpd_si128(_mm_cmple_pd(a, b));
4744        assert_eq_m128i(r, e);
4745    }
4746
4747    #[simd_test(enable = "sse2")]
4748    fn test_mm_cmpgt_pd() {
4749        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4750        let e = _mm_setr_epi64x(0, 0);
4751        let r = _mm_castpd_si128(_mm_cmpgt_pd(a, b));
4752        assert_eq_m128i(r, e);
4753    }
4754
4755    #[simd_test(enable = "sse2")]
4756    fn test_mm_cmpge_pd() {
4757        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4758        let e = _mm_setr_epi64x(!0, 0);
4759        let r = _mm_castpd_si128(_mm_cmpge_pd(a, b));
4760        assert_eq_m128i(r, e);
4761    }
4762
4763    #[simd_test(enable = "sse2")]
4764    fn test_mm_cmpord_pd() {
4765        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4766        let e = _mm_setr_epi64x(0, !0);
4767        let r = _mm_castpd_si128(_mm_cmpord_pd(a, b));
4768        assert_eq_m128i(r, e);
4769    }
4770
4771    #[simd_test(enable = "sse2")]
4772    fn test_mm_cmpunord_pd() {
4773        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4774        let e = _mm_setr_epi64x(!0, 0);
4775        let r = _mm_castpd_si128(_mm_cmpunord_pd(a, b));
4776        assert_eq_m128i(r, e);
4777    }
4778
4779    #[simd_test(enable = "sse2")]
4780    fn test_mm_cmpneq_pd() {
4781        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4782        let e = _mm_setr_epi64x(!0, !0);
4783        let r = _mm_castpd_si128(_mm_cmpneq_pd(a, b));
4784        assert_eq_m128i(r, e);
4785    }
4786
4787    #[simd_test(enable = "sse2")]
4788    fn test_mm_cmpnlt_pd() {
4789        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4790        let e = _mm_setr_epi64x(0, 0);
4791        let r = _mm_castpd_si128(_mm_cmpnlt_pd(a, b));
4792        assert_eq_m128i(r, e);
4793    }
4794
4795    #[simd_test(enable = "sse2")]
4796    fn test_mm_cmpnle_pd() {
4797        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4798        let e = _mm_setr_epi64x(0, 0);
4799        let r = _mm_castpd_si128(_mm_cmpnle_pd(a, b));
4800        assert_eq_m128i(r, e);
4801    }
4802
4803    #[simd_test(enable = "sse2")]
4804    fn test_mm_cmpngt_pd() {
4805        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4806        let e = _mm_setr_epi64x(0, !0);
4807        let r = _mm_castpd_si128(_mm_cmpngt_pd(a, b));
4808        assert_eq_m128i(r, e);
4809    }
4810
4811    #[simd_test(enable = "sse2")]
4812    fn test_mm_cmpnge_pd() {
4813        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4814        let e = _mm_setr_epi64x(0, !0);
4815        let r = _mm_castpd_si128(_mm_cmpnge_pd(a, b));
4816        assert_eq_m128i(r, e);
4817    }
4818
4819    #[simd_test(enable = "sse2")]
4820    fn test_mm_comieq_sd() {
4821        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4822        assert!(_mm_comieq_sd(a, b) != 0);
4823
4824        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4825        assert!(_mm_comieq_sd(a, b) == 0);
4826    }
4827
4828    #[simd_test(enable = "sse2")]
4829    fn test_mm_comilt_sd() {
4830        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4831        assert!(_mm_comilt_sd(a, b) == 0);
4832    }
4833
4834    #[simd_test(enable = "sse2")]
4835    fn test_mm_comile_sd() {
4836        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4837        assert!(_mm_comile_sd(a, b) != 0);
4838    }
4839
4840    #[simd_test(enable = "sse2")]
4841    fn test_mm_comigt_sd() {
4842        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4843        assert!(_mm_comigt_sd(a, b) == 0);
4844    }
4845
4846    #[simd_test(enable = "sse2")]
4847    fn test_mm_comige_sd() {
4848        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4849        assert!(_mm_comige_sd(a, b) != 0);
4850    }
4851
4852    #[simd_test(enable = "sse2")]
4853    fn test_mm_comineq_sd() {
4854        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4855        assert!(_mm_comineq_sd(a, b) == 0);
4856    }
4857
4858    #[simd_test(enable = "sse2")]
4859    fn test_mm_ucomieq_sd() {
4860        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4861        assert!(_mm_ucomieq_sd(a, b) != 0);
4862
4863        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4864        assert!(_mm_ucomieq_sd(a, b) == 0);
4865    }
4866
4867    #[simd_test(enable = "sse2")]
4868    fn test_mm_ucomilt_sd() {
4869        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4870        assert!(_mm_ucomilt_sd(a, b) == 0);
4871    }
4872
4873    #[simd_test(enable = "sse2")]
4874    fn test_mm_ucomile_sd() {
4875        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4876        assert!(_mm_ucomile_sd(a, b) != 0);
4877    }
4878
4879    #[simd_test(enable = "sse2")]
4880    fn test_mm_ucomigt_sd() {
4881        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4882        assert!(_mm_ucomigt_sd(a, b) == 0);
4883    }
4884
4885    #[simd_test(enable = "sse2")]
4886    fn test_mm_ucomige_sd() {
4887        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4888        assert!(_mm_ucomige_sd(a, b) != 0);
4889    }
4890
4891    #[simd_test(enable = "sse2")]
4892    fn test_mm_ucomineq_sd() {
4893        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4894        assert!(_mm_ucomineq_sd(a, b) == 0);
4895    }
4896
4897    #[simd_test(enable = "sse2")]
4898    const fn test_mm_movemask_pd() {
4899        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4900        assert_eq!(r, 0b01);
4901
4902        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4903        assert_eq!(r, 0b11);
4904    }
4905
4906    #[repr(align(16))]
4907    struct Memory {
4908        data: [f64; 4],
4909    }
4910
4911    #[simd_test(enable = "sse2")]
4912    const unsafe fn test_mm_load_pd() {
4913        let mem = Memory {
4914            data: [1.0f64, 2.0, 3.0, 4.0],
4915        };
4916        let vals = &mem.data;
4917        let d = vals.as_ptr();
4918
4919        let r = _mm_load_pd(d);
4920        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4921    }
4922
4923    #[simd_test(enable = "sse2")]
4924    const unsafe fn test_mm_load_sd() {
4925        let a = 1.;
4926        let expected = _mm_setr_pd(a, 0.);
4927        let r = _mm_load_sd(&a);
4928        assert_eq_m128d(r, expected);
4929    }
4930
4931    #[simd_test(enable = "sse2")]
4932    const unsafe fn test_mm_loadh_pd() {
4933        let a = _mm_setr_pd(1., 2.);
4934        let b = 3.;
4935        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4936        let r = _mm_loadh_pd(a, &b);
4937        assert_eq_m128d(r, expected);
4938    }
4939
4940    #[simd_test(enable = "sse2")]
4941    const unsafe fn test_mm_loadl_pd() {
4942        let a = _mm_setr_pd(1., 2.);
4943        let b = 3.;
4944        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4945        let r = _mm_loadl_pd(a, &b);
4946        assert_eq_m128d(r, expected);
4947    }
4948
4949    #[simd_test(enable = "sse2")]
4950    // Miri cannot support this until it is clear how it fits in the Rust memory model
4951    // (non-temporal store)
4952    #[cfg_attr(miri, ignore)]
4953    unsafe fn test_mm_stream_pd() {
4954        #[repr(align(128))]
4955        struct Memory {
4956            pub data: [f64; 2],
4957        }
4958        let a = _mm_set1_pd(7.0);
4959        let mut mem = Memory { data: [-1.0; 2] };
4960
4961        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4962        _mm_sfence();
4963        for i in 0..2 {
4964            assert_eq!(mem.data[i], get_m128d(a, i));
4965        }
4966    }
4967
4968    #[simd_test(enable = "sse2")]
4969    const unsafe fn test_mm_store_sd() {
4970        let mut dest = 0.;
4971        let a = _mm_setr_pd(1., 2.);
4972        _mm_store_sd(&mut dest, a);
4973        assert_eq!(dest, _mm_cvtsd_f64(a));
4974    }
4975
4976    #[simd_test(enable = "sse2")]
4977    const unsafe fn test_mm_store_pd() {
4978        let mut mem = Memory { data: [0.0f64; 4] };
4979        let vals = &mut mem.data;
4980        let a = _mm_setr_pd(1.0, 2.0);
4981        let d = vals.as_mut_ptr();
4982
4983        _mm_store_pd(d, *black_box(&a));
4984        assert_eq!(vals[0], 1.0);
4985        assert_eq!(vals[1], 2.0);
4986    }
4987
4988    #[simd_test(enable = "sse2")]
4989    const unsafe fn test_mm_storeu_pd() {
4990        // guaranteed to be aligned to 16 bytes
4991        let mut mem = Memory { data: [0.0f64; 4] };
4992        let vals = &mut mem.data;
4993        let a = _mm_setr_pd(1.0, 2.0);
4994
4995        // so p is *not* aligned to 16 bytes
4996        let p = vals.as_mut_ptr().offset(1);
4997        _mm_storeu_pd(p, *black_box(&a));
4998
4999        assert_eq!(*vals, [0.0, 1.0, 2.0, 0.0]);
5000    }
5001
5002    #[simd_test(enable = "sse2")]
5003    const unsafe fn test_mm_storeu_si16() {
5004        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5005        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
5006        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
5007        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
5008        assert_eq_m128i(r, e);
5009    }
5010
5011    #[simd_test(enable = "sse2")]
5012    const unsafe fn test_mm_storeu_si32() {
5013        let a = _mm_setr_epi32(1, 2, 3, 4);
5014        let mut r = _mm_setr_epi32(5, 6, 7, 8);
5015        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
5016        let e = _mm_setr_epi32(1, 6, 7, 8);
5017        assert_eq_m128i(r, e);
5018    }
5019
5020    #[simd_test(enable = "sse2")]
5021    const unsafe fn test_mm_storeu_si64() {
5022        let a = _mm_setr_epi64x(1, 2);
5023        let mut r = _mm_setr_epi64x(3, 4);
5024        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
5025        let e = _mm_setr_epi64x(1, 4);
5026        assert_eq_m128i(r, e);
5027    }
5028
5029    #[simd_test(enable = "sse2")]
5030    const unsafe fn test_mm_store1_pd() {
5031        let mut mem = Memory { data: [0.0f64; 4] };
5032        let vals = &mut mem.data;
5033        let a = _mm_setr_pd(1.0, 2.0);
5034        let d = vals.as_mut_ptr();
5035
5036        _mm_store1_pd(d, *black_box(&a));
5037        assert_eq!(vals[0], 1.0);
5038        assert_eq!(vals[1], 1.0);
5039    }
5040
5041    #[simd_test(enable = "sse2")]
5042    const unsafe fn test_mm_store_pd1() {
5043        let mut mem = Memory { data: [0.0f64; 4] };
5044        let vals = &mut mem.data;
5045        let a = _mm_setr_pd(1.0, 2.0);
5046        let d = vals.as_mut_ptr();
5047
5048        _mm_store_pd1(d, *black_box(&a));
5049        assert_eq!(vals[0], 1.0);
5050        assert_eq!(vals[1], 1.0);
5051    }
5052
5053    #[simd_test(enable = "sse2")]
5054    const unsafe fn test_mm_storer_pd() {
5055        let mut mem = Memory { data: [0.0f64; 4] };
5056        let vals = &mut mem.data;
5057        let a = _mm_setr_pd(1.0, 2.0);
5058        let d = vals.as_mut_ptr();
5059
5060        _mm_storer_pd(d, *black_box(&a));
5061        assert_eq!(vals[0], 2.0);
5062        assert_eq!(vals[1], 1.0);
5063    }
5064
5065    #[simd_test(enable = "sse2")]
5066    const unsafe fn test_mm_storeh_pd() {
5067        let mut dest = 0.;
5068        let a = _mm_setr_pd(1., 2.);
5069        _mm_storeh_pd(&mut dest, a);
5070        assert_eq!(dest, get_m128d(a, 1));
5071    }
5072
5073    #[simd_test(enable = "sse2")]
5074    const unsafe fn test_mm_storel_pd() {
5075        let mut dest = 0.;
5076        let a = _mm_setr_pd(1., 2.);
5077        _mm_storel_pd(&mut dest, a);
5078        assert_eq!(dest, _mm_cvtsd_f64(a));
5079    }
5080
5081    #[simd_test(enable = "sse2")]
5082    const unsafe fn test_mm_loadr_pd() {
5083        let mut mem = Memory {
5084            data: [1.0f64, 2.0, 3.0, 4.0],
5085        };
5086        let vals = &mut mem.data;
5087        let d = vals.as_ptr();
5088
5089        let r = _mm_loadr_pd(d);
5090        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
5091    }
5092
5093    #[simd_test(enable = "sse2")]
5094    const unsafe fn test_mm_loadu_pd() {
5095        // guaranteed to be aligned to 16 bytes
5096        let mut mem = Memory {
5097            data: [1.0f64, 2.0, 3.0, 4.0],
5098        };
5099        let vals = &mut mem.data;
5100
5101        // so this will *not* be aligned to 16 bytes
5102        let d = vals.as_ptr().offset(1);
5103
5104        let r = _mm_loadu_pd(d);
5105        let e = _mm_setr_pd(2.0, 3.0);
5106        assert_eq_m128d(r, e);
5107    }
5108
5109    #[simd_test(enable = "sse2")]
5110    const unsafe fn test_mm_loadu_si16() {
5111        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5112        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
5113        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
5114    }
5115
5116    #[simd_test(enable = "sse2")]
5117    const unsafe fn test_mm_loadu_si32() {
5118        let a = _mm_setr_epi32(1, 2, 3, 4);
5119        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
5120        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
5121    }
5122
5123    #[simd_test(enable = "sse2")]
5124    const unsafe fn test_mm_loadu_si64() {
5125        let a = _mm_setr_epi64x(5, 6);
5126        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
5127        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5128    }
5129
5130    #[simd_test(enable = "sse2")]
5131    const fn test_mm_cvtpd_ps() {
5132        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5133        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5134
5135        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5136        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5137
5138        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5139        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5140
5141        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5142        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5143    }
5144
5145    #[simd_test(enable = "sse2")]
5146    const fn test_mm_cvtps_pd() {
5147        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5148        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5149
5150        let r = _mm_cvtps_pd(_mm_setr_ps(
5151            f32::MAX,
5152            f32::INFINITY,
5153            f32::NEG_INFINITY,
5154            f32::MIN,
5155        ));
5156        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5157    }
5158
5159    #[simd_test(enable = "sse2")]
5160    fn test_mm_cvtpd_epi32() {
5161        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5162        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5163
5164        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5165        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5166
5167        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5168        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5169
5170        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5171        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5172
5173        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5174        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5175    }
5176
5177    #[simd_test(enable = "sse2")]
5178    fn test_mm_cvtsd_si32() {
5179        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5180        assert_eq!(r, -2);
5181
5182        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5183        assert_eq!(r, i32::MIN);
5184
5185        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5186        assert_eq!(r, i32::MIN);
5187    }
5188
5189    #[simd_test(enable = "sse2")]
5190    fn test_mm_cvtsd_ss() {
5191        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5192        let b = _mm_setr_pd(2.0, -5.0);
5193
5194        let r = _mm_cvtsd_ss(a, b);
5195
5196        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5197
5198        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5199        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5200
5201        let r = _mm_cvtsd_ss(a, b);
5202
5203        assert_eq_m128(
5204            r,
5205            _mm_setr_ps(
5206                f32::INFINITY,
5207                f32::NEG_INFINITY,
5208                f32::MAX,
5209                f32::NEG_INFINITY,
5210            ),
5211        );
5212    }
5213
5214    #[simd_test(enable = "sse2")]
5215    const fn test_mm_cvtsd_f64() {
5216        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5217        assert_eq!(r, -1.1);
5218    }
5219
5220    #[simd_test(enable = "sse2")]
5221    const fn test_mm_cvtss_sd() {
5222        let a = _mm_setr_pd(-1.1, 2.2);
5223        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5224
5225        let r = _mm_cvtss_sd(a, b);
5226        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5227
5228        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5229        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5230
5231        let r = _mm_cvtss_sd(a, b);
5232        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5233    }
5234
5235    #[simd_test(enable = "sse2")]
5236    fn test_mm_cvttpd_epi32() {
5237        let a = _mm_setr_pd(-1.1, 2.2);
5238        let r = _mm_cvttpd_epi32(a);
5239        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5240
5241        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5242        let r = _mm_cvttpd_epi32(a);
5243        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5244    }
5245
5246    #[simd_test(enable = "sse2")]
5247    fn test_mm_cvttsd_si32() {
5248        let a = _mm_setr_pd(-1.1, 2.2);
5249        let r = _mm_cvttsd_si32(a);
5250        assert_eq!(r, -1);
5251
5252        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5253        let r = _mm_cvttsd_si32(a);
5254        assert_eq!(r, i32::MIN);
5255    }
5256
5257    #[simd_test(enable = "sse2")]
5258    fn test_mm_cvttps_epi32() {
5259        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5260        let r = _mm_cvttps_epi32(a);
5261        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5262
5263        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5264        let r = _mm_cvttps_epi32(a);
5265        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5266    }
5267
5268    #[simd_test(enable = "sse2")]
5269    const fn test_mm_set_sd() {
5270        let r = _mm_set_sd(-1.0_f64);
5271        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5272    }
5273
5274    #[simd_test(enable = "sse2")]
5275    const fn test_mm_set1_pd() {
5276        let r = _mm_set1_pd(-1.0_f64);
5277        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5278    }
5279
5280    #[simd_test(enable = "sse2")]
5281    const fn test_mm_set_pd1() {
5282        let r = _mm_set_pd1(-2.0_f64);
5283        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5284    }
5285
5286    #[simd_test(enable = "sse2")]
5287    const fn test_mm_set_pd() {
5288        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5289        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5290    }
5291
5292    #[simd_test(enable = "sse2")]
5293    const fn test_mm_setr_pd() {
5294        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5295        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5296    }
5297
5298    #[simd_test(enable = "sse2")]
5299    const fn test_mm_setzero_pd() {
5300        let r = _mm_setzero_pd();
5301        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5302    }
5303
5304    #[simd_test(enable = "sse2")]
5305    const unsafe fn test_mm_load1_pd() {
5306        let d = -5.0;
5307        let r = _mm_load1_pd(&d);
5308        assert_eq_m128d(r, _mm_setr_pd(d, d));
5309    }
5310
5311    #[simd_test(enable = "sse2")]
5312    const unsafe fn test_mm_load_pd1() {
5313        let d = -5.0;
5314        let r = _mm_load_pd1(&d);
5315        assert_eq_m128d(r, _mm_setr_pd(d, d));
5316    }
5317
5318    #[simd_test(enable = "sse2")]
5319    const fn test_mm_unpackhi_pd() {
5320        let a = _mm_setr_pd(1.0, 2.0);
5321        let b = _mm_setr_pd(3.0, 4.0);
5322        let r = _mm_unpackhi_pd(a, b);
5323        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5324    }
5325
5326    #[simd_test(enable = "sse2")]
5327    const fn test_mm_unpacklo_pd() {
5328        let a = _mm_setr_pd(1.0, 2.0);
5329        let b = _mm_setr_pd(3.0, 4.0);
5330        let r = _mm_unpacklo_pd(a, b);
5331        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5332    }
5333
5334    #[simd_test(enable = "sse2")]
5335    const fn test_mm_shuffle_pd() {
5336        let a = _mm_setr_pd(1., 2.);
5337        let b = _mm_setr_pd(3., 4.);
5338        let expected = _mm_setr_pd(1., 3.);
5339        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5340        assert_eq_m128d(r, expected);
5341    }
5342
5343    #[simd_test(enable = "sse2")]
5344    const fn test_mm_move_sd() {
5345        let a = _mm_setr_pd(1., 2.);
5346        let b = _mm_setr_pd(3., 4.);
5347        let expected = _mm_setr_pd(3., 2.);
5348        let r = _mm_move_sd(a, b);
5349        assert_eq_m128d(r, expected);
5350    }
5351
5352    #[simd_test(enable = "sse2")]
5353    const fn test_mm_castpd_ps() {
5354        let a = _mm_set1_pd(0.);
5355        let expected = _mm_set1_ps(0.);
5356        let r = _mm_castpd_ps(a);
5357        assert_eq_m128(r, expected);
5358    }
5359
5360    #[simd_test(enable = "sse2")]
5361    const fn test_mm_castpd_si128() {
5362        let a = _mm_set1_pd(0.);
5363        let expected = _mm_set1_epi64x(0);
5364        let r = _mm_castpd_si128(a);
5365        assert_eq_m128i(r, expected);
5366    }
5367
5368    #[simd_test(enable = "sse2")]
5369    const fn test_mm_castps_pd() {
5370        let a = _mm_set1_ps(0.);
5371        let expected = _mm_set1_pd(0.);
5372        let r = _mm_castps_pd(a);
5373        assert_eq_m128d(r, expected);
5374    }
5375
5376    #[simd_test(enable = "sse2")]
5377    const fn test_mm_castps_si128() {
5378        let a = _mm_set1_ps(0.);
5379        let expected = _mm_set1_epi32(0);
5380        let r = _mm_castps_si128(a);
5381        assert_eq_m128i(r, expected);
5382    }
5383
5384    #[simd_test(enable = "sse2")]
5385    const fn test_mm_castsi128_pd() {
5386        let a = _mm_set1_epi64x(0);
5387        let expected = _mm_set1_pd(0.);
5388        let r = _mm_castsi128_pd(a);
5389        assert_eq_m128d(r, expected);
5390    }
5391
5392    #[simd_test(enable = "sse2")]
5393    const fn test_mm_castsi128_ps() {
5394        let a = _mm_set1_epi32(0);
5395        let expected = _mm_set1_ps(0.);
5396        let r = _mm_castsi128_ps(a);
5397        assert_eq_m128(r, expected);
5398    }
5399}